From a006f082b8c630972a0d47bd3a840a139bf237a9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 17 Feb 2017 22:18:12 +0300 Subject: [PATCH 001/303] mdbx: return to `devel` stage. --- README.md | 4 ++-- lmdb.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f81dfef1..bac63b48 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,9 @@ libmdbx Extended LMDB, aka "Расширенная LMDB". *The Future will Positive. Всё будет хорошо.* -[![Build Status](https://travis-ci.org/ReOpen/libmdbx.svg?branch=master)](https://travis-ci.org/ReOpen/libmdbx) +[![Build Status](https://travis-ci.org/ReOpen/libmdbx.svg?branch=devel)](https://travis-ci.org/ReOpen/libmdbx) -English version by Google [is here](https://translate.googleusercontent.com/translate_c?act=url&ie=UTF8&sl=ru&tl=en&u=https://github.com/ReOpen/libmdbx/tree/master). +English version by Google [is here](https://translate.googleusercontent.com/translate_c?act=url&ie=UTF8&sl=ru&tl=en&u=https://github.com/ReOpen/libmdbx/tree/devel). ## Кратко diff --git a/lmdb.h b/lmdb.h index 39031632..272ce965 100644 --- a/lmdb.h +++ b/lmdb.h @@ -209,7 +209,7 @@ typedef int mdb_filehandle_t; MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) /** The release date of this library version */ -#define MDB_VERSION_DATE "2017-02-17" +#define MDB_VERSION_DATE "DEVEL" /** A stringifier for the version info */ #define MDB_VERSTR(a,b,c,d) "MDBX " #a "." #b "." #c ": (" d ", https://github.com/ReOpen/libmdbx)" From e88adf39692ae084f177825e4c7449726dd6ccdb Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 21 Feb 2017 20:38:28 +0300 Subject: [PATCH 002/303] mdbx: preparation to rebirth. --- barriers.h | 6 +- intro.doc | 195 --------------------------------------------------- lmdb.h | 147 +++----------------------------------- mdb.c | 32 ++++----- mdb_chk.c | 4 +- reopen.h | 7 +- yota_test1.c | 4 +- yota_test2.c | 4 +- 8 files changed, 38 insertions(+), 361 deletions(-) delete mode 100644 intro.doc diff --git a/barriers.h b/barriers.h index 1e98730d..ff39cae2 100644 --- a/barriers.h +++ b/barriers.h @@ -14,11 +14,15 @@ /***************************************************************************** * Properly compiler/memory/coherence barriers - * in the most portable way for ReOpenMDBX project. + * in the most portable way for libmdbx project. * * Feedback and comments are welcome. * https://gist.github.com/leo-yuriev/ba186a6bf5cf3a27bae7 */ +#pragma once +/* *INDENT-OFF* */ +/* clang-format off */ + #if defined(__mips) && defined(__linux) /* Only MIPS has explicit cache control */ # include diff --git a/intro.doc b/intro.doc deleted file mode 100644 index 9462df18..00000000 --- a/intro.doc +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -/** @page starting Getting Started - -LMDB is compact, fast, powerful, and robust and implements a simplified -variant of the BerkeleyDB (BDB) API. (BDB is also very powerful, and verbosely -documented in its own right.) After reading this page, the main -\ref mdb documentation should make sense. Thanks to Bert Hubert -for creating the - -initial version of this writeup. - -Everything starts with an environment, created by #mdb_env_create(). -Once created, this environment must also be opened with #mdb_env_open(). - -#mdb_env_open() gets passed a name which is interpreted as a directory -path. Note that this directory must exist already, it is not created -for you. Within that directory, a lock file and a storage file will be -generated. If you don't want to use a directory, you can pass the -#MDB_NOSUBDIR option, in which case the path you provided is used -directly as the data file, and another file with a "-lock" suffix -added will be used for the lock file. - -Once the environment is open, a transaction can be created within it -using #mdb_txn_begin(). Transactions may be read-write or read-only, -and read-write transactions may be nested. A transaction must only -be used by one thread at a time. Transactions are always required, -even for read-only access. The transaction provides a consistent -view of the data. - -Once a transaction has been created, a database can be opened within it -using #mdb_dbi_open(). If only one database will ever be used in the -environment, a NULL can be passed as the database name. For named -databases, the #MDB_CREATE flag must be used to create the database -if it doesn't already exist. Also, #mdb_env_set_maxdbs() must be -called after #mdb_env_create() and before #mdb_env_open() to set the -maximum number of named databases you want to support. - -Note: a single transaction can open multiple databases. Generally -databases should only be opened once, by the first transaction in -the process. After the first transaction completes, the database -handles can freely be used by all subsequent transactions. - -Within a transaction, #mdb_get() can retrieve and #mdb_put() can store single -key/value pairs if that is all you need to do (but see \ref Cursors -below if you want to do more). - -A key/value pair is expressed as two #MDB_val structures. This struct -has two fields, \c mv_size and \c mv_data. The data is a \c void pointer to -an array of \c mv_size bytes. - -Because LMDB is very efficient (and usually zero-copy), the data returned -in an #MDB_val structure may be memory-mapped straight from disk. In -other words look but do not touch (or free() for that matter). -Once a transaction is closed, the values can no longer be used, so -make a copy if you need to keep them after that. - -@section Cursors Cursors - -To do more powerful things, we must use a cursor. - -Within the transaction, a cursor can be created with #mdb_cursor_open(). -With this cursor we can store/retrieve/delete (multiple) values using -#mdb_cursor_get(), #mdb_cursor_put(), and #mdb_cursor_del(). - -#mdb_cursor_get() positions itself depending on the cursor operation -requested, and for some operations, on the supplied key. For example, -to list all key/value pairs in a database, use operation #MDB_FIRST for -the first call to #mdb_cursor_get(), and #MDB_NEXT on subsequent calls, -until the end is hit. - -To retrieve all keys starting from a specified key value, use #MDB_SET. -For more cursor operations, see the \ref mdb docs. - -When using #mdb_cursor_put(), either the function will position the -cursor for you based on the \b key, or you can use operation -#MDB_CURRENT to use the current position of the cursor. Note that -\b key must then match the current position's key. - -@subsection summary Summarizing the Opening - -So we have a cursor in a transaction which opened a database in an -environment which is opened from a filesystem after it was -separately created. - -Or, we create an environment, open it from a filesystem, create a -transaction within it, open a database within that transaction, -and create a cursor within all of the above. - -Got it? - -@section thrproc Threads and Processes - -LMDB uses POSIX locks on files, and these locks have issues if one -process opens a file multiple times. Because of this, do not -#mdb_env_open() a file multiple times from a single process. Instead, -share the LMDB environment that has opened the file across all threads. -Otherwise, if a single process opens the same environment multiple times, -closing it once will remove all the locks held on it, and the other -instances will be vulnerable to corruption from other processes. - -Also note that a transaction is tied to one thread by default using -Thread Local Storage. If you want to pass read-only transactions across -threads, you can use the #MDB_NOTLS option on the environment. - -@section txns Transactions, Rollbacks, etc. - -To actually get anything done, a transaction must be committed using -#mdb_txn_commit(). Alternatively, all of a transaction's operations -can be discarded using #mdb_txn_abort(). In a read-only transaction, -any cursors will \b not automatically be freed. In a read-write -transaction, all cursors will be freed and must not be used again. - -For read-only transactions, obviously there is nothing to commit to -storage. The transaction still must eventually be aborted to close -any database handle(s) opened in it, or committed to keep the -database handles around for reuse in new transactions. - -In addition, as long as a transaction is open, a consistent view of -the database is kept alive, which requires storage. A read-only -transaction that no longer requires this consistent view should -be terminated (committed or aborted) when the view is no longer -needed (but see below for an optimization). - -There can be multiple simultaneously active read-only transactions -but only one that can write. Once a single read-write transaction -is opened, all further attempts to begin one will block until the -first one is committed or aborted. This has no effect on read-only -transactions, however, and they may continue to be opened at any time. - -@section dupkeys Duplicate Keys - -#mdb_get() and #mdb_put() respectively have no and only some support -for multiple key/value pairs with identical keys. If there are multiple -values for a key, #mdb_get() will only return the first value. - -When multiple values for one key are required, pass the #MDB_DUPSORT -flag to #mdb_dbi_open(). In an #MDB_DUPSORT database, by default -#mdb_put() will not replace the value for a key if the key existed -already. Instead it will add the new value to the key. In addition, -#mdb_del() will pay attention to the value field too, allowing for -specific values of a key to be deleted. - -Finally, additional cursor operations become available for -traversing through and retrieving duplicate values. - -@section optim Some Optimization - -If you frequently begin and abort read-only transactions, as an -optimization, it is possible to only reset and renew a transaction. - -#mdb_txn_reset() releases any old copies of data kept around for -a read-only transaction. To reuse this reset transaction, call -#mdb_txn_renew() on it. Any cursors in this transaction must also -be renewed using #mdb_cursor_renew(). - -Note that #mdb_txn_reset() is similar to #mdb_txn_abort() and will -close any databases you opened within the transaction. - -To permanently free a transaction, reset or not, use #mdb_txn_abort(). - -@section cleanup Cleaning Up - -For read-only transactions, any cursors created within it must -be closed using #mdb_cursor_close(). - -It is very rarely necessary to close a database handle, and in -general they should just be left open. - -@section onward The Full API - -The full \ref mdb documentation lists further details, like how to: - - \li size a database (the default limits are intentionally small) - \li drop and clean a database - \li detect and report errors - \li optimize (bulk) loading speed - \li (temporarily) reduce robustness to gain even more speed - \li gather statistics about the database - \li define custom sort orders - -*/ diff --git a/lmdb.h b/lmdb.h index 272ce965..48ca616f 100644 --- a/lmdb.h +++ b/lmdb.h @@ -1,141 +1,13 @@ -/** @file lmdb.h - * @brief Extended Lightning memory-mapped database library +/* + * Copyright 2015-2017 Leonid Yuriev . * - * @mainpage Extended Lightning Memory-Mapped Database (MDBX) - * - * @section intro_sec Introduction - * MDBX is a Btree-based database management library modeled loosely on the - * BerkeleyDB API, but much simplified. The entire database is exposed - * in a memory map, and all data fetches return data directly - * from the mapped memory, so no malloc's or memcpy's occur during - * data fetches. As such, the library is extremely simple because it - * requires no page caching layer of its own, and it is extremely high - * performance and memory-efficient. It is also fully transactional with - * full ACID semantics, and when the memory map is read-only, the - * database integrity cannot be corrupted by stray pointer writes from - * application code. - * - * The library is fully thread-aware and supports concurrent read/write - * access from multiple processes and threads. Data pages use a copy-on- - * write strategy so no active data pages are ever overwritten, which - * also provides resistance to corruption and eliminates the need of any - * special recovery procedures after a system crash. Writes are fully - * serialized; only one write transaction may be active at a time, which - * guarantees that writers can never deadlock. The database structure is - * multi-versioned so readers run with no locks; writers cannot block - * readers, and readers don't block writers. - * - * Unlike other well-known database mechanisms which use either write-ahead - * transaction logs or append-only data writes, MDBX requires no maintenance - * during operation. Both write-ahead loggers and append-only databases - * require periodic checkpointing and/or compaction of their log or database - * files otherwise they grow without bound. MDBX tracks free pages within - * the database and re-uses them for new write operations, so the database - * size does not grow without bound in normal use. - * - * The memory map can be used as a read-only or read-write map. It is - * read-only by default as this provides total immunity to corruption. - * Using read-write mode offers much higher write performance, but adds - * the possibility for stray application writes thru pointers to silently - * corrupt the database. Of course if your application code is known to - * be bug-free (...) then this is not an issue. - * - * If this is your first time using a transactional embedded key/value - * store, you may find the \ref starting page to be helpful. - * - * @section caveats_sec Caveats - * Troubleshooting the lock file: - * - * - A broken lockfile can cause sync issues. - * Stale reader transactions left behind by an aborted program - * cause further writes to grow the database quickly, and - * stale locks can block further operation. - * - * Fix: Check for stale readers periodically, using the - * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. - * Stale writers will be cleared automatically on Linux - * using POSIX mutexes with Robust option. - * Otherwise just make all programs using the database close it; - * the lockfile is always reset on first open of the environment. - * - * - * Restrictions/caveats (in addition to those listed for some functions): - * - * - An MDBX configuration will often reserve considerable \b unused - * memory address space and maybe file size for future growth. - * This does not use actual memory or disk space, but users may need - * to understand the difference so they won't be scared off. - * - * - An LMDB configuration will often reserve considerable \b unused - * memory address space and maybe file size for future growth. - * This does not use actual memory or disk space, but users may need - * to understand the difference so they won't be scared off. - * - * - By default, in versions before 0.9.10, unused portions of the data - * file might receive garbage data from memory freed by other code. - * (This does not happen when using the #MDB_WRITEMAP flag.) As of - * 0.9.10 the default behavior is to initialize such memory before - * writing to the data file. Since there may be a slight performance - * cost due to this initialization, applications may disable it using - * the #MDB_NOMEMINIT flag. Applications handling sensitive data - * which must not be written should not use this flag. This flag is - * irrelevant when using #MDB_WRITEMAP. - * - * - A thread can only use one transaction at a time, plus any child - * transactions. Each transaction belongs to one thread. See below. - * The #MDB_NOTLS flag changes this for read-only transactions. - * - * - Use an MDB_env* in the process which opened it, not after fork(). - * - * - Do not have open an MDBX database twice in the same process at - * the same time. Not even from a plain open() call - close()ing it - * breaks fcntl() advisory locking. (It is OK to reopen it after - * fork() - exec*(), since the lockfile has FD_CLOEXEC set.) - * - * - Avoid long-lived transactions. Read transactions prevent - * reuse of pages freed by newer write transactions, thus the - * database can grow quickly. Write transactions prevent - * other write transactions, since writes are serialized. - * - * - Avoid suspending a process with active transactions. These - * would then be "long-lived" as above. Also read transactions - * suspended when writers commit could sometimes see wrong data. - * - * ...when several processes can use a database concurrently: - * - * - Avoid aborting a process with an active transaction. - * The transaction becomes "long-lived" as above until a check - * for stale readers is performed or the lockfile is reset, - * since the process may not remove it from the lockfile. - * - * This does not apply to write transactions if the system clears - * stale writers, see above. - * - * - If you do that anyway, do a periodic check for stale readers. Or - * close the environment once in a while, so the lockfile can get reset. - * - * - Do not use MDBX databases on remote filesystems, even between - * processes on the same host. This breaks flock() on some OSes, - * possibly memory map sync, and certainly sync between programs - * on different hosts. - * - * - Opening a database can fail if another process is opening or - * closing it at exactly the same time. - * - * @author Leonid Yuriev, 'ReOpen' initiative . - * Howard Chu, Symas Corp. All rights reserved. - * - * @copyright 2015-2017 Leonid Yuriev . - * 2011-2017 Howard Chu, Symas Corp. All rights reserved. + * This code is derived from "LMDB engine" written by + * Howard Chu (Symas Corporation), which itself derived from btree.c + * written by Martin Hedenfalk. * * --- * - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * @par Derived From: - * This code is derived from LMDB engine written by Howard Chu, Symas Corporation. - * - * Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. + * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -145,10 +17,9 @@ * top-level directory of the distribution or, alternatively, at * . * - * @par Derived From: - * This code is derived from btree.c written by Martin Hedenfalk. + * --- * - * Copyright (c) 2009, 2010 Martin Hedenfalk + * Portions Copyright (c) 2009, 2010 Martin Hedenfalk * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -303,7 +174,7 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel /** tie reader locktable slots to #MDB_txn objects instead of to threads */ #define MDB_NOTLS 0x200000 /** don't do any locking, caller must manage their own locks - * WARNING: ReOpenMDBX don't support this mode. */ + * WARNING: libmdbx don't support this mode. */ #define MDB_NOLOCK__UNSUPPORTED 0x400000 /** don't do readahead */ #define MDB_NORDAHEAD 0x800000 diff --git a/mdb.c b/mdb.c index 06ec817b..c26cf0d1 100644 --- a/mdb.c +++ b/mdb.c @@ -1,15 +1,13 @@ -/** @file mdb.c - * @brief Lightning memory-mapped database library - * - * A Btree-based database management library modeled loosely on the - * BerkeleyDB API, but much simplified. - */ - /* * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. + * + * This code is derived from "LMDB engine" written by + * Howard Chu (Symas Corporation), which itself derived from btree.c + * written by Martin Hedenfalk. + * + * --- + * + * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -19,9 +17,9 @@ * top-level directory of the distribution or, alternatively, at * . * - * This code is derived from btree.c written by Martin Hedenfalk. + * --- * - * Copyright (c) 2009, 2010 Martin Hedenfalk + * Portions Copyright (c) 2009, 2010 Martin Hedenfalk * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -47,27 +45,27 @@ /* LY: Please do not ask us for Windows support, just never! * But you can make a fork for Windows, or become maintainer for FreeBSD... */ #ifndef __gnu_linux__ -# warning "ReOpenMDBX supports only GNU Linux" +# warning "libmdbx supports only GNU Linux" #endif #include #if !defined(__GNUC__) || !__GNUC_PREREQ(4,2) - /* LY: Actualy ReOpenMDBX was not tested with compilers + /* LY: Actualy libmdbx was not tested with compilers * older than GCC 4.4 (from RHEL6). * But you could remove this #error and try to continue at your own risk. * In such case please don't rise up an issues related ONLY to old compilers. */ -# warning "ReOpenMDBX required at least GCC 4.2 compatible C/C++ compiler." +# warning "libmdbx required at least GCC 4.2 compatible C/C++ compiler." #endif #if !defined(__GNU_LIBRARY__) || !__GLIBC_PREREQ(2,12) - /* LY: Actualy ReOpenMDBX was not tested with something + /* LY: Actualy libmdbx was not tested with something * older than glibc 2.12 (from RHEL6). * But you could remove this #error and try to continue at your own risk. * In such case please don't rise up an issues related ONLY to old systems. */ -# warning "ReOpenMDBX required at least GLIBC 2.12." +# warning "libmdbx required at least GLIBC 2.12." #endif #if MDB_DEBUG diff --git a/mdb_chk.c b/mdb_chk.c index 1422eea1..db141b4b 100644 --- a/mdb_chk.c +++ b/mdb_chk.c @@ -6,12 +6,12 @@ * * This file is part of libmdbx. * - * ReOpenMDBX is free software; you can redistribute it and/or modify it under + * libmdbx is free software; you can redistribute it and/or modify it under * the terms of the GNU Affero General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * - * ReOpenMDBX is distributed in the hope that it will be useful, + * libmdbx is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. diff --git a/reopen.h b/reopen.h index dd214172..ee828b94 100644 --- a/reopen.h +++ b/reopen.h @@ -12,8 +12,9 @@ * . */ -#ifndef _REOPEN_H -#define _REOPEN_H +#pragma once +/* *INDENT-OFF* */ +/* clang-format off */ #ifndef __CLANG_PREREQ # ifdef __clang__ @@ -233,5 +234,3 @@ __extern_C void __assert_fail( ((void)(addr), (void)(size)) # define ATTRIBUTE_NO_SANITIZE_ADDRESS #endif /* __SANITIZE_ADDRESS__ */ - -#endif /* _REOPEN_H */ diff --git a/yota_test1.c b/yota_test1.c index be727cbf..0cad5468 100644 --- a/yota_test1.c +++ b/yota_test1.c @@ -4,12 +4,12 @@ * * This file is part of libmdbx. * - * ReOpenMDBX is free software; you can redistribute it and/or modify it under + * libmdbx is free software; you can redistribute it and/or modify it under * the terms of the GNU Affero General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * - * ReOpenMDBX is distributed in the hope that it will be useful, + * libmdbx is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. diff --git a/yota_test2.c b/yota_test2.c index 753bea2f..80dc4f2f 100644 --- a/yota_test2.c +++ b/yota_test2.c @@ -4,12 +4,12 @@ * * This file is part of libmdbx. * - * ReOpenMDBX is free software; you can redistribute it and/or modify it under + * libmdbx is free software; you can redistribute it and/or modify it under * the terms of the GNU Affero General Public License as published by * the Free Software Foundation; either version 3 of the License, or * (at your option) any later version. * - * ReOpenMDBX is distributed in the hope that it will be useful, + * libmdbx is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. From 2dc3e1ee5fe7c84cfe548e346cacb615f4b302a6 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 21 Feb 2017 20:16:54 +0300 Subject: [PATCH 003/303] mdbx: reformat and some cleanup (1/3 for rebirth). --- .travis.yml | 2 +- CHANGES | 223 - Doxyfile | 1631 ----- Makefile | 39 +- README.md | 6 +- barriers.h | 4 +- lmdb.h | 1557 ----- mdb.c | 10723 -------------------------------- mdb_chk.c | 954 --- mdb_copy.c | 81 - mdb_dump.c | 314 - mdb_load.c | 456 -- mdb_stat.c | 299 - mdbx.c | 12019 ++++++++++++++++++++++++++++++++++-- mdbx.h | 1905 +++++- mdbx_chk.c | 979 +++ mdb_copy.1 => mdbx_copy.1 | 8 +- mdbx_copy.c | 76 + mdb_dump.1 => mdbx_dump.1 | 12 +- mdbx_dump.c | 316 + mdb_load.1 => mdbx_load.1 | 14 +- mdbx_load.c | 466 ++ mdb_stat.1 => mdbx_stat.1 | 8 +- mdbx_stat.c | 306 + midl.c | 361 -- midl.h | 209 +- mtest0.c | 337 +- mtest1.c | 309 +- mtest2.c | 219 +- mtest3.c | 233 +- mtest4.c | 294 +- mtest5.c | 237 +- mtest6.c | 194 +- sample-mdb.txt | 28 +- wbench.c | 339 +- yota_test1.c | 367 +- yota_test2.c | 445 +- 37 files changed, 16897 insertions(+), 19073 deletions(-) delete mode 100644 CHANGES delete mode 100644 Doxyfile delete mode 100644 lmdb.h delete mode 100644 mdb.c delete mode 100644 mdb_chk.c delete mode 100644 mdb_copy.c delete mode 100644 mdb_dump.c delete mode 100644 mdb_load.c delete mode 100644 mdb_stat.c create mode 100644 mdbx_chk.c rename mdb_copy.1 => mdbx_copy.1 (94%) create mode 100644 mdbx_copy.c rename mdb_dump.1 => mdbx_dump.1 (94%) create mode 100644 mdbx_dump.c rename mdb_load.1 => mdbx_load.1 (94%) create mode 100644 mdbx_load.c rename mdb_stat.1 => mdbx_stat.1 (95%) create mode 100644 mdbx_stat.c delete mode 100644 midl.c diff --git a/.travis.yml b/.travis.yml index 27287a04..5b6d5ee5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,4 +12,4 @@ compiler: os: - linux -script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all lmdb check; fi +script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all check; fi diff --git a/CHANGES b/CHANGES deleted file mode 100644 index 93486855..00000000 --- a/CHANGES +++ /dev/null @@ -1,223 +0,0 @@ -MDBX - Add MDB_PREV_MULTIPLE - Add error MDB_PROBLEM, replace some MDB_CORRUPTED - Workarounds for glibc bugs: #21031 and 21032. - -LMDB 0.9.20 Release Engineering - Fix mdb_load with escaped plaintext (ITS#8558) - Fix mdb_cursor_last / mdb_put interaction (ITS#8557) - -LMDB 0.9.19 Release (2016/12/28) - Fix mdb_env_cwalk cursor init (ITS#8424) - Fix robust mutexes on Solaris 10/11 (ITS#8339) - Fix MDB_GET_BOTH on non-dup record (ITS#8393) - Optimize mdb_drop - Fix xcursors after mdb_cursor_del (ITS#8406) - Fix MDB_NEXT_DUP after mdb_cursor_del (ITS#8412) - Fix mdb_cursor_put resetting C_EOF (ITS#8489) - Fix mdb_env_copyfd2 to return EPIPE on SIGPIPE (ITS#8504) - Fix mdb_env_copy with empty DB (ITS#8209) - Fix behaviors with fork (ITS#8505) - Fix mdb_dbi_open with mainDB cursors (ITS#8542) - Fix F_NOCACHE on MacOS, error is non-fatal (ITS#7682) - Documentation - Cleanup doxygen nits - Note reserved vs actual mem/disk usage - - -LMDB 0.9.18 Release (2016/02/05) - already done for mdbx - Fix robust mutex detection on glibc 2.10-11 (ITS#8330) - Fix page_search_root assert on FreeDB (ITS#8336) - Fix MDB_APPENDDUP vs. rewrite(single item) (ITS#8334) - n/a for mdbx - Fix mdb_copy of large files on Windows - Fix subcursor move after delete (ITS#8355) - Fix mdb_midl_shrink off-by-one (ITS#8363) - n/a for mdbx - Check for utf8_to_utf16 failures (ITS#7992) - Catch strdup failure in mdb_dbi_open - Build - already done for mdbx - Additional makefile var tweaks (ITS#8169) - Documentation - Add Getting Started page - Update WRITEMAP description - -LMDB 0.9.17 Release (2015/11/30) - Fix ITS#7377 catch calloc failure - Fix ITS#8237 regression from ITS#7589 - Fix ITS#8238 page_split for DUPFIXED pages - Fix ITS#8221 MDB_PAGE_FULL on delete/rebalance - Fix ITS#8258 rebalance/split assert - Fix ITS#8263 cursor_put cursor tracking - Fix ITS#8264 cursor_del cursor tracking - Fix ITS#8310 cursor_del cursor tracking - Fix ITS#8299 mdb_del cursor tracking - Fix ITS#8300 mdb_del cursor tracking - Fix ITS#8304 mdb_del cursor tracking - Fix ITS#7771 fakepage cursor tracking - Fix ITS#7789 ensure mapsize >= pages in use - Fix ITS#7971 mdb_txn_renew0() new reader slots - already done for mdbx - Fix ITS#7969 use __sync_synchronize on non-x86 - Fix ITS#8311 page_split from update_key - Fix ITS#8312 loose pages in nested txn - Fix ITS#8313 mdb_rebalance dummy cursor - Fix ITS#8315 dirty_room in nested txn - Fix ITS#8323 dirty_list in nested txn - Fix ITS#8316 page_merge cursor tracking - Fix ITS#8319 mdb_load error messages - Fix ITS#8320 mdb_load plaintext input - Fix ITS#8321 cursor tracking - Added mdb_txn_id() (ITS#7994) - Added robust mutex support - Miscellaneous cleanup/simplification - Build - Create install dirs if needed (ITS#8256) - not affected mdbx - Fix ThreadProc decl on Win32/MSVC (ITS#8270) - not affected mdbx - Added ssize_t typedef for MSVC (ITS#8067) - not affected mdbx - Use ANSI apis on Windows (ITS#8069) - already done for mdbx - Use O_SYNC if O_DSYNC,MDB_DSYNC are not defined (ITS#7209) - already done for mdbx - Allow passing AR to make (ITS#8168) - Allow passing mandir to make install (ITS#8169) - -LMDB 0.9.16 Release (2015/08/14) - Fix cursor EOF bug (ITS#8190) - Fix handling of subDB records (ITS#8181) - Fix mdb_midl_shrink() usage (ITS#8200) - not affected mdbx - fix reference to EINTR on WIN32 from ITS#8106 (ITS#8192) - -LMDB 0.9.15 Release (2015/06/19) - Fix txn init (ITS#7961,#7987) - Fix MDB_PREV_DUP (ITS#7955,#7671) - Fix compact of empty env (ITS#7956) - Fix mdb_copy file mode - Fix mdb_env_close() after failed mdb_env_open() - Fix mdb_rebalance collapsing root (ITS#8062) - Fix mdb_load with large values (ITS#8066) - Fix to retry writes on EINTR (ITS#8106) - Fix mdb_cursor_del on empty DB (ITS#8109) - Fix and Rework comparison for MDB_INTEGERKEY/MDB_INTEGERDUP (ITS#8117) - Fix error handling (ITS#7959,#8157,etc.) - Fix race conditions (ITS#7969,7970) - Added workaround for fdatasync bug in ext3fs - Build - Don't use -fPIC for static lib - Update .gitignore (ITS#7952,#7953) - Cleanup for "make test" (ITS#7841), "make clean", mtest*.c - Misc. Android/Windows cleanup - Documentation - Fix MDB_APPEND doc - Fix MDB_MAXKEYSIZE doc (ITS#8156) - Fix mdb_cursor_put,mdb_cursor_del EACCES description - Fix mdb_env_sync(MDB_RDONLY env) doc (ITS#8021) - Clarify MDB_WRITEMAP doc (ITS#8021) - Clarify mdb_env_open doc - Clarify mdb_dbi_open doc - -LMDB 0.9.14 Release (2014/09/20) - Fix to support 64K page size (ITS#7713) - Fix to persist decreased as well as increased mapsizes (ITS#7789) - Fix cursor bug when deleting last node of a DUPSORT key - Fix mdb_env_info to return FIXEDMAP address - Fix ambiguous error code from writing to closed DBI (ITS#7825) - Fix mdb_copy copying past end of file (ITS#7886) - Fix cursor bugs from page_merge/rebalance - Fix to dirty fewer pages in deletes (mdb_page_loose()) - Fix mdb_dbi_open creating subDBs (ITS#7917) - Fix mdb_cursor_get(_DUP) with single value (ITS#7913) - Fix Windows compat issues in mtests (ITS#7879) - Add compacting variant of mdb_copy - Add BigEndian integer key compare code - Add mdb_dump/mdb_load utilities - -LMDB 0.9.13 Release (2014/06/18) - Fix mdb_page_alloc unlimited overflow page search - Documentation - Re-fix MDB_CURRENT doc (ITS#7793) - Fix MDB_GET_MULTIPLE/MDB_NEXT_MULTIPLE doc - -LMDB 0.9.12 Release (2014/06/13) - Fix MDB_GET_BOTH regression (ITS#7875,#7681) - Fix MDB_MULTIPLE writing multiple keys (ITS#7834) - Fix mdb_rebalance (ITS#7829) - Fix mdb_page_split (ITS#7815) - Fix md_entries count (ITS#7861,#7828,#7793) - Fix MDB_CURRENT (ITS#7793) - Fix possible crash on Windows DLL detach - Misc code cleanup - Documentation - mdb_cursor_put: cursor moves on error (ITS#7771) - - -LMDB 0.9.11 Release (2014/01/15) - Add mdb_env_set_assert() (ITS#7775) - Fix: invalidate txn on page allocation errors (ITS#7377) - Fix xcursor tracking in mdb_cursor_del0() (ITS#7771) - Fix corruption from deletes (ITS#7756) - Fix Windows/MSVC build issues - Raise safe limit of max MDB_MAXKEYSIZE - Misc code cleanup - Documentation - Remove spurious note about non-overlapping flags (ITS#7665) - -LMDB 0.9.10 Release (2013/11/12) - Add MDB_NOMEMINIT option - Fix mdb_page_split() again (ITS#7589) - Fix MDB_NORDAHEAD definition (ITS#7734) - Fix mdb_cursor_del() positioning (ITS#7733) - Partial fix for larger page sizes (ITS#7713) - Fix Windows64/MSVC build issues - -LMDB 0.9.9 Release (2013/10/24) - Add mdb_env_get_fd() - Add MDB_NORDAHEAD option - Add MDB_NOLOCK option - Avoid wasting space in mdb_page_split() (ITS#7589) - Fix mdb_page_merge() cursor fixup (ITS#7722) - Fix mdb_cursor_del() on last delete (ITS#7718) - Fix adding WRITEMAP on existing env (ITS#7715) - Fix nested txns (ITS#7515) - Fix mdb_env_copy() O_DIRECT bug (ITS#7682) - Fix mdb_cursor_set(SET_RANGE) return code (ITS#7681) - Fix mdb_rebalance() cursor fixup (ITS#7701) - Misc code cleanup - Documentation - Note that by default, readers need write access - - -LMDB 0.9.8 Release (2013/09/09) - Allow mdb_env_set_mapsize() on an open environment - Fix mdb_dbi_flags() (ITS#7672) - Fix mdb_page_unspill() in nested txns - Fix mdb_cursor_get(CURRENT|NEXT) after a delete - Fix mdb_cursor_get(DUP) to always return key (ITS#7671) - Fix mdb_cursor_del() to always advance to next item (ITS#7670) - Fix mdb_cursor_set(SET_RANGE) for tree with single page (ITS#7681) - Fix mdb_env_copy() retry open if O_DIRECT fails (ITS#7682) - Tweak mdb_page_spill() to be less aggressive - Documentation - Update caveats since mdb_reader_check() added in 0.9.7 - -LMDB 0.9.7 Release (2013/08/17) - Don't leave stale lockfile on failed RDONLY open (ITS#7664) - Fix mdb_page_split() ref beyond cursor depth - Fix read txn data race (ITS#7635) - Fix mdb_rebalance (ITS#7536, #7538) - Fix mdb_drop() (ITS#7561) - Misc DEBUG macro fixes - Add MDB_NOTLS envflag - Add mdb_env_copyfd() - Add mdb_txn_env() (ITS#7660) - Add mdb_dbi_flags() (ITS#7661) - Add mdb_env_get_maxkeysize() - Add mdb_env_reader_list()/mdb_env_reader_check() - Add mdb_page_spill/unspill, remove hard txn size limit - Use shorter names for semaphores (ITS#7615) - Build - Fix install target (ITS#7656) - Documentation - Misc updates for cursors, DB handles, data lifetime - -LMDB 0.9.6 Release (2013/02/25) - Many fixes/enhancements - -LMDB 0.9.5 Release (2012/11/30) - Renamed from libmdb to liblmdb - Many fixes/enhancements diff --git a/Doxyfile b/Doxyfile deleted file mode 100644 index 5ca2cfe8..00000000 --- a/Doxyfile +++ /dev/null @@ -1,1631 +0,0 @@ -# Doxyfile 1.7.1 - -# This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project -# -# All text after a hash (#) is considered a comment and will be ignored -# The format is: -# TAG = value [value, ...] -# For lists items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (" ") - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- - -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all -# text before the first occurrence of this tag. Doxygen uses libiconv (or the -# iconv built into libc) for the transcoding. See -# http://www.gnu.org/software/libiconv for the list of possible encodings. - -DOXYFILE_ENCODING = UTF-8 - -# The PROJECT_NAME tag is a single word (or a sequence of words surrounded -# by quotes) that should identify the project. - -PROJECT_NAME = LMDB - -# The PROJECT_NUMBER tag can be used to enter a project or revision number. -# This could be handy for archiving the generated documentation or -# if some version control system is used. - -PROJECT_NUMBER = - -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) -# base path where the generated documentation will be put. -# If a relative path is entered, it will be relative to the location -# where doxygen was started. If left blank the current directory will be used. - -OUTPUT_DIRECTORY = - -# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create -# 4096 sub-directories (in 2 levels) under the output directory of each output -# format and will distribute the generated files over these directories. -# Enabling this option can be useful when feeding doxygen a huge amount of -# source files, where putting all generated files in the same directory would -# otherwise cause performance problems for the file system. - -CREATE_SUBDIRS = NO - -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# The default language is English, other supported languages are: -# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, -# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, -# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English -# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, -# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, -# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. - -OUTPUT_LANGUAGE = English - -# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will -# include brief member descriptions after the members that are listed in -# the file and class documentation (similar to JavaDoc). -# Set to NO to disable this. - -BRIEF_MEMBER_DESC = YES - -# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend -# the brief description of a member or function before the detailed description. -# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -# brief descriptions will be completely suppressed. - -REPEAT_BRIEF = YES - -# This tag implements a quasi-intelligent brief description abbreviator -# that is used to form the text in various listings. Each string -# in this list, if found as the leading text of the brief description, will be -# stripped from the text and the result after processing the whole list, is -# used as the annotated text. Otherwise, the brief description is used as-is. -# If left blank, the following values are used ("$name" is automatically -# replaced with the name of the entity): "The $name class" "The $name widget" -# "The $name file" "is" "provides" "specifies" "contains" -# "represents" "a" "an" "the" - -ABBREVIATE_BRIEF = - -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# Doxygen will generate a detailed section even if there is only a brief -# description. - -ALWAYS_DETAILED_SEC = NO - -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment -# operators of the base classes will not be shown. - -INLINE_INHERITED_MEMB = NO - -# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full -# path before files name in the file list and in the header files. If set -# to NO the shortest path that makes the file name unique will be used. - -FULL_PATH_NAMES = YES - -# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag -# can be used to strip a user-defined part of the path. Stripping is -# only done if one of the specified strings matches the left-hand part of -# the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the -# path to strip. - -STRIP_FROM_PATH = - -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of -# the path mentioned in the documentation of a class, which tells -# the reader which header file to include in order to use a class. -# If left blank only the name of the header file containing the class -# definition is used. Otherwise one should specify the include paths that -# are normally passed to the compiler using the -I flag. - -STRIP_FROM_INC_PATH = - -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter -# (but less readable) file names. This can be useful is your file systems -# doesn't support long names like on DOS, Mac, or CD-ROM. - -SHORT_NAMES = NO - -# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen -# will interpret the first line (until the first dot) of a JavaDoc-style -# comment as the brief description. If set to NO, the JavaDoc -# comments will behave just like regular Qt-style comments -# (thus requiring an explicit @brief command for a brief description.) - -JAVADOC_AUTOBRIEF = NO - -# If the QT_AUTOBRIEF tag is set to YES then Doxygen will -# interpret the first line (until the first dot) of a Qt-style -# comment as the brief description. If set to NO, the comments -# will behave just like regular Qt-style comments (thus requiring -# an explicit \brief command for a brief description.) - -QT_AUTOBRIEF = NO - -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen -# treat a multi-line C++ special comment block (i.e. a block of //! or /// -# comments) as a brief description. This used to be the default behaviour. -# The new default is to treat a multi-line C++ comment block as a detailed -# description. Set this tag to YES if you prefer the old behaviour instead. - -MULTILINE_CPP_IS_BRIEF = NO - -# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented -# member inherits the documentation from any documented member that it -# re-implements. - -INHERIT_DOCS = YES - -# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce -# a new page for each member. If set to NO, the documentation of a member will -# be part of the file/class/namespace that contains it. - -SEPARATE_MEMBER_PAGES = NO - -# The TAB_SIZE tag can be used to set the number of spaces in a tab. -# Doxygen uses this value to replace tabs by spaces in code fragments. - -TAB_SIZE = 4 - -# This tag can be used to specify a number of aliases that acts -# as commands in the documentation. An alias has the form "name=value". -# For example adding "sideeffect=\par Side Effects:\n" will allow you to -# put the command \sideeffect (or @sideeffect) in the documentation, which -# will result in a user-defined paragraph with heading "Side Effects:". -# You can put \n's in the value part of an alias to insert newlines. - -ALIASES = - -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C -# sources only. Doxygen will then generate output that is more tailored for C. -# For instance, some of the names that are used will be different. The list -# of all members will be omitted, etc. - -OPTIMIZE_OUTPUT_FOR_C = YES - -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java -# sources only. Doxygen will then generate output that is more tailored for -# Java. For instance, namespaces will be presented as packages, qualified -# scopes will look different, etc. - -OPTIMIZE_OUTPUT_JAVA = NO - -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources only. Doxygen will then generate output that is more tailored for -# Fortran. - -OPTIMIZE_FOR_FORTRAN = NO - -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for -# VHDL. - -OPTIMIZE_OUTPUT_VHDL = NO - -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given extension. -# Doxygen has a built-in mapping, but you can override or extend it using this -# tag. The format is ext=language, where ext is a file extension, and language -# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, -# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make -# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C -# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions -# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. - -EXTENSION_MAPPING = - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should -# set this tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. -# func(std::string) {}). This also make the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. - -BUILTIN_STL_SUPPORT = NO - -# If you use Microsoft's C++/CLI language, you should set this option to YES to -# enable parsing support. - -CPP_CLI_SUPPORT = NO - -# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. -# Doxygen will parse them like normal C++ but will assume all classes use public -# instead of private inheritance when no explicit protection keyword is present. - -SIP_SUPPORT = NO - -# For Microsoft's IDL there are propget and propput attributes to indicate getter -# and setter methods for a property. Setting this option to YES (the default) -# will make doxygen to replace the get and set methods by a property in the -# documentation. This will only work if the methods are indeed getting or -# setting a simple type. If this is not the case, or you want to show the -# methods anyway, you should set this option to NO. - -IDL_PROPERTY_SUPPORT = YES - -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES, then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default -# all members of a group must be documented explicitly. - -DISTRIBUTE_GROUP_DOC = YES - -# Set the SUBGROUPING tag to YES (the default) to allow class member groups of -# the same type (for instance a group of public functions) to be put as a -# subgroup of that type (e.g. under the Public Functions section). Set it to -# NO to prevent subgrouping. Alternatively, this can be done per class using -# the \nosubgrouping command. - -SUBGROUPING = YES - -INLINE_GROUPED_CLASSES = YES -# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum -# is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically -# be useful for C code in case the coding convention dictates that all compound -# types are typedef'ed and only the typedef is referenced, never the tag name. - -TYPEDEF_HIDES_STRUCT = YES - -# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to -# determine which symbols to keep in memory and which to flush to disk. -# When the cache is full, less often used symbols will be written to disk. -# For small to medium size projects (<1000 input files) the default value is -# probably good enough. For larger projects a too small cache size can cause -# doxygen to be busy swapping symbols to and from disk most of the time -# causing a significant performance penality. -# If the system has enough physical memory increasing the cache will improve the -# performance by keeping more symbols in memory. Note that the value works on -# a logarithmic scale so increasing the size by one will rougly double the -# memory usage. The cache size is given by this formula: -# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, -# corresponding to a cache size of 2^16 = 65536 symbols - -SYMBOL_CACHE_SIZE = 0 - -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- - -# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in -# documentation are documented, even if no documentation was available. -# Private class members and static file members will be hidden unless -# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES - -EXTRACT_ALL = NO - -# If the EXTRACT_PRIVATE tag is set to YES all private members of a class -# will be included in the documentation. - -EXTRACT_PRIVATE = NO - -# If the EXTRACT_STATIC tag is set to YES all static members of a file -# will be included in the documentation. - -EXTRACT_STATIC = YES - -# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) -# defined locally in source files will be included in the documentation. -# If set to NO only classes defined in header files are included. - -EXTRACT_LOCAL_CLASSES = YES - -# This flag is only useful for Objective-C code. When set to YES local -# methods, which are defined in the implementation section but not in -# the interface are included in the documentation. -# If set to NO (the default) only methods in the interface are included. - -EXTRACT_LOCAL_METHODS = NO - -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base -# name of the file that contains the anonymous namespace. By default -# anonymous namespace are hidden. - -EXTRACT_ANON_NSPACES = NO - -# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all -# undocumented members of documented classes, files or namespaces. -# If set to NO (the default) these members will be included in the -# various overviews, but no documentation section is generated. -# This option has no effect if EXTRACT_ALL is enabled. - -HIDE_UNDOC_MEMBERS = NO - -# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. -# If set to NO (the default) these classes will be included in the various -# overviews. This option has no effect if EXTRACT_ALL is enabled. - -HIDE_UNDOC_CLASSES = NO - -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all -# friend (class|struct|union) declarations. -# If set to NO (the default) these declarations will be included in the -# documentation. - -HIDE_FRIEND_COMPOUNDS = NO - -# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any -# documentation blocks found inside the body of a function. -# If set to NO (the default) these blocks will be appended to the -# function's detailed documentation block. - -HIDE_IN_BODY_DOCS = NO - -# The INTERNAL_DOCS tag determines if documentation -# that is typed after a \internal command is included. If the tag is set -# to NO (the default) then the documentation will be excluded. -# Set it to YES to include the internal documentation. - -INTERNAL_DOCS = NO - -# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate -# file names in lower-case letters. If set to YES upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. - -CASE_SENSE_NAMES = YES - -# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen -# will show members with their full class and namespace scopes in the -# documentation. If set to YES the scope will be hidden. - -HIDE_SCOPE_NAMES = NO - -# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen -# will put a list of the files that are included by a file in the documentation -# of that file. - -SHOW_INCLUDE_FILES = YES - -# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen -# will list include files with double quotes in the documentation -# rather than with sharp brackets. - -FORCE_LOCAL_INCLUDES = NO - -# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] -# is inserted in the documentation for inline members. - -INLINE_INFO = YES - -# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen -# will sort the (detailed) documentation of file and class members -# alphabetically by member name. If set to NO the members will appear in -# declaration order. - -SORT_MEMBER_DOCS = NO - -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the -# brief documentation of file, namespace and class members alphabetically -# by member name. If set to NO (the default) the members will appear in -# declaration order. - -SORT_BRIEF_DOCS = NO - -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen -# will sort the (brief and detailed) documentation of class members so that -# constructors and destructors are listed first. If set to NO (the default) -# the constructors will appear in the respective orders defined by -# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. -# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO -# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. - -SORT_MEMBERS_CTORS_1ST = NO - -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the -# hierarchy of group names into alphabetical order. If set to NO (the default) -# the group names will appear in their defined order. - -SORT_GROUP_NAMES = NO - -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be -# sorted by fully-qualified names, including namespaces. If set to -# NO (the default), the class list will be sorted only by class name, -# not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the -# alphabetical list. - -SORT_BY_SCOPE_NAME = NO - -# The GENERATE_TODOLIST tag can be used to enable (YES) or -# disable (NO) the todo list. This list is created by putting \todo -# commands in the documentation. - -GENERATE_TODOLIST = YES - -# The GENERATE_TESTLIST tag can be used to enable (YES) or -# disable (NO) the test list. This list is created by putting \test -# commands in the documentation. - -GENERATE_TESTLIST = YES - -# The GENERATE_BUGLIST tag can be used to enable (YES) or -# disable (NO) the bug list. This list is created by putting \bug -# commands in the documentation. - -GENERATE_BUGLIST = YES - -# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or -# disable (NO) the deprecated list. This list is created by putting -# \deprecated commands in the documentation. - -GENERATE_DEPRECATEDLIST= YES - -# The ENABLED_SECTIONS tag can be used to enable conditional -# documentation sections, marked by \if sectionname ... \endif. - -ENABLED_SECTIONS = - -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines -# the initial value of a variable or define consists of for it to appear in -# the documentation. If the initializer consists of more lines than specified -# here it will be hidden. Use a value of 0 to hide initializers completely. -# The appearance of the initializer of individual variables and defines in the -# documentation can be controlled using \showinitializer or \hideinitializer -# command in the documentation regardless of this setting. - -MAX_INITIALIZER_LINES = 30 - -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated -# at the bottom of the documentation of classes and structs. If set to YES the -# list will mention the files that were used to generate the documentation. - -SHOW_USED_FILES = YES - -# If the sources in your project are distributed over multiple directories -# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy -# in the documentation. The default is NO. - -SHOW_DIRECTORIES = NO - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. -# This will remove the Files entry from the Quick Index and from the -# Folder Tree View (if specified). The default is YES. - -SHOW_FILES = YES - -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the -# Namespaces page. -# This will remove the Namespaces entry from the Quick Index -# and from the Folder Tree View (if specified). The default is YES. - -SHOW_NAMESPACES = YES - -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command , where is the value of -# the FILE_VERSION_FILTER tag, and is the name of an input file -# provided by doxygen. Whatever the program writes to standard output -# is used as the file version. See the manual for examples. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. The create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. -# You can optionally specify a file name after the option, if omitted -# DoxygenLayout.xml will be used as the name of the layout file. - -LAYOUT_FILE = - -#--------------------------------------------------------------------------- -# configuration options related to warning and progress messages -#--------------------------------------------------------------------------- - -# The QUIET tag can be used to turn on/off the messages that are generated -# by doxygen. Possible values are YES and NO. If left blank NO is used. - -QUIET = NO - -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated by doxygen. Possible values are YES and NO. If left blank -# NO is used. - -WARNINGS = YES - -# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings -# for undocumented members. If EXTRACT_ALL is set to YES then this flag will -# automatically be disabled. - -WARN_IF_UNDOCUMENTED = YES - -# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some -# parameters in a documented function, or documenting parameters that -# don't exist or using markup commands wrongly. - -WARN_IF_DOC_ERROR = YES - -# This WARN_NO_PARAMDOC option can be abled to get warnings for -# functions that are documented, but have no documentation for their parameters -# or return value. If set to NO (the default) doxygen will only warn about -# wrong or incomplete parameter documentation, but not about the absence of -# documentation. - -WARN_NO_PARAMDOC = NO - -# The WARN_FORMAT tag determines the format of the warning messages that -# doxygen can produce. The string should contain the $file, $line, and $text -# tags, which will be replaced by the file and line number from which the -# warning originated and the warning text. Optionally the format may contain -# $version, which will be replaced by the version of the file (if it could -# be obtained via FILE_VERSION_FILTER) - -WARN_FORMAT = "$file:$line: $text" - -# The WARN_LOGFILE tag can be used to specify a file to which warning -# and error messages should be written. If left blank the output is written -# to stderr. - -WARN_LOGFILE = - -#--------------------------------------------------------------------------- -# configuration options related to the input files -#--------------------------------------------------------------------------- - -# The INPUT tag can be used to specify the files and/or directories that contain -# documented source files. You may enter file names like "myfile.cpp" or -# directories like "/usr/src/myproject". Separate the files or directories -# with spaces. - -INPUT = lmdb.h midl.h mdb.c midl.c intro.doc - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is -# also the default input encoding. Doxygen uses libiconv (or the iconv built -# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for -# the list of possible encodings. - -INPUT_ENCODING = UTF-8 - -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp -# and *.h) to filter out the source-files in the directories. If left -# blank the following patterns are tested: -# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx -# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90 - -FILE_PATTERNS = - -# The RECURSIVE tag can be used to turn specify whether or not subdirectories -# should be searched for input files as well. Possible values are YES and NO. -# If left blank NO is used. - -RECURSIVE = NO - -# The EXCLUDE tag can be used to specify files and/or directories that should -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. - -EXCLUDE = - -# The EXCLUDE_SYMLINKS tag can be used select whether or not files or -# directories that are symbolic links (a Unix filesystem feature) are excluded -# from the input. - -EXCLUDE_SYMLINKS = NO - -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. Note that the wildcards are matched -# against the file with absolute path, so to exclude all test directories -# for example use the pattern */test/* - -EXCLUDE_PATTERNS = - -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test - -EXCLUDE_SYMBOLS = - -# The EXAMPLE_PATH tag can be used to specify one or more files or -# directories that contain example code fragments that are included (see -# the \include command). - -EXAMPLE_PATH = - -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp -# and *.h) to filter out the source-files in the directories. If left -# blank all files are included. - -EXAMPLE_PATTERNS = - -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude -# commands irrespective of the value of the RECURSIVE tag. -# Possible values are YES and NO. If left blank NO is used. - -EXAMPLE_RECURSIVE = NO - -# The IMAGE_PATH tag can be used to specify one or more files or -# directories that contain image that are included in the documentation (see -# the \image command). - -IMAGE_PATH = - -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command , where -# is the value of the INPUT_FILTER tag, and is the name of an -# input file. Doxygen will then use the output that the filter program writes -# to standard output. -# If FILTER_PATTERNS is specified, this tag will be -# ignored. - -INPUT_FILTER = - -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. -# Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. -# The filters are a list of the form: -# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further -# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER -# is applied to all files. - -FILTER_PATTERNS = - -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER) will be used to filter the input files when producing source -# files to browse (i.e. when SOURCE_BROWSER is set to YES). - -FILTER_SOURCE_FILES = NO - -#--------------------------------------------------------------------------- -# configuration options related to source browsing -#--------------------------------------------------------------------------- - -# If the SOURCE_BROWSER tag is set to YES then a list of source files will -# be generated. Documented entities will be cross-referenced with these sources. -# Note: To get rid of all source code in the generated output, make sure also -# VERBATIM_HEADERS is set to NO. - -SOURCE_BROWSER = NO - -# Setting the INLINE_SOURCES tag to YES will include the body -# of functions and classes directly in the documentation. - -INLINE_SOURCES = NO - -# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct -# doxygen to hide any special comment blocks from generated source code -# fragments. Normal C and C++ comments will always remain visible. - -STRIP_CODE_COMMENTS = YES - -# If the REFERENCED_BY_RELATION tag is set to YES -# then for each documented function all documented -# functions referencing it will be listed. - -REFERENCED_BY_RELATION = NO - -# If the REFERENCES_RELATION tag is set to YES -# then for each documented function all documented entities -# called/used by that function will be listed. - -REFERENCES_RELATION = NO - -# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) -# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from -# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will -# link to the source code. -# Otherwise they will link to the documentation. - -REFERENCES_LINK_SOURCE = YES - -# If the USE_HTAGS tag is set to YES then the references to source code -# will point to the HTML generated by the htags(1) tool instead of doxygen -# built-in source browser. The htags tool is part of GNU's global source -# tagging system (see http://www.gnu.org/software/global/global.html). You -# will need version 4.8.6 or higher. - -USE_HTAGS = NO - -# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen -# will generate a verbatim copy of the header file for each class for -# which an include is specified. Set to NO to disable this. - -VERBATIM_HEADERS = YES - -#--------------------------------------------------------------------------- -# configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- - -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index -# of all compounds will be generated. Enable this if the project -# contains a lot of classes, structs, unions or interfaces. - -ALPHABETICAL_INDEX = YES - -# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then -# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns -# in which this list will be split (can be a number in the range [1..20]) - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all -# classes will be put under the same header in the alphabetical index. -# The IGNORE_PREFIX tag can be used to specify one or more prefixes that -# should be ignored while generating the index headers. - -IGNORE_PREFIX = - -#--------------------------------------------------------------------------- -# configuration options related to the HTML output -#--------------------------------------------------------------------------- - -# If the GENERATE_HTML tag is set to YES (the default) Doxygen will -# generate HTML output. - -GENERATE_HTML = YES - -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `html' will be used as the default path. - -HTML_OUTPUT = html - -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for -# each generated HTML page (for example: .htm,.php,.asp). If it is left blank -# doxygen will generate files with .html extension. - -HTML_FILE_EXTENSION = .html - -# The HTML_HEADER tag can be used to specify a personal HTML header for -# each generated HTML page. If it is left blank doxygen will generate a -# standard header. - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a personal HTML footer for -# each generated HTML page. If it is left blank doxygen will generate a -# standard footer. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading -# style sheet that is used by each HTML page. It can be used to -# fine-tune the look of the HTML output. If the tag is left blank doxygen -# will generate a default style sheet. Note that doxygen will try to copy -# the style sheet file to the HTML output directory, so don't put your own -# stylesheet in the HTML output directory as well, or it will be erased! - -HTML_STYLESHEET = - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. -# Doxygen will adjust the colors in the stylesheet and background images -# according to this color. Hue is specified as an angle on a colorwheel, -# see http://en.wikipedia.org/wiki/Hue for more information. -# For instance the value 0 represents red, 60 is yellow, 120 is green, -# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. -# The allowed range is 0 to 359. - -HTML_COLORSTYLE_HUE = 220 - -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of -# the colors in the HTML output. For a value of 0 the output will use -# grayscales only. A value of 255 will produce the most vivid colors. - -HTML_COLORSTYLE_SAT = 100 - -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to -# the luminance component of the colors in the HTML output. Values below -# 100 gradually make the output lighter, whereas values above 100 make -# the output darker. The value divided by 100 is the actual gamma applied, -# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, -# and 100 does not change the gamma. - -HTML_COLORSTYLE_GAMMA = 80 - -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting -# this to NO can help when comparing the output of multiple runs. - -HTML_TIMESTAMP = YES - -# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, -# files or namespaces will be aligned in HTML using tables. If set to -# NO a bullet list will be used. - -HTML_ALIGN_MEMBERS = YES - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. For this to work a browser that supports -# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox -# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). - -HTML_DYNAMIC_SECTIONS = NO - -# If the GENERATE_DOCSET tag is set to YES, additional index files -# will be generated that can be used as input for Apple's Xcode 3 -# integrated development environment, introduced with OSX 10.5 (Leopard). -# To create a documentation set, doxygen will generate a Makefile in the -# HTML output directory. Running make will produce the docset in that -# directory and running "make install" will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find -# it at startup. -# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. - -GENERATE_DOCSET = NO - -# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the -# feed. A documentation feed provides an umbrella under which multiple -# documentation sets from a single provider (such as a company or product suite) -# can be grouped. - -DOCSET_FEEDNAME = "Doxygen generated docs" - -# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that -# should uniquely identify the documentation set bundle. This should be a -# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen -# will append .docset to the name. - -DOCSET_BUNDLE_ID = org.doxygen.Project - -# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify -# the documentation publisher. This should be a reverse domain-name style -# string, e.g. com.mycompany.MyDocSet.documentation. - -DOCSET_PUBLISHER_ID = org.doxygen.Publisher - -# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. - -DOCSET_PUBLISHER_NAME = Publisher - -# If the GENERATE_HTMLHELP tag is set to YES, additional index files -# will be generated that can be used as input for tools like the -# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) -# of the generated HTML documentation. - -GENERATE_HTMLHELP = NO - -# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can -# be used to specify the file name of the resulting .chm file. You -# can add a path in front of the file if the result should not be -# written to the html output directory. - -CHM_FILE = - -# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can -# be used to specify the location (absolute path including file name) of -# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run -# the HTML help compiler on the generated index.hhp. - -HHC_LOCATION = - -# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag -# controls if a separate .chi index file is generated (YES) or that -# it should be included in the master .chm file (NO). - -GENERATE_CHI = NO - -# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING -# is used to encode HtmlHelp index (hhk), content (hhc) and project file -# content. - -CHM_INDEX_ENCODING = - -# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag -# controls whether a binary table of contents is generated (YES) or a -# normal table of contents (NO) in the .chm file. - -BINARY_TOC = NO - -# The TOC_EXPAND flag can be set to YES to add extra items for group members -# to the contents of the HTML help documentation and to the tree view. - -TOC_EXPAND = NO - -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated -# that can be used as input for Qt's qhelpgenerator to generate a -# Qt Compressed Help (.qch) of the generated HTML documentation. - -GENERATE_QHP = NO - -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can -# be used to specify the file name of the resulting .qch file. -# The path specified is relative to the HTML output folder. - -QCH_FILE = - -# The QHP_NAMESPACE tag specifies the namespace to use when generating -# Qt Help Project output. For more information please see -# http://doc.trolltech.com/qthelpproject.html#namespace - -QHP_NAMESPACE = org.doxygen.Project - -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating -# Qt Help Project output. For more information please see -# http://doc.trolltech.com/qthelpproject.html#virtual-folders - -QHP_VIRTUAL_FOLDER = doc - -# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to -# add. For more information please see -# http://doc.trolltech.com/qthelpproject.html#custom-filters - -QHP_CUST_FILTER_NAME = - -# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see -# -# Qt Help Project / Custom Filters. - -QHP_CUST_FILTER_ATTRS = - -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's -# filter section matches. -# -# Qt Help Project / Filter Attributes. - -QHP_SECT_FILTER_ATTRS = - -# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can -# be used to specify the location of Qt's qhelpgenerator. -# If non-empty doxygen will try to run qhelpgenerator on the generated -# .qhp file. - -QHG_LOCATION = - -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files -# will be generated, which together with the HTML files, form an Eclipse help -# plugin. To install this plugin and make it available under the help contents -# menu in Eclipse, the contents of the directory containing the HTML and XML -# files needs to be copied into the plugins directory of eclipse. The name of -# the directory within the plugins directory should be the same as -# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before -# the help appears. - -GENERATE_ECLIPSEHELP = NO - -# A unique identifier for the eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have -# this name. - -ECLIPSE_DOC_ID = org.doxygen.Project - -# The DISABLE_INDEX tag can be used to turn on/off the condensed index at -# top of each HTML page. The value NO (the default) enables the index and -# the value YES disables it. - -DISABLE_INDEX = NO - -# This tag can be used to set the number of enum values (range [1..20]) -# that doxygen will group on one line in the generated HTML documentation. - -ENUM_VALUES_PER_LINE = 4 - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. -# If the tag value is set to YES, a side panel will be generated -# containing a tree-like index structure (just like the one that -# is generated for HTML Help). For this to work a browser that supports -# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). -# Windows users are probably better off using the HTML help feature. - -GENERATE_TREEVIEW = NO - -# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, -# and Class Hierarchy pages using a tree view instead of an ordered list. - -USE_INLINE_TREES = NO - -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be -# used to set the initial width (in pixels) of the frame in which the tree -# is shown. - -TREEVIEW_WIDTH = 250 - -# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open -# links to external symbols imported via tag files in a separate window. - -EXT_LINKS_IN_WINDOW = NO - -# Use this tag to change the font size of Latex formulas included -# as images in the HTML documentation. The default is 10. Note that -# when you change the font size after a successful doxygen run you need -# to manually remove any form_*.png images from the HTML output directory -# to force them to be regenerated. - -FORMULA_FONTSIZE = 10 - -# Use the FORMULA_TRANPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are -# not supported properly for IE 6.0, but are supported on all modern browsers. -# Note that when changing this option you need to delete any form_*.png files -# in the HTML output before the changes have effect. - -FORMULA_TRANSPARENT = YES - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box -# for the HTML output. The underlying search engine uses javascript -# and DHTML and should work on any modern browser. Note that when using -# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets -# (GENERATE_DOCSET) there is already a search function so this one should -# typically be disabled. For large projects the javascript based search engine -# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. - -SEARCHENGINE = YES - -# When the SERVER_BASED_SEARCH tag is enabled the search engine will be -# implemented using a PHP enabled web server instead of at the web client -# using Javascript. Doxygen will generate the search PHP script and index -# file to put on the web server. The advantage of the server -# based approach is that it scales better to large projects and allows -# full text search. The disadvances is that it is more difficult to setup -# and does not have live searching capabilities. - -SERVER_BASED_SEARCH = NO - -#--------------------------------------------------------------------------- -# configuration options related to the LaTeX output -#--------------------------------------------------------------------------- - -# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will -# generate Latex output. - -GENERATE_LATEX = NO - -# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `latex' will be used as the default path. - -LATEX_OUTPUT = latex - -# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be -# invoked. If left blank `latex' will be used as the default command name. -# Note that when enabling USE_PDFLATEX this option is only used for -# generating bitmaps for formulas in the HTML output, but not in the -# Makefile that is written to the output directory. - -LATEX_CMD_NAME = latex - -# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to -# generate index for LaTeX. If left blank `makeindex' will be used as the -# default command name. - -MAKEINDEX_CMD_NAME = makeindex - -# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact -# LaTeX documents. This may be useful for small projects and may help to -# save some trees in general. - -COMPACT_LATEX = NO - -# The PAPER_TYPE tag can be used to set the paper type that is used -# by the printer. Possible values are: a4, a4wide, letter, legal and -# executive. If left blank a4wide will be used. - -PAPER_TYPE = a4wide - -# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX -# packages that should be included in the LaTeX output. - -EXTRA_PACKAGES = - -# The LATEX_HEADER tag can be used to specify a personal LaTeX header for -# the generated latex document. The header should contain everything until -# the first chapter. If it is left blank doxygen will generate a -# standard header. Notice: only use this tag if you know what you are doing! - -LATEX_HEADER = - -# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated -# is prepared for conversion to pdf (using ps2pdf). The pdf file will -# contain links (just like the HTML output) instead of page references -# This makes the output suitable for online browsing using a pdf viewer. - -PDF_HYPERLINKS = YES - -# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of -# plain latex in the generated Makefile. Set this option to YES to get a -# higher quality PDF documentation. - -USE_PDFLATEX = YES - -# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. -# command to the generated LaTeX files. This will instruct LaTeX to keep -# running if errors occur, instead of asking the user for help. -# This option is also used when generating formulas in HTML. - -LATEX_BATCHMODE = NO - -# If LATEX_HIDE_INDICES is set to YES then doxygen will not -# include the index chapters (such as File Index, Compound Index, etc.) -# in the output. - -LATEX_HIDE_INDICES = NO - -# If LATEX_SOURCE_CODE is set to YES then doxygen will include -# source code with syntax highlighting in the LaTeX output. -# Note that which sources are shown also depends on other settings -# such as SOURCE_BROWSER. - -LATEX_SOURCE_CODE = NO - -#--------------------------------------------------------------------------- -# configuration options related to the RTF output -#--------------------------------------------------------------------------- - -# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output -# The RTF output is optimized for Word 97 and may not look very pretty with -# other RTF readers or editors. - -GENERATE_RTF = NO - -# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `rtf' will be used as the default path. - -RTF_OUTPUT = rtf - -# If the COMPACT_RTF tag is set to YES Doxygen generates more compact -# RTF documents. This may be useful for small projects and may help to -# save some trees in general. - -COMPACT_RTF = NO - -# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated -# will contain hyperlink fields. The RTF file will -# contain links (just like the HTML output) instead of page references. -# This makes the output suitable for online browsing using WORD or other -# programs which support those fields. -# Note: wordpad (write) and others do not support links. - -RTF_HYPERLINKS = NO - -# Load stylesheet definitions from file. Syntax is similar to doxygen's -# config file, i.e. a series of assignments. You only have to provide -# replacements, missing definitions are set to their default value. - -RTF_STYLESHEET_FILE = - -# Set optional variables used in the generation of an rtf document. -# Syntax is similar to doxygen's config file. - -RTF_EXTENSIONS_FILE = - -#--------------------------------------------------------------------------- -# configuration options related to the man page output -#--------------------------------------------------------------------------- - -# If the GENERATE_MAN tag is set to YES (the default) Doxygen will -# generate man pages - -GENERATE_MAN = YES - -# The MAN_OUTPUT tag is used to specify where the man pages will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `man' will be used as the default path. - -MAN_OUTPUT = man - -# The MAN_EXTENSION tag determines the extension that is added to -# the generated man pages (default is the subroutine's section .3) - -MAN_EXTENSION = .3 - -# If the MAN_LINKS tag is set to YES and Doxygen generates man output, -# then it will generate one additional man file for each entity -# documented in the real man page(s). These additional files -# only source the real man page, but without them the man command -# would be unable to find the correct page. The default is NO. - -MAN_LINKS = NO - -#--------------------------------------------------------------------------- -# configuration options related to the XML output -#--------------------------------------------------------------------------- - -# If the GENERATE_XML tag is set to YES Doxygen will -# generate an XML file that captures the structure of -# the code including all documentation. - -GENERATE_XML = NO - -# The XML_OUTPUT tag is used to specify where the XML pages will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `xml' will be used as the default path. - -XML_OUTPUT = xml - -# The XML_SCHEMA tag can be used to specify an XML schema, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_SCHEMA = - -# The XML_DTD tag can be used to specify an XML DTD, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_DTD = - -# If the XML_PROGRAMLISTING tag is set to YES Doxygen will -# dump the program listings (including syntax highlighting -# and cross-referencing information) to the XML output. Note that -# enabling this will significantly increase the size of the XML output. - -XML_PROGRAMLISTING = YES - -#--------------------------------------------------------------------------- -# configuration options for the AutoGen Definitions output -#--------------------------------------------------------------------------- - -# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will -# generate an AutoGen Definitions (see autogen.sf.net) file -# that captures the structure of the code including all -# documentation. Note that this feature is still experimental -# and incomplete at the moment. - -GENERATE_AUTOGEN_DEF = NO - -#--------------------------------------------------------------------------- -# configuration options related to the Perl module output -#--------------------------------------------------------------------------- - -# If the GENERATE_PERLMOD tag is set to YES Doxygen will -# generate a Perl module file that captures the structure of -# the code including all documentation. Note that this -# feature is still experimental and incomplete at the -# moment. - -GENERATE_PERLMOD = NO - -# If the PERLMOD_LATEX tag is set to YES Doxygen will generate -# the necessary Makefile rules, Perl scripts and LaTeX code to be able -# to generate PDF and DVI output from the Perl module output. - -PERLMOD_LATEX = NO - -# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be -# nicely formatted so it can be parsed by a human reader. -# This is useful -# if you want to understand what is going on. -# On the other hand, if this -# tag is set to NO the size of the Perl module output will be much smaller -# and Perl will parse it just the same. - -PERLMOD_PRETTY = YES - -# The names of the make variables in the generated doxyrules.make file -# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. -# This is useful so different doxyrules.make files included by the same -# Makefile don't overwrite each other's variables. - -PERLMOD_MAKEVAR_PREFIX = - -#--------------------------------------------------------------------------- -# Configuration options related to the preprocessor -#--------------------------------------------------------------------------- - -# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will -# evaluate all C-preprocessor directives found in the sources and include -# files. - -ENABLE_PREPROCESSING = YES - -# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro -# names in the source code. If set to NO (the default) only conditional -# compilation will be performed. Macro expansion can be done in a controlled -# way by setting EXPAND_ONLY_PREDEF to YES. - -MACRO_EXPANSION = NO - -# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES -# then the macro expansion is limited to the macros specified with the -# PREDEFINED and EXPAND_AS_DEFINED tags. - -EXPAND_ONLY_PREDEF = NO - -# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files -# in the INCLUDE_PATH (see below) will be search if a #include is found. - -SEARCH_INCLUDES = YES - -# The INCLUDE_PATH tag can be used to specify one or more directories that -# contain include files that are not input files but should be processed by -# the preprocessor. - -INCLUDE_PATH = - -# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard -# patterns (like *.h and *.hpp) to filter out the header-files in the -# directories. If left blank, the patterns specified with FILE_PATTERNS will -# be used. - -INCLUDE_FILE_PATTERNS = - -# The PREDEFINED tag can be used to specify one or more macro names that -# are defined before the preprocessor is started (similar to the -D option of -# gcc). The argument of the tag is a list of macros of the form: name -# or name=definition (no spaces). If the definition and the = are -# omitted =1 is assumed. To prevent a macro definition from being -# undefined via #undef or recursively expanded use the := operator -# instead of the = operator. - -PREDEFINED = DEBUG=2 __GNUC__=1 - -# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then -# this tag can be used to specify a list of macro names that should be expanded. -# The macro definition that is found in the sources will be used. -# Use the PREDEFINED tag if you want to use a different macro definition. - -EXPAND_AS_DEFINED = - -# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then -# doxygen's preprocessor will remove all function-like macros that are alone -# on a line, have an all uppercase name, and do not end with a semicolon. Such -# function macros are typically used for boiler-plate code, and will confuse -# the parser if not removed. - -SKIP_FUNCTION_MACROS = YES - -#--------------------------------------------------------------------------- -# Configuration::additions related to external references -#--------------------------------------------------------------------------- - -# The TAGFILES option can be used to specify one or more tagfiles. -# Optionally an initial location of the external documentation -# can be added for each tagfile. The format of a tag file without -# this location is as follows: -# -# TAGFILES = file1 file2 ... -# Adding location for the tag files is done as follows: -# -# TAGFILES = file1=loc1 "file2 = loc2" ... -# where "loc1" and "loc2" can be relative or absolute paths or -# URLs. If a location is present for each tag, the installdox tool -# does not have to be run to correct the links. -# Note that each tag file must have a unique name -# (where the name does NOT include the path) -# If a tag file is not located in the directory in which doxygen -# is run, you must also specify the path to the tagfile here. - -TAGFILES = tooltag=./man1 - -# When a file name is specified after GENERATE_TAGFILE, doxygen will create -# a tag file that is based on the input files it reads. - -GENERATE_TAGFILE = - -# If the ALLEXTERNALS tag is set to YES all external classes will be listed -# in the class index. If set to NO only the inherited external classes -# will be listed. - -ALLEXTERNALS = NO - -# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed -# in the modules index. If set to NO, only the current project's groups will -# be listed. - -EXTERNAL_GROUPS = YES - -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of `which perl'). - -PERL_PATH = /usr/bin/perl - -#--------------------------------------------------------------------------- -# Configuration options related to the dot tool -#--------------------------------------------------------------------------- - -# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will -# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base -# or super classes. Setting the tag to NO turns the diagrams off. Note that -# this option is superseded by the HAVE_DOT option below. This is only a -# fallback. It is recommended to install and use dot, since it yields more -# powerful graphs. - -CLASS_DIAGRAMS = YES - -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see -# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = - -# If set to YES, the inheritance and collaboration graphs will hide -# inheritance and usage relations if the target is undocumented -# or is not a class. - -HIDE_UNDOC_RELATIONS = YES - -# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is -# available from the path. This tool is part of Graphviz, a graph visualization -# toolkit from AT&T and Lucent Bell Labs. The other options in this section -# have no effect if this option is set to NO (the default) - -HAVE_DOT = NO - -# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is -# allowed to run in parallel. When set to 0 (the default) doxygen will -# base this on the number of processors available in the system. You can set it -# explicitly to a value larger than 0 to get control over the balance -# between CPU load and processing speed. - -DOT_NUM_THREADS = 0 - -# By default doxygen will write a font called FreeSans.ttf to the output -# directory and reference it in all dot files that doxygen generates. This -# font does not include all possible unicode characters however, so when you need -# these (or just want a differently looking font) you can specify the font name -# using DOT_FONTNAME. You need need to make sure dot is able to find the font, -# which can be done by putting it in a standard location or by setting the -# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory -# containing the font. - -DOT_FONTNAME = FreeSans.ttf - -# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. -# The default size is 10pt. - -DOT_FONTSIZE = 10 - -# By default doxygen will tell dot to use the output directory to look for the -# FreeSans.ttf font (which doxygen will put there itself). If you specify a -# different font using DOT_FONTNAME you can set the path where dot -# can find it using this tag. - -DOT_FONTPATH = - -# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect inheritance relations. Setting this tag to YES will force the -# the CLASS_DIAGRAMS tag to NO. - -CLASS_GRAPH = YES - -# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for each documented class showing the direct and -# indirect implementation dependencies (inheritance, containment, and -# class references variables) of the class with other documented classes. - -COLLABORATION_GRAPH = YES - -# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen -# will generate a graph for groups, showing the direct groups dependencies - -GROUP_GRAPHS = YES - -# If the UML_LOOK tag is set to YES doxygen will generate inheritance and -# collaboration diagrams in a style similar to the OMG's Unified Modeling -# Language. - -UML_LOOK = NO - -# If set to YES, the inheritance and collaboration graphs will show the -# relations between templates and their instances. - -TEMPLATE_RELATIONS = NO - -# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT -# tags are set to YES then doxygen will generate a graph for each documented -# file showing the direct and indirect include dependencies of the file with -# other documented files. - -INCLUDE_GRAPH = YES - -# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and -# HAVE_DOT tags are set to YES then doxygen will generate a graph for each -# documented header file showing the documented files that directly or -# indirectly include this file. - -INCLUDED_BY_GRAPH = YES - -# If the CALL_GRAPH and HAVE_DOT options are set to YES then -# doxygen will generate a call dependency graph for every global function -# or class method. Note that enabling this option will significantly increase -# the time of a run. So in most cases it will be better to enable call graphs -# for selected functions only using the \callgraph command. - -CALL_GRAPH = NO - -# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then -# doxygen will generate a caller dependency graph for every global function -# or class method. Note that enabling this option will significantly increase -# the time of a run. So in most cases it will be better to enable caller -# graphs for selected functions only using the \callergraph command. - -CALLER_GRAPH = NO - -# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen -# will graphical hierarchy of all classes instead of a textual one. - -GRAPHICAL_HIERARCHY = YES - -# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES -# then doxygen will show the dependencies a directory has on other directories -# in a graphical way. The dependency relations are determined by the #include -# relations between the files in the directories. - -DIRECTORY_GRAPH = YES - -# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images -# generated by dot. Possible values are png, jpg, or gif -# If left blank png will be used. - -DOT_IMAGE_FORMAT = png - -# The tag DOT_PATH can be used to specify the path where the dot tool can be -# found. If left blank, it is assumed the dot tool can be found in the path. - -DOT_PATH = - -# The DOTFILE_DIRS tag can be used to specify one or more directories that -# contain dot files that are included in the documentation (see the -# \dotfile command). - -DOTFILE_DIRS = - -# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of -# nodes that will be shown in the graph. If the number of nodes in a graph -# becomes larger than this value, doxygen will truncate the graph, which is -# visualized by representing a node as a red box. Note that doxygen if the -# number of direct children of the root node in a graph is already larger than -# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note -# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. - -DOT_GRAPH_MAX_NODES = 50 - -# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the -# graphs generated by dot. A depth value of 3 means that only nodes reachable -# from the root by following a path via at most 3 edges will be shown. Nodes -# that lay further from the root node will be omitted. Note that setting this -# option to 1 or 2 may greatly reduce the computation time needed for large -# code bases. Also note that the size of a graph can be further restricted by -# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. - -MAX_DOT_GRAPH_DEPTH = 0 - -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, because dot on Windows does not -# seem to support this out of the box. Warning: Depending on the platform used, -# enabling this option may lead to badly anti-aliased labels on the edges of -# a graph (i.e. they become hard to read). - -DOT_TRANSPARENT = NO - -# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output -# files in one run (i.e. multiple -o and -T options on the command line). This -# makes dot run faster, but since only newer versions of dot (>1.8.10) -# support this, this feature is disabled by default. - -DOT_MULTI_TARGETS = YES - -# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will -# generate a legend page explaining the meaning of the various boxes and -# arrows in the dot generated graphs. - -GENERATE_LEGEND = YES - -# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will -# remove the intermediate dot files that are used to generate -# the various graphs. - -DOT_CLEANUP = YES diff --git a/Makefile b/Makefile index 66d7e278..4b6c3614 100644 --- a/Makefile +++ b/Makefile @@ -36,24 +36,21 @@ IOARENA ?= ../ioarena.git/@BUILD/src/ioarena ######################################################################## -HEADERS := lmdb.h mdbx.h +HEADERS := mdbx.h LIBRARIES := libmdbx.a libmdbx.so TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk -MANPAGES := mdb_stat.1 mdb_copy.1 mdb_dump.1 mdb_load.1 +MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 TESTS := mtest0 mtest1 mtest2 mtest3 mtest4 mtest5 mtest6 wbench \ yota_test1 yota_test2 -SRC_LMDB := mdb.c midl.c lmdb.h midl.h reopen.h barriers.h -SRC_MDBX := $(SRC_LMDB) mdbx.c mdbx.h +SRC_MDBX := mdbx.c mdbx.h reopen.h barriers.h -.PHONY: mdbx lmdb all install clean check tests coverage +.PHONY: mdbx all install clean check tests coverage all: $(LIBRARIES) $(TOOLS) mdbx: libmdbx.a libmdbx.so -lmdb: liblmdb.a liblmdb.so - tools: $(TOOLS) install: $(LIBRARIES) $(TOOLS) $(HEADERS) @@ -88,25 +85,19 @@ libmdbx.a: mdbx.o libmdbx.so: mdbx.lo $(CC) $(CFLAGS) $(LDFLAGS) -save-temps -pthread -shared $(LDOPS) -o $@ $^ -liblmdb.a: lmdb.o - $(AR) rs $@ $^ - -liblmdb.so: lmdb.lo - $(CC) $(CFLAGS) $(LDFLAGS) -pthread -shared $(LDOPS) -o $@ $^ - -mdbx_stat: mdb_stat.o mdbx.o +mdbx_stat: mdbx_stat.o mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -mdbx_copy: mdb_copy.o mdbx.o +mdbx_copy: mdbx_copy.o mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -mdbx_dump: mdb_dump.o mdbx.o +mdbx_dump: mdbx_dump.o mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -mdbx_load: mdb_load.o mdbx.o +mdbx_load: mdbx_load.o mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -mdbx_chk: mdb_chk.o mdbx.o +mdbx_chk: mdbx_chk.o mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ mtest0: mtest0.o mdbx.o @@ -145,16 +136,10 @@ mdbx.o: $(SRC_MDBX) mdbx.lo: $(SRC_MDBX) $(CC) $(CFLAGS) -fPIC -c mdbx.c -o $@ -lmdb.o: $(SRC_LMDB) - $(CC) $(CFLAGS) -c mdb.c -o $@ - -lmdb.lo: $(SRC_LMDB) - $(CC) $(CFLAGS) -fPIC -c mdb.c -o $@ - %: %.o $(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ -%.o: %.c lmdb.h mdbx.h +%.o: %.c mdbx.h $(CC) $(CFLAGS) -c $< COFLAGS = -fprofile-arcs -ftest-coverage @@ -188,13 +173,11 @@ endef $(eval $(call bench-rule,mdbx,$(NN),libmdbx.so)) -$(eval $(call bench-rule,lmdb,$(NN))) - $(eval $(call bench-rule,dummy,$(NN))) $(eval $(call bench-rule,debug,10)) -bench: bench-lmdb.txt bench-mdbx.txt +bench: bench-mdbx.txt endif diff --git a/README.md b/README.md index bac63b48..56f5ec31 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ RECLAIM` в _libmdbx_. сохранены мета-страницы со ссылками на страницы с новыми версиями данных, но не сами новые данные. В этом случае БД будет безвозвратна разрушена, даже если до аварии производилась -полная синхронизация данных (посредством `mdb_env_sync()`). +полная синхронизация данных (посредством `mdbx_env_sync()`). В _libmdbx_ эта проблема устранена, подробности ниже. @@ -248,7 +248,7 @@ RECLAIM` в _libmdbx_. сохранены мета-страницы со ссылками на страницы с новыми версиями данных, но не сами новые данные. В этом случае БД будет безвозвратна разрушена, даже если до аварии производилась - полная синхронизация данных (посредством `mdb_env_sync()`). + полная синхронизация данных (посредством `mdbx_env_sync()`). В _libmdbx_ эта проблема устранена путем полной переработки пути записи данных: @@ -371,5 +371,5 @@ RECLAIM` в _libmdbx_. 25. При завершении читающих транзакций, открытые в них DBI-хендлы не закрываются и не теряются при завершении таких транзакций посредством -mdb_txn_abort() или mdb_txn_reset(). Что позволяет избавится от ряда +mdbx_txn_abort() или mdbx_txn_reset(). Что позволяет избавится от ряда сложно обнаруживаемых ошибок. diff --git a/barriers.h b/barriers.h index ff39cae2..317e60bc 100644 --- a/barriers.h +++ b/barriers.h @@ -17,7 +17,7 @@ * in the most portable way for libmdbx project. * * Feedback and comments are welcome. - * https://gist.github.com/leo-yuriev/ba186a6bf5cf3a27bae7 */ + * https://gist.github.com/leo-yuriev/ba186a6bf5cf3a27bae7 */ #pragma once /* *INDENT-OFF* */ @@ -140,7 +140,7 @@ static MDBX_INLINE void mdbx_barrier(int type) { #define mdbx_coherent_barrier() \ mdbx_barrier(MDB_CACHE_IS_COHERENT ? MDBX_BARRIER_COMPILER : MDBX_BARRIER_MEMORY) -static MDBX_INLINE void mdb_invalidate_cache(void *addr, int nbytes) { +static MDBX_INLINE void mdbx_invalidate_cache(void *addr, int nbytes) { mdbx_coherent_barrier(); #if defined(__mips) && defined(__linux) /* MIPS has cache coherency issues. diff --git a/lmdb.h b/lmdb.h deleted file mode 100644 index 48ca616f..00000000 --- a/lmdb.h +++ /dev/null @@ -1,1557 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev . - * - * This code is derived from "LMDB engine" written by - * Howard Chu (Symas Corporation), which itself derived from btree.c - * written by Martin Hedenfalk. - * - * --- - * - * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - * - * --- - * - * Portions Copyright (c) 2009, 2010 Martin Hedenfalk - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef _LMDB_H_ -#define _LMDB_H_ - -#ifndef MDBX_MODE_ENABLED -# define MDBX_MODE_ENABLED 0 -#endif /* MDBX_MODE_ENABLED */ - -#include -#include -#include -#if MDBX_MODE_ENABLED -# include -# include -#endif /* MDBX_MODE_ENABLED */ - -#ifdef __cplusplus -extern "C" { -#endif - -/** An abstraction for a file handle. - * On POSIX systems file handles are small integers. - */ -typedef int mdb_filehandle_t; - -/** @defgroup mdb LMDB API - * @{ - * @brief OpenLDAP Lightning Memory-Mapped Database Manager - */ -/** @defgroup Version Version Macros - * @{ - */ -/** Library major version */ -#define MDB_VERSION_MAJOR 0 -/** Library minor version */ -#define MDB_VERSION_MINOR 9 -/** Library patch version */ -#define MDB_VERSION_PATCH 19 - -/** Combine args a,b,c into a single integer for easy version comparisons */ -#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) - -/** The full library version as a single integer */ -#define MDB_VERSION_FULL \ - MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) - -/** The release date of this library version */ -#define MDB_VERSION_DATE "DEVEL" - -/** A stringifier for the version info */ -#define MDB_VERSTR(a,b,c,d) "MDBX " #a "." #b "." #c ": (" d ", https://github.com/ReOpen/libmdbx)" - -/** A helper for the stringifier macro */ -#define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d) - -/** The full library version as a C string */ -#define MDB_VERSION_STRING \ - MDB_VERFOO(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH,MDB_VERSION_DATE) -/** @} */ - -/** @brief Opaque structure for a database environment. - * - * A DB environment supports multiple databases, all residing in the same - * shared-memory map. - */ -typedef struct MDB_env MDB_env; - -/** @brief Opaque structure for a transaction handle. - * - * All database operations require a transaction handle. Transactions may be - * read-only or read-write. - */ -typedef struct MDB_txn MDB_txn; - -/** @brief A handle for an individual database in the DB environment. */ -typedef unsigned MDB_dbi; - -/** @brief Opaque structure for navigating through a database */ -typedef struct MDB_cursor MDB_cursor; - -/** @brief Generic structure used for passing keys and data in and out - * of the database. - * - * Values returned from the database are valid only until a subsequent - * update operation, or the end of the transaction. Do not modify or - * free them, they commonly point into the database itself. - * - * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive. - * The same applies to data sizes in databases with the #MDB_DUPSORT flag. - * Other data items can in theory be from 0 to 0xffffffff bytes long. - */ -#if MDBX_MODE_ENABLED -typedef struct iovec MDB_val; -# define mv_size iov_len -# define mv_data iov_base -#else -typedef struct MDB_val { - size_t mv_size; /**< size of the data item */ - void *mv_data; /**< address of the data item */ -} MDB_val; -#endif /* MDBX_MODE_ENABLED */ - -/** @brief A callback function used to compare two keys in a database */ -typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); - -/** @brief A callback function used to relocate a position-dependent data item - * in a fixed-address database. - * - * The \b newptr gives the item's desired address in - * the memory map, and \b oldptr gives its previous address. The item's actual - * data resides at the address in \b item. This callback is expected to walk - * through the fields of the record in \b item and modify any - * values based at the \b oldptr address to be relative to the \b newptr address. - * @param[in,out] item The item that is to be relocated. - * @param[in] oldptr The previous address. - * @param[in] newptr The new address to relocate to. - * @param[in] relctx An application-provided context, set by #mdb_set_relctx(). - * @todo This feature is currently unimplemented. - */ -typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); - -/** @defgroup mdb_env Environment Flags - * @{ - */ - /** mmap at a fixed address (experimental) */ -#define MDB_FIXEDMAP 0x01 - /** no environment directory */ -#define MDB_NOSUBDIR 0x4000 - /** don't fsync after commit */ -#define MDB_NOSYNC 0x10000 - /** read only */ -#define MDB_RDONLY 0x20000 - /** don't fsync metapage after commit */ -#define MDB_NOMETASYNC 0x40000 - /** use writable mmap */ -#define MDB_WRITEMAP 0x80000 - /** use asynchronous msync when #MDB_WRITEMAP is used */ -#define MDB_MAPASYNC 0x100000 - /** tie reader locktable slots to #MDB_txn objects instead of to threads */ -#define MDB_NOTLS 0x200000 - /** don't do any locking, caller must manage their own locks - * WARNING: libmdbx don't support this mode. */ -#define MDB_NOLOCK__UNSUPPORTED 0x400000 - /** don't do readahead */ -#define MDB_NORDAHEAD 0x800000 - /** don't initialize malloc'd memory before writing to datafile */ -#define MDB_NOMEMINIT 0x1000000 - -#if MDBX_MODE_ENABLED - /** aim to coalesce FreeDB records */ -#define MDBX_COALESCE 0x2000000 - /** LIFO policy for reclaiming FreeDB records */ -#define MDBX_LIFORECLAIM 0x4000000 -#endif /* MDBX_MODE_ENABLED */ - - /** make a steady-sync only on close and explicit env-sync */ -#define MDBX_UTTERLY_NOSYNC (MDB_NOSYNC|MDB_MAPASYNC) - /** debuging option, fill/perturb released pages */ -#define MDBX_PAGEPERTURB 0x8000000 -/** @} */ - -/** @defgroup mdb_dbi_open Database Flags - * @{ - */ - /** use reverse string keys */ -#define MDB_REVERSEKEY 0x02 - /** use sorted duplicates */ -#define MDB_DUPSORT 0x04 - /** numeric keys in native byte order, either unsigned int or #mdb_size_t. - * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdb_size_t.) - * The keys must all be of the same size. */ -#define MDB_INTEGERKEY 0x08 - /** with #MDB_DUPSORT, sorted dup items have fixed size */ -#define MDB_DUPFIXED 0x10 - /** with #MDB_DUPSORT, dups are #MDB_INTEGERKEY-style integers */ -#define MDB_INTEGERDUP 0x20 - /** with #MDB_DUPSORT, use reverse string dups */ -#define MDB_REVERSEDUP 0x40 - /** create DB if not already existing */ -#define MDB_CREATE 0x40000 -/** @} */ - -/** @defgroup mdb_put Write Flags - * @{ - */ -/** For put: Don't write if the key already exists. */ -#define MDB_NOOVERWRITE 0x10 -/** Only for #MDB_DUPSORT
- * For put: don't write if the key and data pair already exist.
- * For mdb_cursor_del: remove all duplicate data items. - */ -#define MDB_NODUPDATA 0x20 -/** For mdb_cursor_put: overwrite the current key/data pair - * MDBX allows this flag for mdb_put() for explicit overwrite/update without insertion. */ -#define MDB_CURRENT 0x40 -/** For put: Just reserve space for data, don't copy it. Return a - * pointer to the reserved space. - */ -#define MDB_RESERVE 0x10000 -/** Data is being appended, don't split full pages. */ -#define MDB_APPEND 0x20000 -/** Duplicate data is being appended, don't split full pages. */ -#define MDB_APPENDDUP 0x40000 -/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ -#define MDB_MULTIPLE 0x80000 -/* @} */ - -/** @defgroup mdb_copy Copy Flags - * @{ - */ -/** Compacting copy: Omit free space from copy, and renumber all - * pages sequentially. - */ -#define MDB_CP_COMPACT 0x01 -/* @} */ - -/** @brief Cursor Get operations. - * - * This is the set of all operations for retrieving data - * using a cursor. - */ -typedef enum MDB_cursor_op { - MDB_FIRST, /**< Position at first key/data item */ - MDB_FIRST_DUP, /**< Position at first data item of current key. - Only for #MDB_DUPSORT */ - MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ - MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */ - MDB_GET_CURRENT, /**< Return key/data at current cursor position */ - MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items - from current cursor position. Move cursor to prepare - for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ - MDB_LAST, /**< Position at last key/data item */ - MDB_LAST_DUP, /**< Position at last data item of current key. - Only for #MDB_DUPSORT */ - MDB_NEXT, /**< Position at next data item */ - MDB_NEXT_DUP, /**< Position at next data item of current key. - Only for #MDB_DUPSORT */ - MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items - from next cursor position. Move cursor to prepare - for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ - MDB_NEXT_NODUP, /**< Position at first data item of next key */ - MDB_PREV, /**< Position at previous data item */ - MDB_PREV_DUP, /**< Position at previous data item of current key. - Only for #MDB_DUPSORT */ - MDB_PREV_NODUP, /**< Position at last data item of previous key */ - MDB_SET, /**< Position at specified key */ - MDB_SET_KEY, /**< Position at specified key, return key + data */ - MDB_SET_RANGE, /**< Position at first key greater than or equal to specified key. */ - MDB_PREV_MULTIPLE /**< Position at previous page and return key and up to - a page of duplicate data items. Only for #MDB_DUPFIXED */ -} MDB_cursor_op; - -/** @defgroup errors Return Codes - * - * BerkeleyDB uses -30800 to -30999, we'll go under them - * @{ - */ - /** Successful result */ -#define MDB_SUCCESS 0 - /** key/data pair already exists */ -#define MDB_KEYEXIST (-30799) - /** key/data pair not found (EOF) */ -#define MDB_NOTFOUND (-30798) - /** Requested page not found - this usually indicates corruption */ -#define MDB_PAGE_NOTFOUND (-30797) - /** Located page was wrong type */ -#define MDB_CORRUPTED (-30796) - /** Update of meta page failed or environment had fatal error */ -#define MDB_PANIC (-30795) - /** Environment version mismatch */ -#define MDB_VERSION_MISMATCH (-30794) - /** File is not a valid LMDB file */ -#define MDB_INVALID (-30793) - /** Environment mapsize reached */ -#define MDB_MAP_FULL (-30792) - /** Environment maxdbs reached */ -#define MDB_DBS_FULL (-30791) - /** Environment maxreaders reached */ -#define MDB_READERS_FULL (-30790) - /** Txn has too many dirty pages */ -#define MDB_TXN_FULL (-30788) - /** Cursor stack too deep - internal error */ -#define MDB_CURSOR_FULL (-30787) - /** Page has not enough space - internal error */ -#define MDB_PAGE_FULL (-30786) - /** Database contents grew beyond environment mapsize */ -#define MDB_MAP_RESIZED (-30785) - /** Operation and DB incompatible, or DB type changed. This can mean: - *
    - *
  • The operation expects an #MDB_DUPSORT / #MDB_DUPFIXED database. - *
  • Opening a named DB when the unnamed DB has #MDB_DUPSORT / #MDB_INTEGERKEY. - *
  • Accessing a data record as a database, or vice versa. - *
  • The database was dropped and recreated with different flags. - *
- */ -#define MDB_INCOMPATIBLE (-30784) - /** Invalid reuse of reader locktable slot */ -#define MDB_BAD_RSLOT (-30783) - /** Transaction must abort, has a child, or is invalid */ -#define MDB_BAD_TXN (-30782) - /** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ -#define MDB_BAD_VALSIZE (-30781) - /** The specified DBI was changed unexpectedly */ -#define MDB_BAD_DBI (-30780) - /** Unexpected problem - txn should abort */ -#define MDB_PROBLEM (-30779) - /** The last defined error code */ -#define MDB_LAST_ERRCODE MDB_PROBLEM -/** @} */ - -/** @brief Statistics for a database in the environment */ -typedef struct MDB_stat { - unsigned ms_psize; /**< Size of a database page. - This is currently the same for all databases. */ - unsigned ms_depth; /**< Depth (height) of the B-tree */ - size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ - size_t ms_leaf_pages; /**< Number of leaf pages */ - size_t ms_overflow_pages; /**< Number of overflow pages */ - size_t ms_entries; /**< Number of data items */ -} MDB_stat; - -typedef struct MDBX_stat { - MDB_stat base; -#if MDBX_MODE_ENABLED - /* LY: TODO */ -#endif /* MDBX_MODE_ENABLED */ -} MDBX_stat; - -/** @brief Information about the environment */ -typedef struct MDB_envinfo { - void *me_mapaddr; /**< Address of map, if fixed */ - size_t me_mapsize; /**< Size of the data memory map */ - size_t me_last_pgno; /**< ID of the last used page */ - size_t me_last_txnid; /**< ID of the last committed transaction */ - unsigned me_maxreaders; /**< max reader slots in the environment */ - unsigned me_numreaders; /**< max reader slots used in the environment */ -} MDB_envinfo; - -typedef struct MDBX_envinfo { - MDB_envinfo base; -#if MDBX_MODE_ENABLED - size_t me_tail_txnid; /**< ID of the last reader transaction */ - size_t me_meta1_txnid, me_meta1_sign; - size_t me_meta2_txnid, me_meta2_sign; -#endif /* MDBX_MODE_ENABLED */ -} MDBX_envinfo; - - /** @brief Return the LMDB library version information. - * - * @param[out] major if non-NULL, the library major version number is copied here - * @param[out] minor if non-NULL, the library minor version number is copied here - * @param[out] patch if non-NULL, the library patch version number is copied here - * @retval "version string" The library version as a string - */ -char *mdb_version(int *major, int *minor, int *patch); - - /** @brief Return a string describing a given error code. - * - * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) - * function. If the error code is greater than or equal to 0, then the string - * returned by the system function strerror(3) is returned. If the error code - * is less than 0, an error string corresponding to the LMDB library error is - * returned. See @ref errors for a list of LMDB-specific error codes. - * @param[in] err The error code - * @retval "error message" The description of the error - */ -char *mdb_strerror(int err); - - /** @brief Create an LMDB environment handle. - * - * This function allocates memory for a #MDB_env structure. To release - * the allocated memory and discard the handle, call #mdb_env_close(). - * Before the handle may be used, it must be opened using #mdb_env_open(). - * Various other options may also need to be set before opening the handle, - * e.g. #mdb_env_set_mapsize(), #mdb_env_set_maxreaders(), #mdb_env_set_maxdbs(), - * depending on usage requirements. - * @param[out] env The address where the new handle will be stored - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_create(MDB_env **env); - - /** @brief Open an environment handle. - * - * If this function fails, #mdb_env_close() must be called to discard the #MDB_env handle. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] path The directory in which the database files reside. This - * directory must already exist and be writable. - * @param[in] flags Special options for this environment. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - * Flags set by mdb_env_set_flags() are also used. - *
    - *
  • #MDB_FIXEDMAP - * use a fixed address for the mmap region. This flag must be specified - * when creating the environment, and is stored persistently in the environment. - * If successful, the memory map will always reside at the same virtual address - * and pointers used to reference data items in the database will be constant - * across multiple invocations. This option may not always work, depending on - * how the operating system has allocated memory to shared libraries and other uses. - * The feature is highly experimental. - *
  • #MDB_NOSUBDIR - * By default, LMDB creates its environment in a directory whose - * pathname is given in \b path, and creates its data and lock files - * under that directory. With this option, \b path is used as-is for - * the database main data file. The database lock file is the \b path - * with "-lock" appended. - *
  • #MDB_RDONLY - * Open the environment in read-only mode. No write operations will be - * allowed. LMDB will still modify the lock file - except on read-only - * filesystems, where LMDB does not use locks. - *
  • #MDB_WRITEMAP - * Use a writeable memory map unless MDB_RDONLY is set. This uses - * fewer mallocs but loses protection from application bugs - * like wild pointer writes and other bad updates into the database. - * This may be slightly faster for DBs that fit entirely in RAM, but - * is slower for DBs larger than RAM. - * Incompatible with nested transactions. - * Do not mix processes with and without MDB_WRITEMAP on the same - * environment. This can defeat durability (#mdb_env_sync etc). - *
  • #MDB_NOMETASYNC - * Flush system buffers to disk only once per transaction, omit the - * metadata flush. Defer that until the system flushes files to disk, - * or next non-MDB_RDONLY commit or #mdb_env_sync(). This optimization - * maintains database integrity, but a system crash may undo the last - * committed transaction. I.e. it preserves the ACI (atomicity, - * consistency, isolation) but not D (durability) database property. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDB_NOSYNC - * Don't flush system buffers to disk when committing a transaction. - * This optimization means a system crash can corrupt the database or - * lose the last transactions if buffers are not yet flushed to disk. - * The risk is governed by how often the system flushes dirty buffers - * to disk and how often #mdb_env_sync() is called. However, if the - * filesystem preserves write order and the #MDB_WRITEMAP flag is not - * used, transactions exhibit ACI (atomicity, consistency, isolation) - * properties and only lose D (durability). I.e. database integrity - * is maintained, but a system crash may undo the final transactions. - * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with no - * hint for when to write transactions to disk, unless #mdb_env_sync() - * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDB_MAPASYNC - * When using #MDB_WRITEMAP, use asynchronous flushes to disk. - * As with #MDB_NOSYNC, a system crash can then corrupt the - * database or lose the last transactions. Calling #mdb_env_sync() - * ensures on-disk database integrity until next commit. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDB_NOTLS - * Don't use Thread-Local Storage. Tie reader locktable slots to - * #MDB_txn objects instead of to threads. I.e. #mdb_txn_reset() keeps - * the slot reseved for the #MDB_txn object. A thread may use parallel - * read-only transactions. A read-only transaction may span threads if - * the user synchronizes its use. Applications that multiplex many - * user threads over individual OS threads need this option. Such an - * application must also serialize the write transactions in an OS - * thread, since LMDB's write locking is unaware of the user threads. - *
  • #MDB_NOLOCK - * Don't do any locking. If concurrent access is anticipated, the - * caller must manage all concurrency itself. For proper operation - * the caller must enforce single-writer semantics, and must ensure - * that no readers are using old transactions while a writer is - * active. The simplest approach is to use an exclusive lock so that - * no readers may be active at all when a writer begins. - *
  • #MDB_NORDAHEAD - * Turn off readahead. Most operating systems perform readahead on - * read requests by default. This option turns it off if the OS - * supports it. Turning it off may help random read performance - * when the DB is larger than RAM and system RAM is full. - *
  • #MDB_NOMEMINIT - * Don't initialize malloc'd memory before writing to unused spaces - * in the data file. By default, memory for pages written to the data - * file is obtained using malloc. While these pages may be reused in - * subsequent transactions, freshly malloc'd pages will be initialized - * to zeroes before use. This avoids persisting leftover data from other - * code (that used the heap and subsequently freed the memory) into the - * data file. Note that many other system libraries may allocate - * and free memory from the heap for arbitrary uses. E.g., stdio may - * use the heap for file I/O buffers. This initialization step has a - * modest performance cost so some applications may want to disable - * it using this flag. This option can be a problem for applications - * which handle sensitive data like passwords, and it makes memory - * checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP, - * which writes directly to the mmap instead of using malloc for pages. The - * initialization is also skipped if #MDB_RESERVE is used; the - * caller is expected to overwrite all of the memory that was - * reserved in that case. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDBX_COALESCE - * Aim to coalesce records while reclaiming FreeDB. - * This flag may be changed at any time using #mdb_env_set_flags(). - *
  • #MDBX_LIFORECLAIM - * LIFO policy for reclaiming FreeDB records. This significantly reduce - * write IPOS in case MDB_NOSYNC with periodically checkpoints. - *
- * @param[in] mode The UNIX permissions to set on created files and semaphores. - * This parameter is ignored on Windows. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the - * version that created the database environment. - *
  • #MDB_INVALID - the environment file headers are corrupted. - *
  • ENOENT - the directory specified by the path parameter doesn't exist. - *
  • EACCES - the user didn't have permission to access the environment files. - *
  • EAGAIN - the environment was locked by another process. - *
- */ -int mdb_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode); - /** @brief Copy an LMDB environment to the specified path. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] path The directory in which the copy will reside. This - * directory must already exist and be writable but must otherwise be - * empty. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copy(MDB_env *env, const char *path); - - /** @brief Copy an LMDB environment to the specified file descriptor. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] fd The filedescriptor to write the copy to. It must - * have already been opened for Write access. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); - - /** @brief Copy an LMDB environment to the specified path, with options. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] path The directory in which the copy will reside. This - * directory must already exist and be writable but must otherwise be - * empty. - * @param[in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free - * pages and sequentially renumber all pages in output. This option - * consumes more CPU and runs more slowly than the default. - * Currently it fails if the environment has suffered a page leak. - *
- * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copy2(MDB_env *env, const char *path, unsigned flags); - - /** @brief Copy an LMDB environment to the specified file descriptor, - * with options. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. See - * #mdb_env_copy2() for further details. - * @note This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdb_env_create(). It - * must have already been opened successfully. - * @param[in] fd The filedescriptor to write the copy to. It must - * have already been opened for Write access. - * @param[in] flags Special options for this operation. - * See #mdb_env_copy2() for options. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_copyfd2(MDB_env *env, mdb_filehandle_t fd, unsigned flags); - - /** @brief Return statistics about the LMDB environment. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] stat The address of an #MDB_stat structure - * where the statistics will be copied - */ -int mdb_env_stat(MDB_env *env, MDB_stat *stat); - - /** @brief Return information about the LMDB environment. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] stat The address of an #MDB_envinfo structure - * where the information will be copied - */ -int mdb_env_info(MDB_env *env, MDB_envinfo *info); - - /** @brief Flush the data buffers to disk. - * - * Data is always written to disk when #mdb_txn_commit() is called, - * but the operating system may keep it buffered. LMDB always flushes - * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is - * not valid if the environment was opened with #MDB_RDONLY. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] force If non-zero, force a synchronous flush. Otherwise - * if the environment has the #MDB_NOSYNC flag set the flushes - * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EACCES - the environment is read-only. - *
  • EINVAL - an invalid parameter was specified. - *
  • EIO - an error occurred during synchronization. - *
- */ -int mdb_env_sync(MDB_env *env, int force); - - /** @brief Close the environment and release the memory map. - * - * Only a single thread may call this function. All transactions, databases, - * and cursors must already be closed before calling this function. Attempts to - * use any such handles after calling this function will cause a SIGSEGV. - * The environment handle will be freed and must not be used again after this call. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] dont_sync A dont'sync flag, if non-zero the last checkpoint - * (meta-page update) will be kept "as is" and may be still "weak" - * in NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be ignored - * on opening next time, and transactions since the last non-weak - * checkpoint (meta-page update) will rolledback for consistency guarantee. - */ -void mdb_env_close(MDB_env *env); - - /** @brief Set environment flags. - * - * This may be used to set some flags in addition to those from - * #mdb_env_open(), or to unset these flags. If several threads - * change the flags at the same time, the result is undefined. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] flags The flags to change, bitwise OR'ed together - * @param[in] onoff A non-zero value sets the flags, zero clears them. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_set_flags(MDB_env *env, unsigned flags, int onoff); - - /** @brief Get environment flags. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] flags The address of an integer to store the flags - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_flags(MDB_env *env, unsigned *flags); - - /** @brief Return the path that was used in #mdb_env_open(). - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] path Address of a string pointer to contain the path. This - * is the actual string in the environment, not a copy. It should not be - * altered in any way. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_path(MDB_env *env, const char **path); - - /** @brief Return the filedescriptor for the given environment. - * - * This function may be called after fork(), so the descriptor can be - * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. - * (Until LMDB 0.9.18, only the lockfile had that.) - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd); - - /** @brief Set the size of the memory map to use for this environment. - * - * The size should be a multiple of the OS page size. The default is - * 10485760 bytes. The size of the memory map is also the maximum size - * of the database. The value should be chosen as large as possible, - * to accommodate future growth of the database. - * This function should be called after #mdb_env_create() and before #mdb_env_open(). - * It may be called at later times if no transactions are active in - * this process. Note that the library does not check for this condition, - * the caller must ensure it explicitly. - * - * The new size takes effect immediately for the current process but - * will not be persisted to any others until a write transaction has been - * committed by the current process. Also, only mapsize increases are - * persisted into the environment. - * - * If the mapsize is increased by another process, and data has grown - * beyond the range of the current mapsize, #mdb_txn_begin() will - * return #MDB_MAP_RESIZED. This function may be called with a size - * of zero to adopt the new size. - * - * Any attempt to set a size smaller than the space already consumed - * by the environment will be silently changed to the current size of the used space. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] size The size in bytes - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment has - * an active write transaction. - *
- */ -int mdb_env_set_mapsize(MDB_env *env, size_t size); - - /** @brief Set the maximum number of threads/reader slots for the environment. - * - * This defines the number of slots in the lock table that is used to track readers in the - * the environment. The default is 126. - * Starting a read-only transaction normally ties a lock table slot to the - * current thread until the environment closes or the thread exits. If - * MDB_NOTLS is in use, #mdb_txn_begin() instead ties the slot to the - * MDB_txn object until it or the #MDB_env object is destroyed. - * This function may only be called after #mdb_env_create() and before #mdb_env_open(). - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] readers The maximum number of reader lock table slots - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment is already open. - *
- */ -int mdb_env_set_maxreaders(MDB_env *env, unsigned readers); - - /** @brief Get the maximum number of threads/reader slots for the environment. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] readers Address of an integer to store the number of readers - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_env_get_maxreaders(MDB_env *env, unsigned *readers); - - /** @brief Set the maximum number of named databases for the environment. - * - * This function is only needed if multiple databases will be used in the - * environment. Simpler applications that use the environment as a single - * unnamed database can ignore this option. - * This function may only be called after #mdb_env_create() and before #mdb_env_open(). - * - * Currently a moderate number of slots are cheap but a huge number gets - * expensive: 7-120 words per transaction, and every #mdb_dbi_open() - * does a linear search of the opened slots. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] dbs The maximum number of databases - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment is already open. - *
- */ -int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); - - /** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write. - * - * Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511. - * See @ref MDB_val. - * @param[in] env An environment handle returned by #mdb_env_create() - * @return The maximum size of a key we can write - */ -int mdb_env_get_maxkeysize(MDB_env *env); - - /** @brief Set application information associated with the #MDB_env. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] ctx An arbitrary pointer for whatever the application needs. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_set_userctx(MDB_env *env, void *ctx); - - /** @brief Get the application information associated with the #MDB_env. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @return The pointer set by #mdb_env_set_userctx(). - */ -void *mdb_env_get_userctx(MDB_env *env); - - /** @brief A callback function for most LMDB assert() failures, - * called before printing the message and aborting. - * - * @param[in] env An environment handle returned by #mdb_env_create(). - * @param[in] msg The assertion message, not including newline. - */ -typedef void MDB_assert_func(MDB_env *env, const char *msg, - const char *function, unsigned line); - - /** Set or reset the assert() callback of the environment. - * Disabled if liblmdb is buillt with MDB_DEBUG=0. - * @note This hack should become obsolete as lmdb's error handling matures. - * @param[in] env An environment handle returned by #mdb_env_create(). - * @param[in] func An #MDB_assert_func function, or 0. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func); - - /** @brief Create a transaction for use with the environment. - * - * The transaction handle may be discarded using #mdb_txn_abort() or #mdb_txn_commit(). - * @note A transaction and its cursors must only be used by a single - * thread, and a thread may only have a single transaction at a time. - * If #MDB_NOTLS is in use, this does not apply to read-only transactions. - * @note Cursors may not span transactions. - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] parent If this parameter is non-NULL, the new transaction - * will be a nested transaction, with the transaction indicated by \b parent - * as its parent. Transactions may be nested to any level. A parent - * transaction and its cursors may not issue any other operations than - * mdb_txn_commit and mdb_txn_abort while it has active child transactions. - * @param[in] flags Special options for this transaction. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_RDONLY - * This transaction will not perform any write operations. - *
- * @param[out] txn Address where the new #MDB_txn handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - *
  • #MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's - * mapsize and this environment's map must be resized as well. - * See #mdb_env_set_mapsize(). - *
  • #MDB_READERS_FULL - a read-only transaction was requested and - * the reader lock table is full. See #mdb_env_set_maxreaders(). - *
  • ENOMEM - out of memory. - *
- */ -int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **txn); - - /** @brief Returns the transaction's #MDB_env - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - */ -MDB_env *mdb_txn_env(MDB_txn *txn); - - /** @brief Return the transaction's ID. - * - * This returns the identifier associated with this transaction. For a - * read-only transaction, this corresponds to the snapshot being read; - * concurrent readers will frequently have the same transaction ID. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @return A transaction ID, valid if input is an active transaction. - */ -size_t mdb_txn_id(MDB_txn *txn); - - /** @brief Commit all the operations of a transaction into the database. - * - * The transaction handle is freed. It and its cursors must not be used - * again after this call, except with #mdb_cursor_renew(). - * - * @note MDBX-mode: - * A cursor must be closed explicitly always, before - * or after its transaction ends. It can be reused with - * #mdb_cursor_renew() before finally closing it. - * - * @note LMDB-compatible mode: - * Earlier documentation incorrectly said all cursors would be freed. - * Only write-transactions free cursors. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
  • ENOSPC - no more disk space. - *
  • EIO - a low-level I/O error occurred while writing. - *
  • ENOMEM - out of memory. - *
- */ -int mdb_txn_commit(MDB_txn *txn); - - /** @brief Abandon all the operations of the transaction instead of saving them. - * - * The transaction handle is freed. It and its cursors must not be used - * again after this call, except with #mdb_cursor_renew(). - * - * @note MDBX-mode: - * A cursor must be closed explicitly always, before - * or after its transaction ends. It can be reused with - * #mdb_cursor_renew() before finally closing it. - * - * @note LMDB-compatible mode: - * Earlier documentation incorrectly said all cursors would be freed. - * Only write-transactions free cursors. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - */ -int mdb_txn_abort(MDB_txn *txn); - - /** @brief Reset a read-only transaction. - * - * Abort the transaction like #mdb_txn_abort(), but keep the transaction - * handle. #mdb_txn_renew() may reuse the handle. This saves allocation - * overhead if the process will start a new read-only transaction soon, - * and also locking overhead if #MDB_NOTLS is in use. The reader table - * lock is released, but the table slot stays tied to its thread or - * #MDB_txn. Use mdb_txn_abort() to discard a reset handle, and to free - * its lock table slot if MDB_NOTLS is in use. - * Cursors opened within the transaction must not be used - * again after this call, except with #mdb_cursor_renew(). - * Reader locks generally don't interfere with writers, but they keep old - * versions of database pages allocated. Thus they prevent the old pages - * from being reused when writers commit new data, and so under heavy load - * the database size may grow much more rapidly than otherwise. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - */ -int mdb_txn_reset(MDB_txn *txn); - - /** @brief Renew a read-only transaction. - * - * This acquires a new reader lock for a transaction handle that had been - * released by #mdb_txn_reset(). It must be called before a reset transaction - * may be used again. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_txn_renew(MDB_txn *txn); - -/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ -#define mdb_open(txn,name,flags,dbi) mdb_dbi_open(txn,name,flags,dbi) -/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ -#define mdb_close(env,dbi) mdb_dbi_close(env,dbi) - - /** @brief Open a database in the environment. - * - * A database handle denotes the name and parameters of a database, - * independently of whether such a database exists. - * The database handle may be discarded by calling #mdb_dbi_close(). - * The old database handle is returned if the database was already open. - * The handle may only be closed once. - * - * The database handle will be private to the current transaction until - * the transaction is successfully committed. If the transaction is - * aborted the handle will be closed automatically. - * After a successful commit the handle will reside in the shared - * environment, and may be used by other transactions. - * - * This function must not be called from multiple concurrent - * transactions in the same process. A transaction that uses - * this function must finish (either commit or abort) before - * any other transaction in the process may use this function. - * - * To use named databases (with name != NULL), #mdb_env_set_maxdbs() - * must be called before opening the environment. Database names are - * keys in the unnamed database, and may be read but not written. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] name The name of the database to open. If only a single - * database is needed in the environment, this value may be NULL. - * @param[in] flags Special options for this database. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_REVERSEKEY - * Keys are strings to be compared in reverse order, from the end - * of the strings to the beginning. By default, Keys are treated as strings and - * compared from beginning to end. - *
  • #MDB_DUPSORT - * Duplicate keys may be used in the database. (Or, from another perspective, - * keys may have multiple data items, stored in sorted order.) By default - * keys must be unique and may have only a single data item. - *
  • #MDB_INTEGERKEY - * Keys are binary integers in native byte order, either unsigned int - * or #mdb_size_t, and will be sorted as such. - * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdb_size_t.) - * The keys must all be of the same size. - *
  • #MDB_DUPFIXED - * This flag may only be used in combination with #MDB_DUPSORT. This option - * tells the library that the data items for this database are all the same - * size, which allows further optimizations in storage and retrieval. When - * all data items are the same size, the #MDB_GET_MULTIPLE, #MDB_NEXT_MULTIPLE - * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve multiple - * items at once. - *
  • #MDB_INTEGERDUP - * This option specifies that duplicate data items are binary integers, - * similar to #MDB_INTEGERKEY keys. - *
  • #MDB_REVERSEDUP - * This option specifies that duplicate data items should be compared as - * strings in reverse order. - *
  • #MDB_CREATE - * Create the named database if it doesn't exist. This option is not - * allowed in a read-only transaction or a read-only environment. - *
- * @param[out] dbi Address where the new #MDB_dbi handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_NOTFOUND - the specified database doesn't exist in the environment - * and #MDB_CREATE was not specified. - *
  • #MDB_DBS_FULL - too many databases have been opened. See #mdb_env_set_maxdbs(). - *
- */ -int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi); - - /** @brief Retrieve statistics for a database. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[out] stat The address of an #MDB_stat structure - * where the statistics will be copied - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); - - /** @brief Retrieve the DB flags for a database handle. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[out] flags Address where the flags will be returned. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); - - /** @brief Close a database handle. Normally unnecessary. Use with care: - * - * This call is not mutex protected. Handles should only be closed by - * a single thread, and only if no other threads are going to reference - * the database handle or one of its cursors any further. Do not close - * a handle if an existing transaction has modified its database. - * Doing so can cause misbehavior from database corruption to errors - * like MDB_BAD_VALSIZE (since the DB name is gone). - * - * Closing a database handle is not necessary, but lets #mdb_dbi_open() - * reuse the handle value. Usually it's better to set a bigger - * #mdb_env_set_maxdbs(), unless that value would be large. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - */ -void mdb_dbi_close(MDB_env *env, MDB_dbi dbi); - - /** @brief Empty or delete+close a database. - * - * See #mdb_dbi_close() for restrictions about closing the DB handle. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] del 0 to empty the DB, 1 to delete it from the - * environment and close the DB handle. - * @return A non-zero error value on failure and 0 on success. - */ -int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del); - - /** @brief Set a custom key comparison function for a database. - * - * The comparison function is called whenever it is necessary to compare a - * key specified by the application with a key currently stored in the database. - * If no comparison function is specified, and no special key flags were specified - * with #mdb_dbi_open(), the keys are compared lexically, with shorter keys collating - * before longer keys. - * @warning This function must be called before any data access functions are used, - * otherwise data corruption may occur. The same comparison function must be used by every - * program accessing the database, every time the database is used. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] cmp A #MDB_cmp_func function - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); - - /** @brief Set a custom data comparison function for a #MDB_DUPSORT database. - * - * This comparison function is called whenever it is necessary to compare a data - * item specified by the application with a data item currently stored in the database. - * This function only takes effect if the database was opened with the #MDB_DUPSORT - * flag. - * If no comparison function is specified, and no special key flags were specified - * with #mdb_dbi_open(), the data items are compared lexically, with shorter items collating - * before longer items. - * @warning This function must be called before any data access functions are used, - * otherwise data corruption may occur. The same comparison function must be used by every - * program accessing the database, every time the database is used. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] cmp A #MDB_cmp_func function - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); - - /** @brief Set a relocation function for a #MDB_FIXEDMAP database. - * - * @todo The relocation function is called whenever it is necessary to move the data - * of an item to a different position in the database (e.g. through tree - * balancing operations, shifts as a result of adds or deletes, etc.). It is - * intended to allow address/position-dependent data items to be stored in - * a database in an environment opened with the #MDB_FIXEDMAP option. - * Currently the relocation feature is unimplemented and setting - * this function has no effect. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] rel A #MDB_rel_func function - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel); - - /** @brief Set a context pointer for a #MDB_FIXEDMAP database's relocation function. - * - * See #mdb_set_relfunc and #MDB_rel_func for more details. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] ctx An arbitrary pointer for whatever the application needs. - * It will be passed to the callback function set by #mdb_set_relfunc - * as its \b relctx parameter whenever the callback is invoked. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx); - - /** @brief Get items from a database. - * - * This function retrieves key/data pairs from the database. The address - * and length of the data associated with the specified \b key are returned - * in the structure to which \b data refers. - * If the database supports duplicate keys (#MDB_DUPSORT) then the - * first data item for the key will be returned. Retrieval of other - * items requires the use of #mdb_cursor_get(). - * - * @note The memory pointed to by the returned values is owned by the - * database. The caller need not dispose of the memory, and may not - * modify it in any way. For values returned in a read-only transaction - * any modification attempts will cause a SIGSEGV. - * @note Values returned from the database are valid only until a - * subsequent update operation, or the end of the transaction. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to search for in the database - * @param[out] data The data corresponding to the key - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_NOTFOUND - the key was not in the database. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); - - /** @brief Store items into a database. - * - * This function stores key/data pairs in the database. The default behavior - * is to enter the new key/data pair, replacing any previously existing key - * if duplicates are disallowed, or adding a duplicate data item if - * duplicates are allowed (#MDB_DUPSORT). - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to store in the database - * @param[in,out] data The data to store - * @param[in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
    - *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not - * already appear in the database. This flag may only be specified - * if the database was opened with #MDB_DUPSORT. The function will - * return #MDB_KEYEXIST if the key/data pair already appears in the - * database. - *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will return - * #MDB_KEYEXIST if the key already appears in the database, even if - * the database supports duplicates (#MDB_DUPSORT). The \b data - * parameter will be set to point to the existing item. - *
  • #MDB_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. - * LMDB does nothing else with this memory, the caller is expected - * to modify all of the space requested. This flag must not be - * specified if the database was opened with #MDB_DUPSORT. - *
  • #MDB_APPEND - append the given key/data pair to the end of the - * database. This option allows fast bulk loading when keys are - * already known to be in the correct order. Loading unsorted keys - * with this flag will cause a #MDB_KEYEXIST error. - *
  • #MDB_APPENDDUP - as above, but for sorted dup data. - *
- * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). - *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, - unsigned flags); - - /** @brief Delete items from a database. - * - * This function removes key/data pairs from the database. - * - * MDBX-mode: - * The data parameter is NOT ignored regardless the database does - * support sorted duplicate data items or not. If the data parameter - * is non-NULL only the matching data item will be deleted. - * - * LMDB-compatible mode: - * If the database does not support sorted duplicate data items - * (#MDB_DUPSORT) the data parameter is ignored. - * If the database supports sorted duplicates and the data parameter - * is NULL, all of the duplicate data items for the key will be - * deleted. Otherwise, if the data parameter is non-NULL - * only the matching data item will be deleted. - * - * This function will return #MDB_NOTFOUND if the specified key/data - * pair is not in the database. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to delete from the database - * @param[in] data The data to delete - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); - - /** @brief Create a cursor handle. - * - * A cursor is associated with a specific transaction and database. - * A cursor cannot be used when its database handle is closed. Nor - * when its transaction has ended, except with #mdb_cursor_renew(). - * It can be discarded with #mdb_cursor_close(). - * - * MDBX-mode: - * A cursor must be closed explicitly always, before - * or after its transaction ends. It can be reused with - * #mdb_cursor_renew() before finally closing it. - * - * LMDB-compatible mode: - * A cursor in a write-transaction can be closed before its transaction - * ends, and will otherwise be closed when its transaction ends. - * A cursor in a read-only transaction must be closed explicitly, before - * or after its transaction ends. It can be reused with - * #mdb_cursor_renew() before finally closing it. - * @note Earlier documentation said that cursors in every transaction - * were closed when the transaction committed or aborted. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[out] cursor Address where the new #MDB_cursor handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); - - /** @brief Close a cursor handle. - * - * The cursor handle will be freed and must not be used again after this call. - * Its transaction must still be live if it is a write-transaction. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - */ -void mdb_cursor_close(MDB_cursor *cursor); - - /** @brief Renew a cursor handle. - * - * A cursor is associated with a specific transaction and database. - * Cursors that are only used in read-only - * transactions may be re-used, to avoid unnecessary malloc/free overhead. - * The cursor may be associated with a new read-only transaction, and - * referencing the same database handle as it was created with. - * This may be done whether the previous transaction is live or dead. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); - - /** @brief Return the cursor's transaction handle. - * - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - */ -MDB_txn *mdb_cursor_txn(MDB_cursor *cursor); - - /** @brief Return the cursor's database handle. - * - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - */ -MDB_dbi mdb_cursor_dbi(MDB_cursor *cursor); - - /** @brief Retrieve by cursor. - * - * This function retrieves key/data pairs from the database. The address and length - * of the key are returned in the object to which \b key refers (except for the - * case of the #MDB_SET option, in which the \b key object is unchanged), and - * the address and length of the data are returned in the object to which \b data - * refers. - * See #mdb_get() for restrictions on using the output values. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[in,out] key The key for a retrieved item - * @param[in,out] data The data of a retrieved item - * @param[in] op A cursor operation #MDB_cursor_op - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_NOTFOUND - no matching key found. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, - MDB_cursor_op op); - - /** @brief Store by cursor. - * - * This function stores key/data pairs into the database. - * The cursor is positioned at the new item, or on failure usually near it. - * @note Earlier documentation incorrectly said errors would leave the - * state of the cursor unchanged. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[in] key The key operated on. - * @param[in] data The data operated on. - * @param[in] flags Options for this operation. This parameter - * must be set to 0 or one of the values described here. - *
    - *
  • #MDB_CURRENT - replace the item at the current cursor position. - * The \b key parameter must still be provided, and must match it. - * If using sorted duplicates (#MDB_DUPSORT) the data item must still - * sort into the same place. This is intended to be used when the - * new data is the same size as the old. Otherwise it will simply - * perform a delete of the old record followed by an insert. - *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not - * already appear in the database. This flag may only be specified - * if the database was opened with #MDB_DUPSORT. The function will - * return #MDB_KEYEXIST if the key/data pair already appears in the - * database. - *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will return - * #MDB_KEYEXIST if the key already appears in the database, even if - * the database supports duplicates (#MDB_DUPSORT). - *
  • #MDB_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. This flag - * must not be specified if the database was opened with #MDB_DUPSORT. - *
  • #MDB_APPEND - append the given key/data pair to the end of the - * database. No key comparisons are performed. This option allows - * fast bulk loading when keys are already known to be in the - * correct order. Loading unsorted keys with this flag will cause - * a #MDB_KEYEXIST error. - *
  • #MDB_APPENDDUP - as above, but for sorted dup data. - *
  • #MDB_MULTIPLE - store multiple contiguous data elements in a - * single request. This flag may only be specified if the database - * was opened with #MDB_DUPFIXED. The \b data argument must be an - * array of two MDB_vals. The mv_size of the first MDB_val must be - * the size of a single data element. The mv_data of the first MDB_val - * must point to the beginning of the array of contiguous data elements. - * The mv_size of the second MDB_val must be the count of the number - * of data elements to store. On return this field will be set to - * the count of the number of elements actually written. The mv_data - * of the second MDB_val is unused. - *
- * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). - *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, - unsigned flags); - - /** @brief Delete current key/data pair - * - * This function deletes the key/data pair to which the cursor refers. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[in] flags Options for this operation. This parameter - * must be set to 0 or one of the values described here. - *
    - *
  • #MDB_NODUPDATA - delete all of the data items for the current key. - * This flag may only be specified if the database was opened with #MDB_DUPSORT. - *
- * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdb_cursor_del(MDB_cursor *cursor, unsigned flags); - - /** @brief Return count of duplicates for current key. - * - * This call is only valid on databases that support sorted duplicate - * data items #MDB_DUPSORT. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[out] countp Address where the count will be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - cursor is not initialized, or an invalid parameter was specified. - *
- */ -int mdb_cursor_count(MDB_cursor *cursor, size_t *countp); - - /** @brief Compare two data items according to a particular database. - * - * This returns a comparison as if the two data items were keys in the - * specified database. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] a The first item to compare - * @param[in] b The second item to compare - * @return < 0 if a < b, 0 if a == b, > 0 if a > b - */ -int mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); - - /** @brief Compare two data items according to a particular database. - * - * This returns a comparison as if the two items were data items of - * the specified database. The database must have the #MDB_DUPSORT flag. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] a The first item to compare - * @param[in] b The second item to compare - * @return < 0 if a < b, 0 if a == b, > 0 if a > b - */ -int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); - - /** @brief A callback function used to print a message from the library. - * - * @param[in] msg The string to be printed. - * @param[in] ctx An arbitrary context pointer for the callback. - * @return < 0 on failure, >= 0 on success. - */ -typedef int (MDB_msg_func)(const char *msg, void *ctx); - - /** @brief Dump the entries in the reader lock table. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] func A #MDB_msg_func function - * @param[in] ctx Anything the message function needs - * @return < 0 on failure, >= 0 on success. - */ -int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); - - /** @brief Check for stale entries in the reader lock table. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[out] dead Number of stale slots that were cleared - * @return 0 on success, non-zero on failure. - */ -int mdb_reader_check(MDB_env *env, int *dead); -/** @} */ - -char* mdb_dkey(MDB_val *key, char *buf); - -#ifdef __cplusplus -} -#endif -/** @page tools LMDB Command Line Tools - The following describes the command line tools that are available for LMDB. - \li \ref mdb_chk_1 - \li \ref mdb_copy_1 - \li \ref mdb_dump_1 - \li \ref mdb_load_1 - \li \ref mdb_stat_1 -*/ - -#endif /* _LMDB_H_ */ diff --git a/mdb.c b/mdb.c deleted file mode 100644 index c26cf0d1..00000000 --- a/mdb.c +++ /dev/null @@ -1,10723 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev . - * - * This code is derived from "LMDB engine" written by - * Howard Chu (Symas Corporation), which itself derived from btree.c - * written by Martin Hedenfalk. - * - * --- - * - * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - * - * --- - * - * Portions Copyright (c) 2009, 2010 Martin Hedenfalk - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#ifndef MDB_DEBUG -# define MDB_DEBUG 0 -#endif - -#ifndef _GNU_SOURCE -# define _GNU_SOURCE -#endif - -/* LY: Please do not ask us for Windows support, just never! - * But you can make a fork for Windows, or become maintainer for FreeBSD... */ -#ifndef __gnu_linux__ -# warning "libmdbx supports only GNU Linux" -#endif - -#include - -#if !defined(__GNUC__) || !__GNUC_PREREQ(4,2) - /* LY: Actualy libmdbx was not tested with compilers - * older than GCC 4.4 (from RHEL6). - * But you could remove this #error and try to continue at your own risk. - * In such case please don't rise up an issues related ONLY to old compilers. - */ -# warning "libmdbx required at least GCC 4.2 compatible C/C++ compiler." -#endif - -#if !defined(__GNU_LIBRARY__) || !__GLIBC_PREREQ(2,12) - /* LY: Actualy libmdbx was not tested with something - * older than glibc 2.12 (from RHEL6). - * But you could remove this #error and try to continue at your own risk. - * In such case please don't rise up an issues related ONLY to old systems. - */ -# warning "libmdbx required at least GLIBC 2.12." -#endif - -#if MDB_DEBUG -# undef NDEBUG -#endif - -#include "./reopen.h" -#include "./barriers.h" - -#include -#include -#include -#include -#include -#ifdef HAVE_SYS_FILE_H -# include -#endif -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) -# include -# include /* defines BYTE_ORDER on HPUX and Solaris */ -#endif - -#ifndef _POSIX_SYNCHRONIZED_IO -# define fdatasync fsync -#endif - -#ifndef BYTE_ORDER -# if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) - /* Solaris just defines one or the other */ -# define LITTLE_ENDIAN 1234 -# define BIG_ENDIAN 4321 -# ifdef _LITTLE_ENDIAN -# define BYTE_ORDER LITTLE_ENDIAN -# else -# define BYTE_ORDER BIG_ENDIAN -# endif -# else -# define BYTE_ORDER __BYTE_ORDER -# endif -#endif - -#ifndef LITTLE_ENDIAN -# define LITTLE_ENDIAN __LITTLE_ENDIAN -#endif -#ifndef BIG_ENDIAN -# define BIG_ENDIAN __BIG_ENDIAN -#endif - -#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) -# define MISALIGNED_OK 1 -#endif - -#include "./lmdb.h" -#include "./midl.h" - -#if ! MDBX_MODE_ENABLED -# define MDBX_COALESCE 0 -# define MDBX_LIFORECLAIM 0 -# define MDBX_DBG_ASSERT 0 -# define MDBX_DBG_PRINT 0 -# define MDBX_DBG_TRACE 0 -# define MDBX_DBG_EXTRA 0 -# define MDBX_DBG_AUDIT 0 -# define MDBX_DBG_EDGE 0 -# define mdb_runtime_flags 0 -# define mdb_debug_logger ((void (*)(int, ...)) NULL) -# define MDBX_ONLY_FEATURE static -#else -# define MDBX_ONLY_FEATURE -#endif /* ! MDBX_MODE_ENABLED */ - -#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) -# error "Unknown or unsupported endianness (BYTE_ORDER)" -#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF -# error "Two's complement, reasonably sized integer types, please" -#endif - -/** @defgroup internal LMDB Internals - * @{ - */ -/** @defgroup compat Compatibility Macros - * A bunch of macros to minimize the amount of platform-specific ifdefs - * needed throughout the rest of the code. When the features this library - * needs are similar enough to POSIX to be hidden in a one-or-two line - * replacement, this macro approach is used. - * @{ - */ - - /** Features under development */ -#ifndef MDB_DEVEL -# define MDB_DEVEL 0 -#endif - - /** Wrapper around __func__, which is a C99 feature */ -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L -# define mdb_func_ __func__ -#elif (defined(__GNUC__) && __GNUC__ >= 2) || defined(__clang__) -# define mdb_func_ __FUNCTION__ -#else - /* If a debug message says (), update the #if statements above */ -# define mdb_func_ "" -#endif - -/** Some platforms define the EOWNERDEAD error code - * even though they don't support Robust Mutexes. - * Compile with -DMDB_USE_ROBUST=0. - */ -#ifndef MDB_USE_ROBUST - /* Howard Chu: Android currently lacks Robust Mutex support */ -# if defined(EOWNERDEAD) && !defined(ANDROID) \ - /* LY: glibc before 2.10 has a troubles with Robust Mutex too. */ \ - && __GLIBC_PREREQ(2,10) -# define MDB_USE_ROBUST 1 -# else -# define MDB_USE_ROBUST 0 -# endif -#endif /* MDB_USE_ROBUST */ - -/* Internal error codes, not exposed outside liblmdb */ -#define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) - - /** Mutex for the reader table (rw = r) or write transaction (rw = w). - */ -#define MDB_MUTEX(env, rw) \ - (&(env)->me_txns->mti_##rw##mutex) - - /** An abstraction for a file handle. - * On POSIX systems file handles are small integers. On Windows - * they're opaque pointers. - */ -#define HANDLE int - - /** A value for an invalid file handle. - * Mainly used to initialize file variables and signify that they are - * unused. - */ -#define INVALID_HANDLE_VALUE (-1) - - /** Get the size of a memory page for the system. - * This is the basic size that the platform's memory manager uses, and is - * fundamental to the use of memory-mapped files. - */ -#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) - -/** @} */ - -static int mdb_mutex_lock(MDB_env *env, pthread_mutex_t *mutex); -static int mdb_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc); -static void mdb_mutex_unlock(MDB_env *env, pthread_mutex_t *mutex); - - /** A page number in the database. - * Note that 64 bit page numbers are overkill, since pages themselves - * already represent 12-13 bits of addressable memory, and the OS will - * always limit applications to a maximum of 63 bits of address space. - * - * @note In the #MDB_node structure, we only store 48 bits of this value, - * which thus limits us to only 60 bits of addressable data. - */ -typedef MDB_ID pgno_t; - - /** A transaction ID. - * See struct MDB_txn.mt_txnid for details. - */ -typedef MDB_ID txnid_t; - -/** @defgroup debug Debug Macros - * @{ - */ - /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ -#define DDBI(mc) \ - (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) -/** @} */ - - /** @brief The maximum size of a database page. - * - * It is 32k or 64k, since value-PAGEBASE must fit in - * #MDB_page.%mp_upper. - * - * LMDB will use database pages < OS pages if needed. - * That causes more I/O in write transactions: The OS must - * know (read) the whole page before writing a partial page. - * - * Note that we don't currently support Huge pages. On Linux, - * regular data files cannot use Huge pages, and in general - * Huge pages aren't actually pageable. We rely on the OS - * demand-pager to read our data and page it out when memory - * pressure from other processes is high. So until OSs have - * actual paging support for Huge pages, they're not viable. - */ -#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) - - /** The minimum number of keys required in a database page. - * Setting this to a larger value will place a smaller bound on the - * maximum size of a data item. Data items larger than this size will - * be pushed into overflow pages instead of being stored directly in - * the B-tree node. This value used to default to 4. With a page size - * of 4096 bytes that meant that any item larger than 1024 bytes would - * go into an overflow page. That also meant that on average 2-3KB of - * each overflow page was wasted space. The value cannot be lower than - * 2 because then there would no longer be a tree structure. With this - * value, items larger than 2KB will go into overflow pages, and on - * average only 1KB will be wasted. - */ -#define MDB_MINKEYS 2 - - /** A stamp that identifies a file as an LMDB file. - * There's nothing special about this value other than that it is easily - * recognizable, and it will reflect any byte order mismatches. - */ -#define MDB_MAGIC 0xBEEFC0DE - - /** The version number for a database's datafile format. */ -#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) - /** The version number for a database's lockfile format. */ -#define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) - - /** @brief The max size of a key we can write, or 0 for computed max. - * - * This macro should normally be left alone or set to 0. - * Note that a database with big keys or dupsort data cannot be - * reliably modified by a liblmdb which uses a smaller max. - * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. - * - * Other values are allowed, for backwards compat. However: - * A value bigger than the computed max can break if you do not - * know what you are doing, and liblmdb <= 0.9.10 can break when - * modifying a DB with keys/dupsort data bigger than its max. - * - * Data items in an #MDB_DUPSORT database are also limited to - * this size, since they're actually keys of a sub-DB. Keys and - * #MDB_DUPSORT data items must fit on a node in a regular page. - */ -#ifndef MDB_MAXKEYSIZE -# define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) -#endif - - /** The maximum size of a key we can write to the environment. */ -#if MDB_MAXKEYSIZE -# define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) -#else -# define ENV_MAXKEY(env) ((env)->me_maxkey_limit) -#endif /* MDB_MAXKEYSIZE */ - - /** @brief The maximum size of a data item. - * - * We only store a 32 bit value for node sizes. - */ -#define MAXDATASIZE 0xffffffffUL - - /** Key size which fits in a #DKBUF. - * @ingroup debug - */ -#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) - /** A key buffer. - * @ingroup debug - * This is used for printing a hex dump of a key's contents. - */ -#define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1] - /** Display a key in hex. - * @ingroup debug - * Invoke a function to display a key in hex. - */ -#define DKEY(x) mdb_dkey(x, kbuf) - - /** An invalid page number. - * Mainly used to denote an empty tree. - */ -#define P_INVALID (~(pgno_t)0) - - /** Test if the flags \b f are set in a flag word \b w. */ -#define F_ISSET(w, f) (((w) & (f)) == (f)) - - /** Round \b n up to an even number. */ -#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ - - /** Used for offsets within a single page. - * Since memory pages are typically 4 or 8KB in size, 12-13 bits, - * this is plenty. - */ -typedef uint16_t indx_t; - - /** Default size of memory map. - * This is certainly too small for any actual applications. Apps should always set - * the size explicitly using #mdb_env_set_mapsize(). - */ -#define DEFAULT_MAPSIZE 1048576 - -/** @defgroup readers Reader Lock Table - * Readers don't acquire any locks for their data access. Instead, they - * simply record their transaction ID in the reader table. The reader - * mutex is needed just to find an empty slot in the reader table. The - * slot's address is saved in thread-specific data so that subsequent read - * transactions started by the same thread need no further locking to proceed. - * - * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. - * - * No reader table is used if the database is on a read-only filesystem, or - * if #MDB_NOLOCK is set. - * - * Since the database uses multi-version concurrency control, readers don't - * actually need any locking. This table is used to keep track of which - * readers are using data from which old transactions, so that we'll know - * when a particular old transaction is no longer in use. Old transactions - * that have discarded any data pages can then have those pages reclaimed - * for use by a later write transaction. - * - * The lock table is constructed such that reader slots are aligned with the - * processor's cache line size. Any slot is only ever used by one thread. - * This alignment guarantees that there will be no contention or cache - * thrashing as threads update their own slot info, and also eliminates - * any need for locking when accessing a slot. - * - * A writer thread will scan every slot in the table to determine the oldest - * outstanding reader transaction. Any freed pages older than this will be - * reclaimed by the writer. The writer doesn't use any locks when scanning - * this table. This means that there's no guarantee that the writer will - * see the most up-to-date reader info, but that's not required for correct - * operation - all we need is to know the upper bound on the oldest reader, - * we don't care at all about the newest reader. So the only consequence of - * reading stale information here is that old pages might hang around a - * while longer before being reclaimed. That's actually good anyway, because - * the longer we delay reclaiming old pages, the more likely it is that a - * string of contiguous pages can be found after coalescing old pages from - * many old transactions together. - * @{ - */ - /** Number of slots in the reader table. - * This value was chosen somewhat arbitrarily. 126 readers plus a - * couple mutexes fit exactly into 8KB on my development machine. - * Applications should set the table size using #mdb_env_set_maxreaders(). - */ -#define DEFAULT_READERS 126 - - /** The information we store in a single slot of the reader table. - * In addition to a transaction ID, we also record the process and - * thread ID that owns a slot, so that we can detect stale information, - * e.g. threads or processes that went away without cleaning up. - * @note We currently don't check for stale records. We simply re-init - * the table when we know that we're the only process opening the - * lock file. - */ -typedef struct MDB_rxbody { - /** Current Transaction ID when this transaction began, or (txnid_t)-1. - * Multiple readers that start at the same time will probably have the - * same ID here. Again, it's not important to exclude them from - * anything; all we need to know is which version of the DB they - * started from so we can avoid overwriting any data used in that - * particular version. - */ - volatile txnid_t mrb_txnid; - /** The process ID of the process owning this reader txn. */ - volatile pid_t mrb_pid; - /** The thread ID of the thread owning this txn. */ - volatile pthread_t mrb_tid; -} MDB_rxbody; - - /** The actual reader record, with cacheline padding. */ -typedef struct MDB_reader { - union { - MDB_rxbody mrx; - /** shorthand for mrb_txnid */ -#define mr_txnid mru.mrx.mrb_txnid -#define mr_pid mru.mrx.mrb_pid -#define mr_tid mru.mrx.mrb_tid - /** cache line alignment */ - char pad[(sizeof(MDB_rxbody)+CACHELINE_SIZE-1) & ~(CACHELINE_SIZE-1)]; - } mru; -} MDB_reader; - - /** The header for the reader table. - * The table resides in a memory-mapped file. (This is a different file - * than is used for the main database.) - * - * For POSIX the actual mutexes reside in the shared memory of this - * mapped file. On Windows, mutexes are named objects allocated by the - * kernel; we store the mutex names in this mapped file so that other - * processes can grab them. This same approach is also used on - * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support - * process-shared POSIX mutexes. For these cases where a named object - * is used, the object name is derived from a 64 bit FNV hash of the - * environment pathname. As such, naming collisions are extremely - * unlikely. If a collision occurs, the results are unpredictable. - */ -typedef struct MDB_txbody { - /** Stamp identifying this as an LMDB file. It must be set - * to #MDB_MAGIC. */ - uint32_t mtb_magic; - /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ - uint32_t mtb_format; - /** Mutex protecting access to this table. - * This is the #MDB_MUTEX(env,r) reader table lock. - */ - pthread_mutex_t mtb_rmutex; - /** The ID of the last transaction committed to the database. - * This is recorded here only for convenience; the value can always - * be determined by reading the main database meta pages. - */ - volatile txnid_t mtb_txnid; - /** The number of slots that have been used in the reader table. - * This always records the maximum count, it is not decremented - * when readers release their slots. - */ - volatile unsigned mtb_numreaders; -} MDB_txbody; - - /** The actual reader table definition. */ -typedef struct MDB_txninfo { - union { - MDB_txbody mtb; -#define mti_magic mt1.mtb.mtb_magic -#define mti_format mt1.mtb.mtb_format -#define mti_rmutex mt1.mtb.mtb_rmutex -#define mti_rmname mt1.mtb.mtb_rmname -#define mti_txnid mt1.mtb.mtb_txnid -#define mti_numreaders mt1.mtb.mtb_numreaders - char pad[(sizeof(MDB_txbody)+CACHELINE_SIZE-1) & ~(CACHELINE_SIZE-1)]; - } mt1; - union { - pthread_mutex_t mt2_wmutex; -# define mti_wmutex mt2.mt2_wmutex - char pad[(sizeof(pthread_mutex_t)+CACHELINE_SIZE-1) & ~(CACHELINE_SIZE-1)]; - } mt2; - MDB_reader mti_readers[1]; -} MDB_txninfo; - - /** Lockfile format signature: version, features and field layout */ -#define MDB_LOCK_FORMAT \ - ((uint32_t) \ - ((MDB_LOCK_VERSION) \ - /* Flags which describe functionality */ \ - + (0 /* SYSV_SEM_FLAG */ << 18) \ - + (1 /* MDB_PIDLOCK */ << 16))) -/** @} */ - -/** Common header for all page types. The page type depends on #mp_flags. - * - * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with - * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages - * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. - * - * #P_OVERFLOW records occupy one or more contiguous pages where only the - * first has a page header. They hold the real data of #F_BIGDATA nodes. - * - * #P_SUBP sub-pages are small leaf "pages" with duplicate data. - * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. - * (Duplicate data can also go in sub-databases, which use normal pages.) - * - * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. - * - * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once - * in the snapshot: Either used by a database or listed in a freeDB record. - */ -typedef struct MDB_page { -#define mp_pgno mp_p.p_pgno -#define mp_next mp_p.p_next - union { - pgno_t p_pgno; /**< page number */ - struct MDB_page *p_next; /**< for in-memory list of freed pages */ - } mp_p; - uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */ -/** @defgroup mdb_page Page Flags - * @ingroup internal - * Flags for the page headers. - * @{ - */ -#define P_BRANCH 0x01 /**< branch page */ -#define P_LEAF 0x02 /**< leaf page */ -#define P_OVERFLOW 0x04 /**< overflow page */ -#define P_META 0x08 /**< meta page */ -#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ -#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ -#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ -#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ -#define P_KEEP 0x8000 /**< leave this page alone during spill */ -/** @} */ - uint16_t mp_flags; /**< @ref mdb_page */ -#define mp_lower mp_pb.pb.pb_lower -#define mp_upper mp_pb.pb.pb_upper -#define mp_pages mp_pb.pb_pages - union { - struct { - indx_t pb_lower; /**< lower bound of free space */ - indx_t pb_upper; /**< upper bound of free space */ - } pb; - uint32_t pb_pages; /**< number of overflow pages */ - } mp_pb; - indx_t mp_ptrs[1]; /**< dynamic size */ -} MDB_page; - - /** Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs)) - - /** Address of first usable data byte in a page, after the header */ -#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) - - /** ITS#7713, change PAGEBASE to handle 65536 byte pages */ -#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) - - /** Number of nodes on a page */ -#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1) - - /** The amount of space remaining in the page */ -#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) - - /** The percentage of space used in the page, in tenths of a percent. */ -#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ - ((env)->me_psize - PAGEHDRSZ)) - /** The minimum page fill factor, in tenths of a percent. - * Pages emptier than this are candidates for merging. - */ -#define FILL_THRESHOLD 250 - - /** Test if a page is a leaf page */ -#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) - /** Test if a page is a LEAF2 page */ -#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) - /** Test if a page is a branch page */ -#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) - /** Test if a page is an overflow page */ -#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) - /** Test if a page is a sub page */ -#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) - - /** The number of overflow pages needed to store the given size. */ -#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) - - /** Link in #MDB_txn.%mt_loose_pgs list. - * Kept outside the page header, which is needed when reusing the page. - */ -#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) - - /** Header for a single key/data pair within a page. - * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. - * We guarantee 2-byte alignment for 'MDB_node's. - * - * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child - * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used - * for pgno. (Branch nodes have no flags). Lo and hi are in host byte - * order in case some accesses can be optimized to 32-bit word access. - * - * Leaf node flags describe node contents. #F_BIGDATA says the node's - * data part is the page number of an overflow page with actual data. - * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in - * a sub-page/sub-database, and named databases (just #F_SUBDATA). - */ -typedef struct MDB_node { - /** part of data size or pgno - * @{ */ -#if BYTE_ORDER == LITTLE_ENDIAN - unsigned short mn_lo, mn_hi; -#else - unsigned short mn_hi, mn_lo; -#endif - /** @} */ -/** @defgroup mdb_node Node Flags - * @ingroup internal - * Flags for node headers. - * @{ - */ -#define F_BIGDATA 0x01 /**< data put on overflow page */ -#define F_SUBDATA 0x02 /**< data is a sub-database */ -#define F_DUPDATA 0x04 /**< data has duplicates */ - -/** valid flags for #mdb_node_add() */ -#define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND) - -/** @} */ - unsigned short mn_flags; /**< @ref mdb_node */ - unsigned short mn_ksize; /**< key size */ - char mn_data[1]; /**< key and data are appended here */ -} MDB_node; - - /** Size of the node header, excluding dynamic data at the end */ -#define NODESIZE offsetof(MDB_node, mn_data) - - /** Bit position of top word in page number, for shifting mn_flags */ -#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) - - /** Size of a node in a branch page with a given key. - * This is just the node header plus the key, there is no data. - */ -#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) - - /** Size of a node in a leaf page with a given key and data. - * This is node header plus key plus data size. - */ -#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) - - /** Address of node \b i in page \b p */ -#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) - - /** Address of the key for the node */ -#define NODEKEY(node) (void *)((node)->mn_data) - - /** Address of the data for a node */ -#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) - - /** Get the page number pointed to by a branch node */ -#define NODEPGNO(node) \ - ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \ - (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0)) - /** Set the page number in a branch node */ -#define SETPGNO(node,pgno) do { \ - (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \ - if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0) - - /** Get the size of the data in a leaf node */ -#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) - /** Set the size of the data for a leaf node */ -#define SETDSZ(node,size) do { \ - (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0) - /** The size of a key in a node */ -#define NODEKSZ(node) ((node)->mn_ksize) - - /** Copy a page number from src to dst */ -#ifdef MISALIGNED_OK -# define COPY_PGNO(dst,src) dst = src -#elif SIZE_MAX > 4294967295UL -# define COPY_PGNO(dst,src) do { \ - unsigned short *s, *d; \ - s = (unsigned short *)&(src); \ - d = (unsigned short *)&(dst); \ - *d++ = *s++; \ - *d++ = *s++; \ - *d++ = *s++; \ - *d = *s; \ - } while (0) -#else -# define COPY_PGNO(dst,src) do { \ - unsigned short *s, *d; \ - s = (unsigned short *)&(src); \ - d = (unsigned short *)&(dst); \ - *d++ = *s++; \ - *d = *s; \ - } while (0) -#endif /* MISALIGNED_OK */ - -/** The address of a key in a LEAF2 page. - * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs. - * There are no node headers, keys are stored contiguously. - */ -#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks))) - - /** Set the \b node's key into \b keyptr, if requested. */ -#define MDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \ - (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } } - - /** Set the \b node's key into \b key. */ -#define MDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); } - - /** Information about a single database in the environment. */ -typedef struct MDB_db { - uint32_t md_xsize; /**< also ksize for LEAF2 pages */ - uint16_t md_flags; /**< @ref mdb_dbi_open */ - uint16_t md_depth; /**< depth of this tree */ - pgno_t md_branch_pages; /**< number of internal pages */ - pgno_t md_leaf_pages; /**< number of leaf pages */ - pgno_t md_overflow_pages; /**< number of overflow pages */ - size_t md_entries; /**< number of data items */ - pgno_t md_root; /**< the root page of this tree */ -} MDB_db; - -#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ -#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) - /** #mdb_dbi_open() flags */ -#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ - MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) - - /** Handle for the DB used to track free pages. */ -#define FREE_DBI 0 - /** Handle for the default DB. */ -#define MAIN_DBI 1 - /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ -#define CORE_DBS 2 - - /** Number of meta pages - also hardcoded elsewhere */ -#define NUM_METAS 2 - - /** Meta page content. - * A meta page is the start point for accessing a database snapshot. - * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). - */ -typedef struct MDB_meta { - /** Stamp identifying this as an LMDB file. It must be set - * to #MDB_MAGIC. */ - uint32_t mm_magic; - /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ - uint32_t mm_version; - void *mm_address; /**< address for fixed mapping */ - size_t mm_mapsize; /**< size of mmap region */ - MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ - /** The size of pages used in this DB */ -#define mm_psize mm_dbs[FREE_DBI].md_xsize - /** Any persistent environment flags. @ref mdb_env */ -#define mm_flags mm_dbs[FREE_DBI].md_flags - /** Last used page in the datafile. - * Actually the file may be shorter if the freeDB lists the final pages. - */ - pgno_t mm_last_pg; - volatile txnid_t mm_txnid; /**< txnid that committed this page */ -#define MDB_DATASIGN_NONE 0 -#define MDB_DATASIGN_WEAK 1 - volatile uint64_t mm_datasync_sign; -#define META_IS_WEAK(meta) ((meta)->mm_datasync_sign == MDB_DATASIGN_WEAK) -#define META_IS_STEADY(meta) ((meta)->mm_datasync_sign > MDB_DATASIGN_WEAK) - -#if MDBX_MODE_ENABLED - volatile mdbx_canary mm_canary; -#endif -} MDB_meta; - - /** Buffer for a stack-allocated meta page. - * The members define size and alignment, and silence type - * aliasing warnings. They are not used directly; that could - * mean incorrectly using several union members in parallel. - */ -typedef union MDB_metabuf { - MDB_page mb_page; - struct { - char mm_pad[PAGEHDRSZ]; - MDB_meta mm_meta; - } mb_metabuf; -} MDB_metabuf; - - /** Auxiliary DB info. - * The information here is mostly static/read-only. There is - * only a single copy of this record in the environment. - */ -typedef struct MDB_dbx { - MDB_val md_name; /**< name of the database */ - MDB_cmp_func *md_cmp; /**< function for comparing keys */ - MDB_cmp_func *md_dcmp; /**< function for comparing data items */ - MDB_rel_func *md_rel; /**< user relocate function */ - void *md_relctx; /**< user-provided context for md_rel */ -} MDB_dbx; - -#if MDBX_MODE_ENABLED -# define MDBX_MODE_SALT 0 -#else -# define MDBX_MODE_SALT 1115449266 -#endif - - /** A database transaction. - * Every operation requires a transaction handle. - */ -struct MDB_txn { -#define MDBX_MT_SIGNATURE (0x93D53A31^MDBX_MODE_SALT) - unsigned mt_signature; - MDB_txn *mt_parent; /**< parent of a nested txn */ - /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ - MDB_txn *mt_child; - pgno_t mt_next_pgno; /**< next unallocated page */ - /** The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. - */ - txnid_t mt_txnid; - MDB_env *mt_env; /**< the DB environment */ - /** The list of reclaimed txns from freeDB */ - MDB_IDL mt_lifo_reclaimed; - /** The list of pages that became unused during this transaction. - */ - MDB_IDL mt_free_pgs; - /** The list of loose pages that became unused and may be reused - * in this transaction, linked through #NEXT_LOOSE_PAGE(page). - */ - MDB_page *mt_loose_pgs; - /** Number of loose pages (#mt_loose_pgs) */ - int mt_loose_count; - /** The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. - */ - MDB_IDL mt_spill_pgs; - union { - /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ - MDB_ID2L dirty_list; - /** For read txns: This thread/txn's reader table slot, or NULL. */ - MDB_reader *reader; - } mt_u; - /** Array of records for each DB known in the environment. */ - MDB_dbx *mt_dbxs; - /** Array of MDB_db records for each known DB */ - MDB_db *mt_dbs; - /** Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; -/** @defgroup mt_dbflag Transaction DB Flags - * @ingroup internal - * @{ - */ -#define DB_DIRTY 0x01 /**< DB was written in this txn */ -#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ -#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ -#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ -#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ -#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ -/** @} */ - /** In write txns, array of cursors for each DB */ - MDB_cursor **mt_cursors; - /** Array of flags for each DB */ - unsigned char *mt_dbflags; - /** Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. - */ - MDB_dbi mt_numdbs; - -/** @defgroup mdb_txn Transaction Flags - * @ingroup internal - * @{ - */ - /** #mdb_txn_begin() flags */ -#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC|MDB_NOSYNC|MDB_RDONLY) -#define MDB_TXN_NOMETASYNC MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ -#define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ -#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ - /* internal txn flags */ -#define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ -#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ -#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ -#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ -#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ -#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ - /** most operations on the txn are currently illegal */ -#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD) -/** @} */ - unsigned mt_flags; /**< @ref mdb_txn */ - /** #dirty_list room: Array size - \#dirty pages visible to this txn. - * Includes ancestor txns' dirty pages not hidden by other txns' - * dirty/spilled pages. Thus commit(nested txn) has room to merge - * dirty_list into mt_parent after freeing hidden mt_parent pages. - */ - unsigned mt_dirty_room; - -#if MDBX_MODE_ENABLED - mdbx_canary mt_canary; -#endif -}; - -/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. - */ -#define CURSOR_STACK 32 - -struct MDB_xcursor; - - /** Cursors are used for all DB operations. - * A cursor holds a path of (page pointer, key index) from the DB - * root to a position in the DB, plus other state. #MDB_DUPSORT - * cursors include an xcursor to the current data item. Write txns - * track their cursors and keep them up to date when data moves. - * Exception: An xcursor's pointer to a #P_SUBP page can be stale. - * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). - */ -struct MDB_cursor { -#define MDBX_MC_SIGNATURE (0xFE05D5B1^MDBX_MODE_SALT) -#define MDBX_MC_READY4CLOSE (0x2817A047^MDBX_MODE_SALT) -#define MDBX_MC_WAIT4EOT (0x90E297A7^MDBX_MODE_SALT) - unsigned mc_signature; - /** Next cursor on this DB in this txn */ - MDB_cursor *mc_next; - /** Backup of the original cursor if this cursor is a shadow */ - MDB_cursor *mc_backup; - /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ - struct MDB_xcursor *mc_xcursor; - /** The transaction that owns this cursor */ - MDB_txn *mc_txn; - /** The database handle this cursor operates on */ - MDB_dbi mc_dbi; - /** The database record for this cursor */ - MDB_db *mc_db; - /** The database auxiliary record for this cursor */ - MDB_dbx *mc_dbx; - /** The @ref mt_dbflag for this database */ - unsigned char *mc_dbflag; - unsigned short mc_snum; /**< number of pushed pages */ - unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ -/** @defgroup mdb_cursor Cursor Flags - * @ingroup internal - * Cursor state flags. - * @{ - */ -#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ -#define C_EOF 0x02 /**< No more data */ -#define C_SUB 0x04 /**< Cursor is a sub-cursor */ -#define C_DEL 0x08 /**< last op was a cursor_del */ -#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ -#define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */ -/** @} */ - unsigned mc_flags; /**< @ref mdb_cursor */ - MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ - indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ -}; - - /** Context for sorted-dup records. - * We could have gone to a fully recursive design, with arbitrarily - * deep nesting of sub-databases. But for now we only handle these - * levels - main DB, optional sub-DB, sorted-duplicate DB. - */ -typedef struct MDB_xcursor { - /** A sub-cursor for traversing the Dup DB */ - MDB_cursor mx_cursor; - /** The database record for this Dup DB */ - MDB_db mx_db; - /** The auxiliary DB record for this Dup DB */ - MDB_dbx mx_dbx; - /** The @ref mt_dbflag for this Dup DB */ - unsigned char mx_dbflag; -} MDB_xcursor; - - /** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ -#define XCURSOR_INITED(mc) \ - ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - - /** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed - * when the node which contains the sub-page may have moved. Called - * with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. - */ -#define XCURSOR_REFRESH(mc, mp, ki) do { \ - MDB_page *xr_pg = (mp); \ - MDB_node *xr_node = NODEPTR(xr_pg, ki); \ - if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ - (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ -} while (0) - - /** State of FreeDB old pages, stored in the MDB_env */ -typedef struct MDB_pgstate { - pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ - txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ -} MDB_pgstate; - - /** Context for deferred cleanup of reader's threads. - * to avoid https://github.com/ReOpen/ReOpenLDAP/issues/48 */ -typedef struct MDBX_rthc { - struct MDBX_rthc *rc_next; - pthread_t rc_thread; - MDB_reader *rc_reader; -} MDBX_rthc; - -static MDBX_rthc* mdbx_rthc_get(pthread_key_t key); - - /** The database environment. */ -struct MDB_env { -#define MDBX_ME_SIGNATURE (0x9A899641^MDBX_MODE_SALT) - unsigned me_signature; - HANDLE me_fd; /**< The main data file */ - HANDLE me_lfd; /**< The lock file */ - /** Failed to update the meta page. Probably an I/O error. */ -#define MDB_FATAL_ERROR 0x80000000U - /** Some fields are initialized. */ -#define MDB_ENV_ACTIVE 0x20000000U - /** me_txkey is set */ -#define MDB_ENV_TXKEY 0x10000000U - uint32_t me_flags; /**< @ref mdb_env */ - unsigned me_psize; /**< DB page size, inited from me_os_psize */ - unsigned me_os_psize; /**< OS page size, from #GET_PAGESIZE */ - unsigned me_maxreaders; /**< size of the reader table */ - /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ - unsigned me_close_readers; - MDB_dbi me_numdbs; /**< number of DBs opened */ - MDB_dbi me_maxdbs; /**< size of the DB table */ - pid_t me_pid; /**< process ID of this env */ - char *me_path; /**< path to the DB files */ - char *me_map; /**< the memory map of the data file */ - MDB_txninfo *me_txns; /**< the memory map of the lock file, never NULL */ - void *me_pbuf; /**< scratch area for DUPSORT put() */ - MDB_txn *me_txn; /**< current write transaction */ - MDB_txn *me_txn0; /**< prealloc'd write transaction */ - size_t me_mapsize; /**< size of the data memory map */ - pgno_t me_maxpg; /**< me_mapsize / me_psize */ - MDB_dbx *me_dbxs; /**< array of static DB info */ - uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ - unsigned *me_dbiseqs; /**< array of dbi sequence numbers */ - pthread_key_t me_txkey; /**< thread-key for readers */ - txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ - MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ -# define me_pglast me_pgstate.mf_pglast -# define me_pghead me_pgstate.mf_pghead - MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ - /** IDL of pages that became unused in a write txn */ - MDB_IDL me_free_pgs; - /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ - MDB_ID2L me_dirty_list; - /** Max number of freelist items that can fit in a single overflow page */ - unsigned me_maxfree_1pg; - /** Max size of a node on a page */ - unsigned me_nodemax; - unsigned me_maxkey_limit; /**< max size of a key */ - int me_live_reader; /**< have liveness lock in reader table */ - void *me_userctx; /**< User-settable context */ -#if MDB_DEBUG - MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ -#endif - uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last mdb_env_sync() */ - uint64_t me_sync_threshold; /**< Treshold of above to force synchronous flush */ -#if MDBX_MODE_ENABLED - MDBX_oom_func *me_oom_func; /**< Callback for kicking laggard readers */ -#endif -#ifdef USE_VALGRIND - int me_valgrind_handle; -#endif -}; - - /** Nested transaction */ -typedef struct MDB_ntxn { - MDB_txn mnt_txn; /**< the transaction */ - MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ -} MDB_ntxn; - - /** max number of pages to commit in one writev() call */ -#define MDB_COMMIT_PAGES 64 -#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES -# undef MDB_COMMIT_PAGES -# define MDB_COMMIT_PAGES IOV_MAX -#endif - - /** max bytes to write in one call */ -#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) - - /** Check \b txn and \b dbi arguments to a function */ -#define TXN_DBI_EXIST(txn, dbi, validity) \ - ((dbi)<(txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) - - /** Check for misused \b dbi handles */ -#define TXN_DBI_CHANGED(txn, dbi) \ - ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) - -#define METAPAGE_1(env) \ - (&((MDB_metabuf*) (env)->me_map)->mb_metabuf.mm_meta) - -#define METAPAGE_2(env) \ - (&((MDB_metabuf*) ((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) - -static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags); -static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); -static int mdb_page_touch(MDB_cursor *mc); -static int mdb_cursor_touch(MDB_cursor *mc); - -#define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ - "reset-tmp", "fail-begin", "fail-beginchild"} -enum { - /* mdb_txn_end operation number, for logging */ - MDB_END_COMMITTED, MDB_END_EMPTY_COMMIT, MDB_END_ABORT, MDB_END_RESET, - MDB_END_RESET_TMP, MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD -}; -#define MDB_END_OPMASK 0x0F /**< mask for #mdb_txn_end() operation number */ -#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ -#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ -#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ -static int mdb_txn_end(MDB_txn *txn, unsigned mode); - -static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); -static int mdb_page_search_root(MDB_cursor *mc, - MDB_val *key, int modify); -#define MDB_PS_MODIFY 1 -#define MDB_PS_ROOTONLY 2 -#define MDB_PS_FIRST 4 -#define MDB_PS_LAST 8 -static int mdb_page_search(MDB_cursor *mc, - MDB_val *key, int flags); -static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); - -#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ -static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, - pgno_t newpgno, unsigned nflags); - -static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); -static int mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); -static void mdb_env_close0(MDB_env *env); - -static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); -static int mdb_node_add(MDB_cursor *mc, indx_t indx, - MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags); -static void mdb_node_del(MDB_cursor *mc, int ksize); -static void mdb_node_shrink(MDB_page *mp, indx_t indx); -static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); -static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); -static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); -static size_t mdb_branch_size(MDB_env *env, MDB_val *key); - -static int mdb_rebalance(MDB_cursor *mc); -static int mdb_update_key(MDB_cursor *mc, MDB_val *key); - -static void mdb_cursor_pop(MDB_cursor *mc); -static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp); - -static int mdb_cursor_del0(MDB_cursor *mc); -static int mdb_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); -static int mdb_cursor_sibling(MDB_cursor *mc, int move_right); -static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); -static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op); -static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, - int *exactp); -static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); -static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); - -static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); -static void mdb_xcursor_init0(MDB_cursor *mc); -static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); -static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); - -static int mdb_drop0(MDB_cursor *mc, int subs); -static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); - -/** @cond */ -static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int_ai, mdb_cmp_int_a2, mdb_cmp_int_ua; -/** @endcond */ - -#ifdef __SANITIZE_THREAD__ -static pthread_mutex_t tsan_mutex = PTHREAD_MUTEX_INITIALIZER; -#endif - -/** Return the library version info. */ -char * __cold -mdb_version(int *major, int *minor, int *patch) -{ - if (major) *major = MDB_VERSION_MAJOR; - if (minor) *minor = MDB_VERSION_MINOR; - if (patch) *patch = MDB_VERSION_PATCH; - return MDB_VERSION_STRING; -} - -/** Table of descriptions for LMDB @ref errors */ -static char *const mdb_errstr[] = { - "MDB_KEYEXIST: Key/data pair already exists", - "MDB_NOTFOUND: No matching key/data pair found", - "MDB_PAGE_NOTFOUND: Requested page not found", - "MDB_CORRUPTED: Located page was wrong type", - "MDB_PANIC: Update of meta page failed or environment had fatal error", - "MDB_VERSION_MISMATCH: Database environment version mismatch", - "MDB_INVALID: File is not an LMDB file", - "MDB_MAP_FULL: Environment mapsize limit reached", - "MDB_DBS_FULL: Environment maxdbs limit reached", - "MDB_READERS_FULL: Environment maxreaders limit reached", - "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", - "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", - "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", - "MDB_PAGE_FULL: Internal error - page has no more space", - "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", - "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", - "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", - "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", - "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", - "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", - "MDB_PROBLEM: Unexpected problem - txn should abort", -}; - -char * __cold -mdb_strerror(int err) -{ - int i; - if (!err) - return ("Successful return: 0"); - - if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { - i = err - MDB_KEYEXIST; - return mdb_errstr[i]; - } - - return strerror(err); -} - -#if MDBX_MODE_ENABLED -static txnid_t mdbx_oomkick(MDB_env *env, txnid_t oldest); -#endif /* MDBX_MODE_ENABLED */ - -static void mdb_debug_log(int type, const char *function, int line, const char *fmt, ...) - __attribute__((format(printf, 4, 5))); - -#if MDB_DEBUG - static txnid_t mdb_debug_edge; - - static void __cold - mdb_assert_fail(MDB_env *env, const char *msg, - const char *func, int line) - { - if (env && env->me_assert_func) - env->me_assert_func(env, msg, func, line); - else { - if (mdb_debug_logger) - mdb_debug_log(MDBX_DBG_ASSERT, func, line, "assert: %s\n", msg); - __assert_fail(msg, __FILE__, line, func); - } - } - -# define mdb_assert_enabled() \ - unlikely(mdb_runtime_flags & MDBX_DBG_ASSERT) - -# define mdb_audit_enabled() \ - unlikely(mdb_runtime_flags & MDBX_DBG_AUDIT) - -# define mdb_debug_enabled(type) \ - unlikely(mdb_runtime_flags & \ - (type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) - -#else -# ifndef NDEBUG -# define mdb_debug_enabled(type) (1) -# else -# define mdb_debug_enabled(type) (0) -# endif -# define mdb_audit_enabled() (0) -# define mdb_assert_enabled() (0) -# define mdb_assert_fail(env, msg, func, line) \ - __assert_fail(msg, __FILE__, line, func) -#endif /* MDB_DEBUG */ - -static void __cold -mdb_debug_log(int type, const char *function, int line, const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - if (mdb_debug_logger) - mdb_debug_logger(type, function, line, fmt, args); - else { - if (function && line > 0) - fprintf(stderr, "%s:%d ", function, line); - else if (function) - fprintf(stderr, "%s: ", function); - else if (line > 0) - fprintf(stderr, "%d: ", line); - vfprintf(stderr, fmt, args); - } - va_end(args); -} - -#define mdb_print(fmt, ...) \ - mdb_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) - -#define mdb_debug(fmt, ...) do { \ - if (mdb_debug_enabled(MDBX_DBG_TRACE)) \ - mdb_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", ##__VA_ARGS__); \ - } while(0) - -#define mdb_debug_print(fmt, ...) do { \ - if (mdb_debug_enabled(MDBX_DBG_TRACE)) \ - mdb_debug_log(MDBX_DBG_TRACE, NULL, 0, fmt, ##__VA_ARGS__); \ - } while(0) - -#define mdb_debug_extra(fmt, ...) do { \ - if (mdb_debug_enabled(MDBX_DBG_EXTRA)) \ - mdb_debug_log(MDBX_DBG_EXTRA, __FUNCTION__, __LINE__, fmt, ##__VA_ARGS__); \ - } while(0) - -#define mdb_debug_extra_print(fmt, ...) do { \ - if (mdb_debug_enabled(MDBX_DBG_EXTRA)) \ - mdb_debug_log(MDBX_DBG_EXTRA, NULL, 0, fmt, ##__VA_ARGS__); \ - } while(0) - -#define mdb_ensure_msg(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - mdb_assert_fail(env, msg, __FUNCTION__, __LINE__); \ - } while(0) - -#define mdb_ensure(env, expr) \ - mdb_ensure_msg(env, expr, #expr) - -/** assert(3) variant in environment context */ -#define mdb_assert(env, expr) \ - do { \ - if (mdb_assert_enabled()) \ - mdb_ensure(env, expr); \ - } while(0) - -/** assert(3) variant in cursor context */ -#define mdb_cassert(mc, expr) \ - mdb_assert((mc)->mc_txn->mt_env, expr) - -/** assert(3) variant in transaction context */ -#define mdb_tassert(txn, expr) \ - mdb_assert((txn)->mt_env, expr) - -/** Return the page number of \b mp which may be sub-page, for debug output */ -static MDBX_INLINE pgno_t -mdb_dbg_pgno(MDB_page *mp) -{ - pgno_t ret; - COPY_PGNO(ret, mp->mp_pgno); - return ret; -} - -/** Display a key in hexadecimal and return the address of the result. - * @param[in] key the key to display - * @param[in] buf the buffer to write into. Should always be #DKBUF. - * @return The key in hexadecimal form. - */ -char * -mdb_dkey(MDB_val *key, char *buf) -{ - char *ptr = buf; - unsigned i; - - if (!key) - return ""; - - if (key->mv_size > DKBUF_MAXKEYSIZE) - return "MDB_MAXKEYSIZE"; - /* may want to make this a dynamic check: if the key is mostly - * printable characters, print it as-is instead of converting to hex. */ -#if 1 - buf[0] = '\0'; - for (i=0; imv_size; i++) - ptr += sprintf(ptr, "%02x", ((unsigned char*) key->mv_data)[i]); -#else - sprintf(buf, "%.*s", key->mv_size, key->mv_data); -#endif - return buf; -} - -#if 0 /* LY: debug stuff */ -static const char * -mdb_leafnode_type(MDB_node *n) -{ - static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; - return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : - tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; -} - -/** Display all the keys in the page. */ -static void -mdb_page_list(MDB_page *mp) -{ - pgno_t pgno = mdb_dbg_pgno(mp); - const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; - MDB_node *node; - unsigned i, nkeys, nsize, total = 0; - MDB_val key; - DKBUF; - - switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { - case P_BRANCH: type = "Branch page"; break; - case P_LEAF: type = "Leaf page"; break; - case P_LEAF|P_SUBP: type = "Sub-page"; break; - case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; - case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; - case P_OVERFLOW: - mdb_print("Overflow page %zu pages %u%s\n", - pgno, mp->mp_pages, state); - return; - case P_META: - mdb_print("Meta-page %zu txnid %zu\n", - pgno, ((MDB_meta *)PAGEDATA(mp))->mm_txnid); - return; - default: - mdb_print("Bad page %zu flags 0x%X\n", pgno, mp->mp_flags); - return; - } - - nkeys = NUMKEYS(mp); - mdb_print("%s %zu numkeys %u%s\n", type, pgno, nkeys, state); - - for (i=0; imp_leaf2_ksize; - key.mv_data = LEAF2KEY(mp, i, nsize); - total += nsize; - mdb_print("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); - continue; - } - node = NODEPTR(mp, i); - key.mv_size = node->mn_ksize; - key.mv_data = node->mn_data; - nsize = NODESIZE + key.mv_size; - if (IS_BRANCH(mp)) { - mdb_print("key %u: page %zu, %s\n", i, NODEPGNO(node), DKEY(&key)); - total += nsize; - } else { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - nsize += sizeof(pgno_t); - else - nsize += NODEDSZ(node); - total += nsize; - nsize += sizeof(indx_t); - mdb_print("key %u: nsize %u, %s%s\n", - i, nsize, DKEY(&key), mdb_leafnode_type(node)); - } - total = EVEN(total); - } - mdb_print("Total: header %u + contents %u + unused %u\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); -} - -static void -mdb_cursor_chk(MDB_cursor *mc) -{ - unsigned i; - MDB_node *node; - MDB_page *mp; - - if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; - for (i=0; imc_top; i++) { - mp = mc->mc_pg[i]; - node = NODEPTR(mp, mc->mc_ki[i]); - if (unlikely(NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)) - mdb_print("oops!\n"); - } - if (unlikely(mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))) - mdb_print("ack!\n"); - if (XCURSOR_INITED(mc)) { - node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && - mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { - mdb_print("blah!\n"); - } - } -} -#endif /* 0 */ - -/** Count all the pages in each DB and in the freelist - * and make sure it matches the actual number of pages - * being used. - * All named DBs must be open for a correct count. - */ -static void mdb_audit(MDB_txn *txn) -{ - MDB_cursor mc; - MDB_val key, data; - MDB_ID freecount, count; - MDB_dbi i; - int rc; - - freecount = 0; - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) - freecount += *(MDB_ID *)data.mv_data; - mdb_tassert(txn, rc == MDB_NOTFOUND); - - count = 0; - for (i = 0; imt_numdbs; i++) { - MDB_xcursor mx; - if (!(txn->mt_dbflags[i] & DB_VALID)) - continue; - mdb_cursor_init(&mc, txn, i, &mx); - if (txn->mt_dbs[i].md_root == P_INVALID) - continue; - count += txn->mt_dbs[i].md_branch_pages + - txn->mt_dbs[i].md_leaf_pages + - txn->mt_dbs[i].md_overflow_pages; - if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { - rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST); - for (; rc == MDB_SUCCESS; rc = mdb_cursor_sibling(&mc, 1)) { - unsigned j; - MDB_page *mp; - mp = mc.mc_pg[mc.mc_top]; - for (j=0; jmn_flags & F_SUBDATA) { - MDB_db db; - memcpy(&db, NODEDATA(leaf), sizeof(db)); - count += db.md_branch_pages + db.md_leaf_pages + - db.md_overflow_pages; - } - } - } - mdb_tassert(txn, rc == MDB_NOTFOUND); - } - } - if (freecount + count + NUM_METAS != txn->mt_next_pgno) { - mdb_print("audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", - txn->mt_txnid, freecount, count+NUM_METAS, - freecount+count+NUM_METAS, txn->mt_next_pgno); - } -} - -int -mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) -{ - mdb_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); - return txn->mt_dbxs[dbi].md_cmp(a, b); -} - -int -mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) -{ - mdb_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); - return txn->mt_dbxs[dbi].md_dcmp(a, b); -} - -/** Allocate memory for a page. - * Re-use old malloc'd pages first for singletons, otherwise just malloc. - * Set #MDB_TXN_ERROR on failure. - */ -static MDB_page * -mdb_page_malloc(MDB_txn *txn, unsigned num) -{ - MDB_env *env = txn->mt_env; - size_t size = env->me_psize; - MDB_page *np = env->me_dpages; - if (likely(num == 1 && np)) { - ASAN_UNPOISON_MEMORY_REGION(np, size); - VALGRIND_MEMPOOL_ALLOC(env, np, size); - VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); - env->me_dpages = np->mp_next; - } else { - size *= num; - np = malloc(size); - if (unlikely(! np)) { - txn->mt_flags |= MDB_TXN_ERROR; - return np; - } - VALGRIND_MEMPOOL_ALLOC(env, np, size); - } - - if ((env->me_flags & MDB_NOMEMINIT) == 0) { - /* For a single page alloc, we init everything after the page header. - * For multi-page, we init the final page; if the caller needed that - * many pages they will be filling in at least up to the last page. */ - size_t skip = PAGEHDRSZ; - if (num > 1) - skip += (num - 1) * env->me_psize; - memset((char *) np + skip, 0, size - skip); - } - VALGRIND_MAKE_MEM_UNDEFINED(np, size); - np->mp_flags = 0; - np->mp_pages = num; - return np; -} - -/** Free a single page. - * Saves single pages to a list, for future reuse. - * (This is not used for multi-page overflow pages.) - */ -static MDBX_INLINE void -mdb_page_free(MDB_env *env, MDB_page *mp) -{ - mp->mp_next = env->me_dpages; - VALGRIND_MEMPOOL_FREE(env, mp); - env->me_dpages = mp; -} - -/** Free a dirty page */ -static void -mdb_dpage_free(MDB_env *env, MDB_page *dp) -{ - if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { - mdb_page_free(env, dp); - } else { - /* large pages just get freed directly */ - VALGRIND_MEMPOOL_FREE(env, dp); - free(dp); - } -} - -/** Return all dirty pages to dpage list */ -static void -mdb_dlist_free(MDB_txn *txn) -{ - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned i, n = dl[0].mid; - - for (i = 1; i <= n; i++) { - mdb_dpage_free(env, dl[i].mptr); - } - dl[0].mid = 0; -} - -static void __cold -mdb_kill_page(MDB_env *env, pgno_t pgno) -{ - const size_t offs = env->me_psize * pgno; - const size_t shift = offsetof(MDB_page, mp_pb); - - if (env->me_flags & MDB_WRITEMAP) { - MDB_page *mp = (MDB_page *)(env->me_map + offs); - memset(&mp->mp_pb, 0x6F /* 'o', 111 */, env->me_psize - shift); - VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_pb, env->me_psize - shift); - ASAN_POISON_MEMORY_REGION(&mp->mp_pb, env->me_psize - shift); - } else { - struct iovec iov[1]; - iov[0].iov_len = env->me_psize - shift; - iov[0].iov_base = alloca(iov[0].iov_len); - memset(iov[0].iov_base, 0x6F /* 'o', 111 */, iov[0].iov_len); - ssize_t rc = pwritev(env->me_fd, iov, 1, offs + shift); - assert(rc == (ssize_t) iov[0].iov_len); - (void) rc; - } -} - -/** Loosen or free a single page. - * Saves single pages to a list for future reuse - * in this same txn. It has been pulled from the freeDB - * and already resides on the dirty list, but has been - * deleted. Use these pages first before pulling again - * from the freeDB. - * - * If the page wasn't dirtied in this txn, just add it - * to this txn's free list. - */ -static int -mdb_page_loose(MDB_cursor *mc, MDB_page *mp) -{ - int loose = 0; - pgno_t pgno = mp->mp_pgno; - MDB_txn *txn = mc->mc_txn; - - if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { - if (txn->mt_parent) { - MDB_ID2 *dl = txn->mt_u.dirty_list; - /* If txn has a parent, make sure the page is in our - * dirty list. */ - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; - } - /* ok, it's ours */ - loose = 1; - } - } - } else { - /* no parent txn, so it's just ours */ - loose = 1; - } - } - if (loose) { - mdb_debug("loosen db %d page %zu", DDBI(mc), mp->mp_pgno); - MDB_page **link = &NEXT_LOOSE_PAGE(mp); - if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) { - mdb_kill_page(txn->mt_env, pgno); - VALGRIND_MAKE_MEM_UNDEFINED(link, sizeof(MDB_page*)); - ASAN_UNPOISON_MEMORY_REGION(link, sizeof(MDB_page*)); - } - *link = txn->mt_loose_pgs; - txn->mt_loose_pgs = mp; - txn->mt_loose_count++; - mp->mp_flags |= P_LOOSE; - } else { - int rc = mdb_midl_append(&txn->mt_free_pgs, pgno); - if (unlikely(rc)) - return rc; - } - - return MDB_SUCCESS; -} - -/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. - * @param[in] mc A cursor handle for the current operation. - * @param[in] pflags Flags of the pages to update: - * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. - * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). - * @return 0 on success, non-zero on failure. - */ -static int -mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) -{ - enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; - MDB_txn *txn = mc->mc_txn; - MDB_cursor *m3, *m0 = mc; - MDB_xcursor *mx; - MDB_page *dp, *mp; - MDB_node *leaf; - unsigned i, j; - int rc = MDB_SUCCESS, level; - - /* Mark pages seen by cursors: First m0, then tracked cursors */ - for (i = txn->mt_numdbs;; ) { - if (mc->mc_flags & C_INITIALIZED) { - for (m3 = mc;; m3 = &mx->mx_cursor) { - mp = NULL; - for (j=0; jmc_snum; j++) { - mp = m3->mc_pg[j]; - if ((mp->mp_flags & Mask) == pflags) - mp->mp_flags ^= P_KEEP; - } - mx = m3->mc_xcursor; - /* Proceed to mx if it is at a sub-database */ - if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) - break; - if (! (mp && (mp->mp_flags & P_LEAF))) - break; - leaf = NODEPTR(mp, m3->mc_ki[j-1]); - if (!(leaf->mn_flags & F_SUBDATA)) - break; - } - } - mc = mc->mc_next; - for (; !mc || mc == m0; mc = txn->mt_cursors[--i]) - if (i == 0) - goto mark_done; - } - -mark_done: - if (all) { - /* Mark dirty root pages */ - for (i=0; imt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - pgno_t pgno = txn->mt_dbs[i].md_root; - if (pgno == P_INVALID) - continue; - if (unlikely((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS)) - break; - if ((dp->mp_flags & Mask) == pflags && level <= 1) - dp->mp_flags ^= P_KEEP; - } - } - } - - return rc; -} - -static int mdb_page_flush(MDB_txn *txn, int keep); - -/** Spill pages from the dirty list back to disk. - * This is intended to prevent running into #MDB_TXN_FULL situations, - * but note that they may still occur in a few cases: - * 1) our estimate of the txn size could be too small. Currently this - * seems unlikely, except with a large number of #MDB_MULTIPLE items. - * 2) child txns may run out of space if their parents dirtied a - * lot of pages and never spilled them. TODO: we probably should do - * a preemptive spill during #mdb_txn_begin() of a child txn, if - * the parent's dirty_room is below a given threshold. - * - * Otherwise, if not using nested txns, it is expected that apps will - * not run into #MDB_TXN_FULL any more. The pages are flushed to disk - * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. - * If the txn never references them again, they can be left alone. - * If the txn only reads them, they can be used without any fuss. - * If the txn writes them again, they can be dirtied immediately without - * going thru all of the work of #mdb_page_touch(). Such references are - * handled by #mdb_page_unspill(). - * - * Also note, we never spill DB root pages, nor pages of active cursors, - * because we'll need these back again soon anyway. And in nested txns, - * we can't spill a page in a child txn if it was already spilled in a - * parent txn. That would alter the parent txns' data even though - * the child hasn't committed yet, and we'd have no way to undo it if - * the child aborted. - * - * @param[in] m0 cursor A cursor handle identifying the transaction and - * database for which we are checking space. - * @param[in] key For a put operation, the key being stored. - * @param[in] data For a put operation, the data being stored. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) -{ - MDB_txn *txn = m0->mc_txn; - MDB_page *dp; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned i, j, need; - int rc; - - if (m0->mc_flags & C_SUB) - return MDB_SUCCESS; - - /* Estimate how much space this op will take */ - i = m0->mc_db->md_depth; - /* Named DBs also dirty the main DB */ - if (m0->mc_dbi >= CORE_DBS) - i += txn->mt_dbs[MAIN_DBI].md_depth; - /* For puts, roughly factor in the key+data size */ - if (key) - i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; - i += i; /* double it for good measure */ - need = i; - - if (txn->mt_dirty_room > i) - return MDB_SUCCESS; - - if (!txn->mt_spill_pgs) { - txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); - if (unlikely(!txn->mt_spill_pgs)) - return ENOMEM; - } else { - /* purge deleted slots */ - MDB_IDL sl = txn->mt_spill_pgs; - unsigned num = sl[0]; - j=0; - for (i=1; i<=num; i++) { - if (!(sl[i] & 1)) - sl[++j] = sl[i]; - } - sl[0] = j; - } - - /* Preserve pages which may soon be dirtied again */ - rc = mdb_pages_xkeep(m0, P_DIRTY, 1); - if (unlikely(rc != MDB_SUCCESS)) - goto bailout; - - /* Less aggressive spill - we originally spilled the entire dirty list, - * with a few exceptions for cursor pages and DB root pages. But this - * turns out to be a lot of wasted effort because in a large txn many - * of those pages will need to be used again. So now we spill only 1/8th - * of the dirty pages. Testing revealed this to be a good tradeoff, - * better than 1/2, 1/4, or 1/10. */ - if (need < MDB_IDL_UM_MAX / 8) - need = MDB_IDL_UM_MAX / 8; - - /* Save the page IDs of all the pages we're flushing */ - /* flush from the tail forward, this saves a lot of shifting later on. */ - for (i=dl[0].mid; i && need; i--) { - MDB_ID pn = dl[i].mid << 1; - dp = dl[i].mptr; - if (dp->mp_flags & (P_LOOSE|P_KEEP)) - continue; - /* Can't spill twice, make sure it's not already in a parent's - * spill list. */ - if (txn->mt_parent) { - MDB_txn *tx2; - for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { - if (tx2->mt_spill_pgs) { - j = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { - dp->mp_flags |= P_KEEP; - break; - } - } - } - if (tx2) - continue; - } - rc = mdb_midl_append(&txn->mt_spill_pgs, pn); - if (unlikely(rc != MDB_SUCCESS)) - goto bailout; - need--; - } - mdb_midl_sort(txn->mt_spill_pgs); - - /* Flush the spilled part of dirty list */ - rc = mdb_page_flush(txn, i); - if (unlikely(rc != MDB_SUCCESS)) - goto bailout; - - /* Reset any dirty pages we kept that page_flush didn't see */ - rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); - -bailout: - txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; - return rc; -} - -static MDBX_INLINE uint64_t -mdb_meta_sign(MDB_meta *meta) { - uint64_t sign = MDB_DATASIGN_NONE; -#if 0 /* TODO */ - sign = hippeus_hash64( - &meta->mm_mapsize, - sizeof(MDB_meta) - offsetof(MDB_meta, mm_mapsize), - meta->mm_version | (uint64_t) MDB_MAGIC << 32 - ); -#else - (void) meta; -#endif - /* LY: newer returns MDB_DATASIGN_NONE or MDB_DATASIGN_WEAK */ - return (sign > MDB_DATASIGN_WEAK) ? sign : ~sign; -} - -static MDBX_INLINE MDB_meta* -mdb_meta_head_w(MDB_env *env) { - MDB_meta* a = METAPAGE_1(env); - MDB_meta* b = METAPAGE_2(env); - txnid_t head_txnid = env->me_txns->mti_txnid; - - mdb_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); - if (a->mm_txnid == head_txnid) - return a; - if (likely(b->mm_txnid == head_txnid)) - return b; - - mdb_debug("me_txns->mti_txnid not match meta-pages"); - mdb_assert(env, head_txnid == a->mm_txnid || head_txnid == b->mm_txnid); - env->me_flags |= MDB_FATAL_ERROR; - return a; -} - -static MDB_meta* -mdb_meta_head_r(MDB_env *env) { - MDB_meta* a = METAPAGE_1(env); - MDB_meta* b = METAPAGE_2(env), *h; - -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - - txnid_t head_txnid = env->me_txns->mti_txnid; - mdb_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); - if (likely(a->mm_txnid == head_txnid)) { - h = a; - } else if (likely(b->mm_txnid == head_txnid)) { - h = b; - } else { - /* LY: seems got a collision with mdb_env_sync0() */ - mdbx_coherent_barrier(); - head_txnid = env->me_txns->mti_txnid; - mdb_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); - - if (likely(a->mm_txnid == head_txnid)) { - h = a; - } else if (likely(b->mm_txnid == head_txnid)) { - h = b; - } else { - /* LY: got a race again, or DB is corrupted */ - int rc = mdb_mutex_lock(env, MDB_MUTEX(env, w)); - h = mdb_meta_head_w(env); - if (rc == 0) - mdb_mutex_unlock(env, MDB_MUTEX(env, w)); - } - } - -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - - return h; -} - -static MDBX_INLINE MDB_meta* -mdb_env_meta_flipflop(const MDB_env *env, MDB_meta* meta) { - return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env); -} - -static MDBX_INLINE int -mdb_meta_lt(MDB_meta* a, MDB_meta* b) { - return (META_IS_STEADY(a) == META_IS_STEADY(b)) - ? a->mm_txnid < b->mm_txnid : META_IS_STEADY(b); -} - -/** Find oldest txnid still referenced. */ -static -txnid_t mdb_find_oldest(MDB_env *env, int *laggard) -{ -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - int i, reader; - MDB_reader *r = env->me_txns->mti_readers; - txnid_t oldest = env->me_txns->mti_txnid; - - MDB_meta* a = METAPAGE_1(env); - MDB_meta* b = METAPAGE_2(env); - if (META_IS_WEAK(a) && oldest > b->mm_txnid) - oldest = b->mm_txnid; - if (META_IS_WEAK(b) && oldest > a->mm_txnid) - oldest = a->mm_txnid; - - for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0; ) { - if (r[i].mr_pid) { - txnid_t snap = r[i].mr_txnid; - if (oldest > snap) { - oldest = snap; - reader = i; - } - } - } -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - - if (laggard) - *laggard = reader; - return env->me_pgoldest = oldest; -} - -/** Add a page to the txn's dirty list */ -static void -mdb_page_dirty(MDB_txn *txn, MDB_page *mp) -{ - MDB_ID2 mid; - int rc, (*insert)(MDB_ID2L, MDB_ID2 *); - - if (txn->mt_flags & MDB_TXN_WRITEMAP) { - insert = mdb_mid2l_append; - } else { - insert = mdb_mid2l_insert; - } - mid.mid = mp->mp_pgno; - mid.mptr = mp; - rc = insert(txn->mt_u.dirty_list, &mid); - mdb_tassert(txn, rc == 0); - txn->mt_dirty_room--; -} - -/** Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. - * - * If there are free pages available from older transactions, they - * are re-used first. Otherwise allocate a new page at mt_next_pgno. - * Do not modify the freedB, just merge freeDB records into me_pghead[] - * and move me_pglast to say which records were consumed. Only this - * function can create me_pghead and move me_pglast/mt_next_pgno. - * @param[in] mc cursor A cursor handle identifying the transaction and - * database for which we are allocating. - * @param[in] num the number of pages to allocate. - * @param[out] mp Address of the allocated page(s). Requests for multiple pages - * will always be satisfied by a single contiguous chunk of memory. - * @return 0 on success, non-zero on failure. - */ - -#define MDBX_ALLOC_CACHE 1 -#define MDBX_ALLOC_GC 2 -#define MDBX_ALLOC_NEW 4 -#define MDBX_ALLOC_KICK 8 -#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE|MDBX_ALLOC_GC|MDBX_ALLOC_NEW|MDBX_ALLOC_KICK) - -static int -mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) -{ - int rc; - MDB_txn *txn = mc->mc_txn; - MDB_env *env = txn->mt_env; - pgno_t pgno, *mop = env->me_pghead; - unsigned i = 0, j, mop_len = mop ? mop[0] : 0, n2 = num-1; - MDB_page *np; - txnid_t oldest = 0, last = 0; - MDB_cursor_op op; - MDB_cursor m2; - int found_oldest = 0; - - if (likely(flags & MDBX_ALLOC_GC)) { - flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); - if (unlikely(mc->mc_flags & C_RECLAIMING)) { - /* If mc is updating the freeDB, then the freelist cannot play - * catch-up with itself by growing while trying to save it. */ - flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_KICK | MDBX_COALESCE | MDBX_LIFORECLAIM); - } - } - - if (likely(flags & MDBX_ALLOC_CACHE)) { - /* If there are any loose pages, just use them */ - assert(mp && num); - if (likely(num == 1 && txn->mt_loose_pgs)) { - np = txn->mt_loose_pgs; - txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); - txn->mt_loose_count--; - mdb_debug("db %d use loose page %zu", DDBI(mc), np->mp_pgno); - ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize); - *mp = np; - return MDB_SUCCESS; - } - } - - /* If our dirty list is already full, we can't do anything */ - if (unlikely(txn->mt_dirty_room == 0)) { - rc = MDB_TXN_FULL; - goto fail; - } - - for (;;) { /* oom-kick retry loop */ - for (op = MDB_FIRST;; op = (flags & MDBX_LIFORECLAIM) ? MDB_PREV : MDB_NEXT) { - MDB_val key, data; - MDB_node *leaf; - pgno_t *idl; - - /* Seek a big enough contiguous page range. Prefer - * pages at the tail, just truncating the list. */ - if (likely(flags & MDBX_ALLOC_CACHE) - && mop_len > n2 - && ( !(flags & MDBX_COALESCE) || op == MDB_FIRST)) { - i = mop_len; - do { - pgno = mop[i]; - if (likely(mop[i-n2] == pgno+n2)) - goto done; - } while (--i > n2); - } - - if (op == MDB_FIRST) { /* 1st iteration */ - /* Prepare to fetch more and coalesce */ - if (unlikely( !(flags & MDBX_ALLOC_GC) )) - break; - - oldest = env->me_pgoldest; - mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - if (flags & MDBX_LIFORECLAIM) { - if (! found_oldest) { - oldest = mdb_find_oldest(env, NULL); - found_oldest = 1; - } - /* Begin from oldest reader if any */ - if (oldest > 2) { - last = oldest - 1; - op = MDB_SET_RANGE; - } - } else if (env->me_pglast) { - /* Continue lookup from env->me_pglast to higher/last */ - last = env->me_pglast; - op = MDB_SET_RANGE; - } - - key.mv_data = &last; - key.mv_size = sizeof(last); - } - - if (! (flags & MDBX_LIFORECLAIM) ) { - /* Do not fetch more if the record will be too recent */ - if (op != MDB_FIRST && ++last >= oldest) { - if (!found_oldest) { - oldest = mdb_find_oldest(env, NULL); - found_oldest = 1; - } - if (oldest <= last) - break; - } - } - - rc = mdb_cursor_get(&m2, &key, NULL, op); - if (rc == MDB_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { - if (op == MDB_SET_RANGE) - continue; - found_oldest = 1; - if (oldest < mdb_find_oldest(env, NULL)) { - oldest = env->me_pgoldest; - last = oldest - 1; - key.mv_data = &last; - key.mv_size = sizeof(last); - op = MDB_SET_RANGE; - rc = mdb_cursor_get(&m2, &key, NULL, op); - } - } - if (unlikely(rc)) { - if (rc == MDB_NOTFOUND) - break; - goto fail; - } - - last = *(txnid_t*)key.mv_data; - if (oldest <= last) { - if (!found_oldest) { - oldest = mdb_find_oldest(env, NULL); - found_oldest = 1; - } - if (oldest <= last) { - if (flags & MDBX_LIFORECLAIM) - continue; - break; - } - } - - if (flags & MDBX_LIFORECLAIM) { - if (txn->mt_lifo_reclaimed) { - for(j = txn->mt_lifo_reclaimed[0]; j > 0; --j) - if (txn->mt_lifo_reclaimed[j] == last) - break; - if (j) - continue; - } - } - - np = m2.mc_pg[m2.mc_top]; - leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); - if (unlikely((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS)) - goto fail; - - if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { - txn->mt_lifo_reclaimed = mdb_midl_alloc(env->me_maxfree_1pg); - if (unlikely(!txn->mt_lifo_reclaimed)) { - rc = ENOMEM; - goto fail; - } - } - - idl = (MDB_ID *) data.mv_data; - mdb_tassert(txn, idl[0] == 0 || data.mv_size == (idl[0] + 1) * sizeof(MDB_ID)); - i = idl[0]; - if (!mop) { - if (unlikely(!(env->me_pghead = mop = mdb_midl_alloc(i)))) { - rc = ENOMEM; - goto fail; - } - } else { - if (unlikely((rc = mdb_midl_need(&env->me_pghead, i)) != 0)) - goto fail; - mop = env->me_pghead; - } - if (flags & MDBX_LIFORECLAIM) { - if ((rc = mdb_midl_append(&txn->mt_lifo_reclaimed, last)) != 0) - goto fail; - } - env->me_pglast = last; - - if (mdb_debug_enabled(MDBX_DBG_EXTRA)) { - mdb_debug_extra("IDL read txn %zu root %zu num %u, IDL", - last, txn->mt_dbs[FREE_DBI].md_root, i); - for (j = i; j; j--) - mdb_debug_extra_print(" %zu", idl[j]); - mdb_debug_extra_print("\n"); - } - - /* Merge in descending sorted order */ - mdb_midl_xmerge(mop, idl); - mop_len = mop[0]; - - if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) { - /* force gc reclaim mode */ - return MDB_SUCCESS; - } - - /* Don't try to coalesce too much. */ - if (mop_len > MDB_IDL_UM_SIZE / 2) - break; - if (flags & MDBX_COALESCE) { - if (mop_len /* current size */ >= env->me_maxfree_1pg / 2 - || i /* prev size */ >= env->me_maxfree_1pg / 4) - flags &= ~MDBX_COALESCE; - } - } - - if ((flags & (MDBX_COALESCE|MDBX_ALLOC_CACHE)) == (MDBX_COALESCE|MDBX_ALLOC_CACHE) - && mop_len > n2) { - i = mop_len; - do { - pgno = mop[i]; - if (mop[i-n2] == pgno+n2) - goto done; - } while (--i > n2); - } - - /* Use new pages from the map when nothing suitable in the freeDB */ - i = 0; - pgno = txn->mt_next_pgno; - rc = MDB_MAP_FULL; - if (likely(pgno + num <= env->me_maxpg)) { - rc = MDB_NOTFOUND; - if (likely(flags & MDBX_ALLOC_NEW)) - goto done; - } - - if ((flags & MDBX_ALLOC_GC) - && ((flags & MDBX_ALLOC_KICK) || rc == MDB_MAP_FULL)) { - MDB_meta* head = mdb_meta_head_w(env); - MDB_meta* tail = mdb_env_meta_flipflop(env, head); - - if (oldest == tail->mm_txnid - && META_IS_WEAK(head) && !META_IS_WEAK(tail)) { - MDB_meta meta = *head; - /* LY: Here an oom was happened: - * - all pages had allocated; - * - reclaiming was stopped at the last steady-sync; - * - the head-sync is weak. - * Now we need make a sync to resume reclaiming. If both - * MDB_NOSYNC and MDB_MAPASYNC flags are set, then assume that - * utterly no-sync write mode was requested. In such case - * don't make a steady-sync, but only a legacy-mode checkpoint, - * just for resume reclaiming only, not for data consistency. */ - - mdb_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu, txnid %zu", - head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', - tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', - oldest, env->me_txns->mt1.mtb.mtb_txnid ); - - int flags = env->me_flags & MDB_WRITEMAP; - if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC) - flags |= MDBX_UTTERLY_NOSYNC; - - mdb_assert(env, env->me_sync_pending > 0); - if (mdb_env_sync0(env, flags, &meta) == MDB_SUCCESS) { - txnid_t snap = mdb_find_oldest(env, NULL); - if (snap > oldest) { - continue; - } - } - } - - if (rc == MDB_MAP_FULL) { -#if MDBX_MODE_ENABLED - txnid_t snap = mdbx_oomkick(env, oldest); -#else - mdb_debug("DB size maxed out"); - txnid_t snap = mdb_find_oldest(env, NULL); -#endif /* MDBX_MODE_ENABLED */ - if (snap > oldest) { - oldest = snap; - continue; - } - } - } - -fail: - if (mp) { - *mp = NULL; - txn->mt_flags |= MDB_TXN_ERROR; - } - assert(rc); - return rc; - } - -done: - assert(mp && num); - if (env->me_flags & MDB_WRITEMAP) { - np = (MDB_page *)(env->me_map + env->me_psize * pgno); - /* LY: reset no-access flag from mdb_kill_page() */ - VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); - ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize * num); - } else { - if (unlikely(!(np = mdb_page_malloc(txn, num)))) { - rc = ENOMEM; - goto fail; - } - } - if (i) { - mop[0] = mop_len -= num; - /* Move any stragglers down */ - for (j = i-num; j < mop_len; ) - mop[++j] = mop[++i]; - } else { - txn->mt_next_pgno = pgno + num; - } - - if (env->me_flags & MDBX_PAGEPERTURB) - memset(np, 0x71 /* 'q', 113 */, env->me_psize * num); - VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); - - np->mp_pgno = pgno; - np->mp_leaf2_ksize = 0; - np->mp_flags = 0; - np->mp_pages = num; - mdb_page_dirty(txn, np); - *mp = np; - - return MDB_SUCCESS; -} - -/** Copy the used portions of a non-overflow page. - * @param[in] dst page to copy into - * @param[in] src page to copy from - * @param[in] psize size of a page - */ -static void -mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned psize) -{ - enum { Align = sizeof(pgno_t) }; - indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; - - /* If page isn't full, just copy the used portion. Adjust - * alignment so memcpy may copy words instead of bytes. */ - if ((unused &= -Align) && !IS_LEAF2(src)) { - upper = (upper + PAGEBASE) & -Align; - memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align); - memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), - psize - upper); - } else { - memcpy(dst, src, psize - unused); - } -} - -/** Pull a page off the txn's spill list, if present. - * If a page being referenced was spilled to disk in this txn, bring - * it back and make it dirty/writable again. - * @param[in] txn the transaction handle. - * @param[in] mp the page being referenced. It must not be dirty. - * @param[out] ret the writable page, if any. ret is unchanged if - * mp wasn't spilled. - */ -static int -mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) -{ - MDB_env *env = txn->mt_env; - const MDB_txn *tx2; - unsigned x; - pgno_t pgno = mp->mp_pgno, pn = pgno << 1; - - for (tx2 = txn; tx2; tx2=tx2->mt_parent) { - if (!tx2->mt_spill_pgs) - continue; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { - MDB_page *np; - int num; - if (txn->mt_dirty_room == 0) - return MDB_TXN_FULL; - if (IS_OVERFLOW(mp)) - num = mp->mp_pages; - else - num = 1; - if (env->me_flags & MDB_WRITEMAP) { - np = mp; - } else { - np = mdb_page_malloc(txn, num); - if (unlikely(!np)) - return ENOMEM; - if (num > 1) - memcpy(np, mp, num * env->me_psize); - else - mdb_page_copy(np, mp, env->me_psize); - } - if (tx2 == txn) { - /* If in current txn, this page is no longer spilled. - * If it happens to be the last page, truncate the spill list. - * Otherwise mark it as deleted by setting the LSB. */ - if (x == txn->mt_spill_pgs[0]) - txn->mt_spill_pgs[0]--; - else - txn->mt_spill_pgs[x] |= 1; - } /* otherwise, if belonging to a parent txn, the - * page remains spilled until child commits - */ - - mdb_page_dirty(txn, np); - np->mp_flags |= P_DIRTY; - *ret = np; - break; - } - } - return MDB_SUCCESS; -} - -/** Touch a page: make it dirty and re-insert into tree with updated pgno. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc cursor pointing to the page to be touched - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_touch(MDB_cursor *mc) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top], *np; - MDB_txn *txn = mc->mc_txn; - MDB_cursor *m2, *m3; - pgno_t pgno; - int rc; - - if (!F_ISSET(mp->mp_flags, P_DIRTY)) { - if (txn->mt_flags & MDB_TXN_SPILLS) { - np = NULL; - rc = mdb_page_unspill(txn, mp, &np); - if (unlikely(rc)) - goto fail; - if (likely(np)) - goto done; - } - if (unlikely((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || - (rc = mdb_page_alloc(mc, 1, &np, MDBX_ALLOC_ALL)))) - goto fail; - pgno = np->mp_pgno; - mdb_debug("touched db %d page %zu -> %zu", DDBI(mc), mp->mp_pgno, pgno); - mdb_cassert(mc, mp->mp_pgno != pgno); - mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - /* Update the parent page, if any, to point to the new page */ - if (mc->mc_top) { - MDB_page *parent = mc->mc_pg[mc->mc_top-1]; - MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); - SETPGNO(node, pgno); - } else { - mc->mc_db->md_root = pgno; - } - } else if (txn->mt_parent && !IS_SUBP(mp)) { - MDB_ID2 mid, *dl = txn->mt_u.dirty_list; - pgno = mp->mp_pgno; - /* If txn has a parent, make sure the page is in our - * dirty list. */ - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; - } - return 0; - } - } - mdb_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); - /* No - copy it */ - np = mdb_page_malloc(txn, 1); - if (unlikely(!np)) - return ENOMEM; - mid.mid = pgno; - mid.mptr = np; - rc = mdb_mid2l_insert(dl, &mid); - mdb_cassert(mc, rc == 0); - } else { - return 0; - } - - mdb_page_copy(np, mp, txn->mt_env->me_psize); - np->mp_pgno = pgno; - np->mp_flags |= P_DIRTY; - -done: - /* Adjust cursors pointing to mp */ - mc->mc_pg[mc->mc_top] = np; - m2 = txn->mt_cursors[mc->mc_dbi]; - if (mc->mc_flags & C_SUB) { - for (; m2; m2=m2->mc_next) { - m3 = &m2->mc_xcursor->mx_cursor; - if (m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[mc->mc_top] == mp) - m3->mc_pg[mc->mc_top] = np; - } - } else { - for (; m2; m2=m2->mc_next) { - if (m2->mc_snum < mc->mc_snum) continue; - if (m2 == mc) continue; - if (m2->mc_pg[mc->mc_top] == mp) { - m2->mc_pg[mc->mc_top] = np; - if (XCURSOR_INITED(m2) && IS_LEAF(np)) - XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); - } - } - } - return 0; - -fail: - txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_env_sync(MDB_env *env, int force) -{ - int rc; - pthread_mutex_t *mutex; - MDB_meta *head; - unsigned flags; - - if (unlikely(! env)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(! env->me_txns)) - return MDB_PANIC; - - flags = env->me_flags & ~MDB_NOMETASYNC; - if (unlikely(flags & (MDB_RDONLY | MDB_FATAL_ERROR))) - return EACCES; - - head = mdb_meta_head_r(env); - if (! META_IS_WEAK(head) && env->me_sync_pending == 0 - && env->me_mapsize == head->mm_mapsize) - /* LY: nothing to do */ - return MDB_SUCCESS; - - if (force || head->mm_mapsize != env->me_mapsize - || (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold)) - flags &= MDB_WRITEMAP; - - /* LY: early sync before acquiring the mutex to reduce writer's latency */ - if (env->me_sync_pending > env->me_psize * 16 && (flags & MDB_NOSYNC) == 0) { - if (flags & MDB_WRITEMAP) { - size_t used_size = env->me_psize * (head->mm_last_pg + 1); - rc = msync(env->me_map, used_size, - (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC); - } else { - rc = fdatasync(env->me_fd); - } - if (unlikely(rc)) - return errno; - } - - mutex = MDB_MUTEX(env, w); - rc = mdb_mutex_lock(env, mutex); - if (unlikely(rc)) - return rc; - - /* LY: head may be changed while the mutex has been acquired. */ - head = mdb_meta_head_w(env); - rc = MDB_SUCCESS; - if (META_IS_WEAK(head) || env->me_sync_pending != 0 - || env->me_mapsize != head->mm_mapsize) { - MDB_meta meta = *head; - rc = mdb_env_sync0(env, flags, &meta); - } - - mdb_mutex_unlock(env, mutex); - return rc; -} - -/** Back up parent txn's cursors, then grab the originals for tracking */ -static int -mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) -{ - MDB_cursor *mc, *bk; - MDB_xcursor *mx; - size_t size; - int i; - - for (i = src->mt_numdbs; --i >= 0; ) { - if ((mc = src->mt_cursors[i]) != NULL) { - size = sizeof(MDB_cursor); - if (mc->mc_xcursor) - size += sizeof(MDB_xcursor); - for (; mc; mc = bk->mc_next) { - bk = malloc(size); - if (unlikely(!bk)) - return ENOMEM; - *bk = *mc; - mc->mc_backup = bk; - mc->mc_db = &dst->mt_dbs[i]; - /* Kill pointers into src to reduce abuse: The - * user may not use mc until dst ends. But we need a valid - * txn pointer here for cursor fixups to keep working. */ - mc->mc_txn = dst; - mc->mc_dbflag = &dst->mt_dbflags[i]; - if ((mx = mc->mc_xcursor) != NULL) { - *(MDB_xcursor *)(bk+1) = *mx; - mx->mx_cursor.mc_txn = dst; - } - mc->mc_next = dst->mt_cursors[i]; - dst->mt_cursors[i] = mc; - } - } - } - return MDB_SUCCESS; -} - -/** Close this write txn's cursors, give parent txn's cursors back to parent. - * @param[in] txn the transaction handle. - * @param[in] merge true to keep changes to parent cursors, false to revert. - * @return 0 on success, non-zero on failure. - */ -static void -mdb_cursors_eot(MDB_txn *txn, unsigned merge) -{ - MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; - MDB_xcursor *mx; - int i; - - for (i = txn->mt_numdbs; --i >= 0; ) { - for (mc = cursors[i]; mc; mc = next) { - unsigned stage = mc->mc_signature; - mdb_ensure(NULL, stage == MDBX_MC_SIGNATURE - || stage == MDBX_MC_WAIT4EOT); - next = mc->mc_next; - if ((bk = mc->mc_backup) != NULL) { - if (merge) { - /* Commit changes to parent txn */ - mc->mc_next = bk->mc_next; - mc->mc_backup = bk->mc_backup; - mc->mc_txn = bk->mc_txn; - mc->mc_db = bk->mc_db; - mc->mc_dbflag = bk->mc_dbflag; - if ((mx = mc->mc_xcursor) != NULL) - mx->mx_cursor.mc_txn = bk->mc_txn; - } else { - /* Abort nested txn */ - *mc = *bk; - if ((mx = mc->mc_xcursor) != NULL) - *mx = *(MDB_xcursor *)(bk+1); - } -#if MDBX_MODE_ENABLED - bk->mc_signature = 0; - free(bk); - } - if (stage == MDBX_MC_WAIT4EOT) { - mc->mc_signature = 0; - free(mc); - } else { - mc->mc_signature = MDBX_MC_READY4CLOSE; - mc->mc_flags = 0 /* reset C_UNTRACK */; - } -#else - mc = bk; - } - /* Only malloced cursors are permanently tracked. */ - mc->mc_signature = 0; - free(mc); -#endif - } - cursors[i] = NULL; - } -} - -/** Set or check a pid lock. Set returns 0 on success. - * Check returns 0 if the process is certainly dead, nonzero if it may - * be alive (the lock exists or an error happened so we do not know). - */ -static int -mdb_reader_pid(MDB_env *env, int op, pid_t pid) -{ - for (;;) { - int rc; - struct flock lock_info; - memset(&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_WRLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = pid; - lock_info.l_len = 1; - if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { - if (op == F_GETLK && lock_info.l_type != F_UNLCK) - rc = -1; - } else if ((rc = errno) == EINTR) { - continue; - } - return rc; - } -} - -/** Common code for #mdb_txn_begin() and #mdb_txn_renew(). - * @param[in] txn the transaction handle to initialize - * @return 0 on success, non-zero on failure. - */ -static int -mdb_txn_renew0(MDB_txn *txn, unsigned flags) -{ - MDB_env *env = txn->mt_env; - unsigned i, nr; - int rc, new_notls = 0; - - if (unlikely(env->me_pid != getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; - } - - if (flags & MDB_TXN_RDONLY) { - MDBX_rthc *rthc = NULL; - MDB_reader *r = NULL; - - txn->mt_flags = MDB_TXN_RDONLY; - if (likely(env->me_flags & MDB_ENV_TXKEY)) { - mdb_assert(env, !(env->me_flags & MDB_NOTLS)); - rthc = mdbx_rthc_get(env->me_txkey); - if (unlikely(! rthc)) - return ENOMEM; - if (likely(rthc->rc_reader)) { - r = rthc->rc_reader; - mdb_assert(env, r->mr_pid == env->me_pid); - mdb_assert(env, r->mr_tid == pthread_self()); - } - } else { - mdb_assert(env, env->me_flags & MDB_NOTLS); - r = txn->mt_u.reader; - } - - if (likely(r)) { - if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != ~(txnid_t)0)) - return MDB_BAD_RSLOT; - } else { - pid_t pid = env->me_pid; - pthread_t tid = pthread_self(); - pthread_mutex_t *rmutex = MDB_MUTEX(env, r); - - rc = mdb_mutex_lock(env, rmutex); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - - if (unlikely(!env->me_live_reader)) { - rc = mdb_reader_pid(env, F_SETLK, pid); - if (unlikely(rc != MDB_SUCCESS)) { - mdb_mutex_unlock(env, rmutex); - return rc; - } - env->me_live_reader = 1; - } - - nr = env->me_txns->mti_numreaders; - for (i=0; ime_txns->mti_readers[i].mr_pid == 0) - break; - if (unlikely(i == env->me_maxreaders)) { - mdb_mutex_unlock(env, rmutex); - return MDB_READERS_FULL; - } - r = &env->me_txns->mti_readers[i]; - /* Claim the reader slot, carefully since other code - * uses the reader table un-mutexed: First reset the - * slot, next publish it in mti_numreaders. After - * that, it is safe for mdb_env_close() to touch it. - * When it will be closed, we can finally claim it. */ - r->mr_pid = 0; - r->mr_txnid = ~(txnid_t)0; - r->mr_tid = tid; - mdbx_coherent_barrier(); -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - if (i == nr) - env->me_txns->mti_numreaders = ++nr; - if (env->me_close_readers < nr) - env->me_close_readers = nr; - r->mr_pid = pid; -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - mdb_mutex_unlock(env, rmutex); - - new_notls = MDB_END_SLOT; - if (likely(rthc)) { - rthc->rc_reader = r; - new_notls = 0; - } - } - - while((env->me_flags & MDB_FATAL_ERROR) == 0) { - MDB_meta *meta = mdb_meta_head_r(txn->mt_env); - txnid_t lead = meta->mm_txnid; - r->mr_txnid = lead; - mdbx_coherent_barrier(); - - txnid_t snap = txn->mt_env->me_txns->mti_txnid; - /* LY: Retry on a race, ITS#7970. */ - if (likely(lead == snap)) { - txn->mt_txnid = lead; - txn->mt_next_pgno = meta->mm_last_pg+1; - /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); -#if MDBX_MODE_ENABLED - txn->mt_canary = meta->mm_canary; -#endif - break; - } - } - - txn->mt_u.reader = r; - txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ - } else { - /* Not yet touching txn == env->me_txn0, it may be active */ - rc = mdb_mutex_lock(env, MDB_MUTEX(env, w)); - if (unlikely(rc)) - return rc; - -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - MDB_meta *meta = mdb_meta_head_w(env); -#if MDBX_MODE_ENABLED - txn->mt_canary = meta->mm_canary; -#endif - txn->mt_txnid = meta->mm_txnid + 1; - txn->mt_flags = flags; -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - -#if MDB_DEBUG - if (unlikely(txn->mt_txnid == mdb_debug_edge)) { - if (! mdb_debug_logger) - mdb_runtime_flags |= MDBX_DBG_TRACE | MDBX_DBG_EXTRA - | MDBX_DBG_AUDIT | MDBX_DBG_ASSERT; - mdb_debug_log(MDBX_DBG_EDGE, __FUNCTION__, __LINE__, - "on/off edge (txn %zu)", txn->mt_txnid); - } -#endif - txn->mt_child = NULL; - txn->mt_loose_pgs = NULL; - txn->mt_loose_count = 0; - txn->mt_dirty_room = MDB_IDL_UM_MAX; - txn->mt_u.dirty_list = env->me_dirty_list; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_free_pgs = env->me_free_pgs; - txn->mt_free_pgs[0] = 0; - txn->mt_spill_pgs = NULL; - if (txn->mt_lifo_reclaimed) - txn->mt_lifo_reclaimed[0] = 0; - env->me_txn = txn; - memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned)); - /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); - /* Moved to here to avoid a data race in read TXNs */ - txn->mt_next_pgno = meta->mm_last_pg+1; - } - - /* Setup db info */ - txn->mt_numdbs = env->me_numdbs; - for (i=CORE_DBS; imt_numdbs; i++) { - unsigned x = env->me_dbflags[i]; - txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; - txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0; - } - txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID; - txn->mt_dbflags[FREE_DBI] = DB_VALID; - - if (unlikely(env->me_flags & MDB_FATAL_ERROR)) { - mdb_debug("environment had fatal error, must shutdown!"); - rc = MDB_PANIC; - } else if (unlikely(env->me_maxpg < txn->mt_next_pgno)) { - rc = MDB_MAP_RESIZED; - } else { - return MDB_SUCCESS; - } - mdb_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); - return rc; -} - -int -mdb_txn_renew(MDB_txn *txn) -{ - int rc; - - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY|MDB_TXN_FINISHED))) - return EINVAL; - - rc = mdb_txn_renew0(txn, MDB_TXN_RDONLY); - if (rc == MDB_SUCCESS) { - mdb_debug("renew txn %zu%c %p on mdbenv %p, root page %zu", - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); - } - return rc; -} - -int -mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **ret) -{ - MDB_txn *txn; - MDB_ntxn *ntxn; - int rc, size, tsize; - - if (unlikely(!env || !ret)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(env->me_pid != getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; - } - - flags &= MDB_TXN_BEGIN_FLAGS; - flags |= env->me_flags & MDB_WRITEMAP; - - if (unlikely(env->me_flags & MDB_RDONLY & ~flags)) /* write txn in RDONLY env */ - return EACCES; - - if (parent) { - if (unlikely(parent->mt_signature != MDBX_MT_SIGNATURE)) - return EINVAL; - - /* Nested transactions: Max 1 child, write txns only, no writemap */ - flags |= parent->mt_flags; - if (unlikely(flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED))) { - return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; - } - /* Child txns save MDB_pgstate and use own copy of cursors */ - size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1); - size += tsize = sizeof(MDB_ntxn); - } else if (flags & MDB_RDONLY) { - size = env->me_maxdbs * (sizeof(MDB_db)+1); - size += tsize = sizeof(MDB_txn); - } else { - /* Reuse preallocated write txn. However, do not touch it until - * mdb_txn_renew0() succeeds, since it currently may be active. */ - txn = env->me_txn0; - goto renew; - } - if (unlikely((txn = calloc(1, size)) == NULL)) { - mdb_debug("calloc: %s", strerror(errno)); - return ENOMEM; - } - txn->mt_dbxs = env->me_dbxs; /* static */ - txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); - txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; - txn->mt_flags = flags; - txn->mt_env = env; - - if (parent) { - unsigned i; - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = parent->mt_dbiseqs; - txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE); - if (!txn->mt_u.dirty_list || - !(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX))) - { - free(txn->mt_u.dirty_list); - free(txn); - return ENOMEM; - } - txn->mt_txnid = parent->mt_txnid; - txn->mt_dirty_room = parent->mt_dirty_room; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_spill_pgs = NULL; - txn->mt_next_pgno = parent->mt_next_pgno; - parent->mt_flags |= MDB_TXN_HAS_CHILD; - parent->mt_child = txn; - txn->mt_parent = parent; - txn->mt_numdbs = parent->mt_numdbs; - memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - /* Copy parent's mt_dbflags, but clear DB_NEW */ - for (i=0; imt_numdbs; i++) - txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; - rc = 0; - ntxn = (MDB_ntxn *)txn; - ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ - if (env->me_pghead) { - size = MDB_IDL_SIZEOF(env->me_pghead); - env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); - if (likely(env->me_pghead)) - memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); - else - rc = ENOMEM; - } - if (likely(!rc)) - rc = mdb_cursor_shadow(parent, txn); - if (unlikely(rc)) - mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD); - } else { /* MDB_RDONLY */ - txn->mt_dbiseqs = env->me_dbiseqs; -renew: - rc = mdb_txn_renew0(txn, flags); - } - if (unlikely(rc)) { - if (txn != env->me_txn0) - free(txn); - } else { - txn->mt_signature = MDBX_MT_SIGNATURE; - *ret = txn; - mdb_debug("begin txn %zu%c %p on mdbenv %p, root page %zu", - txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', - (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root); - } - - return rc; -} - -MDB_env * -mdb_txn_env(MDB_txn *txn) -{ - if(unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return NULL; - return txn->mt_env; -} - -size_t -mdb_txn_id(MDB_txn *txn) -{ - if(unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return 0; - return txn->mt_txnid; -} - -/** Export or close DBI handles opened in this txn. */ -static void -mdb_dbis_update(MDB_txn *txn, int keep) -{ - int i; - MDB_dbi n = txn->mt_numdbs; - MDB_env *env = txn->mt_env; - unsigned char *tdbflags = txn->mt_dbflags; - - for (i = n; --i >= CORE_DBS;) { - if (tdbflags[i] & DB_NEW) { - if (keep) { - env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; - } else { - char *ptr = env->me_dbxs[i].md_name.mv_data; - if (ptr) { - env->me_dbxs[i].md_name.mv_data = NULL; - env->me_dbxs[i].md_name.mv_size = 0; - env->me_dbflags[i] = 0; - env->me_dbiseqs[i]++; - free(ptr); - } - } - } - } - if (keep && env->me_numdbs < n) - env->me_numdbs = n; -} - -/** End a transaction, except successful commit of a nested transaction. - * May be called twice for readonly txns: First reset it, then abort. - * @param[in] txn the transaction handle to end - * @param[in] mode why and how to end the transaction - */ -static int -mdb_txn_end(MDB_txn *txn, unsigned mode) -{ - MDB_env *env = txn->mt_env; - static const char *const names[] = MDB_END_NAMES; - - if (unlikely(txn->mt_env->me_pid != getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; - } - - /* Export or close DBI handles opened in this txn */ - mdb_dbis_update(txn, mode & MDB_END_UPDATE); - - mdb_debug("%s txn %zu%c %p on mdbenv %p, root page %zu", - names[mode & MDB_END_OPMASK], - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); - - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - if (txn->mt_u.reader) { -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - txn->mt_u.reader->mr_txnid = ~(txnid_t)0; - if (!(env->me_flags & MDB_NOTLS)) { - txn->mt_u.reader = NULL; /* txn does not own reader */ - } else if (mode & MDB_END_SLOT) { - txn->mt_u.reader->mr_pid = 0; - txn->mt_u.reader = NULL; - } /* else txn owns the slot until it does MDB_END_SLOT */ -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - } - mdbx_coherent_barrier(); - txn->mt_numdbs = 0; /* prevent further DBI activity */ - txn->mt_flags |= MDB_TXN_FINISHED; - - } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { - pgno_t *pghead = env->me_pghead; - - if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ - mdb_cursors_eot(txn, 0); - if (!(env->me_flags & MDB_WRITEMAP)) { - mdb_dlist_free(txn); - } - - if (txn->mt_lifo_reclaimed) { - txn->mt_lifo_reclaimed[0] = 0; - if (txn != env->me_txn0) { - mdb_midl_free(txn->mt_lifo_reclaimed); - txn->mt_lifo_reclaimed = NULL; - } - } - txn->mt_numdbs = 0; - txn->mt_flags = MDB_TXN_FINISHED; - - if (!txn->mt_parent) { - mdb_midl_shrink(&txn->mt_free_pgs); - env->me_free_pgs = txn->mt_free_pgs; - /* me_pgstate: */ - env->me_pghead = NULL; - env->me_pglast = 0; - - env->me_txn = NULL; - mode = 0; /* txn == env->me_txn0, do not free() it */ - - /* The writer mutex was locked in mdb_txn_begin. */ - mdb_mutex_unlock(env, MDB_MUTEX(env, w)); - } else { - txn->mt_parent->mt_child = NULL; - txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; - env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; - mdb_midl_free(txn->mt_free_pgs); - mdb_midl_free(txn->mt_spill_pgs); - free(txn->mt_u.dirty_list); - } - - mdb_midl_free(pghead); - } - - if (mode & MDB_END_FREE) { - txn->mt_signature = 0; - free(txn); - } - - return MDB_SUCCESS; -} - -int -mdb_txn_reset(MDB_txn *txn) -{ - if (unlikely(! txn)) - return EINVAL; - - if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - /* This call is only valid for read-only txns */ - if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) - return EINVAL; - -#if MDBX_MODE_ENABLED - /* LY: don't close DBI-handles in MDBX mode */ - return mdb_txn_end(txn, MDB_END_RESET|MDB_END_UPDATE); -#else - return mdb_txn_end(txn, MDB_END_RESET); -#endif /* MDBX_MODE_ENABLED */ -} - -int -mdb_txn_abort(MDB_txn *txn) -{ - if (unlikely(! txn)) - return EINVAL; - - if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - -#if MDBX_MODE_ENABLED - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) - /* LY: don't close DBI-handles in MDBX mode */ - return mdb_txn_end(txn, MDB_END_ABORT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE); -#endif /* MDBX_MODE_ENABLED */ - - if (txn->mt_child) - mdb_txn_abort(txn->mt_child); - - return mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); -} - -static MDBX_INLINE int -mdb_backlog_size(MDB_txn *txn) -{ - int reclaimed = txn->mt_env->me_pghead ? txn->mt_env->me_pghead[0] : 0; - return reclaimed + txn->mt_loose_count; -} - -/* LY: Prepare a backlog of pages to modify FreeDB itself, - * while reclaiming is prohibited. It should be enough to prevent search - * in mdb_page_alloc() during a deleting, when freeDB tree is unbalanced. */ -static int -mdb_prep_backlog(MDB_txn *txn, MDB_cursor *mc) -{ - /* LY: extra page(s) for b-tree rebalancing */ - const int extra = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) ? 2 : 1; - - if (mdb_backlog_size(txn) < mc->mc_db->md_depth + extra) { - int rc = mdb_cursor_touch(mc); - if (unlikely(rc)) - return rc; - - while (unlikely(mdb_backlog_size(txn) < extra)) { - rc = mdb_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC); - if (unlikely(rc)) { - if (unlikely(rc != MDB_NOTFOUND)) - return rc; - break; - } - } - } - - return MDB_SUCCESS; -} - -/** Save the freelist as of this transaction to the freeDB. - * This changes the freelist. Keep trying until it stabilizes. - */ -static int -mdb_freelist_save(MDB_txn *txn) -{ - /* env->me_pghead[] can grow and shrink during this call. - * env->me_pglast and txn->mt_free_pgs[] can only grow. - * Page numbers cannot disappear from txn->mt_free_pgs[]. */ - MDB_cursor mc; - MDB_env *env = txn->mt_env; - int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; - txnid_t pglast = 0, head_id = 0; - pgno_t freecnt = 0, *free_pgs, *mop; - ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; - unsigned cleanup_idx = 0, refill_idx = 0; - const int lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; - - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - - /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ - clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP)) - ? SSIZE_MAX : maxfree_1pg; - -again: - for (;;) { - /* Come back here after each Put() in case freelist changed */ - MDB_val key, data; - pgno_t *pgs; - ssize_t j; - - if (! lifo) { - /* If using records from freeDB which we have not yet - * deleted, delete them and any we reserved for me_pghead. */ - while (pglast < env->me_pglast) { - rc = mdb_cursor_first(&mc, &key, NULL); - if (unlikely(rc)) - goto bailout; - rc = mdb_prep_backlog(txn, &mc); - if (unlikely(rc)) - goto bailout; - pglast = head_id = *(txnid_t *)key.mv_data; - total_room = head_room = 0; - more = 1; - mdb_tassert(txn, pglast <= env->me_pglast); - mc.mc_flags |= C_RECLAIMING; - rc = mdb_cursor_del(&mc, 0); - mc.mc_flags &= ~C_RECLAIMING; - if (unlikely(rc)) - goto bailout; - } - } else if (txn->mt_lifo_reclaimed) { - /* LY: cleanup reclaimed records. */ - while(cleanup_idx < txn->mt_lifo_reclaimed[0]) { - pglast = txn->mt_lifo_reclaimed[++cleanup_idx]; - key.mv_data = &pglast; - key.mv_size = sizeof(pglast); - rc = mdb_cursor_get(&mc, &key, NULL, MDB_SET); - if (likely(rc != MDB_NOTFOUND)) { - if (unlikely(rc)) - goto bailout; - rc = mdb_prep_backlog(txn, &mc); - if (unlikely(rc)) - goto bailout; - mc.mc_flags |= C_RECLAIMING; - rc = mdb_cursor_del(&mc, 0); - mc.mc_flags &= ~C_RECLAIMING; - if (unlikely(rc)) - goto bailout; - } - } - } - - if (unlikely(!env->me_pghead) && txn->mt_loose_pgs) { - /* Put loose page numbers in mt_free_pgs, since - * we may be unable to return them to me_pghead. */ - MDB_page *mp = txn->mt_loose_pgs; - if (unlikely((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)) - return rc; - for (; mp; mp = NEXT_LOOSE_PAGE(mp)) - mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - txn->mt_loose_pgs = NULL; - txn->mt_loose_count = 0; - } - - /* Save the IDL of pages freed by this txn, to a single record */ - if (freecnt < txn->mt_free_pgs[0]) { - if (unlikely(!freecnt)) { - /* Make sure last page of freeDB is touched and on freelist */ - rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); - if (unlikely(rc && rc != MDB_NOTFOUND)) - goto bailout; - } - free_pgs = txn->mt_free_pgs; - /* Write to last page of freeDB */ - key.mv_size = sizeof(txn->mt_txnid); - key.mv_data = &txn->mt_txnid; - do { - freecnt = free_pgs[0]; - data.mv_size = MDB_IDL_SIZEOF(free_pgs); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (unlikely(rc)) - goto bailout; - /* Retry if mt_free_pgs[] grew during the Put() */ - free_pgs = txn->mt_free_pgs; - } while (freecnt < free_pgs[0]); - - mdb_midl_sort(free_pgs); - memcpy(data.mv_data, free_pgs, data.mv_size); - - if (mdb_debug_enabled(MDBX_DBG_EXTRA)) { - unsigned i = free_pgs[0]; - mdb_debug_extra("IDL write txn %zu root %zu num %u, IDL", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); - for (; i; i--) - mdb_debug_extra_print(" %zu", free_pgs[i]); - mdb_debug_extra_print("\n"); - } - continue; - } - - mop = env->me_pghead; - mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; - - if (mop_len && refill_idx == 0) - refill_idx = 1; - - /* Reserve records for me_pghead[]. Split it if multi-page, - * to avoid searching freeDB for a page range. Use keys in - * range [1,me_pglast]: Smaller than txnid of oldest reader. */ - if (total_room >= mop_len) { - if (total_room == mop_len || --more < 0) - break; - } else if (head_room >= maxfree_1pg && head_id > 1) { - /* Keep current record (overflow page), add a new one */ - head_id--; - refill_idx++; - head_room = 0; - } - - if (lifo) { - if (refill_idx > (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)) { - /* LY: need just a txn-id for save page list. */ - rc = mdb_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); - if (likely(rc == 0)) - /* LY: ok, reclaimed from freedb. */ - continue; - if (unlikely(rc != MDB_NOTFOUND)) - /* LY: other troubles... */ - goto bailout; - - /* LY: freedb is empty, will look any free txn-id in high2low order. */ - if (unlikely(env->me_pglast < 1)) { - /* LY: not any txn in the past of freedb. */ - rc = MDB_MAP_FULL; - goto bailout; - } - - if (unlikely(! txn->mt_lifo_reclaimed)) { - txn->mt_lifo_reclaimed = mdb_midl_alloc(env->me_maxfree_1pg); - if (unlikely(! txn->mt_lifo_reclaimed)) { - rc = ENOMEM; - goto bailout; - } - } - /* LY: append the list. */ - rc = mdb_midl_append(&txn->mt_lifo_reclaimed, env->me_pglast - 1); - if (unlikely(rc)) - goto bailout; - --env->me_pglast; - /* LY: note that freeDB cleanup is not needed. */ - ++cleanup_idx; - } - head_id = txn->mt_lifo_reclaimed[refill_idx]; - } - - /* (Re)write {key = head_id, IDL length = head_room} */ - total_room -= head_room; - head_room = mop_len - total_room; - if (head_room > maxfree_1pg && head_id > 1) { - /* Overflow multi-page for part of me_pghead */ - head_room /= head_id; /* amortize page sizes */ - head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); - } else if (head_room < 0) { - /* Rare case, not bothering to delete this record */ - head_room = 0; - continue; - } - key.mv_size = sizeof(head_id); - key.mv_data = &head_id; - data.mv_size = (head_room + 1) * sizeof(pgno_t); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (unlikely(rc)) - goto bailout; - /* IDL is initially empty, zero out at least the length */ - pgs = (pgno_t *)data.mv_data; - j = head_room > clean_limit ? head_room : 0; - do { - pgs[j] = 0; - } while (--j >= 0); - total_room += head_room; - } - - mdb_tassert(txn, cleanup_idx == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - - /* Return loose page numbers to me_pghead, though usually none are - * left at this point. The pages themselves remain in dirty_list. */ - if (txn->mt_loose_pgs) { - MDB_page *mp = txn->mt_loose_pgs; - unsigned count = txn->mt_loose_count; - MDB_IDL loose; - /* Room for loose pages + temp IDL with same */ - if ((rc = mdb_midl_need(&env->me_pghead, 2*count+1)) != 0) - goto bailout; - mop = env->me_pghead; - loose = mop + MDB_IDL_ALLOCLEN(mop) - count; - for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) - loose[ ++count ] = mp->mp_pgno; - loose[0] = count; - mdb_midl_sort(loose); - mdb_midl_xmerge(mop, loose); - txn->mt_loose_pgs = NULL; - txn->mt_loose_count = 0; - mop_len = mop[0]; - } - - /* Fill in the reserved me_pghead records */ - rc = MDB_SUCCESS; - if (mop_len) { - MDB_val key, data; - - mop += mop_len; - if (! lifo) { - rc = mdb_cursor_first(&mc, &key, &data); - if (unlikely(rc)) - goto bailout; - } - - for(;;) { - txnid_t id; - ssize_t len; - MDB_ID save; - - if (! lifo) { - id = *(txnid_t *)key.mv_data; - mdb_tassert(txn, id <= env->me_pglast); - } else { - mdb_tassert(txn, refill_idx > 0 && refill_idx <= txn->mt_lifo_reclaimed[0]); - id = txn->mt_lifo_reclaimed[refill_idx--]; - key.mv_data = &id; - key.mv_size = sizeof(id); - rc = mdb_cursor_get(&mc, &key, &data, MDB_SET); - if (unlikely(rc)) - goto bailout; - } - mdb_tassert(txn, cleanup_idx == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - - len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; - mdb_tassert(txn, len >= 0); - if (len > mop_len) - len = mop_len; - data.mv_size = (len + 1) * sizeof(MDB_ID); - key.mv_data = &id; - key.mv_size = sizeof(id); - data.mv_data = mop -= len; - - save = mop[0]; - mop[0] = len; - rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT); - mdb_tassert(txn, cleanup_idx == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - mop[0] = save; - if (unlikely(rc || (mop_len -= len) == 0)) - goto bailout; - - if (! lifo) { - rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT); - if (unlikely(rc)) - goto bailout; - } - } - } - -bailout: - if (txn->mt_lifo_reclaimed) { - mdb_tassert(txn, rc || cleanup_idx == txn->mt_lifo_reclaimed[0]); - if (rc == 0 && cleanup_idx != txn->mt_lifo_reclaimed[0]) { - mdb_tassert(txn, cleanup_idx < txn->mt_lifo_reclaimed[0]); - /* LY: zeroed cleanup_idx to force cleanup & refill created freeDB records. */ - cleanup_idx = 0; - /* LY: restart filling */ - refill_idx = total_room = head_room = 0; - more = 1; - goto again; - } - txn->mt_lifo_reclaimed[0] = 0; - if (txn != env->me_txn0) { - mdb_midl_free(txn->mt_lifo_reclaimed); - txn->mt_lifo_reclaimed = NULL; - } - } - - return rc; -} - -/** Flush (some) dirty pages to the map, after clearing their dirty flag. - * @param[in] txn the transaction that's being committed - * @param[in] keep number of initial pages in dirty_list to keep dirty. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_flush(MDB_txn *txn, int keep) -{ - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned psize = env->me_psize, j; - int i, pagecount = dl[0].mid, rc; - size_t size = 0, pos = 0; - pgno_t pgno = 0; - MDB_page *dp = NULL; - struct iovec iov[MDB_COMMIT_PAGES]; - ssize_t wpos = 0, wsize = 0, wres; - size_t next_pos = 1; /* impossible pos, so pos != next_pos */ - int n = 0; - - j = i = keep; - - if (env->me_flags & MDB_WRITEMAP) { - /* Clear dirty flags */ - while (++i <= pagecount) { - dp = dl[i].mptr; - /* Don't flush this page yet */ - if (dp->mp_flags & (P_LOOSE|P_KEEP)) { - dp->mp_flags &= ~P_KEEP; - dl[++j] = dl[i]; - continue; - } - dp->mp_flags &= ~P_DIRTY; - env->me_sync_pending += IS_OVERFLOW(dp) ? psize * dp->mp_pages : psize; - } - goto done; - } - - /* Write the pages */ - for (;;) { - if (++i <= pagecount) { - dp = dl[i].mptr; - /* Don't flush this page yet */ - if (dp->mp_flags & (P_LOOSE|P_KEEP)) { - dp->mp_flags &= ~P_KEEP; - dl[i].mid = 0; - continue; - } - pgno = dl[i].mid; - /* clear dirty flag */ - dp->mp_flags &= ~P_DIRTY; - pos = pgno * psize; - size = psize; - if (IS_OVERFLOW(dp)) size *= dp->mp_pages; - env->me_sync_pending += size; - } - /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ - if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { - if (n) { -retry: - /* Write previous page(s) */ - wres = pwritev(env->me_fd, iov, n, wpos); - if (unlikely(wres != wsize)) { - if (wres < 0) { - rc = errno; - if (rc == EINTR) - goto retry; - mdb_debug("Write error: %s", strerror(rc)); - } else { - rc = EIO; /* TODO: Use which error code? */ - mdb_debug("short write, filesystem full?"); - } - return rc; - } - n = 0; - } - if (i > pagecount) - break; - wpos = pos; - wsize = 0; - } - mdb_debug("committing page %zu", pgno); - next_pos = pos + size; - iov[n].iov_len = size; - iov[n].iov_base = (char *)dp; - wsize += size; - n++; - } - - mdb_invalidate_cache(env->me_map, txn->mt_next_pgno * env->me_psize); - - for (i = keep; ++i <= pagecount; ) { - dp = dl[i].mptr; - /* This is a page we skipped above */ - if (!dl[i].mid) { - dl[++j] = dl[i]; - dl[j].mid = dp->mp_pgno; - continue; - } - mdb_dpage_free(env, dp); - } - -done: - i--; - txn->mt_dirty_room += i - j; - dl[0].mid = j; - return MDB_SUCCESS; -} - -int -mdb_txn_commit(MDB_txn *txn) -{ - int rc; - unsigned i, end_mode; - MDB_env *env; - - if (unlikely(txn == NULL)) - return EINVAL; - - if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(txn->mt_env->me_pid != getpid())) { - txn->mt_env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; - } - - /* mdb_txn_end() mode for a commit which writes nothing */ - end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; - - if (txn->mt_child) { - rc = mdb_txn_commit(txn->mt_child); - txn->mt_child = NULL; - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - } - - env = txn->mt_env; - - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) { - goto done; - } - - if (unlikely(txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR))) { - mdb_debug("error flag is set, can't commit"); - if (txn->mt_parent) - txn->mt_parent->mt_flags |= MDB_TXN_ERROR; - rc = MDB_BAD_TXN; - goto fail; - } - - if (txn->mt_parent) { - MDB_txn *parent = txn->mt_parent; - MDB_page **lp; - MDB_ID2L dst, src; - MDB_IDL pspill; - unsigned x, y, len, ps_len; - - /* Append our reclaim list to parent's */ - if (txn->mt_lifo_reclaimed) { - if (parent->mt_lifo_reclaimed) { - rc = mdb_midl_append_list(&parent->mt_lifo_reclaimed, txn->mt_lifo_reclaimed); - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - mdb_midl_free(txn->mt_lifo_reclaimed); - } else - parent->mt_lifo_reclaimed = txn->mt_lifo_reclaimed; - txn->mt_lifo_reclaimed = NULL; - } - - /* Append our free list to parent's */ - rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - mdb_midl_free(txn->mt_free_pgs); - /* Failures after this must either undo the changes - * to the parent or set MDB_TXN_ERROR in the parent. */ - - parent->mt_next_pgno = txn->mt_next_pgno; - parent->mt_flags = txn->mt_flags; - - /* Merge our cursors into parent's and close them */ - mdb_cursors_eot(txn, 1); - - /* Update parent's DB table. */ - memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - parent->mt_numdbs = txn->mt_numdbs; - parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; - parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; - for (i=CORE_DBS; imt_numdbs; i++) { - /* preserve parent's DB_NEW status */ - x = parent->mt_dbflags[i] & DB_NEW; - parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; - } - - dst = parent->mt_u.dirty_list; - src = txn->mt_u.dirty_list; - /* Remove anything in our dirty list from parent's spill list */ - if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { - x = y = ps_len; - pspill[0] = (pgno_t)-1; - /* Mark our dirty pages as deleted in parent spill list */ - for (i=0, len=src[0].mid; ++i <= len; ) { - MDB_ID pn = src[i].mid << 1; - while (pn > pspill[x]) - x--; - if (pn == pspill[x]) { - pspill[x] = 1; - y = --x; - } - } - /* Squash deleted pagenums if we deleted any */ - for (x=y; ++x <= ps_len; ) - if (!(pspill[x] & 1)) - pspill[++y] = pspill[x]; - pspill[0] = y; - } - - /* Remove anything in our spill list from parent's dirty list */ - if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { - for (i=1; i<=txn->mt_spill_pgs[0]; i++) { - MDB_ID pn = txn->mt_spill_pgs[i]; - if (pn & 1) - continue; /* deleted spillpg */ - pn >>= 1; - y = mdb_mid2l_search(dst, pn); - if (y <= dst[0].mid && dst[y].mid == pn) { - free(dst[y].mptr); - while (y < dst[0].mid) { - dst[y] = dst[y+1]; - y++; - } - dst[0].mid--; - } - } - } - - /* Find len = length of merging our dirty list with parent's */ - x = dst[0].mid; - dst[0].mid = 0; /* simplify loops */ - if (parent->mt_parent) { - len = x + src[0].mid; - y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; - for (i = x; y && i; y--) { - pgno_t yp = src[y].mid; - while (yp < dst[i].mid) - i--; - if (yp == dst[i].mid) { - i--; - len--; - } - } - } else { /* Simplify the above for single-ancestor case */ - len = MDB_IDL_UM_MAX - txn->mt_dirty_room; - } - /* Merge our dirty list with parent's */ - y = src[0].mid; - for (i = len; y; dst[i--] = src[y--]) { - pgno_t yp = src[y].mid; - while (yp < dst[x].mid) - dst[i--] = dst[x--]; - if (yp == dst[x].mid) - free(dst[x--].mptr); - } - mdb_tassert(txn, i == x); - dst[0].mid = len; - free(txn->mt_u.dirty_list); - parent->mt_dirty_room = txn->mt_dirty_room; - if (txn->mt_spill_pgs) { - if (parent->mt_spill_pgs) { - /* TODO: Prevent failure here, so parent does not fail */ - rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); - if (unlikely(rc != MDB_SUCCESS)) - parent->mt_flags |= MDB_TXN_ERROR; - mdb_midl_free(txn->mt_spill_pgs); - mdb_midl_sort(parent->mt_spill_pgs); - } else { - parent->mt_spill_pgs = txn->mt_spill_pgs; - } - } - - /* Append our loose page list to parent's */ - for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) - ; - *lp = txn->mt_loose_pgs; - parent->mt_loose_count += txn->mt_loose_count; - - parent->mt_child = NULL; - mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); - txn->mt_signature = 0; - free(txn); - return rc; - } - - env = txn->mt_env; - if (unlikely(txn != env->me_txn)) { - mdb_debug("attempt to commit unknown transaction"); - rc = EINVAL; - goto fail; - } - - mdb_cursors_eot(txn, 0); - - if (!txn->mt_u.dirty_list[0].mid && - !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) - goto done; - - mdb_debug("committing txn %zu %p on mdbenv %p, root page %zu", - txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root); - - /* Update DB root pointers */ - if (txn->mt_numdbs > CORE_DBS) { - MDB_cursor mc; - MDB_dbi i; - MDB_val data; - data.mv_size = sizeof(MDB_db); - - mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - for (i = CORE_DBS; i < txn->mt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - if (unlikely(TXN_DBI_CHANGED(txn, i))) { - rc = MDB_BAD_DBI; - goto fail; - } - data.mv_data = &txn->mt_dbs[i]; - rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, - F_SUBDATA); - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - } - } - } - - rc = mdb_freelist_save(txn); - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - - mdb_midl_free(env->me_pghead); - env->me_pghead = NULL; - mdb_midl_shrink(&txn->mt_free_pgs); - - if (mdb_audit_enabled()) - mdb_audit(txn); - - rc = mdb_page_flush(txn, 0); - if (likely(rc == MDB_SUCCESS)) { - MDB_meta meta; - - meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; - meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - meta.mm_last_pg = txn->mt_next_pgno - 1; - meta.mm_txnid = txn->mt_txnid; -#if MDBX_MODE_ENABLED - meta.mm_canary = txn->mt_canary; -#endif - - rc = mdb_env_sync0(env, env->me_flags | txn->mt_flags, &meta); - } - if (unlikely(rc != MDB_SUCCESS)) - goto fail; - end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; - -done: - return mdb_txn_end(txn, end_mode); - -fail: - mdb_txn_abort(txn); - return rc; -} - -/** Read the environment parameters of a DB environment before - * mapping it into memory. - * @param[in] env the environment handle - * @param[out] meta address of where to store the meta information - * @return 0 on success, non-zero on failure. - */ -static int __cold -mdb_env_read_header(MDB_env *env, MDB_meta *meta) -{ - MDB_metabuf pbuf; - MDB_page *p; - MDB_meta *m; - int i, rc, off; - enum { Size = sizeof(pbuf) }; - - /* We don't know the page size yet, so use a minimum value. - * Read both meta pages so we can use the latest one. - */ - - meta->mm_datasync_sign = MDB_DATASIGN_WEAK; - meta->mm_txnid = 0; - for (i=off=0; imm_psize) { - rc = pread(env->me_fd, &pbuf, Size, off); - if (rc != Size) { - if (rc == 0 && off == 0) - return ENOENT; - rc = rc < 0 ? (int) errno : MDB_INVALID; - mdb_debug("read: %s", mdb_strerror(rc)); - return rc; - } - - p = (MDB_page *)&pbuf; - - if (!F_ISSET(p->mp_flags, P_META)) { - mdb_debug("page %zu not a meta page", p->mp_pgno); - return MDB_INVALID; - } - - m = PAGEDATA(p); - if (m->mm_magic != MDB_MAGIC) { - mdb_debug("meta has invalid magic"); - return MDB_INVALID; - } - - if (m->mm_version != MDB_DATA_VERSION) { - mdb_debug("database is version %u, expected version %u", - m->mm_version, MDB_DATA_VERSION); - return MDB_VERSION_MISMATCH; - } - - if (m->mm_datasync_sign > MDB_DATASIGN_WEAK && m->mm_datasync_sign != mdb_meta_sign(m)) - continue; - - if (mdb_meta_lt(meta, m)) - *meta = *m; - } - - if (meta->mm_datasync_sign == MDB_DATASIGN_WEAK) - /* LY: Both meta-pages are weak. */ - return MDB_CORRUPTED; - - return MDB_SUCCESS; -} - -/** Fill in most of the zeroed #MDB_meta for an empty database environment */ -static void __cold -mdb_env_init_meta0(MDB_env *env, MDB_meta *meta) -{ - meta->mm_magic = MDB_MAGIC; - meta->mm_version = MDB_DATA_VERSION; - meta->mm_mapsize = env->me_mapsize; - meta->mm_psize = env->me_psize; - meta->mm_last_pg = NUM_METAS-1; - meta->mm_flags = env->me_flags & 0xffff; - meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ - meta->mm_dbs[FREE_DBI].md_root = P_INVALID; - meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; - meta->mm_datasync_sign = mdb_meta_sign(meta); -} - -/** Write the environment parameters of a freshly created DB environment. - * @param[in] env the environment handle - * @param[in] meta the #MDB_meta to write - * @return 0 on success, non-zero on failure. - */ -static int __cold -mdb_env_init_meta(MDB_env *env, MDB_meta *meta) -{ - MDB_page *p, *q; - int rc; - unsigned psize; - int len; - - mdb_debug("writing new meta page"); - - psize = env->me_psize; - - p = calloc(NUM_METAS, psize); - if (!p) - return ENOMEM; - p->mp_pgno = 0; - p->mp_flags = P_META; - *(MDB_meta *)PAGEDATA(p) = *meta; - - q = (MDB_page *)((char *)p + psize); - q->mp_pgno = 1; - q->mp_flags = P_META; - *(MDB_meta *)PAGEDATA(q) = *meta; - - do - len = pwrite(env->me_fd, p, psize * NUM_METAS, 0); - while (len == -1 && errno == EINTR); - - if (len < 0) - rc = errno; - else if ((unsigned) len == psize * NUM_METAS) - rc = MDB_SUCCESS; - else - rc = ENOSPC; - free(p); - return rc; -} - -static int -mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) -{ - int rc; - MDB_meta* head = mdb_meta_head_w(env); - size_t prev_mapsize = head->mm_mapsize; - size_t used_size = env->me_psize * (pending->mm_last_pg + 1); - - mdb_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env)); - mdb_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0); - mdb_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0 - || env->me_mapsize != prev_mapsize); - - pending->mm_mapsize = env->me_mapsize; - mdb_assert(env, pending->mm_mapsize >= used_size); - if (unlikely(pending->mm_mapsize != prev_mapsize)) { - if (pending->mm_mapsize < prev_mapsize) { - /* LY: currently this can't happen, but force full-sync. */ - flags &= MDB_WRITEMAP; - } else { - /* Persist any increases of mapsize config */ - } - } - - if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) - flags &= MDB_WRITEMAP; - - /* LY: step#1 - sync previously written/updated data-pages */ - if (env->me_sync_pending && (flags & MDB_NOSYNC) == 0) { - if (env->me_flags & MDB_WRITEMAP) { - int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - if (unlikely(msync(env->me_map, used_size, mode))) { - rc = errno; - /* LY: msync() should never return EINTR */ - goto fail; - } - if ((flags & MDB_MAPASYNC) == 0) - env->me_sync_pending = 0; - } else { - int (*flush)(int fd) = fdatasync; - if (unlikely(prev_mapsize != pending->mm_mapsize)) { - /* LY: It is no reason to use fdatasync() here, even in case - * no such bug in a kernel. Because "no-bug" mean that a kernel - * internally do nearly the same, e.g. fdatasync() == fsync() - * when no-kernel-bug and file size was changed. - * - * So, this code is always safe and without appreciable - * performance degradation. - * - * For more info about of a corresponding fdatasync() bug - * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ - flush = fsync; - } - while(unlikely(flush(env->me_fd) < 0)) { - rc = errno; - if (rc != EINTR) - goto fail; - } - env->me_sync_pending = 0; - } - } - - /* LY: step#2 - update meta-page. */ - if (env->me_sync_pending == 0) { - pending->mm_datasync_sign = mdb_meta_sign(pending); - } else { - pending->mm_datasync_sign = - (flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC - ? MDB_DATASIGN_NONE : MDB_DATASIGN_WEAK; - } - - volatile MDB_meta* target = (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head)) - ? head : mdb_env_meta_flipflop(env, head); - off_t offset = (char*) target - env->me_map; - - MDB_meta* stay = mdb_env_meta_flipflop(env, (MDB_meta*) target); - mdb_debug("writing meta %d (%s, was %zu/%s, stay %s %zu/%s), root %zu, txn_id %zu, %s", - offset >= env->me_psize, - target == head ? "head" : "tail", target->mm_txnid, - META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" : "Legacy", - stay == head ? "head" : "tail", stay->mm_txnid, - META_IS_WEAK(stay) ? "Weak" : META_IS_STEADY(stay) ? "Steady" : "Legacy", - pending->mm_dbs[MAIN_DBI].md_root, pending->mm_txnid, - META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" : "Legacy" ); - - if (env->me_flags & MDB_WRITEMAP) { -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - /* LY: 'invalidate' the meta, - * but mdb_meta_head_r() will be confused/retired in collision case. */ - target->mm_datasync_sign = MDB_DATASIGN_WEAK; - target->mm_txnid = 0; - /* LY: update info */ - target->mm_mapsize = pending->mm_mapsize; - target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; - target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; - target->mm_last_pg = pending->mm_last_pg; -#if MDBX_MODE_ENABLED - target->mm_canary = pending->mm_canary; -#endif - /* LY: 'commit' the meta */ - target->mm_txnid = pending->mm_txnid; - target->mm_datasync_sign = pending->mm_datasync_sign; - } else { - pending->mm_magic = MDB_MAGIC; - pending->mm_version = MDB_DATA_VERSION; - pending->mm_address = head->mm_address; - retry: - rc = pwrite(env->me_fd, pending, sizeof(MDB_meta), offset); - if (unlikely(rc != sizeof(MDB_meta))) { - rc = (rc < 0) ? errno : EIO; - if (rc == EINTR) - goto retry; - - undo: - mdb_debug("write failed, disk error?"); - /* On a failure, the pagecache still contains the new data. - * Write some old data back, to prevent it from being used. */ - if (pwrite(env->me_fd, (void*) target, sizeof(MDB_meta), offset) == sizeof(MDB_meta)) { - /* LY: take a chance, if write succeeds at a magic ;) */ - goto retry; - } - goto fail; - } - mdb_invalidate_cache(env->me_map + offset, sizeof(MDB_meta)); -#ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); -#endif - } - - /* Memory ordering issues are irrelevant; since the entire writer - * is wrapped by wmutex, all of these changes will become visible - * after the wmutex is unlocked. Since the DB is multi-version, - * readers will get consistent data regardless of how fresh or - * how stale their view of these values is. - */ - env->me_txns->mti_txnid = pending->mm_txnid; -#ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); -#endif - - /* LY: step#3 - sync meta-pages. */ - if ((flags & (MDB_NOSYNC | MDB_NOMETASYNC)) == 0) { - if (env->me_flags & MDB_WRITEMAP) { - char* ptr = env->me_map + (offset & ~(env->me_os_psize - 1)); - int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - if (unlikely(msync(ptr, env->me_os_psize, mode) < 0)) { - rc = errno; - goto fail; - } - } else { - while(unlikely(fdatasync(env->me_fd) < 0)) { - rc = errno; - if (rc != EINTR) - goto undo; - } - } - } - - /* LY: currently this can't happen, but... */ - if (unlikely(pending->mm_mapsize < prev_mapsize)) { - mdb_assert(env, pending->mm_mapsize == env->me_mapsize); - if (unlikely(mremap(env->me_map, prev_mapsize, pending->mm_mapsize, - MREMAP_FIXED, pending->mm_address) == MAP_FAILED)) { - rc = errno; - goto fail; - } - if (unlikely(ftruncate(env->me_fd, pending->mm_mapsize) < 0)) { - rc = errno; - goto fail; - } - } - - return MDB_SUCCESS; - -fail: - env->me_flags |= MDB_FATAL_ERROR; - return rc; -} - -int __cold -mdb_env_create(MDB_env **env) -{ - MDB_env *e; - - e = calloc(1, sizeof(MDB_env)); - if (!e) - return ENOMEM; - - e->me_maxreaders = DEFAULT_READERS; - e->me_maxdbs = e->me_numdbs = CORE_DBS; - e->me_fd = INVALID_HANDLE_VALUE; - e->me_lfd = INVALID_HANDLE_VALUE; - e->me_pid = getpid(); - GET_PAGESIZE(e->me_os_psize); - VALGRIND_CREATE_MEMPOOL(e,0,0); - e->me_signature = MDBX_ME_SIGNATURE; - *env = e; - return MDB_SUCCESS; -} - -static int __cold -mdb_env_map(MDB_env *env, void *addr, size_t usedsize) -{ - unsigned flags = env->me_flags; - - int prot = PROT_READ; - if (flags & MDB_WRITEMAP) { - prot |= PROT_WRITE; - if (ftruncate(env->me_fd, env->me_mapsize) < 0) - return errno; - } - - env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, env->me_fd, 0); - if (env->me_map == MAP_FAILED) { - env->me_map = NULL; - return errno; - } - - /* Can happen because the address argument to mmap() is just a - * hint. mmap() can pick another, e.g. if the range is in use. - * The MAP_FIXED flag would prevent that, but then mmap could - * instead unmap existing pages to make room for the new map. - */ - if (addr && env->me_map != addr) { - errno = 0; /* LY: clean errno as a hit for this case */ - return EBUSY; /* TODO: Make a new MDB_* error code? */ - } - - if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK)) - return errno; - -#ifdef MADV_NOHUGEPAGE - (void) madvise(env->me_map, env->me_mapsize, MADV_NOHUGEPAGE); -#endif - -#ifdef MADV_DONTDUMP - if (! (flags & MDBX_PAGEPERTURB)) { - (void) madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP); - } -#endif - -#ifdef MADV_REMOVE - if (flags & MDB_WRITEMAP) { - (void) madvise(env->me_map + usedsize, env->me_mapsize - usedsize, MADV_REMOVE); - } -#endif - - /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ - if (madvise(env->me_map, env->me_mapsize, (flags & MDB_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED)) - return errno; - - /* Lock meta pages to avoid unexpected write, - * before the data pages would be synchronized. */ - if ((flags & MDB_WRITEMAP) && mlock(env->me_map, env->me_psize * 2)) - return errno; - -#ifdef USE_VALGRIND - env->me_valgrind_handle = - VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "lmdb"); -#endif - - return MDB_SUCCESS; -} - -int __cold -mdb_env_set_mapsize(MDB_env *env, size_t size) -{ - if (unlikely(!env)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(size < env->me_psize * 8)) - return EINVAL; - - /* If env is already open, caller is responsible for making - * sure there are no active txns. - */ - if (env->me_map) { - int rc; - MDB_meta *meta; - void *old; - if (env->me_txn) - return EINVAL; - meta = mdb_meta_head_w(env); - if (!size) - size = meta->mm_mapsize; - /* Silently round up to minimum if the size is too small */ - const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; - if (size < usedsize) - size = usedsize; - munmap(env->me_map, env->me_mapsize); -#ifdef USE_VALGRIND - VALGRIND_DISCARD(env->me_valgrind_handle); - env->me_valgrind_handle = -1; -#endif - env->me_mapsize = size; - old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; - rc = mdb_env_map(env, old, usedsize); - if (rc) - return rc; - } - env->me_mapsize = size; - if (env->me_psize) - env->me_maxpg = env->me_mapsize / env->me_psize; - return MDB_SUCCESS; -} - -int __cold -mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) -{ - if (unlikely(!env)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(env->me_map)) - return EINVAL; - - env->me_maxdbs = dbs + CORE_DBS; - return MDB_SUCCESS; -} - -int __cold -mdb_env_set_maxreaders(MDB_env *env, unsigned readers) -{ - if (unlikely(!env || readers < 1)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(env->me_map)) - return EINVAL; - - env->me_maxreaders = readers; - return MDB_SUCCESS; -} - -int __cold -mdb_env_get_maxreaders(MDB_env *env, unsigned *readers) -{ - if (!env || !readers) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - *readers = env->me_maxreaders; - return MDB_SUCCESS; -} - -static int __cold -mdb_fsize(HANDLE fd, size_t *size) -{ - struct stat st; - - if (fstat(fd, &st)) - return errno; - - *size = st.st_size; - return MDB_SUCCESS; -} - -/** Further setup required for opening an LMDB environment - */ -static int __cold -mdb_env_open2(MDB_env *env, MDB_meta *meta) -{ - unsigned flags = env->me_flags; - int i, newenv = 0, rc; - - if ((i = mdb_env_read_header(env, meta)) != 0) { - if (i != ENOENT) - return i; - mdb_debug("new mdbenv"); - newenv = 1; - env->me_psize = env->me_os_psize; - if (env->me_psize > MAX_PAGESIZE) - env->me_psize = MAX_PAGESIZE; - memset(meta, 0, sizeof(*meta)); - mdb_env_init_meta0(env, meta); - meta->mm_mapsize = DEFAULT_MAPSIZE; - } else { - env->me_psize = meta->mm_psize; - } - - /* Was a mapsize configured? */ - if (!env->me_mapsize) { - env->me_mapsize = meta->mm_mapsize; - } - { - /* Make sure mapsize >= committed data size. Even when using - * mm_mapsize, which could be broken in old files (ITS#7789). - */ - size_t minsize = (meta->mm_last_pg + 1) * meta->mm_psize; - if (env->me_mapsize < minsize) - env->me_mapsize = minsize; - } - meta->mm_mapsize = env->me_mapsize; - - if (newenv && !(flags & MDB_FIXEDMAP)) { - /* mdb_env_map() may grow the datafile. Write the metapages - * first, so the file will be valid if initialization fails. - * Except with FIXEDMAP, since we do not yet know mm_address. - * We could fill in mm_address later, but then a different - * program might end up doing that - one with a memory layout - * and map address which does not suit the main program. - */ - rc = mdb_env_init_meta(env, meta); - if (rc) - return rc; - newenv = 0; - } - - const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; - rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta->mm_address : NULL, usedsize); - if (rc) - return rc; - - if (newenv) { - if (flags & MDB_FIXEDMAP) - meta->mm_address = env->me_map; - i = mdb_env_init_meta(env, meta); - if (i != MDB_SUCCESS) { - return i; - } - } - - env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) - - sizeof(indx_t); - env->me_maxkey_limit = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); - env->me_maxpg = env->me_mapsize / env->me_psize; - - if (MDB_MAXKEYSIZE > env->me_maxkey_limit) - return MDB_BAD_VALSIZE; - - return MDB_SUCCESS; -} - -/****************************************************************************/ - -#ifndef MDBX_USE_THREAD_ATEXIT -# if __GLIBC_PREREQ(2,18) -# define MDBX_USE_THREAD_ATEXIT 1 -# else -# define MDBX_USE_THREAD_ATEXIT 0 -# endif -#endif - -static pthread_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER; -static MDBX_rthc *mdbx_rthc_list; -static pthread_key_t mdbx_pthread_crutch_key; - -static __inline -void mdbx_rthc_lock(void) { - mdb_ensure(NULL, pthread_mutex_lock(&mdbx_rthc_mutex) == 0); -} - -static __inline -void mdbx_rthc_unlock(void) { - mdb_ensure(NULL, pthread_mutex_unlock(&mdbx_rthc_mutex) == 0); -} - -/** Release a reader thread's slot in the reader lock table. - * This function is called automatically when a thread exits. - * @param[in] ptr This points to the MDB_rthc of a slot in the reader lock table. - */ -static __cold -void mdbx_rthc_dtor(void) -{ - /* LY: Основная задача этого деструктора была и есть в освобождении - * слота таблицы читателей при завершении треда, но тут есть пара - * не очевидных сложностей: - * - Таблица читателей располагается в разделяемой памяти, поэтому - * во избежание segfault деструктор не должен что-либо делать после - * или одновременно с mdb_env_close(). - * - Действительно, mdb_env_close() вызовет pthread_key_delete() и - * после этого glibc не будет вызывать деструктор. - * - ОДНАКО, это никак не решает проблему гонок между mdb_env_close() - * и завершающимися тредами. Грубо говоря, при старте mdb_env_close() - * деструктор уже может выполняться в некоторых тредах, и завершиться - * эти выполнения могут во время или после окончания mdb_env_close(). - * - БОЛЕЕ ТОГО, схожая проблема возникает при выгрузке dso/dll, - * так как в текущей glibc (2.24) подсистема ld.so ничего не знает о - * TSD-деструкторах и поэтому может выгрузить lib.so до того как - * отработали все деструкторы. - * - Исходное проявление проблемы было зафиксировано - * в https://github.com/ReOpen/ReOpenLDAP/issues/48 - * - * Предыдущее решение посредством выделяемого динамически MDB_rthc - * было не удачным, так как порождало либо утечку памяти, - * либо вероятностное обращение к уже освобожденной памяти - * из этого деструктора. - * - * Текущее решение достаточно "развесисто", но решает все описанные выше - * проблемы без пенальти по производительности. - */ - - mdbx_rthc_lock(); - - pid_t pid = getpid(); - pthread_t thread = pthread_self(); - for (MDBX_rthc** ref = &mdbx_rthc_list; *ref; ) { - MDBX_rthc* rthc = *ref; - if (rthc->rc_thread == thread) { - if (rthc->rc_reader && rthc->rc_reader->mr_pid == pid) { - rthc->rc_reader->mr_pid = 0; - mdbx_coherent_barrier(); - } - *ref = rthc->rc_next; - free(rthc); - } else { - ref = &(*ref)->rc_next; - } - } - - mdbx_rthc_unlock(); -} - -#if MDBX_USE_THREAD_ATEXIT - -extern void *__dso_handle __attribute__ ((__weak__)); -extern int __cxa_thread_atexit_impl(void (*dtor)(void*), void *obj, void *dso_symbol); - -static __cold -void mdbx_rthc__thread_atexit(void *ptr) { - mdb_ensure(NULL, ptr == pthread_getspecific(mdbx_pthread_crutch_key)); - mdb_ensure(NULL, pthread_setspecific(mdbx_pthread_crutch_key, NULL) == 0); - mdbx_rthc_dtor(); -} - -static __attribute__((constructor)) __cold -void mdbx_pthread_crutch_ctor(void) { - mdb_ensure(NULL, pthread_key_create( - &mdbx_pthread_crutch_key, NULL) == 0); -} - -#else /* MDBX_USE_THREAD_ATEXIT */ - -static __cold -void mdbx_rthc__thread_key_dtor(void *ptr) { - (void) ptr; - if (mdbx_pthread_crutch_key != (pthread_key_t) -1) - mdbx_rthc_dtor(); -} - -static __attribute__((constructor)) __cold -void mdbx_pthread_crutch_ctor(void) { - mdb_ensure(NULL, pthread_key_create( - &mdbx_pthread_crutch_key, mdbx_rthc__thread_key_dtor) == 0); -} - -static __attribute__((destructor)) __cold -void mdbx_pthread_crutch_dtor(void) -{ - pthread_key_delete(mdbx_pthread_crutch_key); - mdbx_pthread_crutch_key = -1; - - /* LY: Из-за race condition в pthread_key_delete() - * деструкторы уже могли начать выполняться. - * Уступая квант времени сразу после удаления ключа - * мы даем им шанс завершиться. */ - pthread_yield(); - - mdbx_rthc_lock(); - pid_t pid = getpid(); - while (mdbx_rthc_list != NULL) { - MDBX_rthc* rthc = mdbx_rthc_list; - mdbx_rthc_list = mdbx_rthc_list->rc_next; - if (rthc->rc_reader && rthc->rc_reader->mr_pid == pid) { - rthc->rc_reader->mr_pid = 0; - mdbx_coherent_barrier(); - } - free(rthc); - - /* LY: Каждый неудаленный элемент списка - это один - * не отработавший деструктор и потенциальный - * шанс получить segfault после выгрузки lib.so - * Поэтому на каждой итерации уступаем квант времени, - * в надежде что деструкторы успеют отработать. */ - mdbx_rthc_unlock(); - pthread_yield(); - mdbx_rthc_lock(); - } - mdbx_rthc_unlock(); - pthread_yield(); -} -#endif /* MDBX_USE_THREAD_ATEXIT */ - -static __cold -MDBX_rthc* mdbx_rthc_add(pthread_key_t key) -{ - MDBX_rthc *rthc = malloc(sizeof(MDBX_rthc)); - if (unlikely(rthc == NULL)) - goto bailout; - - rthc->rc_next = NULL; - rthc->rc_reader = NULL; - rthc->rc_thread = pthread_self(); - if (unlikely(pthread_setspecific(key, rthc) != 0)) - goto bailout_free; - - mdbx_rthc_lock(); - if (pthread_getspecific(mdbx_pthread_crutch_key) == NULL) { -#if MDBX_USE_THREAD_ATEXIT - void *dso_anchor = (&__dso_handle && __dso_handle) - ? __dso_handle : (void *)mdb_version; - if (unlikely(__cxa_thread_atexit_impl(mdbx_rthc__thread_atexit, rthc, dso_anchor) != 0)) { - mdbx_rthc_unlock(); - goto bailout_free; - } -#endif /* MDBX_USE_THREAD_ATEXIT */ - mdb_ensure(NULL, pthread_setspecific(mdbx_pthread_crutch_key, rthc) == 0); - } - rthc->rc_next = mdbx_rthc_list; - mdbx_rthc_list = rthc; - mdbx_rthc_unlock(); - return rthc; - -bailout_free: - free(rthc); -bailout: - return NULL; -} - -static __inline -MDBX_rthc* mdbx_rthc_get(pthread_key_t key) -{ - MDBX_rthc *rthc = pthread_getspecific(key); - if (likely(rthc != NULL)) - return rthc; - return mdbx_rthc_add(key); -} - -static __cold -void mdbx_rthc_cleanup(MDB_env *env) -{ - mdbx_rthc_lock(); - - MDB_reader *begin = env->me_txns->mti_readers; - MDB_reader *end = begin + env->me_close_readers; - for (MDBX_rthc** ref = &mdbx_rthc_list; *ref; ) { - MDBX_rthc* rthc = *ref; - if (rthc->rc_reader >= begin && rthc->rc_reader < end) { - if (rthc->rc_reader->mr_pid == env->me_pid) { - rthc->rc_reader->mr_pid = 0; - mdbx_coherent_barrier(); - } - *ref = rthc->rc_next; - free(rthc); - } else { - ref = &(*ref)->rc_next; - } - } - - mdbx_rthc_unlock(); -} - -/****************************************************************************/ - -/** Downgrade the exclusive lock on the region back to shared */ -static __cold -int mdb_env_share_locks(MDB_env *env, int *excl) -{ - struct flock lock_info; - int rc = 0; - - /* The shared lock replaces the existing lock */ - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_RDLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = 0; - lock_info.l_len = 1; - while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && - (rc = errno) == EINTR) ; - *excl = rc ? -1 : 0; /* error may mean we lost the lock */ - - return rc; -} - -/** Try to get exclusive lock, otherwise shared. - * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. - */ -static int __cold -mdb_env_excl_lock(MDB_env *env, int *excl) -{ - int rc = 0; - struct flock lock_info; - - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_WRLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = 0; - lock_info.l_len = 1; - while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && - (rc = errno) == EINTR) ; - if (!rc) { - *excl = 1; - } else { - lock_info.l_type = F_RDLCK; - while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && - (rc = errno) == EINTR) ; - if (rc == 0) - *excl = 0; - } - return rc; -} - -#ifdef MDB_USE_HASH -/* - * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code - * - * @(#) $Revision: 5.1 $ - * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ - * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ - * - * http://www.isthe.com/chongo/tech/comp/fnv/index.html - * - *** - * - * Please do not copyright this code. This code is in the public domain. - * - * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO - * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF - * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR - * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THIS SOFTWARE. - * - * By: - * chongo /\oo/\ - * http://www.isthe.com/chongo/ - * - * Share and Enjoy! :-) - */ - -typedef unsigned long long mdb_hash_t; -#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL) - -/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer - * @param[in] val value to hash - * @param[in] hval initial value for hash - * @return 64 bit hash - * - * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the - * hval arg on the first call. - */ -static mdb_hash_t -mdb_hash_val(MDB_val *val, mdb_hash_t hval) -{ - unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ - unsigned char *end = s + val->mv_size; - /* - * FNV-1a hash each octet of the string - */ - while (s < end) { - /* xor the bottom with the current octet */ - hval ^= (mdb_hash_t)*s++; - - /* multiply by the 64 bit FNV magic prime mod 2^64 */ - hval += (hval << 1) + (hval << 4) + (hval << 5) + - (hval << 7) + (hval << 8) + (hval << 40); - } - /* return our new hash value */ - return hval; -} - -/** Hash the string and output the encoded hash. - * This uses modified RFC1924 Ascii85 encoding to accommodate systems with - * very short name limits. We don't care about the encoding being reversible, - * we just want to preserve as many bits of the input as possible in a - * small printable string. - * @param[in] str string to hash - * @param[out] encbuf an array of 11 chars to hold the hash - */ -static const char mdb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; - -static void __cold -mdb_pack85(unsigned long l, char *out) -{ - int i; - - for (i=0; i<5; i++) { - *out++ = mdb_a85[l % 85]; - l /= 85; - } -} - -static void __cold -mdb_hash_enc(MDB_val *val, char *encbuf) -{ - mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT); - - mdb_pack85(h, encbuf); - mdb_pack85(h>>32, encbuf+5); - encbuf[10] = '\0'; -} -#endif - -/** Open and/or initialize the lock region for the environment. - * @param[in] env The LMDB environment. - * @param[in] lpath The pathname of the file used for the lock region. - * @param[in] mode The Unix permissions for the file, if we create it. - * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive - * @return 0 on success, non-zero on failure. - */ -static int __cold -mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) -{ - int fdflags; - int rc; - off_t size, rsize; - void *m; - - env->me_lfd = open(lpath, O_RDWR|O_CREAT|O_CLOEXEC, mode); - if (env->me_lfd == INVALID_HANDLE_VALUE) { - rc = errno; - if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { - return MDB_SUCCESS; - } - return rc; - } - - /* Lose record locks when exec*() */ - if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(env->me_lfd, F_SETFD, fdflags); - - if (!(env->me_flags & MDB_NOTLS)) { - rc = pthread_key_create(&env->me_txkey, NULL); - if (rc) - return rc; - env->me_flags |= MDB_ENV_TXKEY; - } - - /* Try to get exclusive lock. If we succeed, then - * nobody is using the lock region and we should initialize it. - */ - if ((rc = mdb_env_excl_lock(env, excl))) return rc; - - size = lseek(env->me_lfd, 0, SEEK_END); - if (size == -1) return errno; - rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); - if (size < rsize && *excl > 0) { - if (ftruncate(env->me_lfd, rsize) != 0) return errno; - } else { - rsize = size; - size = rsize - sizeof(MDB_txninfo); - env->me_maxreaders = size/sizeof(MDB_reader) + 1; - } - - m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, env->me_lfd, 0); - if (m == MAP_FAILED) - return errno; - env->me_txns = m; - -#ifdef MADV_NOHUGEPAGE - (void) madvise(env->me_txns, rsize, MADV_NOHUGEPAGE); -#endif - -#ifdef MADV_DODUMP - (void) madvise(env->me_txns, rsize, MADV_DODUMP); -#endif - - if (madvise(env->me_txns, rsize, MADV_DONTFORK) < 0) - return errno; - - if (madvise(env->me_txns, rsize, MADV_WILLNEED) < 0) - return errno; - - if (madvise(env->me_txns, rsize, MADV_RANDOM) < 0) - return errno; - - if (*excl > 0) { - /* Solaris needs this before initing a robust mutex. Otherwise - * it may skip the init and return EBUSY "seems someone already - * inited" or EINVAL "it was inited differently". - */ - memset(&env->me_txns->mti_rmutex, 0, sizeof(env->me_txns->mti_rmutex)); - memset(&env->me_txns->mti_wmutex, 0, sizeof(env->me_txns->mti_wmutex)); - - pthread_mutexattr_t mattr; - rc = pthread_mutexattr_init(&mattr); - if (rc) return rc; - - rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); - -#if MDB_USE_ROBUST - if(! rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); -#endif /* MDB_USE_ROBUST */ - if (! rc) rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr); - if (! rc) rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr); - - pthread_mutexattr_destroy(&mattr); - if (rc) return rc; - - env->me_txns->mti_magic = MDB_MAGIC; - env->me_txns->mti_format = MDB_LOCK_FORMAT; - env->me_txns->mti_txnid = ~0L; - env->me_txns->mti_numreaders = 0; - } else { - if (env->me_txns->mti_magic != MDB_MAGIC) { - mdb_debug("lock region has invalid magic"); - return MDB_INVALID; - } - if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { - mdb_debug("lock region has format+version 0x%x, expected 0x%x", - env->me_txns->mti_format, MDB_LOCK_FORMAT); - return MDB_VERSION_MISMATCH; - } - } - - return MDB_SUCCESS; -} - - /** The name of the lock file in the DB environment */ -#define LOCKNAME "/lock.mdb" - /** The name of the data file in the DB environment */ -#define DATANAME "/data.mdb" - /** The suffix of the lock file when no subdir is used */ -#define LOCKSUFF "-lock" - /** Only a subset of the @ref mdb_env flags can be changed - * at runtime. Changing other flags requires closing the - * environment and re-opening it with the new flags. - */ -#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC| \ - MDB_NOMEMINIT|MDBX_COALESCE|MDBX_PAGEPERTURB) -#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ - MDB_WRITEMAP|MDB_NOTLS|MDB_NORDAHEAD|MDBX_LIFORECLAIM) - -#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) -# error "Persistent DB flags & env flags overlap, but both go in mm_flags" -#endif - -MDBX_ONLY_FEATURE int __cold -mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mode_t mode, int *exclusive) -{ - int oflags, rc, len, excl = -1; - char *lpath, *dpath; - - if (unlikely(!env || !path)) - return EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (env->me_fd != INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) - return EINVAL; - - len = strlen(path); - if (flags & MDB_NOSUBDIR) { - rc = len + sizeof(LOCKSUFF) + len + 1; - } else { - rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); - } - lpath = malloc(rc); - if (!lpath) - return ENOMEM; - if (flags & MDB_NOSUBDIR) { - dpath = lpath + len + sizeof(LOCKSUFF); - sprintf(lpath, "%s" LOCKSUFF, path); - strcpy(dpath, path); - } else { - dpath = lpath + len + sizeof(LOCKNAME); - sprintf(lpath, "%s" LOCKNAME, path); - sprintf(dpath, "%s" DATANAME, path); - } - - rc = MDB_SUCCESS; - flags |= env->me_flags; - if (flags & MDB_RDONLY) { - /* LY: silently ignore irrelevant flags when we're only getting read access */ - flags &= ~(MDB_WRITEMAP | MDB_MAPASYNC | MDB_NOSYNC | MDB_NOMETASYNC - | MDBX_COALESCE | MDBX_LIFORECLAIM | MDB_NOMEMINIT); - } else { - if (!((env->me_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)) - && (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) - rc = ENOMEM; - } - env->me_flags = flags |= MDB_ENV_ACTIVE; - if (rc) - goto leave; - - env->me_path = strdup(path); - env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); - env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); - env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned)); - if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { - rc = ENOMEM; - goto leave; - } - env->me_dbxs[FREE_DBI].md_cmp = mdb_cmp_int_ai; /* aligned MDB_INTEGERKEY */ - - /* For RDONLY, get lockfile after we know datafile exists */ - if (!(flags & MDB_RDONLY)) { - rc = mdb_env_setup_locks(env, lpath, mode, &excl); - if (rc) - goto leave; - } - - if (F_ISSET(flags, MDB_RDONLY)) - oflags = O_RDONLY; - else - oflags = O_RDWR | O_CREAT; - - env->me_fd = open(dpath, oflags|O_CLOEXEC, mode); - if (env->me_fd == INVALID_HANDLE_VALUE) { - rc = errno; - goto leave; - } - - int fdflags; - if ((fdflags = fcntl(env->me_fd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(env->me_fd, F_SETFD, fdflags); - - if (flags & MDB_RDONLY) { - rc = mdb_env_setup_locks(env, lpath, mode, &excl); - if (rc) - goto leave; - } - - MDB_meta meta; - if ((rc = mdb_env_open2(env, &meta)) == MDB_SUCCESS) { - mdb_debug("opened dbenv %p", (void *) env); - if (excl > 0) { - env->me_txns->mti_txnid = meta.mm_txnid; - if (exclusive == NULL || *exclusive < 2) { - /* LY: downgrade lock only if exclusive access not requested. - * in case exclusive==1, just leave value as is. */ - rc = mdb_env_share_locks(env, &excl); - if (rc) - goto leave; - } - } else if (exclusive) { - /* LY: just indicate that is not an exclusive access. */ - *exclusive = 0; - } - if (!(flags & MDB_RDONLY)) { - MDB_txn *txn; - int tsize = sizeof(MDB_txn), size = tsize + env->me_maxdbs * - (sizeof(MDB_db)+sizeof(MDB_cursor *)+sizeof(unsigned)+1); - if ((env->me_pbuf = calloc(1, env->me_psize)) && - (txn = calloc(1, size))) - { - txn->mt_dbs = (MDB_db *)((char *)txn + tsize); - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); - txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); - txn->mt_env = env; - txn->mt_dbxs = env->me_dbxs; - txn->mt_flags = MDB_TXN_FINISHED; - env->me_txn0 = txn; - } else { - rc = ENOMEM; - } - } - } - -#if MDB_DEBUG - if (rc == MDB_SUCCESS) { - MDB_meta *meta = mdb_meta_head_r(env); - MDB_db *db = &meta->mm_dbs[MAIN_DBI]; - int toggle = ((char*) meta == PAGEDATA(env->me_map)) ? 0 : 1; - - mdb_debug("opened database version %u, pagesize %u", - meta->mm_version, env->me_psize); - mdb_debug("using meta page %d, txn %zu", toggle, meta->mm_txnid); - mdb_debug("depth: %u", db->md_depth); - mdb_debug("entries: %zu", db->md_entries); - mdb_debug("branch pages: %zu", db->md_branch_pages); - mdb_debug("leaf pages: %zu", db->md_leaf_pages); - mdb_debug("overflow pages: %zu", db->md_overflow_pages); - mdb_debug("root: %zu", db->md_root); - } -#endif - -leave: - if (rc) - mdb_env_close0(env); - free(lpath); - return rc; -} - -int __cold -mdb_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode) -{ - return mdbx_env_open_ex(env, path, flags, mode, NULL); -} - -/** Destroy resources from mdb_env_open(), clear our readers & DBIs */ -static void __cold -mdb_env_close0(MDB_env *env) -{ - int i; - - if (!(env->me_flags & MDB_ENV_ACTIVE)) - return; - env->me_flags &= ~MDB_ENV_ACTIVE; - - /* Doing this here since me_dbxs may not exist during mdb_env_close */ - if (env->me_dbxs) { - for (i = env->me_maxdbs; --i >= CORE_DBS; ) - free(env->me_dbxs[i].md_name.mv_data); - free(env->me_dbxs); - } - - free(env->me_pbuf); - free(env->me_dbiseqs); - free(env->me_dbflags); - free(env->me_path); - free(env->me_dirty_list); - if (env->me_txn0) - mdb_midl_free(env->me_txn0->mt_lifo_reclaimed); - free(env->me_txn0); - mdb_midl_free(env->me_free_pgs); - - if (env->me_flags & MDB_ENV_TXKEY) { - mdb_ensure(env, pthread_key_delete(env->me_txkey) == 0); - env->me_flags &= ~MDB_ENV_TXKEY; - } - - if (env->me_map) { - munmap(env->me_map, env->me_mapsize); -#ifdef USE_VALGRIND - VALGRIND_DISCARD(env->me_valgrind_handle); - env->me_valgrind_handle = -1; -#endif - } - if (env->me_fd != INVALID_HANDLE_VALUE) - (void) close(env->me_fd); - - /* Clearing readers is done in this function because - * me_txkey with its destructor must be disabled first. - * - * We skip the the reader mutex, so we touch only - * data owned by this process (me_close_readers and - * our readers), and clear each reader atomically. - */ - if (env->me_pid == getpid()) - mdbx_rthc_cleanup(env); - - munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); - env->me_txns = NULL; - env->me_pid = 0; - - if (env->me_lfd != INVALID_HANDLE_VALUE) { - (void) close(env->me_lfd); - } -} - -MDBX_ONLY_FEATURE int __cold -mdbx_env_close_ex(MDB_env *env, int dont_sync) -{ - MDB_page *dp; - int rc = MDB_SUCCESS; - - if (unlikely(!env)) - return EINVAL; - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (! dont_sync && env->me_txns) - rc = mdb_env_sync(env, 1); - - VALGRIND_DESTROY_MEMPOOL(env); - while ((dp = env->me_dpages) != NULL) { - ASAN_UNPOISON_MEMORY_REGION(&dp->mp_next, sizeof(dp->mp_next)); - VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dpages = dp->mp_next; - free(dp); - } - - mdb_env_close0(env); - env->me_signature = 0; - free(env); - - return rc; -} - -void __cold -mdb_env_close(MDB_env *env) -{ - mdbx_env_close_ex(env, 0); -} - -/* LY: fast enough on most arches - * - * / - * | -1, a < b - * cmp2int(a,b) = < 0, a == b - * | 1, a > b - * \ - */ -#if 1 -# define mdbx_cmp2int(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -# define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/** Compare two items pointing at aligned unsigned int's. */ -static int __hot -mdb_cmp_int_ai(const MDB_val *a, const MDB_val *b) -{ - mdb_assert(NULL, a->mv_size == b->mv_size); - mdb_assert(NULL, 0 == (uintptr_t) a->mv_data % sizeof(int) - && 0 == (uintptr_t) b->mv_data % sizeof(int)); - - if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) - return mdbx_cmp2int( *(size_t *)a->mv_data, *(size_t *)b->mv_data ); - - mdb_assert(NULL, a->mv_size == sizeof(int) ); - return mdbx_cmp2int( *(unsigned *)a->mv_data, *(unsigned *)b->mv_data ); -} - -/** Compare two items pointing at 2-byte aligned unsigned int's. */ -static int __hot -mdb_cmp_int_a2(const MDB_val *a, const MDB_val *b) -{ - mdb_assert(NULL, a->mv_size == b->mv_size); - mdb_assert(NULL, 0 == (uintptr_t) a->mv_data % sizeof(uint16_t) - && 0 == (uintptr_t) b->mv_data % sizeof(uint16_t)); -#ifdef MISALIGNED_OK - if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) - return mdbx_cmp2int( *(size_t *)a->mv_data, *(size_t *)b->mv_data ); - - mdb_assert(NULL, a->mv_size == sizeof(int) ); - return mdbx_cmp2int( *(unsigned *)a->mv_data, *(unsigned *)b->mv_data ); -#else - mdb_assert(NULL, 0 == a->mv_size % sizeof(uint16_t)); - { - int diff; - const uint16_t *pa, *pb, *end; - -#if BYTE_ORDER == LITTLE_ENDIAN - end = (const uint16_t *) a->mv_data; - pa = (const uint16_t *) ((char *) a->mv_data + a->mv_size); - pb = (const uint16_t *) ((char *) b->mv_data + a->mv_size); - do { - diff = *--pa - *--pb; -#else /* BYTE_ORDER */ - end = (const uint16_t *) ((char *) a->mv_data + a->mv_size); - pa = (const uint16_t *) a->mv_data; - pb = (const uint16_t *) b->mv_data; - do { - diff = *pa++ - *pb++; -#endif /* BYTE_ORDER */ - if (likely(diff != 0)) break; - } while(pa != end); - return diff; - } -#endif /* MISALIGNED_OK */ -} - -/** Compare two items pointing at unsigneds of unknown alignment. - * - * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp. - */ -static int __hot -mdb_cmp_int_ua(const MDB_val *a, const MDB_val *b) -{ - mdb_assert(NULL, a->mv_size == b->mv_size); -#if MISALIGNED_OK - if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) - return mdbx_cmp2int( *(size_t *)a->mv_data, *(size_t *)b->mv_data ); - - mdb_assert(NULL, a->mv_size == sizeof(int) ); - return mdbx_cmp2int( *(unsigned *)a->mv_data, *(unsigned *)b->mv_data ); -#else - mdb_assert(NULL, a->mv_size == sizeof(int) || a->mv_size == sizeof(size_t)); -#if BYTE_ORDER == LITTLE_ENDIAN - { - int diff; - const uint8_t *pa, *pb; - - pa = (const uint8_t *)a->mv_data + a->mv_size; - pb = (const uint8_t *)b->mv_data + a->mv_size; - - do { - diff = *--pa - *--pb; - if (likely(diff != 0)) break; - } while(pa != a->mv_data); - return diff; - } -#else /* BYTE_ORDER */ - return memcmp(a->mv_data, b->mv_data, a->mv_size); -#endif /* BYTE_ORDER */ -#endif /* MISALIGNED_OK */ -} - -/** Compare two items lexically */ -static int __hot -mdb_cmp_memn(const MDB_val *a, const MDB_val *b) -{ - /* LY: assumes that length of keys are NOT equal for most cases, - * if no then branch-prediction should mitigate the problem */ -#if 0 - /* LY: without branch instructions on x86, - * but isn't best for equal length of keys */ - int diff_len = mdbx_cmp2int(a->mv_size, b->mv_size); -#else - /* LY: best when length of keys are equal, - * but got a branch-penalty otherwise */ - if (unlikely(a->mv_size == b->mv_size)) - return memcmp(a->mv_data, b->mv_data, a->mv_size); - int diff_len = (a->mv_size < b->mv_size) ? -1 : 1; -#endif - size_t shortest = (a->mv_size < b->mv_size) ? a->mv_size : b->mv_size; - int diff_data = memcmp(a->mv_data, b->mv_data, shortest); - return likely(diff_data) ? diff_data : diff_len; -} - -/** Compare two items in reverse byte order */ -static int __hot -mdb_cmp_memnr(const MDB_val *a, const MDB_val *b) -{ - const uint8_t *pa, *pb, *end; - - pa = (const uint8_t *)a->mv_data + a->mv_size; - pb = (const uint8_t *)b->mv_data + b->mv_size; - size_t minlen = (a->mv_size < b->mv_size) ? a->mv_size : b->mv_size; - end = pa - minlen; - - while (pa != end) { - int diff = *--pa - *--pb; - if (likely(diff)) - return diff; - } - return mdbx_cmp2int(a->mv_size, b->mv_size); -} - -/** Search for key within a page, using binary search. - * Returns the smallest entry larger or equal to the key. - * If exactp is non-null, stores whether the found entry was an exact match - * in *exactp (1 or 0). - * Updates the cursor index with the index of the found entry. - * If no entry larger or equal to the key is found, returns NULL. - */ -static MDB_node * __hot -mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) -{ - unsigned i = 0, nkeys; - int low, high; - int rc = 0; - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NULL; - MDB_val nodekey; - MDB_cmp_func *cmp; - DKBUF; - - nkeys = NUMKEYS(mp); - - mdb_debug("searching %u keys in %s %spage %zu", - nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mdb_dbg_pgno(mp)); - - low = IS_LEAF(mp) ? 0 : 1; - high = nkeys - 1; - cmp = mc->mc_dbx->md_cmp; - - /* Branch pages have no data, so if using integer keys, - * alignment is guaranteed. Use faster mdb_cmp_int_ai. - */ - if (cmp == mdb_cmp_int_a2 && IS_BRANCH(mp)) - cmp = mdb_cmp_int_ai; - - if (IS_LEAF2(mp)) { - nodekey.mv_size = mc->mc_db->md_xsize; - node = NODEPTR(mp, 0); /* fake */ - while (low <= high) { - i = (low + high) >> 1; - nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); - rc = cmp(key, &nodekey); - mdb_debug("found leaf index %u [%s], rc = %i", - i, DKEY(&nodekey), rc); - if (rc == 0) - break; - if (rc > 0) - low = i + 1; - else - high = i - 1; - } - } else { - while (low <= high) { - i = (low + high) >> 1; - - node = NODEPTR(mp, i); - nodekey.mv_size = NODEKSZ(node); - nodekey.mv_data = NODEKEY(node); - - rc = cmp(key, &nodekey); - if (IS_LEAF(mp)) - mdb_debug("found leaf index %u [%s], rc = %i", - i, DKEY(&nodekey), rc); - else - mdb_debug("found branch index %u [%s -> %zu], rc = %i", - i, DKEY(&nodekey), NODEPGNO(node), rc); - if (rc == 0) - break; - if (rc > 0) - low = i + 1; - else - high = i - 1; - } - } - - if (rc > 0) { /* Found entry is less than the key. */ - i++; /* Skip to get the smallest entry larger than key. */ - if (!IS_LEAF2(mp)) - node = NODEPTR(mp, i); - } - if (exactp) - *exactp = (rc == 0 && nkeys > 0); - /* store the key index */ - mc->mc_ki[mc->mc_top] = i; - if (i >= nkeys) - /* There is no entry larger or equal to the key. */ - return NULL; - - /* nodeptr is fake for LEAF2 */ - return node; -} - -#if 0 -static void -mdb_cursor_adjust(MDB_cursor *mc, func) -{ - MDB_cursor *m2; - - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { - func(mc, m2); - } - } -} -#endif - -/** Pop a page off the top of the cursor's stack. */ -static void -mdb_cursor_pop(MDB_cursor *mc) -{ - if (mc->mc_snum) { - mdb_debug("popped page %zu off db %d cursor %p", - mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc); - - mc->mc_snum--; - if (mc->mc_snum) { - mc->mc_top--; - } else { - mc->mc_flags &= ~C_INITIALIZED; - } - } -} - -/** Push a page onto the top of the cursor's stack. - * Set #MDB_TXN_ERROR on failure. - */ -static int -mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) -{ - mdb_debug("pushing page %zu on db %d cursor %p", mp->mp_pgno, - DDBI(mc), (void *) mc); - - if (unlikely(mc->mc_snum >= CURSOR_STACK)) { - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CURSOR_FULL; - } - - mc->mc_top = mc->mc_snum++; - mc->mc_pg[mc->mc_top] = mp; - mc->mc_ki[mc->mc_top] = 0; - - return MDB_SUCCESS; -} - -/** Find the address of the page corresponding to a given page number. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc the cursor accessing the page. - * @param[in] pgno the page number for the page to retrieve. - * @param[out] ret address of a pointer where the page's address will be stored. - * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) -{ - MDB_txn *txn = mc->mc_txn; - MDB_env *env = txn->mt_env; - MDB_page *p = NULL; - int level; - - if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { - MDB_txn *tx2 = txn; - level = 1; - do { - MDB_ID2L dl = tx2->mt_u.dirty_list; - unsigned x; - /* Spilled pages were dirtied in this txn and flushed - * because the dirty list got full. Bring this page - * back in from the map (but don't unspill it here, - * leave that unless page_touch happens again). */ - if (tx2->mt_spill_pgs) { - MDB_ID pn = pgno << 1; - x = mdb_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) - goto mapped; - } - if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - p = dl[x].mptr; - goto done; - } - } - level++; - } while ((tx2 = tx2->mt_parent) != NULL); - } - - if (unlikely(pgno >= txn->mt_next_pgno)) { - mdb_debug("page %zu not found", pgno); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_NOTFOUND; - } - level = 0; - -mapped: - p = (MDB_page *)(env->me_map + env->me_psize * pgno); - -done: - *ret = p; - if (lvl) - *lvl = level; - return MDB_SUCCESS; -} - -/** Finish #mdb_page_search() / #mdb_page_search_lowest(). - * The cursor is at the root page, set up the rest of it. - */ -static int -mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top]; - int rc; - DKBUF; - - while (IS_BRANCH(mp)) { - MDB_node *node; - indx_t i; - - mdb_debug("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp)); - /* Don't assert on branch pages in the FreeDB. We can get here - * while in the process of rebalancing a FreeDB branch page; we must - * let that proceed. ITS#8336 - */ - mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); - mdb_debug("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0))); - - if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { - i = 0; - if (flags & MDB_PS_LAST) { - i = NUMKEYS(mp) - 1; - /* if already init'd, see if we're already in right place */ - if (mc->mc_flags & C_INITIALIZED) { - if (mc->mc_ki[mc->mc_top] == i) { - mc->mc_top = mc->mc_snum++; - mp = mc->mc_pg[mc->mc_top]; - goto ready; - } - } - } - } else { - int exact; - node = mdb_node_search(mc, key, &exact); - if (node == NULL) - i = NUMKEYS(mp) - 1; - else { - i = mc->mc_ki[mc->mc_top]; - if (!exact) { - mdb_cassert(mc, i > 0); - i--; - } - } - mdb_debug("following index %u for key [%s]", i, DKEY(key)); - } - - mdb_cassert(mc, i < NUMKEYS(mp)); - node = NODEPTR(mp, i); - - if (unlikely((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) - return rc; - - mc->mc_ki[mc->mc_top] = i; - if (unlikely(rc = mdb_cursor_push(mc, mp))) - return rc; - -ready: - if (flags & MDB_PS_MODIFY) { - if (unlikely((rc = mdb_page_touch(mc)) != 0)) - return rc; - mp = mc->mc_pg[mc->mc_top]; - } - } - - if (unlikely(!IS_LEAF(mp))) { - mdb_debug("internal error, index points to a %02X page!?", - mp->mp_flags); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; - } - - mdb_debug("found leaf page %zu for key [%s]", mp->mp_pgno, - key ? DKEY(key) : "null"); - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - return MDB_SUCCESS; -} - -/** Search for the lowest key under the current branch page. - * This just bypasses a NUMKEYS check in the current page - * before calling mdb_page_search_root(), because the callers - * are all in situations where the current page is known to - * be underfilled. - */ -static int -mdb_page_search_lowest(MDB_cursor *mc) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NODEPTR(mp, 0); - int rc; - - if (unlikely((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) - return rc; - - mc->mc_ki[mc->mc_top] = 0; - if (unlikely(rc = mdb_cursor_push(mc, mp))) - return rc; - return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); -} - -/** Search for the page a given key should be in. - * Push it and its parent pages on the cursor stack. - * @param[in,out] mc the cursor for this operation. - * @param[in] key the key to search for, or NULL for first/last page. - * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB - * are touched (updated with new page numbers). - * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. - * This is used by #mdb_cursor_first() and #mdb_cursor_last(). - * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) -{ - int rc; - pgno_t root; - - /* Make sure the txn is still viable, then find the root from - * the txn's db table and set it as the root of the cursor's stack. - */ - if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) { - mdb_debug("transaction has failed, must abort"); - return MDB_BAD_TXN; - } else { - /* Make sure we're using an up-to-date root */ - if (unlikely(*mc->mc_dbflag & DB_STALE)) { - MDB_cursor mc2; - if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) - return MDB_BAD_DBI; - mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); - if (rc) - return rc; - { - MDB_val data; - int exact = 0; - uint16_t flags; - MDB_node *leaf = mdb_node_search(&mc2, - &mc->mc_dbx->md_name, &exact); - if (!exact) - return MDB_NOTFOUND; - if (unlikely((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)) - return MDB_INCOMPATIBLE; /* not a named DB */ - rc = mdb_node_read(&mc2, leaf, &data); - if (rc) - return rc; - memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), - sizeof(uint16_t)); - /* The txn may not know this DBI, or another process may - * have dropped and recreated the DB with other flags. - */ - if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)) - return MDB_INCOMPATIBLE; - memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); - } - *mc->mc_dbflag &= ~DB_STALE; - } - root = mc->mc_db->md_root; - - if (unlikely(root == P_INVALID)) { /* Tree is empty. */ - mdb_debug("tree is empty"); - return MDB_NOTFOUND; - } - } - - mdb_cassert(mc, root > 1); - if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) - if (unlikely((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) - return rc; - - mc->mc_snum = 1; - mc->mc_top = 0; - - mdb_debug("db %d root page %zu has flags 0x%X", - DDBI(mc), root, mc->mc_pg[0]->mp_flags); - - if (flags & MDB_PS_MODIFY) { - if (unlikely(rc = mdb_page_touch(mc))) - return rc; - } - - if (flags & MDB_PS_ROOTONLY) - return MDB_SUCCESS; - - return mdb_page_search_root(mc, key, flags); -} - -static int -mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) -{ - MDB_txn *txn = mc->mc_txn; - pgno_t pg = mp->mp_pgno; - unsigned x = 0, ovpages = mp->mp_pages; - MDB_env *env = txn->mt_env; - MDB_IDL sl = txn->mt_spill_pgs; - MDB_ID pn = pg << 1; - int rc; - - mdb_debug("free ov page %zu (%u)", pg, ovpages); - /* If the page is dirty or on the spill list we just acquired it, - * so we should give it back to our current free list, if any. - * Otherwise put it onto the list of pages we freed in this txn. - * - * Won't create me_pghead: me_pglast must be inited along with it. - * Unsupported in nested txns: They would need to hide the page - * range in ancestor txns' dirty and spilled lists. - */ - if (env->me_pghead && - !txn->mt_parent && - ((mp->mp_flags & P_DIRTY) || - (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) - { - unsigned i, j; - pgno_t *mop; - MDB_ID2 *dl, ix, iy; - rc = mdb_midl_need(&env->me_pghead, ovpages); - if (unlikely(rc)) - return rc; - if (!(mp->mp_flags & P_DIRTY)) { - /* This page is no longer spilled */ - if (x == sl[0]) - sl[0]--; - else - sl[x] |= 1; - goto release; - } - /* Remove from dirty list */ - dl = txn->mt_u.dirty_list; - x = dl[0].mid--; - for (ix = dl[x]; ix.mptr != mp; ix = iy) { - if (likely(x > 1)) { - x--; - iy = dl[x]; - dl[x] = ix; - } else { - mdb_cassert(mc, x > 1); - j = ++(dl[0].mid); - dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; - } - } - txn->mt_dirty_room++; - if (!(env->me_flags & MDB_WRITEMAP)) - mdb_dpage_free(env, mp); -release: - /* Insert in me_pghead */ - mop = env->me_pghead; - j = mop[0] + ovpages; - for (i = mop[0]; i && mop[i] < pg; i--) - mop[j--] = mop[i]; - while (j>i) - mop[j--] = pg++; - mop[0] += ovpages; - } else { - rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); - if (unlikely(rc)) - return rc; - } - mc->mc_db->md_overflow_pages -= ovpages; - return 0; -} - -/** Return the data associated with a given node. - * @param[in] mc The cursor for this operation. - * @param[in] leaf The node being read. - * @param[out] data Updated to point to the node's data. - * @return 0 on success, non-zero on failure. - */ -static MDBX_INLINE int -mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) -{ - MDB_page *omp; /* overflow page */ - pgno_t pgno; - int rc; - - if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { - data->mv_size = NODEDSZ(leaf); - data->mv_data = NODEDATA(leaf); - return MDB_SUCCESS; - } - - /* Read overflow data. - */ - data->mv_size = NODEDSZ(leaf); - memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if (unlikely((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0)) { - mdb_debug("read overflow page %zu failed", pgno); - return rc; - } - data->mv_data = PAGEDATA(omp); - - return MDB_SUCCESS; -} - -int -mdb_get(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data) -{ - MDB_cursor mc; - MDB_xcursor mx; - int exact = 0; - DKBUF; - - mdb_debug("===> get db %u key [%s]", dbi, DKEY(key)); - - if (unlikely(!key || !data || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); -} - -/** Find a sibling for a page. - * Replaces the page at the top of the cursor's stack with the - * specified sibling, if one exists. - * @param[in] mc The cursor for this operation. - * @param[in] move_right Non-zero if the right sibling is requested, - * otherwise the left sibling. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_cursor_sibling(MDB_cursor *mc, int move_right) -{ - int rc; - MDB_node *indx; - MDB_page *mp; - - if (unlikely(mc->mc_snum < 2)) { - return MDB_NOTFOUND; /* root has no siblings */ - } - - mdb_cursor_pop(mc); - mdb_debug("parent page is page %zu, index %u", - mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); - - if (move_right - ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) - : (mc->mc_ki[mc->mc_top] == 0)) { - mdb_debug("no more keys left, moving to %s sibling", - move_right ? "right" : "left"); - if (unlikely((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS)) { - /* undo cursor_pop before returning */ - mc->mc_top++; - mc->mc_snum++; - return rc; - } - } else { - if (move_right) - mc->mc_ki[mc->mc_top]++; - else - mc->mc_ki[mc->mc_top]--; - mdb_debug("just moving to %s index key %u", - move_right ? "right" : "left", mc->mc_ki[mc->mc_top]); - } - mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); - - indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (unlikely((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0)) { - /* mc will be inconsistent if caller does mc_snum++ as above */ - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - return rc; - } - - mdb_cursor_push(mc, mp); - if (!move_right) - mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1; - - return MDB_SUCCESS; -} - -/** Move the cursor to the next data item. */ -static int -mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) -{ - MDB_page *mp; - MDB_node *leaf; - int rc; - - if ((mc->mc_flags & C_DEL) && op == MDB_NEXT_DUP) - return MDB_NOTFOUND; - - if (!(mc->mc_flags & C_INITIALIZED)) - return mdb_cursor_first(mc, key, data); - - mp = mc->mc_pg[mc->mc_top]; - - if (mc->mc_flags & C_EOF) { - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) - return MDB_NOTFOUND; - mc->mc_flags ^= C_EOF; - } - - if (mc->mc_db->md_flags & MDB_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_NEXT || op == MDB_NEXT_DUP) { - rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); - if (op != MDB_NEXT || rc != MDB_NOTFOUND) { - if (likely(rc == MDB_SUCCESS)) - MDB_GET_KEY(leaf, key); - return rc; - } - } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (op == MDB_NEXT_DUP) - return MDB_NOTFOUND; - } - } - - mdb_debug("cursor_next: top page is %zu in cursor %p", - mdb_dbg_pgno(mp), (void *) mc); - if (mc->mc_flags & C_DEL) { - mc->mc_flags ^= C_DEL; - goto skip; - } - - if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { - mdb_debug("=====> move to next sibling page"); - if (unlikely((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)) { - mc->mc_flags |= C_EOF; - return rc; - } - mp = mc->mc_pg[mc->mc_top]; - mdb_debug("next page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); - } else - mc->mc_ki[mc->mc_top]++; - -skip: - mdb_debug("==> cursor points to page %zu with %u keys, key index %u", - mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) - return rc; - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -/** Move the cursor to the previous data item. */ -static int -mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) -{ - MDB_page *mp; - MDB_node *leaf; - int rc; - - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = mdb_cursor_last(mc, key, data); - if (unlikely(rc)) - return rc; - mc->mc_ki[mc->mc_top]++; - } - - mp = mc->mc_pg[mc->mc_top]; - - if (mc->mc_db->md_flags & MDB_DUPSORT) { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_PREV || op == MDB_PREV_DUP) { - rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); - if (op != MDB_PREV || rc != MDB_NOTFOUND) { - if (likely(rc == MDB_SUCCESS)) { - MDB_GET_KEY(leaf, key); - mc->mc_flags &= ~C_EOF; - } - return rc; - } - } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (op == MDB_PREV_DUP) - return MDB_NOTFOUND; - } - } - - mdb_debug("cursor_prev: top page is %zu in cursor %p", - mdb_dbg_pgno(mp), (void *) mc); - - mc->mc_flags &= ~(C_EOF|C_DEL); - - if (mc->mc_ki[mc->mc_top] == 0) { - mdb_debug("=====> move to prev sibling page"); - if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { - return rc; - } - mp = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - mdb_debug("prev page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); - } else - mc->mc_ki[mc->mc_top]--; - - mdb_debug("==> cursor points to page %zu with %u keys, key index %u", - mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); - - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (data) { - if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) - return rc; - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -/** Set the cursor on a specific data item. */ -static int -mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, - MDB_cursor_op op, int *exactp) -{ - int rc; - MDB_page *mp; - MDB_node *leaf = NULL; - DKBUF; - - if ( (mc->mc_db->md_flags & MDB_INTEGERKEY) - && unlikely( key->mv_size != sizeof(unsigned) - && key->mv_size != sizeof(size_t) )) { - mdb_cassert(mc, ! "key-size is invalid for MDB_INTEGERKEY"); - return MDB_BAD_VALSIZE; - } - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - /* See if we're already on the right page */ - if (mc->mc_flags & C_INITIALIZED) { - MDB_val nodekey; - - mp = mc->mc_pg[mc->mc_top]; - if (!NUMKEYS(mp)) { - mc->mc_ki[mc->mc_top] = 0; - return MDB_NOTFOUND; - } - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_size = mc->mc_db->md_xsize; - nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); - } else { - leaf = NODEPTR(mp, 0); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - /* Probably happens rarely, but first node on the page - * was the one we wanted. - */ - mc->mc_ki[mc->mc_top] = 0; - if (exactp) - *exactp = 1; - goto set1; - } - if (rc > 0) { - unsigned i; - unsigned nkeys = NUMKEYS(mp); - if (nkeys > 1) { - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = LEAF2KEY(mp, - nkeys-1, nodekey.mv_size); - } else { - leaf = NODEPTR(mp, nkeys-1); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - /* last node was the one we wanted */ - mc->mc_ki[mc->mc_top] = nkeys-1; - if (exactp) - *exactp = 1; - goto set1; - } - if (rc < 0) { - if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { - /* This is definitely the right page, skip search_page */ - if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = LEAF2KEY(mp, - mc->mc_ki[mc->mc_top], nodekey.mv_size); - } else { - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY2(leaf, nodekey); - } - rc = mc->mc_dbx->md_cmp(key, &nodekey); - if (rc == 0) { - /* current node was the one we wanted */ - if (exactp) - *exactp = 1; - goto set1; - } - } - rc = 0; - mc->mc_flags &= ~C_EOF; - goto set2; - } - } - /* If any parents have right-sibs, search. - * Otherwise, there's nothing further. */ - for (i=0; imc_top; i++) - if (mc->mc_ki[i] < - NUMKEYS(mc->mc_pg[i])-1) - break; - if (i == mc->mc_top) { - /* There are no other pages */ - mc->mc_ki[mc->mc_top] = nkeys; - return MDB_NOTFOUND; - } - } - if (!mc->mc_top) { - /* There are no other pages */ - mc->mc_ki[mc->mc_top] = 0; - if (op == MDB_SET_RANGE && !exactp) { - rc = 0; - goto set1; - } else - return MDB_NOTFOUND; - } - } else { - mc->mc_pg[0] = 0; - } - - rc = mdb_page_search(mc, key, 0); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - - mp = mc->mc_pg[mc->mc_top]; - mdb_cassert(mc, IS_LEAF(mp)); - -set2: - leaf = mdb_node_search(mc, key, exactp); - if (exactp != NULL && !*exactp) { - /* MDB_SET specified and not an exact match. */ - return MDB_NOTFOUND; - } - - if (leaf == NULL) { - mdb_debug("===> inexact leaf not found, goto sibling"); - if (unlikely((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)) { - mc->mc_flags |= C_EOF; - return rc; /* no entries matched */ - } - mp = mc->mc_pg[mc->mc_top]; - mdb_cassert(mc, IS_LEAF(mp)); - leaf = NODEPTR(mp, 0); - } - -set1: - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - if (IS_LEAF2(mp)) { - if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - } - return MDB_SUCCESS; - } - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - } - if (likely(data)) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - } else { - int ex2, *ex2p; - if (op == MDB_GET_BOTH) { - ex2p = &ex2; - ex2 = 0; - } else { - ex2p = NULL; - } - rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { - MDB_val olddata; - if (unlikely((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS)) - return rc; - rc = mc->mc_dbx->md_dcmp(data, &olddata); - if (rc) { - if (op == MDB_GET_BOTH || rc > 0) - return MDB_NOTFOUND; - rc = 0; - } - *data = olddata; - } else { - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) - return rc; - } - } - - /* The key already matches in all other cases */ - if (op == MDB_SET_RANGE || op == MDB_SET_KEY) - MDB_GET_KEY(leaf, key); - mdb_debug("==> cursor placed on key [%s]", DKEY(key)); - - return rc; -} - -/** Move the cursor to the first item in the database. */ -static int -mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) -{ - int rc; - MDB_node *leaf; - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - - leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - mc->mc_ki[mc->mc_top] = 0; - - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); - return MDB_SUCCESS; - } - - if (likely(data)) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) - return rc; - } - } - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -/** Move the cursor to the last item in the database. */ -static int -mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) -{ - int rc; - MDB_node *leaf; - - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - if (likely(!(mc->mc_flags & C_EOF))) { - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_LAST); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - } - - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; - mc->mc_flags |= C_INITIALIZED|C_EOF; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); - return MDB_SUCCESS; - } - - if (likely(data)) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) - return rc; - } - } - - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; -} - -int -mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, - MDB_cursor_op op) -{ - int rc; - int exact = 0; - int (*mfunc)(MDB_cursor *mc, MDB_val *key, MDB_val *data); - - if (unlikely(mc == NULL)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - switch (op) { - case MDB_GET_CURRENT: - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { - rc = EINVAL; - } else { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - int nkeys = NUMKEYS(mp); - if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { - mc->mc_ki[mc->mc_top] = nkeys; - rc = MDB_NOTFOUND; - break; - } - rc = MDB_SUCCESS; - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - } else { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY(leaf, key); - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - mdb_xcursor_init1(mc, leaf); - rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - break; - } - rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); - } else { - rc = mdb_node_read(mc, leaf, data); - } - } - } - } - break; - case MDB_GET_BOTH: - case MDB_GET_BOTH_RANGE: - if (unlikely(data == NULL)) { - rc = EINVAL; - break; - } - if (unlikely(mc->mc_xcursor == NULL)) { - rc = MDB_INCOMPATIBLE; - break; - } - /* FALLTHRU */ - case MDB_SET: - case MDB_SET_KEY: - case MDB_SET_RANGE: - if (unlikely(key == NULL)) { - rc = EINVAL; - } else { - rc = mdb_cursor_set(mc, key, data, op, - op == MDB_SET_RANGE ? NULL : &exact); - } - break; - case MDB_GET_MULTIPLE: - if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) { - rc = EINVAL; - break; - } - if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) { - rc = MDB_INCOMPATIBLE; - break; - } - rc = MDB_SUCCESS; - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || - (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) - break; - goto fetchm; - case MDB_NEXT_MULTIPLE: - if (unlikely(data == NULL)) { - rc = EINVAL; - break; - } - if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) { - rc = MDB_INCOMPATIBLE; - break; - } - rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); - if (rc == MDB_SUCCESS) { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - MDB_cursor *mx; -fetchm: - mx = &mc->mc_xcursor->mx_cursor; - data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * - mx->mc_db->md_xsize; - data->mv_data = PAGEDATA(mx->mc_pg[mx->mc_top]); - mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1; - } else { - rc = MDB_NOTFOUND; - } - } - break; - case MDB_PREV_MULTIPLE: - if (data == NULL) { - rc = EINVAL; - break; - } - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - rc = MDB_INCOMPATIBLE; - break; - } - if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdb_cursor_last(mc, key, data); - else - rc = MDB_SUCCESS; - if (rc == MDB_SUCCESS) { - MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; - if (mx->mc_flags & C_INITIALIZED) { - rc = mdb_cursor_sibling(mx, 0); - if (rc == MDB_SUCCESS) - goto fetchm; - } else { - rc = MDB_NOTFOUND; - } - } - break; - case MDB_NEXT: - case MDB_NEXT_DUP: - case MDB_NEXT_NODUP: - rc = mdb_cursor_next(mc, key, data, op); - break; - case MDB_PREV: - case MDB_PREV_DUP: - case MDB_PREV_NODUP: - rc = mdb_cursor_prev(mc, key, data, op); - break; - case MDB_FIRST: - rc = mdb_cursor_first(mc, key, data); - break; - case MDB_FIRST_DUP: - mfunc = mdb_cursor_first; - mmove: - if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) { - rc = EINVAL; - break; - } - if (unlikely(mc->mc_xcursor == NULL)) { - rc = MDB_INCOMPATIBLE; - break; - } - { - MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - MDB_GET_KEY(leaf, key); - rc = mdb_node_read(mc, leaf, data); - break; - } - } - if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - rc = EINVAL; - break; - } - rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); - break; - case MDB_LAST: - rc = mdb_cursor_last(mc, key, data); - break; - case MDB_LAST_DUP: - mfunc = mdb_cursor_last; - goto mmove; - default: - mdb_debug("unhandled/unimplemented cursor operation %u", op); - rc = EINVAL; - break; - } - - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; - - return rc; -} - -/** Touch all the pages in the cursor stack. Set mc_top. - * Makes sure all the pages are writable, before attempting a write operation. - * @param[in] mc The cursor to operate on. - */ -static int -mdb_cursor_touch(MDB_cursor *mc) -{ - int rc = MDB_SUCCESS; - - if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { - /* Touch DB record of named DB */ - MDB_cursor mc2; - MDB_xcursor mcx; - if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) - return MDB_BAD_DBI; - mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); - if (unlikely(rc)) - return rc; - *mc->mc_dbflag |= DB_DIRTY; - } - mc->mc_top = 0; - if (mc->mc_snum) { - do { - rc = mdb_page_touch(mc); - } while (!rc && ++(mc->mc_top) < mc->mc_snum); - mc->mc_top = mc->mc_snum-1; - } - return rc; -} - -/** Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDB_NOSPILL 0x8000 - -int -mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, - unsigned flags) -{ - MDB_env *env; - MDB_node *leaf = NULL; - MDB_page *fp, *mp, *sub_root = NULL; - uint16_t fp_flags; - MDB_val xdata, *rdata, dkey, olddata; - MDB_db dummy; - int do_sub = 0, insert_key, insert_data; - unsigned mcount = 0, dcount = 0, nospill; - size_t nsize; - int rc, rc2; - unsigned nflags; - DKBUF; - - if (unlikely(mc == NULL || key == NULL)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - env = mc->mc_txn->mt_env; - - /* Check this first so counter will always be zero on any - * early failures. - */ - if (flags & MDB_MULTIPLE) { - dcount = data[1].mv_size; - data[1].mv_size = 0; - if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED))) - return MDB_INCOMPATIBLE; - } - - if (flags & MDB_RESERVE) { - if (unlikely(mc->mc_db->md_flags & (MDB_DUPSORT | MDB_REVERSEDUP))) - return MDB_INCOMPATIBLE; - } - - nospill = flags & MDB_NOSPILL; - flags &= ~MDB_NOSPILL; - - if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (unlikely(key->mv_size > ENV_MAXKEY(env))) - return MDB_BAD_VALSIZE; - -#if SIZE_MAX > MAXDATASIZE - if (unlikely(data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE))) - return MDB_BAD_VALSIZE; -#else - if ((mc->mc_db->md_flags & MDB_DUPSORT) && unlikely(data->mv_size > ENV_MAXKEY(env))) - return MDB_BAD_VALSIZE; -#endif - - if ((mc->mc_db->md_flags & MDB_INTEGERKEY) - && unlikely(key->mv_size != sizeof(unsigned) - && key->mv_size != sizeof(size_t) )) { - mdb_cassert(mc, ! "key-size is invalid for MDB_INTEGERKEY"); - return MDB_BAD_VALSIZE; - } - - if ((mc->mc_db->md_flags & MDB_INTEGERDUP) - && unlikely(data->mv_size != sizeof(unsigned) - && data->mv_size != sizeof(size_t) )) { - mdb_cassert(mc, ! "data-size is invalid MDB_INTEGERDUP"); - return MDB_BAD_VALSIZE; - } - - mdb_debug("==> put db %d key [%s], size %zu, data size %zu", - DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size); - - int dupdata_flag = 0; - if (flags & MDB_CURRENT) { - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return EINVAL; -#if MDBX_MODE_ENABLED - if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { - MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_cassert(mc, mc->mc_xcursor != NULL - && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); - if (mc->mc_xcursor->mx_db.md_entries > 1) { - rc = mdbx_cursor_del(mc, 0); - if (rc != MDB_SUCCESS) - return rc; - flags -= MDB_CURRENT; - } - } - } -#endif /* MDBX_MODE_ENABLED */ - rc = MDB_SUCCESS; - } else if (mc->mc_db->md_root == P_INVALID) { - /* new database, cursor has nothing to point to */ - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - rc = MDB_NO_ROOT; - } else { - int exact = 0; - MDB_val d2; - if (flags & MDB_APPEND) { - MDB_val k2; - rc = mdb_cursor_last(mc, &k2, &d2); - if (rc == 0) { - rc = mc->mc_dbx->md_cmp(key, &k2); - if (rc > 0) { - rc = MDB_NOTFOUND; - mc->mc_ki[mc->mc_top]++; - } else { - /* new key is <= last key */ - rc = MDB_KEYEXIST; - } - } - } else { - rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); - } - if ((flags & MDB_NOOVERWRITE) && rc == 0) { - mdb_debug("duplicate key [%s]", DKEY(key)); - *data = d2; - return MDB_KEYEXIST; - } - if (rc && unlikely(rc != MDB_NOTFOUND)) - return rc; - } - - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; - - /* Cursor is positioned, check for room in the dirty list */ - if (!nospill) { - if (flags & MDB_MULTIPLE) { - rdata = &xdata; - xdata.mv_size = data->mv_size * dcount; - } else { - rdata = data; - } - if (unlikely(rc2 = mdb_page_spill(mc, key, rdata))) - return rc2; - } - - if (rc == MDB_NO_ROOT) { - MDB_page *np; - /* new database, write a root leaf page */ - mdb_debug("allocating new root leaf page"); - if (unlikely(rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { - return rc2; - } - mdb_cursor_push(mc, np); - mc->mc_db->md_root = np->mp_pgno; - mc->mc_db->md_depth++; - *mc->mc_dbflag |= DB_DIRTY; - if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) == MDB_DUPFIXED) - np->mp_flags |= P_LEAF2; - mc->mc_flags |= C_INITIALIZED; - } else { - /* make sure all cursor pages are writable */ - rc2 = mdb_cursor_touch(mc); - if (unlikely(rc2)) - return rc2; - } - - insert_key = insert_data = rc; - if (insert_key) { - /* The key does not exist */ - mdb_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - LEAFSIZE(key, data) > env->me_nodemax) - { - /* Too big for a node, insert in sub-DB. Set up an empty - * "old sub-page" for prep_subDB to expand to a full page. - */ - fp_flags = P_LEAF|P_DIRTY; - fp = env->me_pbuf; - fp->mp_leaf2_ksize = data->mv_size; /* used if MDB_DUPFIXED */ - fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); - olddata.mv_size = PAGEHDRSZ; - goto prep_subDB; - } - } else { - /* there's only a key anyway, so this is a no-op */ - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - char *ptr; - unsigned ksize = mc->mc_db->md_xsize; - if (key->mv_size != ksize) - return MDB_BAD_VALSIZE; - ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); - memcpy(ptr, key->mv_data, ksize); -fix_parent: - /* if overwriting slot 0 of leaf, need to - * update branch key if there is a parent page - */ - if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - unsigned short dtop = 1; - mc->mc_top--; - /* slot 0 is always an empty key, find real slot */ - while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - mc->mc_top--; - dtop++; - } - if (mc->mc_ki[mc->mc_top]) - rc2 = mdb_update_key(mc, key); - else - rc2 = MDB_SUCCESS; - mc->mc_top += dtop; - if (rc2) - return rc2; - } - return MDB_SUCCESS; - } - -more: - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - olddata.mv_size = NODEDSZ(leaf); - olddata.mv_data = NODEDATA(leaf); - - /* DB has dups? */ - if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { - /* Prepare (sub-)page/sub-DB to accept the new item, - * if needed. fp: old sub-page or a header faking - * it. mp: new (sub-)page. offset: growth in page - * size. xdata: node data with new page or DB. - */ - unsigned i, offset = 0; - mp = fp = xdata.mv_data = env->me_pbuf; - mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; - - /* Was a single item before, must convert now */ - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - /* Just overwrite the current item */ - if (flags & MDB_CURRENT) { - if ((flags & MDB_NODUPDATA) && !mc->mc_dbx->md_dcmp(data, &olddata)) - return MDB_KEYEXIST; - goto current; - } - - /* does data match? */ - if (!mc->mc_dbx->md_dcmp(data, &olddata)) { - if (unlikely(flags & (MDB_NODUPDATA|MDB_APPENDDUP))) - return MDB_KEYEXIST; - /* overwrite it */ - goto current; - } - - /* Back up original data item */ - dupdata_flag = 1; - dkey.mv_size = olddata.mv_size; - dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size); - - /* Make sub-page header for the dup items, with dummy body */ - fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP; - fp->mp_lower = (PAGEHDRSZ-PAGEBASE); - xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - fp->mp_flags |= P_LEAF2; - fp->mp_leaf2_ksize = data->mv_size; - xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ - } else { - xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + - (dkey.mv_size & 1) + (data->mv_size & 1); - } - fp->mp_upper = xdata.mv_size - PAGEBASE; - olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ - } else if (leaf->mn_flags & F_SUBDATA) { - /* Data is on sub-DB, just store it */ - flags |= F_DUPDATA|F_SUBDATA; - goto put_sub; - } else { - /* Data is on sub-page */ - fp = olddata.mv_data; - switch (flags) { - default: - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - offset = EVEN(NODESIZE + sizeof(indx_t) + - data->mv_size); - break; - } - offset = fp->mp_leaf2_ksize; - if (SIZELEFT(fp) < offset) { - offset *= 4; /* space for 4 more */ - break; - } - /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ - case MDB_CURRENT | MDB_NODUPDATA: - case MDB_CURRENT: - fp->mp_flags |= P_DIRTY; - COPY_PGNO(fp->mp_pgno, mp->mp_pgno); - mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; - flags |= F_DUPDATA; - goto put_sub; - } - xdata.mv_size = olddata.mv_size + offset; - } - - fp_flags = fp->mp_flags; - if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { - /* Too big for a sub-page, convert to sub-DB */ - fp_flags &= ~P_SUBP; -prep_subDB: - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - fp_flags |= P_LEAF2; - dummy.md_xsize = fp->mp_leaf2_ksize; - dummy.md_flags = MDB_DUPFIXED; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - dummy.md_flags |= MDB_INTEGERKEY; - } else { - dummy.md_xsize = 0; - dummy.md_flags = 0; - } - dummy.md_depth = 1; - dummy.md_branch_pages = 0; - dummy.md_leaf_pages = 1; - dummy.md_overflow_pages = 0; - dummy.md_entries = NUMKEYS(fp); - xdata.mv_size = sizeof(MDB_db); - xdata.mv_data = &dummy; - if ((rc = mdb_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) - return rc; - offset = env->me_psize - olddata.mv_size; - flags |= F_DUPDATA|F_SUBDATA; - dummy.md_root = mp->mp_pgno; - sub_root = mp; - } - if (mp != fp) { - mp->mp_flags = fp_flags | P_DIRTY; - mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; - mp->mp_lower = fp->mp_lower; - mp->mp_upper = fp->mp_upper + offset; - if (fp_flags & P_LEAF2) { - memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize); - } else { - memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, - olddata.mv_size - fp->mp_upper - PAGEBASE); - for (i=0; imp_ptrs[i] = fp->mp_ptrs[i] + offset; - } - } - - rdata = &xdata; - flags |= F_DUPDATA; - do_sub = 1; - if (!insert_key) - mdb_node_del(mc, 0); - goto new_sub; - } -current: - /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ - if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) - return MDB_INCOMPATIBLE; - /* overflow page overwrites need special handling */ - if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); - - memcpy(&pg, olddata.mv_data, sizeof(pg)); - if (unlikely((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0)) - return rc2; - ovpages = omp->mp_pages; - - /* Is the ov page large enough? */ - if (ovpages >= dpages) { - if (!(omp->mp_flags & P_DIRTY) - && (level || (env->me_flags & MDB_WRITEMAP))) { - rc = mdb_page_unspill(mc->mc_txn, omp, &omp); - if (unlikely(rc)) - return rc; - level = 0; /* dirty in this txn or clean */ - } - /* Is it dirty? */ - if (omp->mp_flags & P_DIRTY) { - /* yes, overwrite it. Note in this case we don't - * bother to try shrinking the page if the new data - * is smaller than the overflow threshold. - */ - if (unlikely(level > 1)) { - /* It is writable only in a parent txn */ - MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); - MDB_ID2 id2; - if (unlikely(!np)) - return ENOMEM; - id2.mid = pg; - id2.mptr = np; - /* Note - this page is already counted in parent's dirty_room */ - rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); - mdb_cassert(mc, rc2 == 0); - /* Currently we make the page look as with put() in the - * parent txn, in case the user peeks at MDB_RESERVEd - * or unused parts. Some users treat ovpages specially. - */ - size_t sz = (size_t) env->me_psize * ovpages, off; - if (MDBX_MODE_ENABLED || !(flags & MDB_RESERVE)) { - /* Skip the part where LMDB will put *data. - * Copy end of page, adjusting alignment so - * compiler may copy words instead of bytes. - */ - off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); - memcpy((size_t *)((char *)np + off), - (size_t *)((char *)omp + off), sz - off); - sz = PAGEHDRSZ; - } - memcpy(np, omp, sz); /* Copy whole or header of page */ - omp = np; - } - SETDSZ(leaf, data->mv_size); - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = PAGEDATA(omp); - else - memcpy(PAGEDATA(omp), data->mv_data, data->mv_size); - return MDB_SUCCESS; - } - } - if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) - return rc2; - } else if (data->mv_size == olddata.mv_size) { - /* same size, just replace it. Note that we could - * also reuse this node if the new data is smaller, - * but instead we opt to shrink the node in that case. - */ - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = olddata.mv_data; - else if (!(mc->mc_flags & C_SUB)) - memcpy(olddata.mv_data, data->mv_data, data->mv_size); - else { - memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); - goto fix_parent; - } - return MDB_SUCCESS; - } - mdb_node_del(mc, 0); - } - - rdata = data; - -new_sub: - nflags = flags & NODE_ADD_FLAGS; - nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata); - if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { - if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA ) - nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ - if (!insert_key) - nflags |= MDB_SPLIT_REPLACE; - rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags); - } else { - /* There is room already in this leaf page. */ - rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); - if (likely(rc == 0)) { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue; - if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { - m3->mc_ki[i]++; - } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); - } - } - } - - if (likely(rc == MDB_SUCCESS)) { - /* Now store the actual data in the child DB. Note that we're - * storing the user data in the keys field, so there are strict - * size limits on dupdata. The actual data fields of the child - * DB are all zero size. */ - if (do_sub) { - int xflags; - size_t ecount; -put_sub: - xdata.mv_size = 0; - xdata.mv_data = ""; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (flags & MDB_CURRENT) { - xflags = (flags & MDB_NODUPDATA) ? - MDB_CURRENT|MDB_NOOVERWRITE|MDB_NOSPILL : MDB_CURRENT|MDB_NOSPILL; - } else { - mdb_xcursor_init1(mc, leaf); - xflags = (flags & MDB_NODUPDATA) ? - MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; - } - if (sub_root) - mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; - /* converted, write the original data first */ - if (dupdata_flag) { - rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); - if (unlikely(rc)) - goto bad_sub; - /* we've done our job */ - dkey.mv_size = 0; - } - if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2; - MDB_xcursor *mx = mc->mc_xcursor; - unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; - int nkeys = NUMKEYS(mp); - - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[i] == mp) { - if (m2->mc_ki[i] == mc->mc_ki[i]) { - mdb_xcursor_init2(m2, mx, dupdata_flag); - } else if (!insert_key && m2->mc_ki[i] < nkeys) { - XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); - } - } - } - } - ecount = mc->mc_xcursor->mx_db.md_entries; - if (flags & MDB_APPENDDUP) - xflags |= MDB_APPEND; - rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); - if (flags & F_SUBDATA) { - void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); - } - insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; - } - /* Increment count unless we just replaced an existing item. */ - if (insert_data) - mc->mc_db->md_entries++; - if (insert_key) { - /* Invalidate txn if we created an empty sub-DB */ - if (unlikely(rc)) - goto bad_sub; - /* If we succeeded and the key didn't exist before, - * make sure the cursor is marked valid. */ - mc->mc_flags |= C_INITIALIZED; - } - if (flags & MDB_MULTIPLE) { - if (!rc) { - mcount++; - /* let caller know how many succeeded, if any */ - data[1].mv_size = mcount; - if (mcount < dcount) { - data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; - insert_key = insert_data = 0; - goto more; - } - } - } - return rc; -bad_sub: - if (unlikely(rc == MDB_KEYEXIST)) /* should not happen, we deleted that item */ - rc = MDB_PROBLEM; - } - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_cursor_del(MDB_cursor *mc, unsigned flags) -{ - MDB_node *leaf; - MDB_page *mp; - int rc; - - if (unlikely(!mc)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return EINVAL; - - if (unlikely(mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))) - return MDB_NOTFOUND; - - if (unlikely(!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))) - return rc; - - rc = mdb_cursor_touch(mc); - if (unlikely(rc)) - return rc; - - mp = mc->mc_pg[mc->mc_top]; - if (IS_LEAF2(mp)) - goto del_key; - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (flags & MDB_NODUPDATA) { - /* mdb_cursor_del0() will subtract the final entry */ - mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; - } else { - if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } - rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); - if (unlikely(rc)) - return rc; - /* If sub-DB still has entries, we're done */ - if (mc->mc_xcursor->mx_db.md_entries) { - if (leaf->mn_flags & F_SUBDATA) { - /* update subDB info */ - void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); - } else { - MDB_cursor *m2; - /* shrink fake page */ - mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]); - leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - /* fix other sub-DB cursors pointed at fake pages on this page */ - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (!(m2->mc_flags & C_INITIALIZED)) continue; - if (m2->mc_pg[mc->mc_top] == mp) { - MDB_node *n2 = leaf; - if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { - n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); - if (n2->mn_flags & F_SUBDATA) continue; - } - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); - } - } - } - mc->mc_db->md_entries--; - return rc; - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; - } - /* otherwise fall thru and delete the sub-DB */ - } - - if (leaf->mn_flags & F_SUBDATA) { - /* add all the child DB's pages to the free list */ - rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (unlikely(rc)) - goto fail; - } - } - /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ - else if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) { - rc = MDB_INCOMPATIBLE; - goto fail; - } - - /* add overflow pages to free list */ - if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; - pgno_t pg; - - memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if (unlikely((rc = mdb_page_get(mc, pg, &omp, NULL)) || - (rc = mdb_ovpage_free(mc, omp)))) - goto fail; - } - -del_key: - return mdb_cursor_del0(mc); - -fail: - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -/** Allocate and initialize new pages for a database. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc a cursor on the database being added to. - * @param[in] flags flags defining what type of page is being allocated. - * @param[in] num the number of pages to allocate. This is usually 1, - * unless allocating overflow pages for a large record. - * @param[out] mp Address of a page, or NULL on failure. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) -{ - MDB_page *np; - int rc; - - if (unlikely((rc = mdb_page_alloc(mc, num, &np, MDBX_ALLOC_ALL)))) - return rc; - mdb_debug("allocated new mpage %zu, page size %u", - np->mp_pgno, mc->mc_txn->mt_env->me_psize); - np->mp_flags = flags | P_DIRTY; - np->mp_lower = (PAGEHDRSZ-PAGEBASE); - np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; - - if (IS_BRANCH(np)) - mc->mc_db->md_branch_pages++; - else if (IS_LEAF(np)) - mc->mc_db->md_leaf_pages++; - else if (IS_OVERFLOW(np)) { - mc->mc_db->md_overflow_pages += num; - np->mp_pages = num; - } - *mp = np; - - return 0; -} - -/** Calculate the size of a leaf node. - * The size depends on the environment's page size; if a data item - * is too large it will be put onto an overflow page and the node - * size will only include the key and not the data. Sizes are always - * rounded up to an even number of bytes, to guarantee 2-byte alignment - * of the #MDB_node headers. - * @param[in] env The environment handle. - * @param[in] key The key for the node. - * @param[in] data The data for the node. - * @return The number of bytes needed to store the node. - */ -static MDBX_INLINE size_t -mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) -{ - size_t sz; - - sz = LEAFSIZE(key, data); - if (sz > env->me_nodemax) { - /* put on overflow page */ - sz -= data->mv_size - sizeof(pgno_t); - } - - return EVEN(sz + sizeof(indx_t)); -} - -/** Calculate the size of a branch node. - * The size should depend on the environment's page size but since - * we currently don't support spilling large keys onto overflow - * pages, it's simply the size of the #MDB_node header plus the - * size of the key. Sizes are always rounded up to an even number - * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. - * @param[in] env The environment handle. - * @param[in] key The key for the node. - * @return The number of bytes needed to store the node. - */ -static MDBX_INLINE size_t -mdb_branch_size(MDB_env *env, MDB_val *key) -{ - size_t sz; - - sz = INDXSIZE(key); - if (unlikely(sz > env->me_nodemax)) { - /* put on overflow page */ - /* not implemented */ - mdb_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __FUNCTION__, __LINE__); - sz -= key->mv_size - sizeof(pgno_t); - } - - return sz + sizeof(indx_t); -} - -/** Add a node to the page pointed to by the cursor. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc The cursor for this operation. - * @param[in] indx The index on the page where the new node should be added. - * @param[in] key The key for the new node. - * @param[in] data The data for the new node, if any. - * @param[in] pgno The page number, if adding a branch node. - * @param[in] flags Flags for the node. - * @return 0 on success, non-zero on failure. Possible errors are: - *
    - *
  • ENOMEM - failed to allocate overflow pages for the node. - *
  • MDB_PAGE_FULL - there is insufficient room in the page. This error - * should never happen since all callers already calculate the - * page's free space before calling this function. - *
- */ -static int -mdb_node_add(MDB_cursor *mc, indx_t indx, - MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags) -{ - unsigned i; - size_t node_size = NODESIZE; - ssize_t room; - indx_t ofs; - MDB_node *node; - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_page *ofp = NULL; /* overflow page */ - void *ndata; - DKBUF; - - mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); - - mdb_debug("add to %s %spage %zu index %i, data size %zu key size %zu [%s]", - IS_LEAF(mp) ? "leaf" : "branch", - IS_SUBP(mp) ? "sub-" : "", - mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, - key ? key->mv_size : 0, key ? DKEY(key) : "null"); - - if (IS_LEAF2(mp)) { - mdb_cassert(mc, key); - /* Move higher keys up one slot. */ - int ksize = mc->mc_db->md_xsize, dif; - char *ptr = LEAF2KEY(mp, indx, ksize); - dif = NUMKEYS(mp) - indx; - if (dif > 0) - memmove(ptr+ksize, ptr, dif*ksize); - /* insert new key */ - memcpy(ptr, key->mv_data, ksize); - - /* Just using these for counting */ - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); - return MDB_SUCCESS; - } - - room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); - if (key != NULL) - node_size += key->mv_size; - if (IS_LEAF(mp)) { - mdb_cassert(mc, key && data); - if (unlikely(F_ISSET(flags, F_BIGDATA))) { - /* Data already on overflow page. */ - node_size += sizeof(pgno_t); - } else if (unlikely(node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax)) { - int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); - int rc; - /* Put data on overflow page. */ - mdb_debug("data size is %zu, node would be %zu, put data on overflow page", - data->mv_size, node_size+data->mv_size); - node_size = EVEN(node_size + sizeof(pgno_t)); - if ((ssize_t)node_size > room) - goto full; - if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) - return rc; - mdb_debug("allocated overflow page %zu", ofp->mp_pgno); - flags |= F_BIGDATA; - goto update; - } else { - node_size += data->mv_size; - } - } - node_size = EVEN(node_size); - if (unlikely((ssize_t)node_size > room)) - goto full; - -update: - /* Move higher pointers up one slot. */ - for (i = NUMKEYS(mp); i > indx; i--) - mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; - - /* Adjust free space offsets. */ - ofs = mp->mp_upper - node_size; - mdb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); - mp->mp_ptrs[indx] = ofs; - mp->mp_upper = ofs; - mp->mp_lower += sizeof(indx_t); - - /* Write the node data. */ - node = NODEPTR(mp, indx); - node->mn_ksize = (key == NULL) ? 0 : key->mv_size; - node->mn_flags = flags; - if (IS_LEAF(mp)) - SETDSZ(node,data->mv_size); - else - SETPGNO(node,pgno); - - if (key) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); - - if (IS_LEAF(mp)) { - ndata = NODEDATA(node); - if (unlikely(ofp == NULL)) { - if (unlikely(F_ISSET(flags, F_BIGDATA))) - memcpy(ndata, data->mv_data, sizeof(pgno_t)); - else if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = ndata; - else if (likely(ndata != data->mv_data)) - memcpy(ndata, data->mv_data, data->mv_size); - } else { - memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); - ndata = PAGEDATA(ofp); - if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = ndata; - else if (likely(ndata != data->mv_data)) - memcpy(ndata, data->mv_data, data->mv_size); - } - } - - return MDB_SUCCESS; - -full: - mdb_debug("not enough room in page %zu, got %u ptrs", - mdb_dbg_pgno(mp), NUMKEYS(mp)); - mdb_debug("upper-lower = %u - %u = %zd", mp->mp_upper,mp->mp_lower,room); - mdb_debug("node size = %zu", node_size); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_FULL; -} - -/** Delete the specified node from a page. - * @param[in] mc Cursor pointing to the node to delete. - * @param[in] ksize The size of a node. Only used if the page is - * part of a #MDB_DUPFIXED database. - */ -static void -mdb_node_del(MDB_cursor *mc, int ksize) -{ - MDB_page *mp = mc->mc_pg[mc->mc_top]; - indx_t indx = mc->mc_ki[mc->mc_top]; - unsigned sz; - indx_t i, j, numkeys, ptr; - MDB_node *node; - char *base; - - mdb_debug("delete node %u on %s page %zu", indx, - IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp)); - numkeys = NUMKEYS(mp); - mdb_cassert(mc, indx < numkeys); - - if (IS_LEAF2(mp)) { - int x = numkeys - 1 - indx; - base = LEAF2KEY(mp, indx, ksize); - if (x) - memmove(base, base + ksize, x * ksize); - mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += ksize - sizeof(indx_t); - return; - } - - node = NODEPTR(mp, indx); - sz = NODESIZE + node->mn_ksize; - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - sz += sizeof(pgno_t); - else - sz += NODEDSZ(node); - } - sz = EVEN(sz); - - ptr = mp->mp_ptrs[indx]; - for (i = j = 0; i < numkeys; i++) { - if (i != indx) { - mp->mp_ptrs[j] = mp->mp_ptrs[i]; - if (mp->mp_ptrs[i] < ptr) - mp->mp_ptrs[j] += sz; - j++; - } - } - - base = (char *)mp + mp->mp_upper + PAGEBASE; - memmove(base + sz, base, ptr - mp->mp_upper); - - mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += sz; -} - -/** Compact the main page after deleting a node on a subpage. - * @param[in] mp The main page to operate on. - * @param[in] indx The index of the subpage on the main page. - */ -static void -mdb_node_shrink(MDB_page *mp, indx_t indx) -{ - MDB_node *node; - MDB_page *sp, *xp; - char *base; - indx_t delta, nsize, len, ptr; - int i; - - node = NODEPTR(mp, indx); - sp = (MDB_page *)NODEDATA(node); - delta = SIZELEFT(sp); - nsize = NODEDSZ(node) - delta; - - /* Prepare to shift upward, set len = length(subpage part to shift) */ - if (IS_LEAF2(sp)) { - len = nsize; - if (nsize & 1) - return; /* do not make the node uneven-sized */ - } else { - xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ - for (i = NUMKEYS(sp); --i >= 0; ) - xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; - len = PAGEHDRSZ; - } - sp->mp_upper = sp->mp_lower; - COPY_PGNO(sp->mp_pgno, mp->mp_pgno); - SETDSZ(node, nsize); - - /* Shift upward */ - base = (char *)mp + mp->mp_upper + PAGEBASE; - memmove(base + delta, base, (char *)sp + len - base); - - ptr = mp->mp_ptrs[indx]; - for (i = NUMKEYS(mp); --i >= 0; ) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] += delta; - } - mp->mp_upper += delta; -} - -/** Initial setup of a sorted-dups cursor. - * Sorted duplicates are implemented as a sub-database for the given key. - * The duplicate data items are actually keys of the sub-database. - * Operations on the duplicate data items are performed using a sub-cursor - * initialized when the sub-database is first accessed. This function does - * the preliminary setup of the sub-cursor, filling in the fields that - * depend only on the parent DB. - * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. - */ -static void -mdb_xcursor_init0(MDB_cursor *mc) -{ - MDB_xcursor *mx = mc->mc_xcursor; - - mx->mx_cursor.mc_xcursor = NULL; - mx->mx_cursor.mc_txn = mc->mc_txn; - mx->mx_cursor.mc_db = &mx->mx_db; - mx->mx_cursor.mc_dbx = &mx->mx_dbx; - mx->mx_cursor.mc_dbi = mc->mc_dbi; - mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - mx->mx_dbx.md_name.mv_size = 0; - mx->mx_dbx.md_name.mv_data = NULL; - mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; - mx->mx_dbx.md_dcmp = NULL; - mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; -} - -/** Final setup of a sorted-dups cursor. - * Sets up the fields that depend on the data from the main cursor. - * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized. - * @param[in] node The data containing the #MDB_db record for the - * sorted-dup database. - */ -static void -mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) -{ - MDB_xcursor *mx = mc->mc_xcursor; - - if (node->mn_flags & F_SUBDATA) { - memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); - mx->mx_cursor.mc_pg[0] = 0; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - } else { - MDB_page *fp = NODEDATA(node); - mx->mx_db.md_xsize = 0; - mx->mx_db.md_flags = 0; - mx->mx_db.md_depth = 1; - mx->mx_db.md_branch_pages = 0; - mx->mx_db.md_leaf_pages = 1; - mx->mx_db.md_overflow_pages = 0; - mx->mx_db.md_entries = NUMKEYS(fp); - COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); - mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; - mx->mx_cursor.mc_pg[0] = fp; - mx->mx_cursor.mc_ki[0] = 0; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - mx->mx_db.md_flags = MDB_DUPFIXED; - mx->mx_db.md_xsize = fp->mp_leaf2_ksize; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - mx->mx_db.md_flags |= MDB_INTEGERKEY; - } - } - mdb_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; -/* #if UINT_MAX < SIZE_MAX - if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) - mx->mx_dbx.md_cmp = mdb_cmp_clong; -#endif */ -} - -/** Fixup a sorted-dups cursor due to underlying update. - * Sets up some fields that depend on the data from the main cursor. - * Almost the same as init1, but skips initialization steps if the - * xcursor had already been used. - * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. - * @param[in] src_mx The xcursor of an up-to-date cursor. - * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. - */ -static void -mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) -{ - MDB_xcursor *mx = mc->mc_xcursor; - - if (new_dupdata) { - mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags |= C_INITIALIZED; - mx->mx_cursor.mc_ki[0] = 0; - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; - mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; - } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { - return; - } - mx->mx_db = src_mx->mx_db; - mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; - mdb_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); -} - -/** Initialize a cursor for a given transaction and database. */ -static void -mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) -{ - mc->mc_signature = MDBX_MC_SIGNATURE; - mc->mc_next = NULL; - mc->mc_backup = NULL; - mc->mc_dbi = dbi; - mc->mc_txn = txn; - mc->mc_db = &txn->mt_dbs[dbi]; - mc->mc_dbx = &txn->mt_dbxs[dbi]; - mc->mc_dbflag = &txn->mt_dbflags[dbi]; - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_pg[0] = 0; - mc->mc_flags = 0; - mc->mc_ki[0] = 0; - mc->mc_xcursor = NULL; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { - mdb_tassert(txn, mx != NULL); - mx->mx_cursor.mc_signature = MDBX_MC_SIGNATURE; - mc->mc_xcursor = mx; - mdb_xcursor_init0(mc); - } - if (unlikely(*mc->mc_dbflag & DB_STALE)) { - mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); - } -} - -int -mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) -{ - MDB_cursor *mc; - size_t size = sizeof(MDB_cursor); - - if (unlikely(!ret || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) - return EINVAL; - - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) - return EINVAL; - - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) - size += sizeof(MDB_xcursor); - - if (likely((mc = malloc(size)) != NULL)) { - mdb_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); - if (txn->mt_cursors) { - mc->mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = mc; - mc->mc_flags |= C_UNTRACK; - } - } else { - return ENOMEM; - } - - *ret = mc; - - return MDB_SUCCESS; -} - -int -mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) -{ - if (unlikely(!mc || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE - && mc->mc_signature != MDBX_MC_READY4CLOSE)) - return EINVAL; - - if (unlikely(!TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID))) - return EINVAL; - - if (unlikely(mc->mc_backup)) - return EINVAL; - - if (unlikely((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)) { -#if MDBX_MODE_ENABLED - MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; - mc->mc_signature = MDBX_MC_READY4CLOSE; -#else - return EINVAL; -#endif - } - - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); - return MDB_SUCCESS; -} - -/* Return the count of duplicate data items for the current key */ -int -mdb_cursor_count(MDB_cursor *mc, size_t *countp) -{ - if (unlikely(mc == NULL || countp == NULL)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return EINVAL; - -#if MDBX_MODE_ENABLED - if (!mc->mc_snum) { - *countp = 0; - return MDB_NOTFOUND; - } - - MDB_page *mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) { - *countp = 0; - return MDB_NOTFOUND; - } - - *countp = 1; - if (mc->mc_xcursor != NULL) { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); - *countp = mc->mc_xcursor->mx_db.md_entries; - } - } -#else - if (unlikely(mc->mc_xcursor == NULL)) - return MDB_INCOMPATIBLE; - - if (!mc->mc_snum) - return MDB_NOTFOUND; - - MDB_page *mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) - return MDB_NOTFOUND; - - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - *countp = 1; - } else { - if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) - return EINVAL; - *countp = mc->mc_xcursor->mx_db.md_entries; - } -#endif /* MDBX_MODE_ENABLED */ - return MDB_SUCCESS; -} - -void -mdb_cursor_close(MDB_cursor *mc) -{ - if (mc) { - mdb_ensure(NULL, mc->mc_signature == MDBX_MC_SIGNATURE - || mc->mc_signature == MDBX_MC_READY4CLOSE); - if (!mc->mc_backup) { - /* Remove from txn, if tracked. - * A read-only txn (!C_UNTRACK) may have been freed already, - * so do not peek inside it. Only write txns track cursors. */ - if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { - MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) prev = &(*prev)->mc_next; - if (*prev == mc) - *prev = mc->mc_next; - } - mc->mc_signature = 0; - free(mc); - } else { - /* cursor closed before nested txn ends */ - mdb_cassert(mc, mc->mc_signature == MDBX_MC_SIGNATURE); - mc->mc_signature = MDBX_MC_WAIT4EOT; - } - } -} - -MDB_txn * -mdb_cursor_txn(MDB_cursor *mc) -{ - if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) - return NULL; - return mc->mc_txn; -} - -MDB_dbi -mdb_cursor_dbi(MDB_cursor *mc) -{ - if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) - return INT_MIN; - return mc->mc_dbi; -} - -/** Replace the key for a branch node with a new key. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc Cursor pointing to the node to operate on. - * @param[in] key The new key to use. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_update_key(MDB_cursor *mc, MDB_val *key) -{ - MDB_page *mp; - MDB_node *node; - char *base; - size_t len; - int delta, ksize, oksize; - indx_t ptr, i, numkeys, indx; - DKBUF; - - indx = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - node = NODEPTR(mp, indx); - ptr = mp->mp_ptrs[indx]; - { - MDB_val k2; - char kbuf2[DKBUF_MAXKEYSIZE*2+1]; - k2.mv_data = NODEKEY(node); - k2.mv_size = node->mn_ksize; - mdb_debug("update key %u (ofs %u) [%s] to [%s] on page %zu", - indx, ptr, - mdb_dkey(&k2, kbuf2), - DKEY(key), - mp->mp_pgno); - } - - /* Sizes must be 2-byte aligned. */ - ksize = EVEN(key->mv_size); - oksize = EVEN(node->mn_ksize); - delta = ksize - oksize; - - /* Shift node contents if EVEN(key length) changed. */ - if (delta) { - if (delta > 0 && SIZELEFT(mp) < delta) { - pgno_t pgno; - /* not enough space left, do a delete and split */ - mdb_debug("Not enough room, delta = %d, splitting...", delta); - pgno = NODEPGNO(node); - mdb_node_del(mc, 0); - return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); - } - - numkeys = NUMKEYS(mp); - for (i = 0; i < numkeys; i++) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] -= delta; - } - - base = (char *)mp + mp->mp_upper + PAGEBASE; - len = ptr - mp->mp_upper + NODESIZE; - memmove(base - delta, base, len); - mp->mp_upper -= delta; - - node = NODEPTR(mp, indx); - } - - /* But even if no shift was needed, update ksize */ - if (node->mn_ksize != key->mv_size) - node->mn_ksize = key->mv_size; - - if (key->mv_size) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); - - return MDB_SUCCESS; -} - -static void -mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); - -/** Perform \b act while tracking temporary cursor \b mn */ -#define WITH_CURSOR_TRACKING(mn, act) do { \ - MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ - if ((mn).mc_flags & C_SUB) { \ - dummy.mc_flags = C_INITIALIZED; \ - dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ - tracked = &dummy; \ - } else { \ - tracked = &(mn); \ - } \ - tracked->mc_next = *tp; \ - *tp = tracked; \ - { act; } \ - *tp = tracked->mc_next; \ -} while (0) - -/** Move a node from csrc to cdst. - */ -static int -mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) -{ - MDB_node *srcnode; - MDB_val key, data; - pgno_t srcpg; - MDB_cursor mn; - int rc; - unsigned short flags; - - DKBUF; - - /* Mark src and dst as dirty. */ - if (unlikely((rc = mdb_page_touch(csrc)) || - (rc = mdb_page_touch(cdst)))) - return rc; - - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_xsize; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size); - data.mv_size = 0; - data.mv_data = NULL; - srcpg = 0; - flags = 0; - } else { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); - mdb_cassert(csrc, !((size_t)srcnode & 1)); - srcpg = NODEPGNO(srcnode); - flags = srcnode->mn_flags; - if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - unsigned snum = csrc->mc_snum; - MDB_node *s2; - /* must find the lowest key below src */ - rc = mdb_page_search_lowest(csrc); - if (unlikely(rc)) - return rc; - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_xsize; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); - } - csrc->mc_snum = snum--; - csrc->mc_top = snum; - } else { - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); - } - mn.mc_xcursor = NULL; - if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { - unsigned snum = cdst->mc_snum; - MDB_node *s2; - MDB_val bkey; - /* must find the lowest key below dst */ - mdb_cursor_copy(cdst, &mn); - rc = mdb_page_search_lowest(&mn); - if (unlikely(rc)) - return rc; - if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { - bkey.mv_size = mn.mc_db->md_xsize; - bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); - } else { - s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); - bkey.mv_size = NODEKSZ(s2); - bkey.mv_data = NODEKEY(s2); - } - mn.mc_snum = snum--; - mn.mc_top = snum; - mn.mc_ki[snum] = 0; - rc = mdb_update_key(&mn, &bkey); - if (unlikely(rc)) - return rc; - } - - mdb_debug("moving %s node %u [%s] on page %zu to node %u on page %zu", - IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", - csrc->mc_ki[csrc->mc_top], - DKEY(&key), - csrc->mc_pg[csrc->mc_top]->mp_pgno, - cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno); - - /* Add the node to the destination page. */ - rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - - /* Delete the node from the source page. */ - mdb_node_del(csrc, key.mv_size); - - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mpd, *mps; - - mps = csrc->mc_pg[csrc->mc_top]; - /* If we're adding on the left, bump others up */ - if (fromleft) { - mpd = cdst->mc_pg[csrc->mc_top]; - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) - continue; - if (m3 != cdst && - m3->mc_pg[csrc->mc_top] == mpd && - m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { - m3->mc_ki[csrc->mc_top]++; - } - if (m3 !=csrc && - m3->mc_pg[csrc->mc_top] == mps && - m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - m3->mc_ki[csrc->mc_top-1]++; - } - if (XCURSOR_INITED(m3) && IS_LEAF(mps)) - XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - } - } else - /* Adding on the right, bump others down */ - { - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) - continue; - if (m3->mc_pg[csrc->mc_top] == mps) { - if (!m3->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - m3->mc_ki[csrc->mc_top-1]--; - } else { - m3->mc_ki[csrc->mc_top]--; - } - if (XCURSOR_INITED(m3) && IS_LEAF(mps)) - XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - } - } - } - } - - /* Update the parent separators. */ - if (csrc->mc_ki[csrc->mc_top] == 0) { - if (csrc->mc_ki[csrc->mc_top-1] != 0) { - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); - } else { - srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - mdb_debug("update separator for source page %zu to [%s]", - csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)); - mdb_cursor_copy(csrc, &mn); - mn.mc_snum--; - mn.mc_top--; - /* We want mdb_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdb_update_key(&mn, &key)); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - MDB_val nullkey; - indx_t ix = csrc->mc_ki[csrc->mc_top]; - nullkey.mv_size = 0; - csrc->mc_ki[csrc->mc_top] = 0; - rc = mdb_update_key(csrc, &nullkey); - csrc->mc_ki[csrc->mc_top] = ix; - mdb_cassert(csrc, rc == MDB_SUCCESS); - } - } - - if (cdst->mc_ki[cdst->mc_top] == 0) { - if (cdst->mc_ki[cdst->mc_top-1] != 0) { - if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); - } else { - srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); - } - mdb_debug("update separator for destination page %zu to [%s]", - cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)); - mdb_cursor_copy(cdst, &mn); - mn.mc_snum--; - mn.mc_top--; - /* We want mdb_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdb_update_key(&mn, &key)); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { - MDB_val nullkey; - indx_t ix = cdst->mc_ki[cdst->mc_top]; - nullkey.mv_size = 0; - cdst->mc_ki[cdst->mc_top] = 0; - rc = mdb_update_key(cdst, &nullkey); - cdst->mc_ki[cdst->mc_top] = ix; - mdb_cassert(cdst, rc == MDB_SUCCESS); - } - } - - return MDB_SUCCESS; -} - -/** Merge one page into another. - * The nodes from the page pointed to by \b csrc will - * be copied to the page pointed to by \b cdst and then - * the \b csrc page will be freed. - * @param[in] csrc Cursor pointing to the source page. - * @param[in] cdst Cursor pointing to the destination page. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) -{ - MDB_page *psrc, *pdst; - MDB_node *srcnode; - MDB_val key, data; - unsigned nkeys; - int rc; - indx_t i, j; - - psrc = csrc->mc_pg[csrc->mc_top]; - pdst = cdst->mc_pg[cdst->mc_top]; - - mdb_debug("merging page %zu into %zu", psrc->mp_pgno, pdst->mp_pgno); - - mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ - mdb_cassert(csrc, cdst->mc_snum > 1); - - /* Mark dst as dirty. */ - if (unlikely(rc = mdb_page_touch(cdst))) - return rc; - - /* get dst page again now that we've touched it. */ - pdst = cdst->mc_pg[cdst->mc_top]; - - /* Move all nodes from src to dst. - */ - j = nkeys = NUMKEYS(pdst); - if (IS_LEAF2(psrc)) { - key.mv_size = csrc->mc_db->md_xsize; - key.mv_data = PAGEDATA(psrc); - for (i = 0; i < NUMKEYS(psrc); i++, j++) { - rc = mdb_node_add(cdst, j, &key, NULL, 0, 0); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - key.mv_data = (char *)key.mv_data + key.mv_size; - } - } else { - for (i = 0; i < NUMKEYS(psrc); i++, j++) { - srcnode = NODEPTR(psrc, i); - if (i == 0 && IS_BRANCH(psrc)) { - MDB_cursor mn; - MDB_node *s2; - mdb_cursor_copy(csrc, &mn); - mn.mc_xcursor = NULL; - /* must find the lowest key below src */ - rc = mdb_page_search_lowest(&mn); - if (unlikely(rc)) - return rc; - if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { - key.mv_size = mn.mc_db->md_xsize; - key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); - } else { - s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); - } - } else { - key.mv_size = srcnode->mn_ksize; - key.mv_data = NODEKEY(srcnode); - } - - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); - rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - } - - mdb_debug("dst page %zu now has %u keys (%.1f%% filled)", - pdst->mp_pgno, NUMKEYS(pdst), - (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10); - - /* Unlink the src page from parent and add to free list. - */ - csrc->mc_top--; - mdb_node_del(csrc, 0); - if (csrc->mc_ki[csrc->mc_top] == 0) { - key.mv_size = 0; - rc = mdb_update_key(csrc, &key); - if (unlikely(rc)) { - csrc->mc_top++; - return rc; - } - } - csrc->mc_top++; - - psrc = csrc->mc_pg[csrc->mc_top]; - /* If not operating on FreeDB, allow this page to be reused - * in this txn. Otherwise just add to free list. - */ - rc = mdb_page_loose(csrc, psrc); - if (unlikely(rc)) - return rc; - if (IS_LEAF(psrc)) - csrc->mc_db->md_leaf_pages--; - else - csrc->mc_db->md_branch_pages--; - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; - unsigned top = csrc->mc_top; - - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (csrc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == csrc) continue; - if (m3->mc_snum < csrc->mc_snum) continue; - if (m3->mc_pg[top] == psrc) { - m3->mc_pg[top] = pdst; - m3->mc_ki[top] += nkeys; - m3->mc_ki[top-1] = cdst->mc_ki[top-1]; - } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && - m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { - m3->mc_ki[top-1]--; - } - if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) - XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); - } - } - { - unsigned snum = cdst->mc_snum; - uint16_t depth = cdst->mc_db->md_depth; - mdb_cursor_pop(cdst); - rc = mdb_rebalance(cdst); - /* Did the tree height change? */ - if (depth != cdst->mc_db->md_depth) - snum += cdst->mc_db->md_depth - depth; - cdst->mc_snum = snum; - cdst->mc_top = snum-1; - } - return rc; -} - -/** Copy the contents of a cursor. - * @param[in] csrc The cursor to copy from. - * @param[out] cdst The cursor to copy to. - */ -static void -mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) -{ - unsigned i; - - cdst->mc_txn = csrc->mc_txn; - cdst->mc_dbi = csrc->mc_dbi; - cdst->mc_db = csrc->mc_db; - cdst->mc_dbx = csrc->mc_dbx; - cdst->mc_snum = csrc->mc_snum; - cdst->mc_top = csrc->mc_top; - cdst->mc_flags = csrc->mc_flags; - - for (i=0; imc_snum; i++) { - cdst->mc_pg[i] = csrc->mc_pg[i]; - cdst->mc_ki[i] = csrc->mc_ki[i]; - } -} - -/** Rebalance the tree after a delete operation. - * @param[in] mc Cursor pointing to the page where rebalancing - * should begin. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_rebalance(MDB_cursor *mc) -{ - MDB_node *node; - int rc, fromleft; - unsigned ptop, minkeys, thresh; - MDB_cursor mn; - indx_t oldki; - - if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { - minkeys = 2; - thresh = 1; - } else { - minkeys = 1; - thresh = FILL_THRESHOLD; - } - mdb_debug("rebalancing %s page %zu (has %u keys, %.1f%% full)", - IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", - mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), - (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10); - - if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && - NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { - mdb_debug("no need to rebalance page %zu, above fill threshold", - mdb_dbg_pgno(mc->mc_pg[mc->mc_top])); - return MDB_SUCCESS; - } - - if (mc->mc_snum < 2) { - MDB_page *mp = mc->mc_pg[0]; - if (IS_SUBP(mp)) { - mdb_debug("Can't rebalance a subpage, ignoring"); - return MDB_SUCCESS; - } - if (NUMKEYS(mp) == 0) { - mdb_debug("tree is completely empty"); - mc->mc_db->md_root = P_INVALID; - mc->mc_db->md_depth = 0; - mc->mc_db->md_leaf_pages = 0; - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (unlikely(rc)) - return rc; - /* Adjust cursors pointing to mp */ - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - { - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) - continue; - if (m3->mc_pg[0] == mp) { - m3->mc_snum = 0; - m3->mc_top = 0; - m3->mc_flags &= ~C_INITIALIZED; - } - } - } - } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { - int i; - mdb_debug("collapsing root page!"); - rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (unlikely(rc)) - return rc; - mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); - rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); - if (unlikely(rc)) - return rc; - mc->mc_db->md_depth--; - mc->mc_db->md_branch_pages--; - mc->mc_ki[0] = mc->mc_ki[1]; - for (i = 1; imc_db->md_depth; i++) { - mc->mc_pg[i] = mc->mc_pg[i+1]; - mc->mc_ki[i] = mc->mc_ki[i+1]; - } - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc) continue; - if (!(m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_pg[0] == mp) { - for (i=0; imc_db->md_depth; i++) { - m3->mc_pg[i] = m3->mc_pg[i+1]; - m3->mc_ki[i] = m3->mc_ki[i+1]; - } - m3->mc_snum--; - m3->mc_top--; - } - } - } - } else - mdb_debug("root page doesn't need rebalancing"); - return MDB_SUCCESS; - } - - /* The parent (branch page) must have at least 2 pointers, - * otherwise the tree is invalid. - */ - ptop = mc->mc_top-1; - mdb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); - - /* Leaf page fill factor is below the threshold. - * Try to move keys from left or right neighbor, or - * merge with a neighbor page. - */ - - /* Find neighbors. - */ - mdb_cursor_copy(mc, &mn); - mn.mc_xcursor = NULL; - - oldki = mc->mc_ki[mc->mc_top]; - if (mc->mc_ki[ptop] == 0) { - /* We're the leftmost leaf in our parent. - */ - mdb_debug("reading right neighbor"); - mn.mc_ki[ptop]++; - node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); - if (unlikely(rc)) - return rc; - mn.mc_ki[mn.mc_top] = 0; - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); - fromleft = 0; - } else { - /* There is at least one neighbor to the left. - */ - mdb_debug("reading left neighbor"); - mn.mc_ki[ptop]--; - node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); - if (unlikely(rc)) - return rc; - mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; - mc->mc_ki[mc->mc_top] = 0; - fromleft = 1; - } - - mdb_debug("found neighbor page %zu (%u keys, %.1f%% full)", - mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), - (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10); - - /* If the neighbor page is above threshold and has enough keys, - * move one key from it. Otherwise we should try to merge them. - * (A branch page must never have less than 2 keys.) - */ - if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { - rc = mdb_node_move(&mn, mc, fromleft); - if (fromleft) { - /* if we inserted on left, bump position up */ - oldki++; - } - } else { - if (!fromleft) { - rc = mdb_page_merge(&mn, mc); - } else { - oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); - mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - /* We want mdb_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, - rc = mdb_page_merge(mc, &mn)); - mdb_cursor_copy(&mn, mc); - } - mc->mc_flags &= ~C_EOF; - } - mc->mc_ki[mc->mc_top] = oldki; - return rc; -} - -/** Complete a delete operation started by #mdb_cursor_del(). */ -static int -mdb_cursor_del0(MDB_cursor *mc) -{ - int rc; - MDB_page *mp; - indx_t ki; - unsigned nkeys; - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - mdb_node_del(mc, mc->mc_db->md_xsize); - mc->mc_db->md_entries--; - { - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3 == mc || m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] == ki) { - m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDB_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - } - continue; - } else if (m3->mc_ki[mc->mc_top] > ki) { - m3->mc_ki[mc->mc_top]--; - } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - } - } - } - rc = mdb_rebalance(mc); - - if (likely(rc == MDB_SUCCESS)) { - /* DB is totally empty now, just bail out. - * Other cursors adjustments were already done - * by mdb_rebalance and aren't needed here. - */ - if (!mc->mc_snum) - return rc; - - mp = mc->mc_pg[mc->mc_top]; - nkeys = NUMKEYS(mp); - - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdb_cursor_sibling(m3, 1); - if (rc == MDB_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDB_SUCCESS; - continue; - } - } - if (mc->mc_db->md_flags & MDB_DUPSORT) { - MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node is a fake page, it needs to be reinited - * because its data has moved. But just reset mc_pg[0] - * if the xcursor is already live. - */ - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - else - mdb_xcursor_init1(m3, node); - } - } - } - } - } - mc->mc_flags |= C_DEL; - } - - if (unlikely(rc)) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_del(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data) -{ - if (unlikely(!key || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - -#if ! MDBX_MODE_ENABLED - if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { - /* must ignore any data */ - data = NULL; - } -#endif - - return mdb_del0(txn, dbi, key, data, 0); -} - -static int -mdb_del0(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data, unsigned flags) -{ - MDB_cursor mc; - MDB_xcursor mx; - MDB_cursor_op op; - MDB_val rdata; - int rc, exact = 0; - DKBUF; - - mdb_debug("====> delete db %u key [%s]", dbi, DKEY(key)); - - mdb_cursor_init(&mc, txn, dbi, &mx); - - if (data) { - op = MDB_GET_BOTH; - rdata = *data; - data = &rdata; - } else { - op = MDB_SET; - flags |= MDB_NODUPDATA; - } - rc = mdb_cursor_set(&mc, key, data, op, &exact); - if (likely(rc == 0)) { - /* let mdb_page_split know about this cursor if needed: - * delete will trigger a rebalance; if it needs to move - * a node from one page to another, it will have to - * update the parent's separator key(s). If the new sepkey - * is larger than the current one, the parent page may - * run out of space, triggering a split. We need this - * cursor to be consistent until the end of the rebalance. - */ - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; - rc = mdb_cursor_del(&mc, flags); - txn->mt_cursors[dbi] = mc.mc_next; - } - return rc; -} - -/** Split a page and insert a new node. - * Set #MDB_TXN_ERROR on failure. - * @param[in,out] mc Cursor pointing to the page and desired insertion index. - * The cursor will be updated to point to the actual page and index where - * the node got inserted after the split. - * @param[in] newkey The key for the newly inserted node. - * @param[in] newdata The data for the newly inserted node. - * @param[in] newpgno The page number, if the new node is a branch node. - * @param[in] nflags The #NODE_ADD_FLAGS for the new node. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, - unsigned nflags) -{ - unsigned flags; - int rc = MDB_SUCCESS, new_root = 0, did_split = 0; - indx_t newindx; - pgno_t pgno = 0; - int i, j, split_indx, nkeys, pmax; - MDB_env *env = mc->mc_txn->mt_env; - MDB_node *node; - MDB_val sepkey, rkey, xdata, *rdata = &xdata; - MDB_page *copy = NULL; - MDB_page *mp, *rp, *pp; - int ptop; - MDB_cursor mn; - DKBUF; - - mp = mc->mc_pg[mc->mc_top]; - newindx = mc->mc_ki[mc->mc_top]; - nkeys = NUMKEYS(mp); - - mdb_debug("-----> splitting %s page %zu and adding [%s] at index %i/%i", - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, - DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys); - - /* Create a right sibling. */ - if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) - return rc; - rp->mp_leaf2_ksize = mp->mp_leaf2_ksize; - mdb_debug("new right sibling: page %zu", rp->mp_pgno); - - /* Usually when splitting the root page, the cursor - * height is 1. But when called from mdb_update_key, - * the cursor height may be greater because it walks - * up the stack while finding the branch slot to update. - */ - if (mc->mc_top < 1) { - if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) - goto done; - /* shift current top to make room for new parent */ - for (i=mc->mc_snum; i>0; i--) { - mc->mc_pg[i] = mc->mc_pg[i-1]; - mc->mc_ki[i] = mc->mc_ki[i-1]; - } - mc->mc_pg[0] = pp; - mc->mc_ki[0] = 0; - mc->mc_db->md_root = pp->mp_pgno; - mdb_debug("root split! new root = %zu", pp->mp_pgno); - new_root = mc->mc_db->md_depth++; - - /* Add left (implicit) pointer. */ - if (unlikely((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS)) { - /* undo the pre-push */ - mc->mc_pg[0] = mc->mc_pg[1]; - mc->mc_ki[0] = mc->mc_ki[1]; - mc->mc_db->md_root = mp->mp_pgno; - mc->mc_db->md_depth--; - goto done; - } - mc->mc_snum++; - mc->mc_top++; - ptop = 0; - } else { - ptop = mc->mc_top-1; - mdb_debug("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno); - } - - mdb_cursor_copy(mc, &mn); - mn.mc_xcursor = NULL; - mn.mc_pg[mn.mc_top] = rp; - mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; - - if (nflags & MDB_APPEND) { - mn.mc_ki[mn.mc_top] = 0; - sepkey = *newkey; - split_indx = newindx; - nkeys = 0; - } else { - split_indx = (nkeys+1) / 2; - - if (IS_LEAF2(rp)) { - char *split, *ins; - int x; - unsigned lsize, rsize, ksize; - /* Move half of the keys to the right sibling */ - x = mc->mc_ki[mc->mc_top] - split_indx; - ksize = mc->mc_db->md_xsize; - split = LEAF2KEY(mp, split_indx, ksize); - rsize = (nkeys - split_indx) * ksize; - lsize = (nkeys - split_indx) * sizeof(indx_t); - mp->mp_lower -= lsize; - rp->mp_lower += lsize; - mp->mp_upper += rsize - lsize; - rp->mp_upper -= rsize - lsize; - sepkey.mv_size = ksize; - if (newindx == split_indx) { - sepkey.mv_data = newkey->mv_data; - } else { - sepkey.mv_data = split; - } - if (x<0) { - ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); - memcpy(rp->mp_ptrs, split, rsize); - sepkey.mv_data = rp->mp_ptrs; - memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); - memcpy(ins, newkey->mv_data, ksize); - mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); - } else { - if (x) - memcpy(rp->mp_ptrs, split, x * ksize); - ins = LEAF2KEY(rp, x, ksize); - memcpy(ins, newkey->mv_data, ksize); - memcpy(ins+ksize, split + x * ksize, rsize - x * ksize); - rp->mp_lower += sizeof(indx_t); - rp->mp_upper -= ksize - sizeof(indx_t); - mc->mc_ki[mc->mc_top] = x; - } - } else { - int psize, nsize, k; - /* Maximum free space in an empty page */ - pmax = env->me_psize - PAGEHDRSZ; - if (IS_LEAF(mp)) - nsize = mdb_leaf_size(env, newkey, newdata); - else - nsize = mdb_branch_size(env, newkey); - nsize = EVEN(nsize); - - /* grab a page to hold a temporary copy */ - copy = mdb_page_malloc(mc->mc_txn, 1); - if (unlikely(copy == NULL)) { - rc = ENOMEM; - goto done; - } - copy->mp_pgno = mp->mp_pgno; - copy->mp_flags = mp->mp_flags; - copy->mp_lower = (PAGEHDRSZ-PAGEBASE); - copy->mp_upper = env->me_psize - PAGEBASE; - - /* prepare to insert */ - for (i=0, j=0; imp_ptrs[j++] = 0; - } - copy->mp_ptrs[j++] = mp->mp_ptrs[i]; - } - - /* When items are relatively large the split point needs - * to be checked, because being off-by-one will make the - * difference between success or failure in mdb_node_add. - * - * It's also relevant if a page happens to be laid out - * such that one half of its nodes are all "small" and - * the other half of its nodes are "large." If the new - * item is also "large" and falls on the half with - * "large" nodes, it also may not fit. - * - * As a final tweak, if the new item goes on the last - * spot on the page (and thus, onto the new page), bias - * the split so the new page is emptier than the old page. - * This yields better packing during sequential inserts. - */ - if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { - /* Find split point */ - psize = 0; - if (newindx <= split_indx || newindx >= nkeys) { - i = 0; j = 1; - k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp); - } else { - i = nkeys; j = -1; - k = split_indx-1; - } - for (; i!=k; i+=j) { - if (i == newindx) { - psize += nsize; - node = NULL; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); - psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); - if (IS_LEAF(mp)) { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - psize += sizeof(pgno_t); - else - psize += NODEDSZ(node); - } - psize = EVEN(psize); - } - if (psize > pmax || i == k-j) { - split_indx = i + (j<0); - break; - } - } - } - if (split_indx == newindx) { - sepkey.mv_size = newkey->mv_size; - sepkey.mv_data = newkey->mv_data; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); - sepkey.mv_size = node->mn_ksize; - sepkey.mv_data = NODEKEY(node); - } - } - } - - mdb_debug("separator is %d [%s]", split_indx, DKEY(&sepkey)); - - /* Copy separator key to the parent. */ - if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { - int snum = mc->mc_snum; - mn.mc_snum--; - mn.mc_top--; - did_split = 1; - /* We want other splits to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); - if (unlikely(rc != MDB_SUCCESS)) - goto done; - - /* root split? */ - if (mc->mc_snum > snum) { - ptop++; - } - /* Right page might now have changed parent. - * Check if left page also changed parent. - */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; imc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - mc->mc_pg[ptop] = mn.mc_pg[ptop]; - if (mn.mc_ki[ptop]) { - mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; - } else { - /* find right page's left sibling */ - mc->mc_ki[ptop] = mn.mc_ki[ptop]; - rc = mdb_cursor_sibling(mc, 0); - } - } - } else { - mn.mc_top--; - rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); - mn.mc_top++; - } - if (unlikely(rc != MDB_SUCCESS)) { - if (rc == MDB_NOTFOUND) /* improper mdb_cursor_sibling() result */ - rc = MDB_PROBLEM; - goto done; - } - if (nflags & MDB_APPEND) { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[mc->mc_top] = 0; - rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags); - if (rc) - goto done; - for (i=0; imc_top; i++) - mc->mc_ki[i] = mn.mc_ki[i]; - } else if (!IS_LEAF2(mp)) { - /* Move nodes */ - mc->mc_pg[mc->mc_top] = rp; - i = split_indx; - j = 0; - do { - if (i == newindx) { - rkey.mv_data = newkey->mv_data; - rkey.mv_size = newkey->mv_size; - if (IS_LEAF(mp)) { - rdata = newdata; - } else - pgno = newpgno; - flags = nflags; - /* Update index for the new key. */ - mc->mc_ki[mc->mc_top] = j; - } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); - rkey.mv_data = NODEKEY(node); - rkey.mv_size = node->mn_ksize; - if (IS_LEAF(mp)) { - xdata.mv_data = NODEDATA(node); - xdata.mv_size = NODEDSZ(node); - rdata = &xdata; - } else - pgno = NODEPGNO(node); - flags = node->mn_flags; - } - - if (!IS_LEAF(mp) && j == 0) { - /* First branch index doesn't need key data. */ - rkey.mv_size = 0; - } - - rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags); - if (rc) - goto done; - if (i == nkeys) { - i = 0; - j = 0; - mc->mc_pg[mc->mc_top] = copy; - } else { - i++; - j++; - } - } while (i != split_indx); - - nkeys = NUMKEYS(copy); - for (i=0; imp_ptrs[i] = copy->mp_ptrs[i]; - mp->mp_lower = copy->mp_lower; - mp->mp_upper = copy->mp_upper; - memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1), - env->me_psize - copy->mp_upper - PAGEBASE); - - /* reset back to original page */ - if (newindx < split_indx) { - mc->mc_pg[mc->mc_top] = mp; - } else { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. - */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; i<=ptop; i++) { - mc->mc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - } - } - if (nflags & MDB_RESERVE) { - node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); - } - } else { - if (newindx >= split_indx) { - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. - */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { - for (i=0; i<=ptop; i++) { - mc->mc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - } - } - } - - { - /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; - nkeys = NUMKEYS(mp); - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (mc->mc_flags & C_SUB) - m3 = &m2->mc_xcursor->mx_cursor; - else - m3 = m2; - if (m3 == mc) - continue; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (new_root) { - int k; - /* sub cursors may be on different DB */ - if (m3->mc_pg[0] != mp) - continue; - /* root split */ - for (k=new_root; k>=0; k--) { - m3->mc_ki[k+1] = m3->mc_ki[k]; - m3->mc_pg[k+1] = m3->mc_pg[k]; - } - if (m3->mc_ki[0] >= nkeys) { - m3->mc_ki[0] = 1; - } else { - m3->mc_ki[0] = 0; - } - m3->mc_pg[0] = mc->mc_pg[0]; - m3->mc_snum++; - m3->mc_top++; - } - if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) - m3->mc_ki[mc->mc_top]++; - if (m3->mc_ki[mc->mc_top] >= nkeys) { - m3->mc_pg[mc->mc_top] = rp; - m3->mc_ki[mc->mc_top] -= nkeys; - for (i=0; imc_top; i++) { - m3->mc_ki[i] = mn.mc_ki[i]; - m3->mc_pg[i] = mn.mc_pg[i]; - } - } - } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && - m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { - m3->mc_ki[ptop]++; - } - if (XCURSOR_INITED(m3) && IS_LEAF(mp)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - } - } - mdb_debug("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)); - -done: - if (copy) /* tmp page */ - mdb_page_free(env, copy); - if (unlikely(rc)) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return rc; -} - -int -mdb_put(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data, unsigned flags) -{ - MDB_cursor mc; - MDB_xcursor mx; - - if (unlikely(!key || !data || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - if (unlikely(flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP - /* LY: MDB_CURRENT indicates explicit overwrite (update) for MDBX */ - | (MDBX_MODE_ENABLED ? MDB_CURRENT : 0)))) - return EINVAL; - - if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - mdb_cursor_init(&mc, txn, dbi, &mx); - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; - int rc = MDB_SUCCESS; -#if MDBX_MODE_ENABLED - /* LY: support for update (explicit overwrite) */ - if (flags & MDB_CURRENT) { - rc = mdb_cursor_get(&mc, key, NULL, MDB_SET); - if (likely(rc == MDB_SUCCESS) && (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)) { - /* LY: allows update (explicit overwrite) only for unique keys */ - MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_tassert(txn, XCURSOR_INITED(&mc) && mc.mc_xcursor->mx_db.md_entries > 1); - rc = MDBX_EMULTIVAL; - } - } - } -#endif /* MDBX_MODE_ENABLED */ - if (likely(rc == MDB_SUCCESS)) - rc = mdb_cursor_put(&mc, key, data, flags); - txn->mt_cursors[dbi] = mc.mc_next; - - return rc; -} - -#ifndef MDB_WBUF -# define MDB_WBUF (1024*1024) -#endif -#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ - - /** State needed for a double-buffering compacting copy. */ -typedef struct mdb_copy { - MDB_env *mc_env; - MDB_txn *mc_txn; - pthread_mutex_t mc_mutex; - pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ - char *mc_wbuf[2]; - char *mc_over[2]; - int mc_wlen[2]; - int mc_olen[2]; - pgno_t mc_next_pgno; - HANDLE mc_fd; - int mc_toggle; /**< Buffer number in provider */ - int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ - /** Error code. Never cleared if set. Both threads can set nonzero - * to fail the copy. Not mutex-protected, LMDB expects atomic int. - */ - volatile int mc_error; -} mdb_copy; - - /** Dedicated writer thread for compacting copy. */ -static void* __cold -mdb_env_copythr(void *arg) -{ - mdb_copy *my = arg; - char *ptr; - int toggle = 0, wsize, rc = 0; - int len; - -#ifdef SIGPIPE - sigset_t set; - sigemptyset(&set); - sigaddset(&set, SIGPIPE); - if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) - my->mc_error = rc; -#endif - - pthread_mutex_lock(&my->mc_mutex); - for(;;) { - while (!my->mc_new) - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ - break; - wsize = my->mc_wlen[toggle]; - ptr = my->mc_wbuf[toggle]; -again: - rc = MDB_SUCCESS; - while (wsize > 0 && !my->mc_error) { - len = write(my->mc_fd, ptr, wsize); - if (len < 0) { - rc = errno; -#ifdef SIGPIPE - if (rc == EPIPE) { - /* Collect the pending SIGPIPE, otherwise at least OS X - * gives it to the process on thread-exit (ITS#8504). - */ - int tmp; - sigwait(&set, &tmp); - } -#endif - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - wsize -= len; - continue; - } else { - rc = EIO; - break; - } - } - if (rc) { - my->mc_error = rc; - } - /* If there's an overflow page tail, write it too */ - if (my->mc_olen[toggle]) { - wsize = my->mc_olen[toggle]; - ptr = my->mc_over[toggle]; - my->mc_olen[toggle] = 0; - goto again; - } - my->mc_wlen[toggle] = 0; - toggle ^= 1; - /* Return the empty buffer to provider */ - my->mc_new--; - pthread_cond_signal(&my->mc_cond); - } - pthread_mutex_unlock(&my->mc_mutex); - return NULL; -} - - /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. - * - * @param[in] my control structure. - * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). - */ -static int __cold -mdb_env_cthr_toggle(mdb_copy *my, int adjust) -{ - pthread_mutex_lock(&my->mc_mutex); - my->mc_new += adjust; - pthread_cond_signal(&my->mc_cond); - while (my->mc_new & 2) /* both buffers in use */ - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - pthread_mutex_unlock(&my->mc_mutex); - - my->mc_toggle ^= (adjust & 1); - /* Both threads reset mc_wlen, to be safe from threading errors */ - my->mc_wlen[my->mc_toggle] = 0; - return my->mc_error; -} - - /** Depth-first tree traversal for compacting copy. - * @param[in] my control structure. - * @param[in,out] pg database root. - * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. - */ -static int __cold -mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) -{ - MDB_cursor mc; - MDB_node *ni; - MDB_page *mo, *mp, *leaf; - char *buf, *ptr; - int rc, toggle; - unsigned i; - - /* Empty DB, nothing to do */ - if (*pg == P_INVALID) - return MDB_SUCCESS; - - memset(&mc, 0, sizeof(mc)); - mc.mc_snum = 1; - mc.mc_txn = my->mc_txn; - - rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); - if (rc) - return rc; - rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); - if (rc) - return rc; - - /* Make cursor pages writable */ - buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); - if (buf == NULL) - return ENOMEM; - - for (i=0; imc_env->me_psize); - mc.mc_pg[i] = (MDB_page *)ptr; - ptr += my->mc_env->me_psize; - } - - /* This is writable space for a leaf page. Usually not needed. */ - leaf = (MDB_page *)ptr; - - toggle = my->mc_toggle; - while (mc.mc_snum > 0) { - unsigned n; - mp = mc.mc_pg[mc.mc_top]; - n = NUMKEYS(mp); - - if (IS_LEAF(mp)) { - if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { - for (i=0; imn_flags & F_BIGDATA) { - MDB_page *omp; - pgno_t pg; - - /* Need writable leaf */ - if (mp != leaf) { - mc.mc_pg[mc.mc_top] = leaf; - mdb_page_copy(leaf, mp, my->mc_env->me_psize); - mp = leaf; - ni = NODEPTR(mp, i); - } - - memcpy(&pg, NODEDATA(ni), sizeof(pg)); - memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); - rc = mdb_page_get(&mc, pg, &omp, NULL); - if (rc) - goto done; - if (my->mc_wlen[toggle] >= MDB_WBUF) { - rc = mdb_env_cthr_toggle(my, 1); - if (rc) - goto done; - toggle = my->mc_toggle; - } - mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - memcpy(mo, omp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno; - my->mc_next_pgno += omp->mp_pages; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (omp->mp_pages > 1) { - my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); - my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; - rc = mdb_env_cthr_toggle(my, 1); - if (rc) - goto done; - toggle = my->mc_toggle; - } - } else if (ni->mn_flags & F_SUBDATA) { - MDB_db db; - - /* Need writable leaf */ - if (mp != leaf) { - mc.mc_pg[mc.mc_top] = leaf; - mdb_page_copy(leaf, mp, my->mc_env->me_psize); - mp = leaf; - ni = NODEPTR(mp, i); - } - - memcpy(&db, NODEDATA(ni), sizeof(db)); - my->mc_toggle = toggle; - rc = mdb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); - if (rc) - goto done; - toggle = my->mc_toggle; - memcpy(NODEDATA(ni), &db, sizeof(db)); - } - } - } - } else { - mc.mc_ki[mc.mc_top]++; - if (mc.mc_ki[mc.mc_top] < n) { - pgno_t pg; -again: - ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); - pg = NODEPGNO(ni); - rc = mdb_page_get(&mc, pg, &mp, NULL); - if (rc) - goto done; - mc.mc_top++; - mc.mc_snum++; - mc.mc_ki[mc.mc_top] = 0; - if (IS_BRANCH(mp)) { - /* Whenever we advance to a sibling branch page, - * we must proceed all the way down to its first leaf. - */ - mdb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); - goto again; - } else - mc.mc_pg[mc.mc_top] = mp; - continue; - } - } - if (my->mc_wlen[toggle] >= MDB_WBUF) { - rc = mdb_env_cthr_toggle(my, 1); - if (rc) - goto done; - toggle = my->mc_toggle; - } - mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - mdb_page_copy(mo, mp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno++; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (mc.mc_top) { - /* Update parent if there is one */ - ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]); - SETPGNO(ni, mo->mp_pgno); - mdb_cursor_pop(&mc); - } else { - /* Otherwise we're done */ - *pg = mo->mp_pgno; - break; - } - } -done: - free(buf); - return rc; -} - - /** Copy environment with compaction. */ -static int __cold -mdb_env_copyfd1(MDB_env *env, HANDLE fd) -{ - MDB_meta *mm; - MDB_page *mp; - mdb_copy my; - MDB_txn *txn = NULL; - pthread_t thr; - pgno_t root, new_root; - int rc = MDB_SUCCESS; - - memset(&my, 0, sizeof(my)); - if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) - return rc; - if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) - goto done2; - my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); - if (my.mc_wbuf[0] == NULL) { - rc = errno; - goto done; - } - memset(my.mc_wbuf[0], 0, MDB_WBUF*2); - my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; - my.mc_next_pgno = NUM_METAS; - my.mc_env = env; - my.mc_fd = fd; - rc = pthread_create(&thr, NULL, mdb_env_copythr, &my); - if (rc) - goto done; - - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) - goto finish; - - mp = (MDB_page *)my.mc_wbuf[0]; - memset(mp, 0, NUM_METAS * env->me_psize); - mp->mp_pgno = 0; - mp->mp_flags = P_META; - mm = (MDB_meta *)PAGEDATA(mp); - mdb_env_init_meta0(env, mm); - mm->mm_address = METAPAGE_1(env)->mm_address; - - mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); - mp->mp_pgno = 1; - mp->mp_flags = P_META; - *(MDB_meta *)PAGEDATA(mp) = *mm; - mm = (MDB_meta *)PAGEDATA(mp); - - /* Set metapage 1 with current main DB */ - root = new_root = txn->mt_dbs[MAIN_DBI].md_root; - if (root != P_INVALID) { - /* Count free pages + freeDB pages. Subtract from last_pg - * to find the new last_pg, which also becomes the new root. - */ - MDB_ID freecount = 0; - MDB_cursor mc; - MDB_val key, data; - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) - freecount += *(MDB_ID *)data.mv_data; - if (rc != MDB_NOTFOUND) - goto finish; - freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + - txn->mt_dbs[FREE_DBI].md_leaf_pages + - txn->mt_dbs[FREE_DBI].md_overflow_pages; - - new_root = txn->mt_next_pgno - 1 - freecount; - mm->mm_last_pg = new_root; - mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - mm->mm_dbs[MAIN_DBI].md_root = new_root; - } else { - /* When the DB is empty, handle it specially to - * fix any breakage like page leaks from ITS#8174. - */ - mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; - } - if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { - mm->mm_txnid = 1; /* use metapage 1 */ - } - - my.mc_wlen[0] = env->me_psize * NUM_METAS; - my.mc_txn = txn; - rc = mdb_env_cwalk(&my, &root, 0); - if (rc == MDB_SUCCESS && root != new_root) { - rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ - } - -finish: - if (rc) - my.mc_error = rc; - mdb_env_cthr_toggle(&my, 1 | MDB_EOF); - rc = pthread_join(thr, NULL); - mdb_txn_abort(txn); - -done: - free(my.mc_wbuf[0]); - pthread_cond_destroy(&my.mc_cond); -done2: - pthread_mutex_destroy(&my.mc_mutex); - return rc ? rc : my.mc_error; -} - - /** Copy environment as-is. */ -static int __cold -mdb_env_copyfd0(MDB_env *env, HANDLE fd) -{ - MDB_txn *txn = NULL; - pthread_mutex_t *wmutex = NULL; - int rc; - size_t wsize; - char *ptr; - ssize_t len; - size_t w2; - - /* Do the lock/unlock of the reader mutex before starting the - * write txn. Otherwise other read txns could block writers. - */ - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) - return rc; - - /* We must start the actual read txn after blocking writers */ - rc = mdb_txn_end(txn, MDB_END_RESET_TMP); - if (rc) - return rc; - - /* Temporarily block writers until we snapshot the meta pages */ - wmutex = MDB_MUTEX(env, w); - rc = mdb_mutex_lock(env, wmutex); - if (unlikely(rc)) - goto leave; - - rc = mdb_txn_renew0(txn, MDB_RDONLY); - if (rc) { - mdb_mutex_unlock(env, wmutex); - goto leave; - } - - wsize = env->me_psize * NUM_METAS; - ptr = env->me_map; - w2 = wsize; - while (w2 > 0) { - len = write(fd, ptr, w2); - if (len < 0) { - rc = errno; - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - w2 -= len; - continue; - } else { - /* Non-blocking or async handles are not supported */ - rc = EIO; - break; - } - } - mdb_mutex_unlock(env, wmutex); - - if (rc) - goto leave; - - w2 = txn->mt_next_pgno * env->me_psize; - { - size_t fsize = 0; - if ((rc = mdb_fsize(env->me_fd, &fsize))) - goto leave; - if (w2 > fsize) - w2 = fsize; - } - wsize = w2 - wsize; - while (wsize > 0) { - if (wsize > MAX_WRITE) - w2 = MAX_WRITE; - else - w2 = wsize; - len = write(fd, ptr, w2); - if (len < 0 ) { - rc = errno; - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - wsize -= len; - continue; - } else { - rc = EIO; - break; - } - } - -leave: - mdb_txn_abort(txn); - return rc; -} - -int __cold -mdb_env_copyfd2(MDB_env *env, HANDLE fd, unsigned flags) -{ - if (flags & MDB_CP_COMPACT) - return mdb_env_copyfd1(env, fd); - else - return mdb_env_copyfd0(env, fd); -} - -int __cold -mdb_env_copyfd(MDB_env *env, HANDLE fd) -{ - return mdb_env_copyfd2(env, fd, 0); -} - -int __cold -mdb_env_copy2(MDB_env *env, const char *path, unsigned flags) -{ - int rc, len; - char *lpath; - HANDLE newfd = INVALID_HANDLE_VALUE; - - if (env->me_flags & MDB_NOSUBDIR) { - lpath = (char *)path; - } else { - len = strlen(path); - len += sizeof(DATANAME); - lpath = malloc(len); - if (!lpath) - return ENOMEM; - sprintf(lpath, "%s" DATANAME, path); - } - - /* The destination path must exist, but the destination file must not. - * We don't want the OS to cache the writes, since the source data is - * already in the OS cache. - */ - newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0666); - if (newfd == INVALID_HANDLE_VALUE) { - rc = errno; - goto leave; - } - - int fdflags; - if ((fdflags = fcntl(newfd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(newfd, F_SETFD, fdflags); - - if (env->me_psize >= env->me_os_psize) { -#ifdef F_NOCACHE /* __APPLE__ */ - (void) fcntl(newfd, F_NOCACHE, 1); -#elif defined O_DIRECT - /* Set O_DIRECT if the file system supports it */ - if ((rc = fcntl(newfd, F_GETFL)) != -1) - (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); -#endif - } - - rc = mdb_env_copyfd2(env, newfd, flags); - -leave: - if (!(env->me_flags & MDB_NOSUBDIR)) - free(lpath); - if (newfd != INVALID_HANDLE_VALUE) - if (close(newfd) < 0 && rc == MDB_SUCCESS) - rc = errno; - - return rc; -} - -int __cold -mdb_env_copy(MDB_env *env, const char *path) -{ - return mdb_env_copy2(env, path, 0); -} - -int __cold -mdb_env_set_flags(MDB_env *env, unsigned flags, int onoff) -{ - if (unlikely(flags & ~CHANGEABLE)) - return EINVAL; - - pthread_mutex_t *mutex = MDB_MUTEX(env, w); - int rc = mdb_mutex_lock(env, mutex); - if (unlikely(rc)) - return rc; - - if (onoff) - env->me_flags |= flags; - else - env->me_flags &= ~flags; - - mdb_mutex_unlock(env, mutex); - return MDB_SUCCESS; -} - -int __cold -mdb_env_get_flags(MDB_env *env, unsigned *arg) -{ - if (unlikely(!env || !arg)) - return EINVAL; - - *arg = env->me_flags & (CHANGEABLE|CHANGELESS); - return MDB_SUCCESS; -} - -int __cold -mdb_env_set_userctx(MDB_env *env, void *ctx) -{ - if (unlikely(!env)) - return EINVAL; - env->me_userctx = ctx; - return MDB_SUCCESS; -} - -void * __cold -mdb_env_get_userctx(MDB_env *env) -{ - return env ? env->me_userctx : NULL; -} - -int __cold -mdb_env_set_assert(MDB_env *env, MDB_assert_func *func) -{ - if (unlikely(!env)) - return EINVAL; -#if MDB_DEBUG - env->me_assert_func = func; - return MDB_SUCCESS; -#else - (void) func; - return ENOSYS; -#endif -} - -int __cold -mdb_env_get_path(MDB_env *env, const char **arg) -{ - if (unlikely(!env || !arg)) - return EINVAL; - - *arg = env->me_path; - return MDB_SUCCESS; -} - -int __cold -mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *arg) -{ - if (unlikely(!env || !arg)) - return EINVAL; - - *arg = env->me_fd; - return MDB_SUCCESS; -} - -/** Common code for #mdb_stat() and #mdb_env_stat(). - * @param[in] env the environment to operate in. - * @param[in] db the #MDB_db record containing the stats to return. - * @param[out] arg the address of an #MDB_stat structure to receive the stats. - * @return 0, this function always succeeds. - */ -static int __cold -mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg) -{ - arg->ms_psize = env->me_psize; - arg->ms_depth = db->md_depth; - arg->ms_branch_pages = db->md_branch_pages; - arg->ms_leaf_pages = db->md_leaf_pages; - arg->ms_overflow_pages = db->md_overflow_pages; - arg->ms_entries = db->md_entries; - - return MDB_SUCCESS; -} - -MDBX_ONLY_FEATURE int __cold -mdbx_env_stat(MDB_env *env, MDBX_stat *arg, size_t bytes) -{ - MDB_meta *meta; - - if (unlikely(env == NULL || arg == NULL)) - return EINVAL; - if (unlikely(bytes != sizeof(MDBX_stat))) - return EINVAL; - - meta = mdb_meta_head_r(env); - return mdb_stat0(env, &meta->mm_dbs[MAIN_DBI], &arg->base); -} - -int __cold -mdb_env_stat(MDB_env *env, MDB_stat *arg) -{ - return mdbx_env_stat(env, (MDBX_stat *) arg, sizeof(MDB_stat)); -} - -MDBX_ONLY_FEATURE int __cold -mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) -{ - MDB_meta *meta; - - if (unlikely(env == NULL || arg == NULL)) - return EINVAL; - - if (bytes == sizeof(MDB_envinfo)) { - do { - meta = mdb_meta_head_r(env); - arg->base.me_last_txnid = meta->mm_txnid; - arg->base.me_last_pgno = meta->mm_last_pg; - arg->base.me_mapaddr = meta->mm_address; - arg->base.me_mapsize = env->me_mapsize; - arg->base.me_maxreaders = env->me_maxreaders; - arg->base.me_numreaders = env->me_txns->mti_numreaders; - } while (unlikely( arg->base.me_last_txnid != env->me_txns->mti_txnid)); -#if MDBX_MODE_ENABLED - } else if (bytes == sizeof(MDBX_envinfo)) { - MDB_meta *m1, *m2; - MDB_reader *r; - unsigned i; - - m1 = METAPAGE_1(env); - m2 = METAPAGE_2(env); - - do { - meta = mdb_meta_head_r(env); - arg->base.me_last_txnid = meta->mm_txnid; - arg->base.me_last_pgno = meta->mm_last_pg; - arg->me_meta1_txnid = m1->mm_txnid; - arg->me_meta1_sign = m1->mm_datasync_sign; - arg->me_meta2_txnid = m2->mm_txnid; - arg->me_meta2_sign = m2->mm_datasync_sign; - } while (unlikely( arg->base.me_last_txnid != env->me_txns->mti_txnid - || arg->me_meta1_sign != m1->mm_datasync_sign - || arg->me_meta2_sign != m2->mm_datasync_sign )); - - arg->base.me_mapaddr = meta->mm_address; - arg->base.me_mapsize = env->me_mapsize; - arg->base.me_maxreaders = env->me_maxreaders; - arg->base.me_numreaders = env->me_txns->mti_numreaders; - arg->me_tail_txnid = 0; - - r = env->me_txns->mti_readers; - arg->me_tail_txnid = arg->base.me_last_txnid; - for (i = 0; i < arg->base.me_numreaders; ++i ) { - if (r[i].mr_pid) { - txnid_t mr = r[i].mr_txnid; - if (arg->me_tail_txnid > mr) - arg->me_tail_txnid = mr; - } - } -#endif /* MDBX_MODE_ENABLED */ - } else { - return EINVAL; - } - - return MDB_SUCCESS; -} - -int __cold -mdb_env_info(MDB_env *env, MDB_envinfo *arg) -{ - return mdbx_env_info(env, (MDBX_envinfo*) arg, sizeof(MDB_envinfo)); -} - -static MDB_cmp_func* -mdbx_default_keycmp(unsigned flags) -{ - return (flags & MDB_REVERSEKEY) ? mdb_cmp_memnr : - (flags & MDB_INTEGERKEY) ? mdb_cmp_int_a2 : mdb_cmp_memn; -} - -static MDB_cmp_func* -mdbx_default_datacmp(unsigned flags) -{ - return !(flags & MDB_DUPSORT) ? 0 : - ((flags & MDB_INTEGERDUP) ? mdb_cmp_int_ua : - ((flags & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn)); -} - -/** Set the default comparison functions for a database. - * Called immediately after a database is opened to set the defaults. - * The user can then override them with #mdb_set_compare() or - * #mdb_set_dupsort(). - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - */ -static void -mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi) -{ - unsigned flags = txn->mt_dbs[dbi].md_flags; - txn->mt_dbxs[dbi].md_cmp = mdbx_default_keycmp(flags); - txn->mt_dbxs[dbi].md_dcmp = mdbx_default_datacmp(flags); -} - -int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi) -{ - MDB_val key, data; - MDB_dbi i; - MDB_cursor mc; - MDB_db dummy; - int rc, dbflag, exact; - unsigned unused = 0, seq; - char *namedup; - size_t len; - - if (unlikely(!txn || !dbi)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(flags & ~VALID_FLAGS)) - return EINVAL; - - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - /* main DB? */ - if (!name) { - *dbi = MAIN_DBI; - if (flags & PERSISTENT_FLAGS) { - uint16_t f2 = flags & PERSISTENT_FLAGS; - /* make sure flag changes get committed */ - if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) { - txn->mt_dbs[MAIN_DBI].md_flags |= f2; - txn->mt_flags |= MDB_TXN_DIRTY; - } - } - mdb_default_cmp(txn, MAIN_DBI); - return MDB_SUCCESS; - } - - if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { - mdb_default_cmp(txn, MAIN_DBI); - } - - /* Is the DB already open? */ - len = strlen(name); - for (i=CORE_DBS; imt_numdbs; i++) { - if (!txn->mt_dbxs[i].md_name.mv_size) { - /* Remember this free slot */ - if (!unused) unused = i; - continue; - } - if (len == txn->mt_dbxs[i].md_name.mv_size && - !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { - *dbi = i; - return MDB_SUCCESS; - } - } - - /* If no free slot and max hit, fail */ - if (!unused && unlikely(txn->mt_numdbs >= txn->mt_env->me_maxdbs)) - return MDB_DBS_FULL; - - /* Cannot mix named databases with some mainDB flags */ - if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY))) - return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; - - /* Find the DB info */ - dbflag = DB_NEW|DB_VALID|DB_USRVALID; - exact = 0; - key.mv_size = len; - key.mv_data = (void *)name; - mdb_cursor_init(&mc, txn, MAIN_DBI, NULL); - rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact); - if (likely(rc == MDB_SUCCESS)) { - /* make sure this is actually a DB */ - MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (unlikely((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)) - return MDB_INCOMPATIBLE; - } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { - return rc; - } - - /* Done here so we cannot fail after creating a new DB */ - if (unlikely((namedup = strdup(name)) == NULL)) - return ENOMEM; - - if (unlikely(rc)) { - /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ - data.mv_size = sizeof(MDB_db); - data.mv_data = &dummy; - memset(&dummy, 0, sizeof(dummy)); - dummy.md_root = P_INVALID; - dummy.md_flags = flags & PERSISTENT_FLAGS; - WITH_CURSOR_TRACKING(mc, - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); - dbflag |= DB_DIRTY; - } - - if (unlikely(rc)) { - free(namedup); - } else { - /* Got info, register DBI in this txn */ - unsigned slot = unused ? unused : txn->mt_numdbs; - txn->mt_dbxs[slot].md_name.mv_data = namedup; - txn->mt_dbxs[slot].md_name.mv_size = len; - txn->mt_dbxs[slot].md_rel = NULL; - txn->mt_dbflags[slot] = dbflag; - /* txn-> and env-> are the same in read txns, use - * tmp variable to avoid undefined assignment - */ - seq = ++txn->mt_env->me_dbiseqs[slot]; - txn->mt_dbiseqs[slot] = seq; - - memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); - *dbi = slot; - mdb_default_cmp(txn, slot); - if (!unused) { - txn->mt_numdbs++; - } - } - - return rc; -} - -MDBX_ONLY_FEATURE int __cold -mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, size_t bytes) -{ - if (unlikely(!arg || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) - return EINVAL; - - if (unlikely(bytes != sizeof(MDBX_stat))) - return EINVAL; - - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; - - if (unlikely(txn->mt_dbflags[dbi] & DB_STALE)) { - MDB_cursor mc; - MDB_xcursor mx; - /* Stale, must read the DB's root. cursor_init does it for us. */ - mdb_cursor_init(&mc, txn, dbi, &mx); - } - return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], &arg->base); -} - -int __cold -mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) -{ - return mdbx_stat(txn, dbi, (MDBX_stat*) arg, sizeof(MDB_stat)); -} - -void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) -{ - char *ptr; - if (dbi < CORE_DBS || dbi >= env->me_maxdbs) - return; - ptr = env->me_dbxs[dbi].md_name.mv_data; - /* If there was no name, this was already closed */ - if (ptr) { - env->me_dbxs[dbi].md_name.mv_data = NULL; - env->me_dbxs[dbi].md_name.mv_size = 0; - env->me_dbflags[dbi] = 0; - env->me_dbiseqs[dbi]++; - free(ptr); - } -} - -int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags) -{ - if (unlikely(!txn || !flags)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) - return EINVAL; - - *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; - return MDB_SUCCESS; -} - -/** Add all the DB's pages to the free list. - * @param[in] mc Cursor on the DB to free. - * @param[in] subs non-Zero to check for sub-DBs in this DB. - * @return 0 on success, non-zero on failure. - */ -static int -mdb_drop0(MDB_cursor *mc, int subs) -{ - int rc; - - rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); - if (likely(rc == MDB_SUCCESS)) { - MDB_txn *txn = mc->mc_txn; - MDB_node *ni; - MDB_cursor mx; - unsigned i; - - /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. - * This also avoids any P_LEAF2 pages, which have no nodes. - * Also if the DB doesn't have sub-DBs and has no overflow - * pages, omit scanning leaves. - */ - if ((mc->mc_flags & C_SUB) || - (!subs && !mc->mc_db->md_overflow_pages)) - mdb_cursor_pop(mc); - - mdb_cursor_copy(mc, &mx); - while (mc->mc_snum > 0) { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - unsigned n = NUMKEYS(mp); - if (IS_LEAF(mp)) { - for (i=0; imn_flags & F_BIGDATA) { - MDB_page *omp; - pgno_t pg; - memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(mc, pg, &omp, NULL); - if (unlikely(rc)) - goto done; - mdb_cassert(mc, IS_OVERFLOW(omp)); - rc = mdb_midl_append_range(&txn->mt_free_pgs, - pg, omp->mp_pages); - if (unlikely(rc)) - goto done; - mc->mc_db->md_overflow_pages -= omp->mp_pages; - if (!mc->mc_db->md_overflow_pages && !subs) - break; - } else if (subs && (ni->mn_flags & F_SUBDATA)) { - mdb_xcursor_init1(mc, ni); - rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); - if (unlikely(rc)) - goto done; - } - } - if (!subs && !mc->mc_db->md_overflow_pages) - goto pop; - } else { - if (unlikely((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0)) - goto done; - for (i=0; imt_free_pgs, pg); - } - } - if (!mc->mc_top) - break; - mc->mc_ki[mc->mc_top] = i; - rc = mdb_cursor_sibling(mc, 1); - if (rc) { - if (unlikely(rc != MDB_NOTFOUND)) - goto done; - /* no more siblings, go back to beginning - * of previous level. - */ -pop: - mdb_cursor_pop(mc); - mc->mc_ki[0] = 0; - for (i=1; imc_snum; i++) { - mc->mc_ki[i] = 0; - mc->mc_pg[i] = mx.mc_pg[i]; - } - } - } - /* free it */ - rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); -done: - if (unlikely(rc)) - txn->mt_flags |= MDB_TXN_ERROR; - } else if (rc == MDB_NOTFOUND) { - rc = MDB_SUCCESS; - } - mc->mc_flags &= ~C_INITIALIZED; - return rc; -} - -int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) -{ - MDB_cursor *mc, *m2; - int rc; - - if (unlikely(1 < (unsigned) del || !txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - if (unlikely(TXN_DBI_CHANGED(txn, dbi))) - return MDB_BAD_DBI; - - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) - return EACCES; - - rc = mdb_cursor_open(txn, dbi, &mc); - if (unlikely(rc)) - return rc; - - rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); - /* Invalidate the dropped DB's cursors */ - for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) - m2->mc_flags &= ~(C_INITIALIZED|C_EOF); - if (unlikely(rc)) - goto leave; - - /* Can't delete the main DB */ - if (del && dbi >= CORE_DBS) { - rc = mdb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); - if (likely(!rc)) { - txn->mt_dbflags[dbi] = DB_STALE; - mdb_dbi_close(txn->mt_env, dbi); - } else { - txn->mt_flags |= MDB_TXN_ERROR; - } - } else { - /* reset the DB record, mark it dirty */ - txn->mt_dbflags[dbi] |= DB_DIRTY; - txn->mt_dbs[dbi].md_depth = 0; - txn->mt_dbs[dbi].md_branch_pages = 0; - txn->mt_dbs[dbi].md_leaf_pages = 0; - txn->mt_dbs[dbi].md_overflow_pages = 0; - txn->mt_dbs[dbi].md_entries = 0; - txn->mt_dbs[dbi].md_root = P_INVALID; - - txn->mt_flags |= MDB_TXN_DIRTY; - } -leave: - mdb_cursor_close(mc); - return rc; -} - -int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) -{ - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - txn->mt_dbxs[dbi].md_cmp = cmp; - return MDB_SUCCESS; -} - -int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) -{ - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - txn->mt_dbxs[dbi].md_dcmp = cmp; - return MDB_SUCCESS; -} - -int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) -{ - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - txn->mt_dbxs[dbi].md_rel = rel; - return MDB_SUCCESS; -} - -int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) -{ - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - txn->mt_dbxs[dbi].md_relctx = ctx; - return MDB_SUCCESS; -} - -int __cold -mdb_env_get_maxkeysize(MDB_env *env) -{ - if (!env || env->me_signature != MDBX_ME_SIGNATURE) - return EINVAL; - return ENV_MAXKEY(env); -} - -int __cold -mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) -{ - unsigned i, rdrs; - MDB_reader *mr; - char buf[64]; - int rc = 0, first = 1; - - if (unlikely(!env || !func)) - return -EINVAL; - - if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - rdrs = env->me_txns->mti_numreaders; - mr = env->me_txns->mti_readers; - for (i=0; i> 1; - cursor = base + pivot + 1; - val = pid - ids[cursor]; - - if( val < 0 ) { - n = pivot; - } else if ( val > 0 ) { - base = cursor; - n -= pivot + 1; - } else { - /* found, so it's a duplicate */ - return -1; - } - } - - if( val > 0 ) { - ++cursor; - } - ids[0]++; - for (n = ids[0]; n > cursor; n--) - ids[n] = ids[n-1]; - ids[n] = pid; - return 0; -} - -int __cold -mdb_reader_check(MDB_env *env, int *dead) -{ - if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) - return EINVAL; - if (dead) - *dead = 0; - return mdb_reader_check0(env, 0, dead); -} - -/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ -static int __cold -mdb_reader_check0(MDB_env *env, int rlocked, int *dead) -{ - pthread_mutex_t *rmutex = rlocked ? NULL : MDB_MUTEX(env, r); - unsigned i, j, rdrs; - MDB_reader *mr; - pid_t *pids, pid; - int rc = MDB_SUCCESS, count = 0; - - if (unlikely(env->me_pid != getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; - } - - rdrs = env->me_txns->mti_numreaders; - pids = malloc((rdrs+1) * sizeof(pid_t)); - if (!pids) - return ENOMEM; - pids[0] = 0; - mr = env->me_txns->mti_readers; - for (i=0; ime_pid) { - if (mdb_pid_insert(pids, pid) == 0) { - if (!mdb_reader_pid(env, F_GETLK, pid)) { - /* Stale reader found */ - j = i; - if (rmutex) { - if ((rc = pthread_mutex_lock(rmutex)) != 0) { - if ((rc = mdb_mutex_failed(env, rmutex, rc))) - break; - rdrs = 0; /* the above checked all readers */ - } else { - /* Recheck, a new process may have reused pid */ - if (mdb_reader_pid(env, F_GETLK, pid)) - j = rdrs; - } - } - for (; j < rdrs; j++) { - if (mr[j].mr_pid == pid) { - mdb_debug("clear stale reader pid %u txn %zd", - (unsigned) pid, mr[j].mr_txnid); - mr[j].mr_pid = 0; - count++; - } - } - if (rmutex) - mdb_mutex_unlock(env, rmutex); - } - } - } - } - free(pids); - if (dead) - *dead = count; - return rc; -} - -static int __cold -mdb_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc) -{ -#if MDB_USE_ROBUST - if (unlikely(rc == EOWNERDEAD)) { - int rlocked, rc2; - - /* We own the mutex. Clean up after dead previous owner. */ - rc = MDB_SUCCESS; - rlocked = (mutex == MDB_MUTEX(env, r)); - if (!rlocked) { - /* Keep mti_txnid updated, otherwise next writer can - * overwrite data which latest meta page refers to. - */ - #if 0 - /* LY: Hm, how this can happen, if the mti_txnid - * is updating only at the finish of a successful commit ? */ - - MDB_meta *meta = mdb_env_meta_head(env); - env->me_txns->mti_txnid = meta->mm_txnid; - #endif - /* env is hosed if the dead thread was ours */ - if (env->me_txn) { - env->me_flags |= MDB_FATAL_ERROR; - env->me_txn = NULL; - rc = MDB_PANIC; - } - } - mdb_debug("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); - rc2 = mdb_reader_check0(env, rlocked, NULL); - if (rc2 == 0) - rc2 = pthread_mutex_consistent(mutex); - if (rc || (rc = rc2)) { - mdb_debug("mutex recovery failed, %s", mdb_strerror(rc)); - pthread_mutex_unlock(mutex); - } - } -#endif /* MDB_USE_ROBUST */ - if (unlikely(rc)) { - mdb_debug("lock mutex failed, %s", mdb_strerror(rc)); - if (rc != EDEADLK) { - env->me_flags |= MDB_FATAL_ERROR; - rc = MDB_PANIC; - } - } - - return rc; -} - -static int mdb_mutex_lock(MDB_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_lock(mutex); - if (unlikely(rc)) - rc = mdb_mutex_failed(env, mutex, rc); - return rc; -} - -static void mdb_mutex_unlock(MDB_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_unlock(mutex); - mdb_assert(env, rc == 0); - (void) env; - (void) rc; -} - -/** @} */ - -#include "./midl.c" diff --git a/mdb_chk.c b/mdb_chk.c deleted file mode 100644 index db141b4b..00000000 --- a/mdb_chk.c +++ /dev/null @@ -1,954 +0,0 @@ -/* mdbx_chk.c - memory-mapped database check tool */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * - * This file is part of libmdbx. - * - * libmdbx is free software; you can redistribute it and/or modify it under - * the terms of the GNU Affero General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * libmdbx is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "midl.h" -#include "mdbx.h" - -typedef struct flagbit { - int bit; - char *name; -} flagbit; - -flagbit dbflags[] = { - { MDB_DUPSORT, "dupsort" }, - { MDB_INTEGERKEY, "integerkey" }, - { MDB_REVERSEKEY, "reversekey" }, - { MDB_DUPFIXED, "dupfixed" }, - { MDB_REVERSEDUP, "reversedup" }, - { MDB_INTEGERDUP, "integerdup" }, - { 0, NULL } -}; - -static volatile sig_atomic_t gotsignal; - -static void signal_handler( int sig ) { - (void) sig; - gotsignal = 1; -} - -#define MAX_DBI 32768 - -#define EXIT_INTERRUPTED (EXIT_FAILURE+4) -#define EXIT_FAILURE_SYS (EXIT_FAILURE+3) -#define EXIT_FAILURE_MDB (EXIT_FAILURE+2) -#define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE+1) -#define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE - -struct { - const char* dbi_names[MAX_DBI]; - size_t dbi_pages[MAX_DBI]; - size_t dbi_empty_pages[MAX_DBI]; - size_t dbi_payload_bytes[MAX_DBI]; - size_t dbi_lost_bytes[MAX_DBI]; - short *pagemap; - size_t total_payload_bytes; - size_t pgcount; -} walk; - -static __attribute__((constructor)) -void init_walk(void) -{ - walk.dbi_names[0] = "@gc"; -} - -size_t total_unused_bytes; -int exclusive = 2; - -MDB_env *env; -MDB_txn *txn, *locktxn; -MDBX_envinfo info; -MDBX_stat stat; -size_t maxkeysize, reclaimable_pages, freedb_pages, lastpgno; -size_t userdb_count, skipped_subdb; -unsigned verbose, quiet; -const char* only_subdb; - -struct problem { - struct problem* pr_next; - size_t count; - const char* caption; -}; - -struct problem* problems_list; -size_t total_problems; - -static void __attribute__ ((format (printf, 1, 2))) -print(const char* msg, ...) { - if (! quiet) { - va_list args; - - fflush(stderr); - va_start(args, msg); - vfprintf(stdout, msg, args); - va_end(args); - } -} - -static void __attribute__ ((format (printf, 1, 2))) -error(const char* msg, ...) { - total_problems++; - - if (! quiet) { - va_list args; - - fflush(stdout); - va_start(args, msg); - vfprintf(stderr, msg, args); - va_end(args); - fflush(NULL); - } -} - -static void pagemap_cleanup(void) { - int i; - - for( i = 1; i < MAX_DBI; ++i ) { - if (walk.dbi_names[i]) { - free((void *) walk.dbi_names[i]); - walk.dbi_names[i] = NULL; - } - } - - free(walk.pagemap); - walk.pagemap = NULL; -} - -static int pagemap_lookup_dbi(const char* dbi) { - static int last; - int i; - - if (last > 0 && strcmp(walk.dbi_names[last], dbi) == 0) - return last; - - for(i = 1; walk.dbi_names[i] && last < MAX_DBI; ++i) - if (strcmp(walk.dbi_names[i], dbi) == 0) - return last = i; - - if (i == MAX_DBI) - return -1; - - walk.dbi_names[i] = strdup(dbi); - - if (verbose > 1) { - print(" - found '%s' area\n", dbi); - fflush(NULL); - } - - return last = i; -} - -static void problem_add(const char* object, size_t entry_number, const char* msg, const char *extra, ...) { - total_problems++; - - if (! quiet) { - int need_fflush = 0; - struct problem* p; - - for (p = problems_list; p; p = p->pr_next) - if (p->caption == msg) - break; - - if (! p) { - p = calloc(1, sizeof(*p)); - p->caption = msg; - p->pr_next = problems_list; - problems_list = p; - need_fflush = 1; - } - - p->count++; - if (verbose > 1) { - print(" %s #%zu: %s", object, entry_number, msg); - if (extra) { - va_list args; - printf(" ("); - va_start(args, extra); - vfprintf(stdout, extra, args); - va_end(args); - printf(")"); - } - printf("\n"); - if (need_fflush) - fflush(NULL); - } - } -} - -static struct problem* problems_push() { - struct problem* p = problems_list; - problems_list = NULL; - return p; -} - -static size_t problems_pop(struct problem* list) { - size_t count = 0; - - if (problems_list) { - int i; - - print(" - problems: "); - for (i = 0; problems_list; ++i) { - struct problem* p = problems_list->pr_next; - count += problems_list->count; - print("%s%s (%zu)", i ? ", " : "", problems_list->caption, problems_list->count); - free(problems_list); - problems_list = p; - } - print("\n"); - fflush(NULL); - } - - problems_list = list; - return count; -} - -static int pgvisitor(size_t pgno, unsigned pgnumber, void* ctx, const char* dbi, - const char* type, int nentries, int payload_bytes, int header_bytes, int unused_bytes) -{ - (void) ctx; - - if (type) { - size_t page_bytes = payload_bytes + header_bytes + unused_bytes; - size_t page_size = pgnumber * stat.base.ms_psize; - int index = pagemap_lookup_dbi(dbi); - if (index < 0) - return ENOMEM; - - if (verbose > 2 && (!only_subdb || strcmp(only_subdb, dbi) == 0)) { - if (pgnumber == 1) - print(" %s-page %zu", type, pgno); - else - print(" %s-span %zu[%u]", type, pgno, pgnumber); - print(" of %s: header %i, payload %i, unused %i\n", - dbi, header_bytes, payload_bytes, unused_bytes); - } - - walk.pgcount += pgnumber; - - if (unused_bytes < 0 || (size_t) unused_bytes > page_size) - problem_add("page", pgno, "illegal unused-bytes", "%zu < %i < %zu", - 0, unused_bytes, stat.base.ms_psize); - - if (header_bytes < (int) sizeof(long) || (size_t) header_bytes >= stat.base.ms_psize - sizeof(long)) - problem_add("page", pgno, "illegal header-length", "%zu < %i < %zu", - sizeof(long), header_bytes, stat.base.ms_psize - sizeof(long)); - if (payload_bytes < 1) { - if (nentries > 1) { - problem_add("page", pgno, "zero size-of-entry", "payload %i bytes, %i entries", - payload_bytes, nentries); - if ((size_t) header_bytes + unused_bytes < page_size) { - /* LY: hush a misuse error */ - page_bytes = page_size; - } - } else { - problem_add("page", pgno, "empty", "payload %i bytes, %i entries", - payload_bytes, nentries); - walk.dbi_empty_pages[index] += 1; - } - } - - if (page_bytes != page_size) { - problem_add("page", pgno, "misused", "%zu != %zu (%ih + %ip + %iu)", - page_size, page_bytes, header_bytes, payload_bytes, unused_bytes); - if (page_size > page_bytes) - walk.dbi_lost_bytes[index] += page_size - page_bytes; - } else { - walk.dbi_payload_bytes[index] += payload_bytes + header_bytes; - walk.total_payload_bytes += payload_bytes + header_bytes; - } - - if (pgnumber) { - do { - if (pgno >= lastpgno) - problem_add("page", pgno, "wrong page-no", - "%zu > %zi", pgno, lastpgno); - else if (walk.pagemap[pgno]) - problem_add("page", pgno, "already used", - "in %s", walk.dbi_names[walk.pagemap[pgno]]); - else { - walk.pagemap[pgno] = index; - walk.dbi_pages[index] += 1; - } - ++pgno; - } while(--pgnumber); - } - } - - return gotsignal ? EINTR : MDB_SUCCESS; -} - -typedef int (visitor)(size_t record_number, MDB_val *key, MDB_val* data); -static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent); - -static int handle_userdb(size_t record_number, MDB_val *key, MDB_val* data) { - (void) record_number; - (void) key; - (void) data; - return MDB_SUCCESS; -} - -static int handle_freedb(size_t record_number, MDB_val *key, MDB_val* data) { - char *bad = ""; - size_t pg, prev; - ssize_t i, number, span = 0; - size_t *iptr = data->mv_data, txnid = *(size_t*)key->mv_data; - - if (key->mv_size != sizeof(txnid)) - problem_add("entry", record_number, "wrong txn-id size", "key-size %zi", key->mv_size); - else if (txnid < 1 || txnid > info.base.me_last_txnid) - problem_add("entry", record_number, "wrong txn-id", "%zu", txnid); - - if (data->mv_size < sizeof(size_t) || data->mv_size % sizeof(size_t)) - problem_add("entry", record_number, "wrong idl size", "%zu", data->mv_size); - else { - number = *iptr++; - if (number >= MDB_IDL_UM_MAX) - problem_add("entry", record_number, "wrong idl length", "%zi", number); - else if ((number + 1) * sizeof(size_t) != data->mv_size) - problem_add("entry", record_number, "mismatch idl length", "%zi != %zu", - number * sizeof(size_t), data->mv_size); - else { - freedb_pages += number; - if (info.me_tail_txnid > txnid) - reclaimable_pages += number; - for (i = number, prev = 1; --i >= 0; ) { - pg = iptr[i]; - if (pg < 2 /* META_PAGE */ || pg > info.base.me_last_pgno) - problem_add("entry", record_number, "wrong idl entry", "2 < %zi < %zi", - pg, info.base.me_last_pgno); - else if (pg <= prev) { - bad = " [bad sequence]"; - problem_add("entry", record_number, "bad sequence", "%zi <= %zi", - pg, prev); - } - prev = pg; - pg += span; - for (; i >= span && iptr[i - span] == pg; span++, pg++) ; - } - if (verbose > 2 && !only_subdb) { - print(" transaction %zu, %zd pages, maxspan %zd%s\n", - *(size_t *)key->mv_data, number, span, bad); - if (verbose > 3) { - int j = number - 1; - while (j >= 0) { - pg = iptr[j]; - for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) ; - if (span > 1) - print(" %9zu[%zd]\n", pg, span); - else - print(" %9zu\n", pg); - } - } - } - } - } - - return MDB_SUCCESS; -} - -static int handle_maindb(size_t record_number, MDB_val *key, MDB_val* data) { - char *name; - int rc; - size_t i; - - name = key->mv_data; - for(i = 0; i < key->mv_size; ++i) { - if (name[i] < ' ') - return handle_userdb(record_number, key, data); - } - - name = malloc(key->mv_size + 1); - memcpy(name, key->mv_data, key->mv_size); - name[key->mv_size] = '\0'; - userdb_count++; - - rc = process_db(-1, name, handle_userdb, 0); - free(name); - if (rc != MDB_INCOMPATIBLE) - return rc; - - return handle_userdb(record_number, key, data); -} - -static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) -{ - MDB_cursor *mc; - MDBX_stat ms; - MDB_val key, data; - MDB_val prev_key, prev_data; - unsigned flags; - int rc, i; - struct problem* saved_list; - size_t problems_count; - - unsigned record_count = 0, dups = 0; - size_t key_bytes = 0, data_bytes = 0; - - if (0 > (int) dbi) { - rc = mdbx_dbi_open(txn, name, 0, &dbi); - if (rc) { - if (!name || rc != MDB_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { - error(" - mdbx_open '%s' failed, error %d %s\n", - name ? name : "main", rc, mdbx_strerror(rc)); - } - return rc; - } - } - - if (dbi >= 2 /* CORE_DBS */ && name && only_subdb && strcmp(only_subdb, name)) { - if (verbose) { - print("Skip processing '%s'...\n", name); - fflush(NULL); - } - skipped_subdb++; - return MDB_SUCCESS; - } - - if (! silent && verbose) { - print("Processing '%s'...\n", name ? name : "main"); - fflush(NULL); - } - - rc = mdbx_dbi_flags(txn, dbi, &flags); - if (rc) { - error(" - mdbx_dbi_flags failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc; - } - - rc = mdbx_stat(txn, dbi, &ms, sizeof(ms)); - if (rc) { - error(" - mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc; - } - - if (! silent && verbose) { - print(" - dbi-id %d, flags:", dbi); - if (! flags) - print(" none"); - else { - for (i=0; dbflags[i].bit; i++) - if (flags & dbflags[i].bit) - print(" %s", dbflags[i].name); - } - print(" (0x%02X)\n", flags); - if (verbose > 1) { - print(" - page size %u, entries %zu\n", ms.base.ms_psize, ms.base.ms_entries); - print(" - b-tree depth %u, pages: branch %zu, leaf %zu, overflow %zu\n", - ms.base.ms_depth, ms.base.ms_branch_pages, ms.base.ms_leaf_pages, ms.base.ms_overflow_pages); - } - } - - rc = mdbx_cursor_open(txn, dbi, &mc); - if (rc) { - error(" - mdbx_cursor_open failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc; - } - - saved_list = problems_push(); - prev_key.mv_data = NULL; - prev_data.mv_size = 0; - rc = mdbx_cursor_get(mc, &key, &data, MDB_FIRST); - while (rc == MDB_SUCCESS) { - if (gotsignal) { - print(" - interrupted by signal\n"); - fflush(NULL); - rc = EINTR; - goto bailout; - } - - if (key.mv_size > maxkeysize) { - problem_add("entry", record_count, "key length exceeds max-key-size", - "%zu > %zu", key.mv_size, maxkeysize); - } else if ((flags & MDB_INTEGERKEY) - && key.mv_size != sizeof(size_t) && key.mv_size != sizeof(int)) { - problem_add("entry", record_count, "wrong key length", - "%zu != %zu", key.mv_size, sizeof(size_t)); - } - - if ((flags & MDB_INTEGERDUP) - && data.mv_size != sizeof(size_t) && data.mv_size != sizeof(int)) { - problem_add("entry", record_count, "wrong data length", - "%zu != %zu", data.mv_size, sizeof(size_t)); - } - - if (prev_key.mv_data) { - if ((flags & MDB_DUPFIXED) && prev_data.mv_size != data.mv_size) { - problem_add("entry", record_count, "different data length", - "%zu != %zu", prev_data.mv_size, data.mv_size); - } - - int cmp = mdbx_cmp(txn, dbi, &prev_key, &key); - if (cmp > 0) { - problem_add("entry", record_count, "broken ordering of entries", NULL); - } else if (cmp == 0) { - ++dups; - if (! (flags & MDB_DUPSORT)) - problem_add("entry", record_count, "duplicated entries", NULL); - else if (flags & MDB_INTEGERDUP) { - cmp = mdbx_dcmp(txn, dbi, &prev_data, &data); - if (cmp > 0) - problem_add("entry", record_count, "broken ordering of multi-values", NULL); - } - } - } else if (verbose) { - if (flags & MDB_INTEGERKEY) - print(" - fixed key-size %zu\n", key.mv_size ); - if (flags & (MDB_INTEGERDUP | MDB_DUPFIXED)) - print(" - fixed data-size %zu\n", data.mv_size ); - } - - if (handler) { - rc = handler(record_count, &key, &data); - if (rc) - goto bailout; - } - - record_count++; - key_bytes += key.mv_size; - data_bytes += data.mv_size; - - prev_key = key; - prev_data = data; - rc = mdbx_cursor_get(mc, &key, &data, MDB_NEXT); - } - if (rc != MDB_NOTFOUND) - error(" - mdbx_cursor_get failed, error %d %s\n", rc, mdbx_strerror(rc)); - else - rc = 0; - - if (record_count != ms.base.ms_entries) - problem_add("entry", record_count, "differentent number of entries", - "%zu != %zu", record_count, ms.base.ms_entries); -bailout: - problems_count = problems_pop(saved_list); - if (! silent && verbose) { - print(" - summary: %u records, %u dups, %zu key's bytes, %zu data's bytes, %zu problems\n", - record_count, dups, key_bytes, data_bytes, problems_count); - fflush(NULL); - } - - mdbx_cursor_close(mc); - return rc || problems_count; -} - -static void usage(char *prog) -{ - fprintf(stderr, "usage: %s dbpath [-V] [-v] [-n] [-q] [-w] [-c] [-d] [-s subdb]\n" - " -V\t\tshow version\n" - " -v\t\tmore verbose, could be used multiple times\n" - " -n\t\tNOSUBDIR mode for open\n" - " -q\t\tbe quiet\n" - " -w\t\tlock DB for writing while checking\n" - " -d\t\tdisable page-by-page traversal of b-tree\n" - " -s subdb\tprocess a specific subdatabase only\n" - " -c\t\tforce cooperative mode (don't try exclusive)\n", prog); - exit(EXIT_INTERRUPTED); -} - -const char* meta_synctype(size_t sign) { - switch(sign) { - case 0: - return "no-sync/legacy"; - case 1: - return "weak"; - default: - return "steady"; - } -} - -int meta_lt(size_t txn1, size_t sign1, size_t txn2, size_t sign2) { - return ((sign1 > 1) == (sign2 > 1)) ? txn1 < txn2 : txn2 && sign2 > 1; -} - -int main(int argc, char *argv[]) -{ - int i, rc; - char *prog = argv[0]; - char *envname; - int envflags = MDB_RDONLY; - int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; - int dont_traversal = 0; - size_t n; - struct timespec timestamp_start, timestamp_finish; - double elapsed; - - atexit(pagemap_cleanup); - - if (clock_gettime(CLOCK_MONOTONIC, ×tamp_start)) { - rc = errno; - error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); - return EXIT_FAILURE_SYS; - } - - if (argc < 2) { - usage(prog); - } - - while ((i = getopt(argc, argv, "Vvqnwcds:")) != EOF) { - switch(i) { - case 'V': - printf("%s\n", MDB_VERSION_STRING); - exit(EXIT_SUCCESS); - break; - case 'v': - verbose++; - break; - case 'q': - quiet = 1; - break; - case 'n': - envflags |= MDB_NOSUBDIR; - break; - case 'w': - envflags &= ~MDB_RDONLY; - break; - case 'c': - exclusive = 0; - break; - case 'd': - dont_traversal = 1; - break; - case 's': - if (only_subdb && strcmp(only_subdb, optarg)) - usage(prog); - only_subdb = optarg; - break; - default: - usage(prog); - } - } - - if (optind != argc - 1) - usage(prog); - -#ifdef SIGPIPE - signal(SIGPIPE, signal_handler); -#endif -#ifdef SIGHUP - signal(SIGHUP, signal_handler); -#endif - signal(SIGINT, signal_handler); - signal(SIGTERM, signal_handler); - - envname = argv[optind]; - print("Running mdbx_chk for '%s' in %s mode...\n", - envname, (envflags & MDB_RDONLY) ? "read-only" : "write-lock"); - fflush(NULL); - - rc = mdbx_env_create(&env); - if (rc) { - error("mdbx_env_create failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc < 0 ? EXIT_FAILURE_MDB : EXIT_FAILURE_SYS; - } - - rc = mdbx_env_get_maxkeysize(env); - if (rc < 0) { - error("mdbx_env_get_maxkeysize failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - maxkeysize = rc; - - rc = mdbx_env_set_maxdbs(env, MAX_DBI); - if (rc < 0) { - error("mdbx_env_set_maxdbs failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - rc = mdbx_env_open_ex(env, envname, envflags, 0664, &exclusive); - if (rc) { - error("mdbx_env_open failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - if (verbose) - print(" - %s mode\n", exclusive ? "monopolistic" : "cooperative"); - - if (! (envflags & MDB_RDONLY)) { - rc = mdbx_txn_begin(env, NULL, 0, &locktxn); - if (rc) { - error("mdbx_txn_begin(lock-write) failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - } - - rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) { - error("mdbx_txn_begin(read-only) failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - rc = mdbx_env_info(env, &info, sizeof(info)); - if (rc) { - error("mdbx_env_info failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - rc = mdbx_env_stat(env, &stat, sizeof(stat)); - if (rc) { - error("mdbx_env_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - lastpgno = info.base.me_last_pgno + 1; - errno = 0; - - if (verbose) { - double k = 1024.0; - const char sf[] = "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ - for(i = 0; sf[i+1] && info.base.me_mapsize / k > 1000.0; ++i) - k *= 1024; - print(" - map size %zu (%.2f %cb)\n", info.base.me_mapsize, - info.base.me_mapsize / k, sf[i]); - if (info.base.me_mapaddr) - print(" - mapaddr %p\n", info.base.me_mapaddr); - print(" - pagesize %u, max keysize %zu (%s), max readers %u\n", - stat.base.ms_psize, maxkeysize, - (maxkeysize == 511) ? "default" : - (maxkeysize == 0) ? "devel" : "custom", - info.base.me_maxreaders); - print(" - transactions: last %zu, bottom %zu, lag reading %zi\n", info.base.me_last_txnid, - info.me_tail_txnid, info.base.me_last_txnid - info.me_tail_txnid); - - print(" - meta-1: %s %zu, %s", - meta_synctype(info.me_meta1_sign), info.me_meta1_txnid, - meta_lt(info.me_meta1_txnid, info.me_meta1_sign, - info.me_meta2_txnid, info.me_meta2_sign) ? "tail" : "head"); - if (info.me_meta1_txnid > info.base.me_last_txnid) - print(", rolled-back %zu (%zu >>> %zu)", - info.me_meta1_txnid - info.base.me_last_txnid, - info.me_meta1_txnid, info.base.me_last_txnid); - print("\n"); - - print(" - meta-2: %s %zu, %s", - meta_synctype(info.me_meta2_sign), info.me_meta2_txnid, - meta_lt(info.me_meta2_txnid, info.me_meta2_sign, - info.me_meta1_txnid, info.me_meta1_sign) ? "tail" : "head"); - if (info.me_meta2_txnid > info.base.me_last_txnid) - print(", rolled-back %zu (%zu >>> %zu)", - info.me_meta2_txnid - info.base.me_last_txnid, - info.me_meta2_txnid, info.base.me_last_txnid); - print("\n"); - } - - if (exclusive > 1) { - if (verbose) - print(" - perform full check last-txn-id with meta-pages\n"); - - if (! meta_lt(info.me_meta1_txnid, info.me_meta1_sign, - info.me_meta2_txnid, info.me_meta2_sign) - && info.me_meta1_txnid != info.base.me_last_txnid) { - print(" - meta-1 txn-id mismatch last-txn-id (%zi != %zi)\n", - info.me_meta1_txnid, info.base.me_last_txnid); - ++problems_meta; - } - - if (! meta_lt(info.me_meta2_txnid, info.me_meta2_sign, - info.me_meta1_txnid, info.me_meta1_sign) - && info.me_meta2_txnid != info.base.me_last_txnid) { - print(" - meta-2 txn-id mismatch last-txn-id (%zi != %zi)\n", - info.me_meta2_txnid, info.base.me_last_txnid); - ++problems_meta; - } - } else if (locktxn) { - if (verbose) - print(" - perform lite check last-txn-id with meta-pages (not a monopolistic mode)\n"); - size_t last = (info.me_meta2_txnid > info.me_meta1_txnid) ? info.me_meta2_txnid : info.me_meta1_txnid; - if (last != info.base.me_last_txnid) { - print(" - last-meta mismatch last-txn-id (%zi != %zi)\n", - last, info.base.me_last_txnid); - ++problems_meta; - } - } else if (verbose) { - print(" - skip check last-txn-id with meta-pages (monopolistic or write-lock mode only)\n"); - } - - if (!dont_traversal) { - struct problem* saved_list; - size_t traversal_problems; - size_t empty_pages, lost_bytes; - - print("Traversal b-tree...\n"); - fflush(NULL); - walk.pagemap = calloc(lastpgno, sizeof(*walk.pagemap)); - if (! walk.pagemap) { - rc = errno ? errno : ENOMEM; - error("calloc failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - saved_list = problems_push(); - rc = mdbx_env_pgwalk(txn, pgvisitor, NULL); - traversal_problems = problems_pop(saved_list); - - if (rc) { - if (rc == EINTR && gotsignal) { - print(" - interrupted by signal\n"); - fflush(NULL); - } else { - error("mdbx_env_pgwalk failed, error %d %s\n", rc, mdbx_strerror(rc)); - } - goto bailout; - } - - for( n = 0; n < lastpgno; ++n) - if (! walk.pagemap[n]) - walk.dbi_pages[0] += 1; - - empty_pages = lost_bytes = 0; - for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { - empty_pages += walk.dbi_empty_pages[i]; - lost_bytes += walk.dbi_lost_bytes[i]; - } - - if (verbose) { - size_t total_page_bytes = walk.pgcount * stat.base.ms_psize; - print(" - dbi pages: %zu total", walk.pgcount); - if (verbose > 1) - for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) - print(", %s %zu", walk.dbi_names[i], walk.dbi_pages[i]); - print(", %s %zu\n", walk.dbi_names[0], walk.dbi_pages[0]); - if (verbose > 1) { - print(" - space info: total %zu bytes, payload %zu (%.1f%%), unused %zu (%.1f%%)\n", - total_page_bytes, walk.total_payload_bytes, - walk.total_payload_bytes * 100.0 / total_page_bytes, - total_page_bytes - walk.total_payload_bytes, - (total_page_bytes - walk.total_payload_bytes) * 100.0 / total_page_bytes); - for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { - size_t dbi_bytes = walk.dbi_pages[i] * stat.base.ms_psize; - print(" %s: subtotal %zu bytes (%.1f%%), payload %zu (%.1f%%), unused %zu (%.1f%%)", - walk.dbi_names[i], - dbi_bytes, dbi_bytes * 100.0 / total_page_bytes, - walk.dbi_payload_bytes[i], walk.dbi_payload_bytes[i] * 100.0 / dbi_bytes, - dbi_bytes - walk.dbi_payload_bytes[i], - (dbi_bytes - walk.dbi_payload_bytes[i]) * 100.0 / dbi_bytes); - if (walk.dbi_empty_pages[i]) - print(", %zu empty pages", walk.dbi_empty_pages[i]); - if (walk.dbi_lost_bytes[i]) - print(", %zu bytes lost", walk.dbi_lost_bytes[i]); - print("\n"); - } - } - print(" - summary: average fill %.1f%%", walk.total_payload_bytes * 100.0 / total_page_bytes); - if (empty_pages) - print(", %zu empty pages", empty_pages); - if (lost_bytes) - print(", %zu bytes lost", lost_bytes); - print(", %zu problems\n", traversal_problems); - } - } else if (verbose) { - print("Skipping b-tree walk...\n"); - fflush(NULL); - } - - if (! verbose) - print("Iterating DBIs...\n"); - problems_maindb = process_db(-1, /* MAIN_DBI */ NULL, NULL, 0); - problems_freedb = process_db(0 /* FREE_DBI */, "free", handle_freedb, 0); - - if (verbose) { - size_t value = info.base.me_mapsize / stat.base.ms_psize; - double percent = value / 100.0; - print(" - pages info: %zu total", value); - print(", allocated %zu (%.1f%%)", lastpgno, lastpgno / percent); - - if (verbose > 1) { - value = info.base.me_mapsize / stat.base.ms_psize - lastpgno; - print(", remained %zu (%.1f%%)", value, value / percent); - - value = lastpgno - freedb_pages; - print(", used %zu (%.1f%%)", value, value / percent); - - print(", gc %zu (%.1f%%)", freedb_pages, freedb_pages / percent); - - value = freedb_pages - reclaimable_pages; - print(", detained %zu (%.1f%%)", value, value / percent); - - print(", reclaimable %zu (%.1f%%)", reclaimable_pages, reclaimable_pages / percent); - } - - value = info.base.me_mapsize / stat.base.ms_psize - lastpgno + reclaimable_pages; - print(", available %zu (%.1f%%)\n", value, value / percent); - } - - if (problems_maindb == 0 && problems_freedb == 0) { - if (!dont_traversal && (exclusive || locktxn)) { - if (walk.pgcount != lastpgno - freedb_pages) { - error("used pages mismatch (%zu != %zu)\n", walk.pgcount, lastpgno - freedb_pages); - } - if (walk.dbi_pages[0] != freedb_pages) { - error("gc pages mismatch (%zu != %zu)\n", walk.dbi_pages[0], freedb_pages); - } - } else if (verbose) { - print(" - skip check used and gc pages (btree-traversal with monopolistic or write-lock mode only)\n"); - } - - if (! process_db(-1, NULL, handle_maindb, 1)) { - if (! userdb_count && verbose) - print(" - does not contain multiple databases\n"); - } - } - -bailout: - if (txn) - mdbx_txn_abort(txn); - if (locktxn) - mdbx_txn_abort(locktxn); - if (env) - mdbx_env_close(env); - fflush(NULL); - if (rc) { - if (rc < 0) - return gotsignal ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; - return EXIT_FAILURE_MDB; - } - - if (clock_gettime(CLOCK_MONOTONIC, ×tamp_finish)) { - rc = errno; - error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); - return EXIT_FAILURE_SYS; - } - - elapsed = timestamp_finish.tv_sec - timestamp_start.tv_sec - + (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9; - - total_problems += problems_meta; - if (total_problems || problems_maindb || problems_freedb) { - print("Total %zu error(s) is detected, elapsed %.3f seconds.\n", - total_problems, elapsed); - if (problems_meta || problems_maindb || problems_freedb) - return EXIT_FAILURE_CHECK_MAJOR; - return EXIT_FAILURE_CHECK_MINOR; - } - print("No error is detected, elapsed %.3f seconds\n", elapsed); - return EXIT_SUCCESS; -} diff --git a/mdb_copy.c b/mdb_copy.c deleted file mode 100644 index 43bee869..00000000 --- a/mdb_copy.c +++ /dev/null @@ -1,81 +0,0 @@ -/* mdb_copy.c - memory-mapped database backup tool */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2012-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include "mdbx.h" - -static void -sighandle(int sig) -{ - (void) sig; -} - -int main(int argc,char * argv[]) -{ - int rc; - MDB_env *env = NULL; - const char *progname = argv[0], *act; - unsigned flags = MDB_RDONLY; - unsigned cpflags = 0; - - for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { - if (argv[1][1] == 'n' && argv[1][2] == '\0') - flags |= MDB_NOSUBDIR; - else if (argv[1][1] == 'c' && argv[1][2] == '\0') - cpflags |= MDB_CP_COMPACT; - else if (argv[1][1] == 'V' && argv[1][2] == '\0') { - printf("%s\n", MDB_VERSION_STRING); - exit(0); - } else - argc = 0; - } - - if (argc<2 || argc>3) { - fprintf(stderr, "usage: %s [-V] [-c] [-n] srcpath [dstpath]\n", progname); - exit(EXIT_FAILURE); - } - -#ifdef SIGPIPE - signal(SIGPIPE, sighandle); -#endif -#ifdef SIGHUP - signal(SIGHUP, sighandle); -#endif - signal(SIGINT, sighandle); - signal(SIGTERM, sighandle); - - act = "opening environment"; - rc = mdb_env_create(&env); - if (rc == MDB_SUCCESS) { - rc = mdb_env_open(env, argv[1], flags, 0640); - } - if (rc == MDB_SUCCESS) { - act = "copying"; - if (argc == 2) - rc = mdb_env_copyfd2(env, STDOUT_FILENO, cpflags); - else - rc = mdb_env_copy2(env, argv[2], cpflags); - } - if (rc) - fprintf(stderr, "%s: %s failed, error %d (%s)\n", - progname, act, rc, mdb_strerror(rc)); - mdb_env_close(env); - - return rc ? EXIT_FAILURE : EXIT_SUCCESS; -} diff --git a/mdb_dump.c b/mdb_dump.c deleted file mode 100644 index 0b5db58e..00000000 --- a/mdb_dump.c +++ /dev/null @@ -1,314 +0,0 @@ -/* mdb_dump.c - memory-mapped database dump tool */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include -#include -#include -#include "mdbx.h" - -#define PRINT 1 -static int mode; - -typedef struct flagbit { - int bit; - char *name; -} flagbit; - -flagbit dbflags[] = { - { MDB_REVERSEKEY, "reversekey" }, - { MDB_DUPSORT, "dupsort" }, - { MDB_INTEGERKEY, "integerkey" }, - { MDB_DUPFIXED, "dupfixed" }, - { MDB_INTEGERDUP, "integerdup" }, - { MDB_REVERSEDUP, "reversedup" }, - { 0, NULL } -}; - -static volatile sig_atomic_t gotsig; - -static void dumpsig( int sig ) -{ - (void) sig; - gotsig = 1; -} - -static const char hexc[] = "0123456789abcdef"; - -static void hex(unsigned char c) -{ - putchar(hexc[c >> 4]); - putchar(hexc[c & 0xf]); -} - -static void text(MDB_val *v) -{ - unsigned char *c, *end; - - putchar(' '); - c = v->mv_data; - end = c + v->mv_size; - while (c < end) { - if (isprint(*c)) { - putchar(*c); - } else { - putchar('\\'); - hex(*c); - } - c++; - } - putchar('\n'); -} - -static void byte(MDB_val *v) -{ - unsigned char *c, *end; - - putchar(' '); - c = v->mv_data; - end = c + v->mv_size; - while (c < end) { - hex(*c++); - } - putchar('\n'); -} - -/* Dump in BDB-compatible format */ -static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) -{ - MDB_cursor *mc; - MDB_stat ms; - MDB_val key, data; - MDB_envinfo info; - unsigned int flags; - int rc, i; - - rc = mdb_dbi_flags(txn, dbi, &flags); - if (rc) return rc; - - rc = mdb_stat(txn, dbi, &ms); - if (rc) return rc; - - rc = mdb_env_info(mdb_txn_env(txn), &info); - if (rc) return rc; - - printf("VERSION=3\n"); - printf("format=%s\n", mode & PRINT ? "print" : "bytevalue"); - if (name) - printf("database=%s\n", name); - printf("type=btree\n"); - printf("mapsize=%zu\n", info.me_mapsize); - if (info.me_mapaddr) - printf("mapaddr=%p\n", info.me_mapaddr); - printf("maxreaders=%u\n", info.me_maxreaders); - - for (i=0; dbflags[i].bit; i++) - if (flags & dbflags[i].bit) - printf("%s=1\n", dbflags[i].name); - - printf("db_pagesize=%d\n", ms.ms_psize); - printf("HEADER=END\n"); - - rc = mdb_cursor_open(txn, dbi, &mc); - if (rc) return rc; - - while ((rc = mdb_cursor_get(mc, &key, &data, MDB_NEXT)) == MDB_SUCCESS) { - if (gotsig) { - rc = EINTR; - break; - } - if (mode & PRINT) { - text(&key); - text(&data); - } else { - byte(&key); - byte(&data); - } - } - printf("DATA=END\n"); - if (rc == MDB_NOTFOUND) - rc = MDB_SUCCESS; - - return rc; -} - -static void usage(char *prog) -{ - fprintf(stderr, "usage: %s [-V] [-f output] [-l] [-n] [-p] [-a|-s subdb] dbpath\n", prog); - exit(EXIT_FAILURE); -} - -int main(int argc, char *argv[]) -{ - int i, rc; - MDB_env *env; - MDB_txn *txn; - MDB_dbi dbi; - char *prog = argv[0]; - char *envname; - char *subname = NULL; - int alldbs = 0, envflags = 0, list = 0; - - if (argc < 2) { - usage(prog); - } - - /* -a: dump main DB and all subDBs - * -s: dump only the named subDB - * -n: use NOSUBDIR flag on env_open - * -p: use printable characters - * -f: write to file instead of stdout - * -V: print version and exit - * (default) dump only the main DB - */ - while ((i = getopt(argc, argv, "af:lnps:V")) != EOF) { - switch(i) { - case 'V': - printf("%s\n", MDB_VERSION_STRING); - exit(0); - break; - case 'l': - list = 1; - /*FALLTHROUGH*/; - case 'a': - if (subname) - usage(prog); - alldbs++; - break; - case 'f': - if (freopen(optarg, "w", stdout) == NULL) { - fprintf(stderr, "%s: %s: reopen: %s\n", - prog, optarg, strerror(errno)); - exit(EXIT_FAILURE); - } - break; - case 'n': - envflags |= MDB_NOSUBDIR; - break; - case 'p': - mode |= PRINT; - break; - case 's': - if (alldbs) - usage(prog); - subname = optarg; - break; - default: - usage(prog); - } - } - - if (optind != argc - 1) - usage(prog); - -#ifdef SIGPIPE - signal(SIGPIPE, dumpsig); -#endif -#ifdef SIGHUP - signal(SIGHUP, dumpsig); -#endif - signal(SIGINT, dumpsig); - signal(SIGTERM, dumpsig); - - envname = argv[optind]; - rc = mdb_env_create(&env); - if (rc) { - fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc)); - return EXIT_FAILURE; - } - - if (alldbs || subname) { - mdb_env_set_maxdbs(env, 2); - } - - rc = mdb_env_open(env, envname, envflags | MDB_RDONLY, 0664); - if (rc) { - fprintf(stderr, "mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) { - fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - rc = mdb_open(txn, subname, 0, &dbi); - if (rc) { - fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - - if (alldbs) { - MDB_cursor *cursor; - MDB_val key; - int count = 0; - - rc = mdb_cursor_open(txn, dbi, &cursor); - if (rc) { - fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { - char *str; - MDB_dbi db2; - if (memchr(key.mv_data, '\0', key.mv_size)) - continue; - count++; - str = malloc(key.mv_size+1); - memcpy(str, key.mv_data, key.mv_size); - str[key.mv_size] = '\0'; - rc = mdb_open(txn, str, 0, &db2); - if (rc == MDB_SUCCESS) { - if (list) { - printf("%s\n", str); - list++; - } else { - rc = dumpit(txn, db2, str); - if (rc) - break; - } - mdb_close(env, db2); - } - free(str); - if (rc) continue; - } - mdb_cursor_close(cursor); - if (!count) { - fprintf(stderr, "%s: %s does not contain multiple databases\n", prog, envname); - rc = MDB_NOTFOUND; - } else if (rc == MDB_INCOMPATIBLE) { - /* LY: the record it not a named sub-db. */ - rc = MDB_SUCCESS; - } - } else { - rc = dumpit(txn, dbi, subname); - } - if (rc && rc != MDB_NOTFOUND) - fprintf(stderr, "%s: %s: %s\n", prog, envname, mdb_strerror(rc)); - - mdb_close(env, dbi); -txn_abort: - mdb_txn_abort(txn); -env_close: - mdb_env_close(env); - - return rc ? EXIT_FAILURE : EXIT_SUCCESS; -} diff --git a/mdb_load.c b/mdb_load.c deleted file mode 100644 index e2cddd53..00000000 --- a/mdb_load.c +++ /dev/null @@ -1,456 +0,0 @@ -/* mdb_load.c - memory-mapped database load tool */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include -#include -#include "mdbx.h" - -#define PRINT 1 -#define NOHDR 2 -static int mode; - -static char *subname = NULL; - -static size_t lineno; -static int version; - -static int dbi_flags; - -static char *prog; - -static int Eof; - -static MDB_envinfo info; - -static MDB_val kbuf, dbuf; - -#define STRLENOF(s) (sizeof(s)-1) - -typedef struct flagbit { - int bit; - char *name; - int len; -} flagbit; - -#define S(s) s, STRLENOF(s) - -flagbit dbflags[] = { - { MDB_REVERSEKEY, S("reversekey") }, - { MDB_DUPSORT, S("dupsort") }, - { MDB_INTEGERKEY, S("integerkey") }, - { MDB_DUPFIXED, S("dupfixed") }, - { MDB_INTEGERDUP, S("integerdup") }, - { MDB_REVERSEDUP, S("reversedup") }, - { 0, NULL, 0 } -}; - -static void readhdr(void) -{ - char *ptr; - - dbi_flags = 0; - while (fgets(dbuf.mv_data, dbuf.mv_size, stdin) != NULL) { - lineno++; - if (!strncmp(dbuf.mv_data, "db_pagesize=", STRLENOF("db_pagesize=")) - || !strncmp(dbuf.mv_data, "duplicates=", STRLENOF("duplicates="))) { - /* LY: silently ignore information fields. */ - continue; - } else if (!strncmp(dbuf.mv_data, "VERSION=", STRLENOF("VERSION="))) { - version=atoi((char *)dbuf.mv_data+STRLENOF("VERSION=")); - if (version > 3) { - fprintf(stderr, "%s: line %zd: unsupported VERSION %d\n", - prog, lineno, version); - exit(EXIT_FAILURE); - } - } else if (!strncmp(dbuf.mv_data, "HEADER=END", STRLENOF("HEADER=END"))) { - break; - } else if (!strncmp(dbuf.mv_data, "format=", STRLENOF("format="))) { - if (!strncmp((char *)dbuf.mv_data+STRLENOF("FORMAT="), "print", STRLENOF("print"))) - mode |= PRINT; - else if (strncmp((char *)dbuf.mv_data+STRLENOF("FORMAT="), "bytevalue", STRLENOF("bytevalue"))) { - fprintf(stderr, "%s: line %zd: unsupported FORMAT %s\n", - prog, lineno, (char *)dbuf.mv_data+STRLENOF("FORMAT=")); - exit(EXIT_FAILURE); - } - } else if (!strncmp(dbuf.mv_data, "database=", STRLENOF("database="))) { - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); - if (ptr) *ptr = '\0'; - if (subname) free(subname); - subname = strdup((char *)dbuf.mv_data+STRLENOF("database=")); - } else if (!strncmp(dbuf.mv_data, "type=", STRLENOF("type="))) { - if (strncmp((char *)dbuf.mv_data+STRLENOF("type="), "btree", STRLENOF("btree"))) { - fprintf(stderr, "%s: line %zd: unsupported type %s\n", - prog, lineno, (char *)dbuf.mv_data+STRLENOF("type=")); - exit(EXIT_FAILURE); - } - } else if (!strncmp(dbuf.mv_data, "mapaddr=", STRLENOF("mapaddr="))) { - int i; - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); - if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data+STRLENOF("mapaddr="), "%p", &info.me_mapaddr); - if (i != 1) { - fprintf(stderr, "%s: line %zd: invalid mapaddr %s\n", - prog, lineno, (char *)dbuf.mv_data+STRLENOF("mapaddr=")); - exit(EXIT_FAILURE); - } - } else if (!strncmp(dbuf.mv_data, "mapsize=", STRLENOF("mapsize="))) { - int i; - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); - if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data+STRLENOF("mapsize="), "%zu", &info.me_mapsize); - if (i != 1) { - fprintf(stderr, "%s: line %zd: invalid mapsize %s\n", - prog, lineno, (char *)dbuf.mv_data+STRLENOF("mapsize=")); - exit(EXIT_FAILURE); - } - } else if (!strncmp(dbuf.mv_data, "maxreaders=", STRLENOF("maxreaders="))) { - int i; - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); - if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data+STRLENOF("maxreaders="), "%u", &info.me_maxreaders); - if (i != 1) { - fprintf(stderr, "%s: line %zd: invalid maxreaders %s\n", - prog, lineno, (char *)dbuf.mv_data+STRLENOF("maxreaders=")); - exit(EXIT_FAILURE); - } - } else { - int i; - for (i=0; dbflags[i].bit; i++) { - if (!strncmp(dbuf.mv_data, dbflags[i].name, dbflags[i].len) && - ((char *)dbuf.mv_data)[dbflags[i].len] == '=') { - if (((char *)dbuf.mv_data)[dbflags[i].len+1] == '1') - dbi_flags |= dbflags[i].bit; - break; - } - } - if (!dbflags[i].bit) { - ptr = memchr(dbuf.mv_data, '=', dbuf.mv_size); - if (!ptr) { - fprintf(stderr, "%s: line %zd: unexpected format\n", - prog, lineno); - exit(EXIT_FAILURE); - } else { - *ptr = '\0'; - fprintf(stderr, "%s: line %zd: unrecognized keyword ignored: %s\n", - prog, lineno, (char *)dbuf.mv_data); - } - } - } - } -} - -static void badend(void) -{ - fprintf(stderr, "%s: line %zd: unexpected end of input\n", - prog, lineno); -} - -static int unhex(unsigned char *c2) -{ - int x, c; - x = *c2++ & 0x4f; - if (x & 0x40) - x -= 55; - c = x << 4; - x = *c2 & 0x4f; - if (x & 0x40) - x -= 55; - c |= x; - return c; -} - -static int readline(MDB_val *out, MDB_val *buf) -{ - unsigned char *c1, *c2, *end; - size_t len, l2; - int c; - - if (!(mode & NOHDR)) { - c = fgetc(stdin); - if (c == EOF) { - Eof = 1; - return EOF; - } - if (c != ' ') { - lineno++; - if (fgets(buf->mv_data, buf->mv_size, stdin) == NULL) { -badend: - Eof = 1; - badend(); - return EOF; - } - if (c == 'D' && !strncmp(buf->mv_data, "ATA=END", STRLENOF("ATA=END"))) - return EOF; - goto badend; - } - } - if (fgets(buf->mv_data, buf->mv_size, stdin) == NULL) { - Eof = 1; - return EOF; - } - lineno++; - - c1 = buf->mv_data; - len = strlen((char *)c1); - l2 = len; - - /* Is buffer too short? */ - while (c1[len-1] != '\n') { - buf->mv_data = realloc(buf->mv_data, buf->mv_size*2); - if (!buf->mv_data) { - Eof = 1; - fprintf(stderr, "%s: line %zd: out of memory, line too long\n", - prog, lineno); - return EOF; - } - c1 = buf->mv_data; - c1 += l2; - if (fgets((char *)c1, buf->mv_size+1, stdin) == NULL) { - Eof = 1; - badend(); - return EOF; - } - buf->mv_size *= 2; - len = strlen((char *)c1); - l2 += len; - } - c1 = c2 = buf->mv_data; - len = l2; - c1[--len] = '\0'; - end = c1 + len; - - if (mode & PRINT) { - while (c2 < end) { - if (*c2 == '\\') { - if (c2[1] == '\\') { - c1++; c2 += 2; - } else { - if (c2+3 > end || !isxdigit(c2[1]) || !isxdigit(c2[2])) { - Eof = 1; - badend(); - return EOF; - } - *c1++ = unhex(++c2); - c2 += 2; - } - } else { - /* copies are redundant when no escapes were used */ - *c1++ = *c2++; - } - } - } else { - /* odd length not allowed */ - if (len & 1) { - Eof = 1; - badend(); - return EOF; - } - while (c2 < end) { - if (!isxdigit(*c2) || !isxdigit(c2[1])) { - Eof = 1; - badend(); - return EOF; - } - *c1++ = unhex(c2); - c2 += 2; - } - } - c2 = out->mv_data = buf->mv_data; - out->mv_size = c1 - c2; - - return 0; -} - -static void usage(void) -{ - fprintf(stderr, "usage: %s [-V] [-f input] [-n] [-s name] [-N] [-T] dbpath\n", prog); - exit(EXIT_FAILURE); -} - -int main(int argc, char *argv[]) -{ - int i, rc; - MDB_env *env; - MDB_txn *txn; - MDB_cursor *mc; - MDB_dbi dbi; - char *envname; - int envflags = 0, putflags = 0; - - prog = argv[0]; - - if (argc < 2) { - usage(); - } - - /* -f: load file instead of stdin - * -n: use NOSUBDIR flag on env_open - * -s: load into named subDB - * -N: use NOOVERWRITE on puts - * -T: read plaintext - * -V: print version and exit - */ - while ((i = getopt(argc, argv, "f:ns:NTV")) != EOF) { - switch(i) { - case 'V': - printf("%s\n", MDB_VERSION_STRING); - exit(0); - break; - case 'f': - if (freopen(optarg, "r", stdin) == NULL) { - fprintf(stderr, "%s: %s: reopen: %s\n", - prog, optarg, strerror(errno)); - exit(EXIT_FAILURE); - } - break; - case 'n': - envflags |= MDB_NOSUBDIR; - break; - case 's': - subname = strdup(optarg); - break; - case 'N': - putflags = MDB_NOOVERWRITE|MDB_NODUPDATA; - break; - case 'T': - mode |= NOHDR | PRINT; - break; - default: - usage(); - } - } - - if (optind != argc - 1) - usage(); - - dbuf.mv_size = 4096; - dbuf.mv_data = malloc(dbuf.mv_size); - - if (!(mode & NOHDR)) - readhdr(); - - envname = argv[optind]; - rc = mdb_env_create(&env); - if (rc) { - fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc)); - return EXIT_FAILURE; - } - - mdb_env_set_maxdbs(env, 2); - - if (info.me_maxreaders) - mdb_env_set_maxreaders(env, info.me_maxreaders); - - if (info.me_mapsize) - mdb_env_set_mapsize(env, info.me_mapsize); - - if (info.me_mapaddr) - envflags |= MDB_FIXEDMAP; - - rc = mdb_env_open(env, envname, envflags, 0664); - if (rc) { - fprintf(stderr, "mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - kbuf.mv_size = mdb_env_get_maxkeysize(env) * 2 + 2; - kbuf.mv_data = malloc(kbuf.mv_size); - - while(!Eof) { - MDB_val key, data; - int batch = 0; - - rc = mdb_txn_begin(env, NULL, 0, &txn); - if (rc) { - fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - rc = mdb_open(txn, subname, dbi_flags|MDB_CREATE, &dbi); - if (rc) { - fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - - rc = mdb_cursor_open(txn, dbi, &mc); - if (rc) { - fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - - while(1) { - rc = readline(&key, &kbuf); - if (rc) /* rc == EOF */ - break; - - rc = readline(&data, &dbuf); - if (rc) { - fprintf(stderr, "%s: line %zd: failed to read key value\n", prog, lineno); - goto txn_abort; - } - - rc = mdb_cursor_put(mc, &key, &data, putflags); - if (rc == MDB_KEYEXIST && putflags) - continue; - if (rc) { - fprintf(stderr, "mdb_cursor_put failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - batch++; - if (batch == 100) { - rc = mdb_txn_commit(txn); - if (rc) { - fprintf(stderr, "%s: line %zd: txn_commit: %s\n", - prog, lineno, mdb_strerror(rc)); - goto env_close; - } - rc = mdb_txn_begin(env, NULL, 0, &txn); - if (rc) { - fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - rc = mdb_cursor_open(txn, dbi, &mc); - if (rc) { - fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - batch = 0; - } - } - rc = mdb_txn_commit(txn); - txn = NULL; - if (rc) { - fprintf(stderr, "%s: line %zd: txn_commit: %s\n", - prog, lineno, mdb_strerror(rc)); - goto env_close; - } - mdb_dbi_close(env, dbi); - if(!(mode & NOHDR)) - readhdr(); - } - -txn_abort: - mdb_txn_abort(txn); -env_close: - mdb_env_close(env); - - return rc ? EXIT_FAILURE : EXIT_SUCCESS; -} diff --git a/mdb_stat.c b/mdb_stat.c deleted file mode 100644 index d47ffe9e..00000000 --- a/mdb_stat.c +++ /dev/null @@ -1,299 +0,0 @@ -/* mdb_stat.c - memory-mapped database status tool */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include "mdbx.h" - -static void prstat(MDBX_stat *ms) -{ -#if 0 - printf(" Page size: %u\n", ms->base.ms_psize); -#endif - printf(" Tree depth: %u\n", ms->base.ms_depth); - printf(" Branch pages: %zu\n", ms->base.ms_branch_pages); - printf(" Leaf pages: %zu\n", ms->base.ms_leaf_pages); - printf(" Overflow pages: %zu\n", ms->base.ms_overflow_pages); - printf(" Entries: %zu\n", ms->base.ms_entries); -} - -static void usage(char *prog) -{ - fprintf(stderr, "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-a|-s subdb] dbpath\n", prog); - exit(EXIT_FAILURE); -} - -int main(int argc, char *argv[]) -{ - int i, rc; - MDB_env *env; - MDB_txn *txn; - MDB_dbi dbi; - MDBX_stat mst; - MDBX_envinfo mei; - char *prog = argv[0]; - char *envname; - char *subname = NULL; - int alldbs = 0, envinfo = 0, envflags = 0, freinfo = 0, rdrinfo = 0; - - if (argc < 2) { - usage(prog); - } - - /* -a: print stat of main DB and all subDBs - * -s: print stat of only the named subDB - * -e: print env info - * -f: print freelist info - * -r: print reader info - * -n: use NOSUBDIR flag on env_open - * -V: print version and exit - * (default) print stat of only the main DB - */ - while ((i = getopt(argc, argv, "Vaefnrs:")) != EOF) { - switch(i) { - case 'V': - printf("%s\n", MDB_VERSION_STRING); - exit(0); - break; - case 'a': - if (subname) - usage(prog); - alldbs++; - break; - case 'e': - envinfo++; - break; - case 'f': - freinfo++; - break; - case 'n': - envflags |= MDB_NOSUBDIR; - break; - case 'r': - rdrinfo++; - break; - case 's': - if (alldbs) - usage(prog); - subname = optarg; - break; - default: - usage(prog); - } - } - - if (optind != argc - 1) - usage(prog); - - envname = argv[optind]; - rc = mdb_env_create(&env); - if (rc) { - fprintf(stderr, "mdb_env_create failed, error %d %s\n", rc, mdb_strerror(rc)); - return EXIT_FAILURE; - } - - if (alldbs || subname) { - mdb_env_set_maxdbs(env, 4); - } - - rc = mdb_env_open(env, envname, envflags | MDB_RDONLY, 0664); - if (rc) { - fprintf(stderr, "mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - if (envinfo) { - (void)mdbx_env_stat(env, &mst, sizeof(mst)); - (void)mdbx_env_info(env, &mei, sizeof(mei)); - printf("Environment Info\n"); - printf(" Map address: %p\n", mei.base.me_mapaddr); - printf(" Map size: %zu\n", mei.base.me_mapsize); - printf(" Page size: %u\n", mst.base.ms_psize); - printf(" Max pages: %zu\n", mei.base.me_mapsize / mst.base.ms_psize); - printf(" Number of pages used: %zu\n", mei.base.me_last_pgno+1); - printf(" Last transaction ID: %zu\n", mei.base.me_last_txnid); - printf(" Tail transaction ID: %zu (%zi)\n", - mei.me_tail_txnid, mei.me_tail_txnid - mei.base.me_last_txnid); - printf(" Max readers: %u\n", mei.base.me_maxreaders); - printf(" Number of readers used: %u\n", mei.base.me_numreaders); - } else { - /* LY: zap warnings from gcc */ - memset(&mst, 0, sizeof(mst)); - memset(&mei, 0, sizeof(mei)); - } - - if (rdrinfo) { - printf("Reader Table Status\n"); - rc = mdb_reader_list(env, (MDB_msg_func *)fputs, stdout); - if (rdrinfo > 1) { - int dead; - mdb_reader_check(env, &dead); - printf(" %d stale readers cleared.\n", dead); - rc = mdb_reader_list(env, (MDB_msg_func *)fputs, stdout); - } - if (!(subname || alldbs || freinfo)) - goto env_close; - } - - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) { - fprintf(stderr, "mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc)); - goto env_close; - } - - if (freinfo) { - MDB_cursor *cursor; - MDB_val key, data; - size_t pages = 0, *iptr; - size_t reclaimable = 0; - - printf("Freelist Status\n"); - dbi = 0; - rc = mdb_cursor_open(txn, dbi, &cursor); - if (rc) { - fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - rc = mdbx_stat(txn, dbi, &mst, sizeof(mst)); - if (rc) { - fprintf(stderr, "mdb_stat failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - prstat(&mst); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - iptr = data.mv_data; - pages += *iptr; - if (envinfo && mei.me_tail_txnid > *(size_t *)key.mv_data) - reclaimable += *iptr; - if (freinfo > 1) { - char *bad = ""; - size_t pg, prev; - ssize_t i, j, span = 0; - j = *iptr++; - for (i = j, prev = 1; --i >= 0; ) { - pg = iptr[i]; - if (pg <= prev) - bad = " [bad sequence]"; - prev = pg; - pg += span; - for (; i >= span && iptr[i-span] == pg; span++, pg++) ; - } - printf(" Transaction %zu, %zd pages, maxspan %zd%s\n", - *(size_t *)key.mv_data, j, span, bad); - if (freinfo > 2) { - for (--j; j >= 0; ) { - pg = iptr[j]; - for (span=1; --j >= 0 && iptr[j] == pg+span; span++) ; - if (span>1) - printf(" %9zu[%zd]\n", pg, span); - else - printf(" %9zu\n", pg); - } - } - } - } - mdb_cursor_close(cursor); - if (envinfo) { - size_t value = mei.base.me_mapsize / mst.base.ms_psize; - double percent = value / 100.0; - printf("Page Allocation Info\n"); - printf(" Max pages: %9zu 100%%\n", value); - - value = mei.base.me_last_pgno+1; - printf(" Number of pages used: %zu %.1f%%\n", value, value / percent); - - value = mei.base.me_mapsize / mst.base.ms_psize - (mei.base.me_last_pgno+1); - printf(" Remained: %zu %.1f%%\n", value, value / percent); - - value = mei.base.me_last_pgno+1 - pages; - printf(" Used now: %zu %.1f%%\n", value, value / percent); - - value = pages; - printf(" Unallocated: %zu %.1f%%\n", value, value / percent); - - value = pages - reclaimable; - printf(" Detained: %zu %.1f%%\n", value, value / percent); - - value = reclaimable; - printf(" Reclaimable: %zu %.1f%%\n", value, value / percent); - - value = mei.base.me_mapsize / mst.base.ms_psize - (mei.base.me_last_pgno+1) + reclaimable; - printf(" Available: %zu %.1f%%\n", value, value / percent); - } else - printf(" Free pages: %zu\n", pages); - } - - rc = mdb_open(txn, subname, 0, &dbi); - if (rc) { - fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - - rc = mdbx_stat(txn, dbi, &mst, sizeof(mst)); - if (rc) { - fprintf(stderr, "mdb_stat failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - printf("Status of %s\n", subname ? subname : "Main DB"); - prstat(&mst); - - if (alldbs) { - MDB_cursor *cursor; - MDB_val key; - - rc = mdb_cursor_open(txn, dbi, &cursor); - if (rc) { - fprintf(stderr, "mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { - char *str; - MDB_dbi db2; - if (memchr(key.mv_data, '\0', key.mv_size)) - continue; - str = malloc(key.mv_size+1); - memcpy(str, key.mv_data, key.mv_size); - str[key.mv_size] = '\0'; - rc = mdb_open(txn, str, 0, &db2); - if (rc == MDB_SUCCESS) - printf("Status of %s\n", str); - free(str); - if (rc) continue; - rc = mdbx_stat(txn, db2, &mst, sizeof(mst)); - if (rc) { - fprintf(stderr, "mdb_stat failed, error %d %s\n", rc, mdb_strerror(rc)); - goto txn_abort; - } - prstat(&mst); - mdb_close(env, db2); - } - mdb_cursor_close(cursor); - } - - if (rc == MDB_NOTFOUND) - rc = MDB_SUCCESS; - - mdb_close(env, dbi); -txn_abort: - mdb_txn_abort(txn); -env_close: - mdb_env_close(env); - - return rc ? EXIT_FAILURE : EXIT_SUCCESS; -} diff --git a/mdbx.c b/mdbx.c index 66c377d8..b96857c1 100644 --- a/mdbx.c +++ b/mdbx.c @@ -1,7 +1,13 @@ /* * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. + * + * This code is derived from "LMDB engine" written by + * Howard Chu (Symas Corporation), which itself derived from btree.c + * written by Martin Hedenfalk. + * + * --- + * + * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -10,415 +16,11265 @@ * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at * . + * + * --- + * + * Portions Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include + #include "mdbx.h" -int mdb_runtime_flags = MDBX_DBG_PRINT +#ifndef MDB_DEBUG +#define MDB_DEBUG 0 +#endif + +/* LY: Please do not ask us for Windows support, just never! + * But you can make a fork for Windows, or become maintainer for FreeBSD... */ +#ifndef __gnu_linux__ +#warning "libmdbx supports only GNU Linux" +#endif + +#if !defined(__GNUC__) || !__GNUC_PREREQ(4, 2) +/* LY: Actualy libmdbx was not tested with compilers + * older than GCC 4.4 (from RHEL6). + * But you could remove this #error and try to continue at your own risk. + * In such case please don't rise up an issues related ONLY to old compilers. + */ +#warning "libmdbx required at least GCC 4.2 compatible C/C++ compiler." +#endif + +#if !defined(__GNU_LIBRARY__) || !__GLIBC_PREREQ(2, 12) +/* LY: Actualy libmdbx was not tested with something + * older than glibc 2.12 (from RHEL6). + * But you could remove this #error and try to continue at your own risk. + * In such case please don't rise up an issues related ONLY to old systems. + */ +#warning "libmdbx required at least GLIBC 2.12." +#endif + #if MDB_DEBUG - | MDBX_DBG_ASSERT +#undef NDEBUG +#endif + +#include +#include +#include +#include +#include +#ifdef HAVE_SYS_FILE_H +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) +#include +#include /* defines BYTE_ORDER on HPUX and Solaris */ +#endif + +#ifndef _POSIX_SYNCHRONIZED_IO +#define fdatasync fsync +#endif + +#ifndef BYTE_ORDER +#if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && \ + !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) +/* Solaris just defines one or the other */ +#define LITTLE_ENDIAN 1234 +#define BIG_ENDIAN 4321 +#ifdef _LITTLE_ENDIAN +#define BYTE_ORDER LITTLE_ENDIAN +#else +#define BYTE_ORDER BIG_ENDIAN +#endif +#else +#define BYTE_ORDER __BYTE_ORDER +#endif +#endif + +#ifndef LITTLE_ENDIAN +#define LITTLE_ENDIAN __LITTLE_ENDIAN +#endif +#ifndef BIG_ENDIAN +#define BIG_ENDIAN __BIG_ENDIAN +#endif + +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) +#define MISALIGNED_OK 1 +#endif + +#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) +#error "Unknown or unsupported endianness (BYTE_ORDER)" +#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +#error "Two's complement, reasonably sized integer types, please" +#endif + +#include "./barriers.h" +#include "./midl.h" +#include "./reopen.h" + +/** Search for an ID in an IDL. + * @param[in] ids The IDL to search. + * @param[in] id The ID to search for. + * @return The index of the first ID greater than or equal to \b id. + */ +static unsigned mdbx_midl_search(MDB_IDL ids, MDB_ID id); + +/** Allocate an IDL. + * Allocates memory for an IDL of the given size. + * @return IDL on success, NULL on failure. + */ +static MDB_IDL mdbx_midl_alloc(int num); + +/** Free an IDL. + * @param[in] ids The IDL to free. + */ +static void mdbx_midl_free(MDB_IDL ids); + +/** Shrink an IDL. + * Return the IDL to the default size if it has grown larger. + * @param[in,out] idp Address of the IDL to shrink. + */ +static void mdbx_midl_shrink(MDB_IDL *idp); + +/** Make room for num additional elements in an IDL. + * @param[in,out] idp Address of the IDL. + * @param[in] num Number of elements to make room for. + * @return 0 on success, ENOMEM on failure. + */ +static int mdbx_midl_need(MDB_IDL *idp, unsigned num); + +/** Append an ID onto an IDL. + * @param[in,out] idp Address of the IDL to append to. + * @param[in] id The ID to append. + * @return 0 on success, ENOMEM if the IDL is too large. + */ +static int mdbx_midl_append(MDB_IDL *idp, MDB_ID id); + +/** Append an IDL onto an IDL. + * @param[in,out] idp Address of the IDL to append to. + * @param[in] app The IDL to append. + * @return 0 on success, ENOMEM if the IDL is too large. + */ +static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app); + +/** Append an ID range onto an IDL. + * @param[in,out] idp Address of the IDL to append to. + * @param[in] id The lowest ID to append. + * @param[in] n Number of IDs to append. + * @return 0 on success, ENOMEM if the IDL is too large. + */ +static int mdbx_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n); + +/** Merge an IDL onto an IDL. The destination IDL must be big enough. + * @param[in] idl The IDL to merge into. + * @param[in] merge The IDL to merge. + */ +static void mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge); + +/** Sort an IDL. + * @param[in,out] ids The IDL to sort. + */ +static void mdbx_midl_sort(MDB_IDL ids); + +/** Search for an ID in an ID2L. + * @param[in] ids The ID2L to search. + * @param[in] id The ID to search for. + * @return The index of the first ID2 whose \b mid member is greater than + * or equal to \b id. + */ +static unsigned mdbx_mid2l_search(MDB_ID2L ids, MDB_ID id); + +/** Insert an ID2 into a ID2L. + * @param[in,out] ids The ID2L to insert into. + * @param[in] id The ID2 to insert. + * @return 0 on success, -1 if the ID was already present in the ID2L. + */ +static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id); + +/** Append an ID2 into a ID2L. + * @param[in,out] ids The ID2L to append into. + * @param[in] id The ID2 to append. + * @return 0 on success, -2 if the ID2L is too big. + */ +static int mdbx_mid2l_append(MDB_ID2L ids, MDB_ID2 *id); + +int mdbx_runtime_flags = MDBX_DBG_PRINT +#if MDB_DEBUG + | MDBX_DBG_ASSERT #endif #if MDB_DEBUG > 1 - | MDBX_DBG_TRACE + | MDBX_DBG_TRACE #endif #if MDB_DEBUG > 2 - | MDBX_DBG_AUDIT + | MDBX_DBG_AUDIT #endif #if MDB_DEBUG > 3 - | MDBX_DBG_EXTRA + | MDBX_DBG_EXTRA #endif - ; + ; -static MDBX_debug_func *mdb_debug_logger; +static MDBX_debug_func *mdbx_debug_logger; -int mdbx_setup_debug(int flags, MDBX_debug_func* logger, long edge_txn); +int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); -#include "mdb.c" +/** Features under development */ +#ifndef MDB_DEVEL +#define MDB_DEVEL 0 +#endif -int __cold -mdbx_setup_debug(int flags, MDBX_debug_func* logger, long edge_txn) { - unsigned ret = mdb_runtime_flags; - if (flags != (int) MDBX_DBG_DNT) - mdb_runtime_flags = flags; - if (logger != (MDBX_debug_func*) MDBX_DBG_DNT) - mdb_debug_logger = logger; - if (edge_txn != (long) MDBX_DBG_DNT) { +/** Wrapper around __func__, which is a C99 feature */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define mdbx_func_ __func__ +#elif (defined(__GNUC__) && __GNUC__ >= 2) || defined(__clang__) +#define mdbx_func_ __FUNCTION__ +#else +/* If a debug message says (), update the #if statements above */ +#define mdbx_func_ "" +#endif + +/** Some platforms define the EOWNERDEAD error code + * even though they don't support Robust Mutexes. + * Compile with -DMDB_USE_ROBUST=0. + */ +#ifndef MDB_USE_ROBUST +/* Howard Chu: Android currently lacks Robust Mutex support */ +#if defined(EOWNERDEAD) && \ + !defined(ANDROID) /* LY: glibc before 2.10 has a troubles with Robust \ + Mutex too. */ \ + && __GLIBC_PREREQ(2, 10) +#define MDB_USE_ROBUST 1 +#else +#define MDB_USE_ROBUST 0 +#endif +#endif /* MDB_USE_ROBUST */ + +/* Internal error codes, not exposed outside liblmdb */ +#define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) + +/** Mutex for the reader table (rw = r) or write transaction (rw = w). + */ +#define MDB_MUTEX(env, rw) (&(env)->me_txns->mti_##rw##mutex) + +/** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#define HANDLE int + +/** A value for an invalid file handle. + * Mainly used to initialize file variables and signify that they are + * unused. + */ +#define INVALID_HANDLE_VALUE (-1) + +/** Get the size of a memory page for the system. + * This is the basic size that the platform's memory manager uses, and is + * fundamental to the use of memory-mapped files. + */ +#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) + +/** @} */ + +static int mdbx_mutex_lock(MDB_env *env, pthread_mutex_t *mutex); +static int mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc); +static void mdbx_mutex_unlock(MDB_env *env, pthread_mutex_t *mutex); + +/** A page number in the database. + * Note that 64 bit page numbers are overkill, since pages themselves + * already represent 12-13 bits of addressable memory, and the OS will + * always limit applications to a maximum of 63 bits of address space. + * + * @note In the #MDB_node structure, we only store 48 bits of this value, + * which thus limits us to only 60 bits of addressable data. + */ +typedef MDB_ID pgno_t; + +/** A transaction ID. + * See struct MDB_txn.mt_txnid for details. + */ +typedef MDB_ID txnid_t; + +/** @defgroup debug Debug Macros + * @{ + */ +/** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) +/** @} */ + +/** @brief The maximum size of a database page. + * + * It is 32k or 64k, since value-PAGEBASE must fit in + * #MDB_page.%mp_upper. + * + * LMDB will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. + * + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. + */ +#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) + +/** The minimum number of keys required in a database page. + * Setting this to a larger value will place a smaller bound on the + * maximum size of a data item. Data items larger than this size will + * be pushed into overflow pages instead of being stored directly in + * the B-tree node. This value used to default to 4. With a page size + * of 4096 bytes that meant that any item larger than 1024 bytes would + * go into an overflow page. That also meant that on average 2-3KB of + * each overflow page was wasted space. The value cannot be lower than + * 2 because then there would no longer be a tree structure. With this + * value, items larger than 2KB will go into overflow pages, and on + * average only 1KB will be wasted. + */ +#define MDB_MINKEYS 2 + +/** A stamp that identifies a file as an LMDB file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. + */ +#define MDB_MAGIC 0xBEEFC0DE + +/** The version number for a database's datafile format. */ +#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) +/** The version number for a database's lockfile format. */ +#define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) + +/** @brief The max size of a key we can write, or 0 for computed max. + * + * This macro should normally be left alone or set to 0. + * Note that a database with big keys or dupsort data cannot be + * reliably modified by a liblmdb which uses a smaller max. + * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. + * + * Other values are allowed, for backwards compat. However: + * A value bigger than the computed max can break if you do not + * know what you are doing, and liblmdb <= 0.9.10 can break when + * modifying a DB with keys/dupsort data bigger than its max. + * + * Data items in an #MDB_DUPSORT database are also limited to + * this size, since they're actually keys of a sub-DB. Keys and + * #MDB_DUPSORT data items must fit on a node in a regular page. + */ +#ifndef MDB_MAXKEYSIZE +#define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) +#endif + +/** The maximum size of a key we can write to the environment. */ +#if MDB_MAXKEYSIZE +#define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) +#else +#define ENV_MAXKEY(env) ((env)->me_maxkey_limit) +#endif /* MDB_MAXKEYSIZE */ + +/** @brief The maximum size of a data item. + * + * We only store a 32 bit value for node sizes. + */ +#define MAXDATASIZE 0xffffffffUL + +/** Key size which fits in a #DKBUF. + * @ingroup debug + */ +#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) +/** A key buffer. + * @ingroup debug + * This is used for printing a hex dump of a key's contents. + */ +#define DKBUF char kbuf[DKBUF_MAXKEYSIZE * 2 + 1] +/** Display a key in hex. + * @ingroup debug + * Invoke a function to display a key in hex. + */ +#define DKEY(x) mdbx_dkey(x, kbuf) + +/** An invalid page number. + * Mainly used to denote an empty tree. + */ +#define P_INVALID (~(pgno_t)0) + +/** Test if the flags \b f are set in a flag word \b w. */ +#define F_ISSET(w, f) (((w) & (f)) == (f)) + +/** Round \b n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + +/** Used for offsets within a single page. + * Since memory pages are typically 4 or 8KB in size, 12-13 bits, + * this is plenty. + */ +typedef uint16_t indx_t; + +/** Default size of memory map. + * This is certainly too small for any actual applications. Apps should + *always set + * the size explicitly using #mdbx_env_set_mapsize(). + */ +#define DEFAULT_MAPSIZE 1048576 + +/** @defgroup readers Reader Lock Table + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent + *read + * transactions started by the same thread need no further locking to + *proceed. + * + * If #MDB_NOTLS is set, the slot address is not saved in thread-specific + *data. + * + * No reader table is used if the database is on a read-only filesystem, + *or + * if #MDB_NOLOCK is set. + * + * Since the database uses multi-version concurrency control, readers + *don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old + *transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with + *the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the + *oldest + * outstanding reader transaction. Any freed pages older than this will + *be + * reclaimed by the writer. The writer doesn't use any locks when + *scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for + *correct + * operation - all we need is to know the upper bound on the oldest + *reader, + * we don't care at all about the newest reader. So the only consequence + *of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, + *because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages + *from + * many old transactions together. + * @{ + */ +/** Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. 126 readers plus a + * couple mutexes fit exactly into 8KB on my development machine. + * Applications should set the table size using + *#mdbx_env_set_maxreaders(). + */ +#define DEFAULT_READERS 126 + +/** The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * @note We currently don't check for stale records. We simply re-init + * the table when we know that we're the only process opening the + * lock file. + */ +typedef struct MDB_rxbody { + /** Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. + */ + volatile txnid_t mrb_txnid; + /** The process ID of the process owning this reader txn. */ + volatile pid_t mrb_pid; + /** The thread ID of the thread owning this txn. */ + volatile pthread_t mrb_tid; +} MDB_rxbody; + +/** The actual reader record, with cacheline padding. */ +typedef struct MDB_reader { + union { + MDB_rxbody mrx; +/** shorthand for mrb_txnid */ +#define mr_txnid mru.mrx.mrb_txnid +#define mr_pid mru.mrx.mrb_pid +#define mr_tid mru.mrx.mrb_tid + /** cache line alignment */ + char pad[(sizeof(MDB_rxbody) + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1)]; + } mru; +} MDB_reader; + +/** The header for the reader table. + * The table resides in a memory-mapped file. (This is a different file + * than is used for the main database.) + * + * For POSIX the actual mutexes reside in the shared memory of this + * mapped file. On Windows, mutexes are named objects allocated by the + * kernel; we store the mutex names in this mapped file so that other + * processes can grab them. This same approach is also used on + * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support + * process-shared POSIX mutexes. For these cases where a named object + * is used, the object name is derived from a 64 bit FNV hash of the + * environment pathname. As such, naming collisions are extremely + * unlikely. If a collision occurs, the results are unpredictable. + */ +typedef struct MDB_txbody { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mtb_magic; + /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ + uint32_t mtb_format; + /** Mutex protecting access to this table. + * This is the #MDB_MUTEX(env,r) reader table lock. + */ + pthread_mutex_t mtb_rmutex; + /** The ID of the last transaction committed to the database. + * This is recorded here only for convenience; the value can always + * be determined by reading the main database meta pages. + */ + volatile txnid_t mtb_txnid; + /** The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. + */ + volatile unsigned mtb_numreaders; +} MDB_txbody; + +/** The actual reader table definition. */ +typedef struct MDB_txninfo { + union { + MDB_txbody mtb; +#define mti_magic mt1.mtb.mtb_magic +#define mti_format mt1.mtb.mtb_format +#define mti_rmutex mt1.mtb.mtb_rmutex +#define mti_rmname mt1.mtb.mtb_rmname +#define mti_txnid mt1.mtb.mtb_txnid +#define mti_numreaders mt1.mtb.mtb_numreaders + char pad[(sizeof(MDB_txbody) + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1)]; + } mt1; + union { + pthread_mutex_t mt2_wmutex; +#define mti_wmutex mt2.mt2_wmutex + char pad[(sizeof(pthread_mutex_t) + CACHELINE_SIZE - 1) & + ~(CACHELINE_SIZE - 1)]; + } mt2; + MDB_reader mti_readers[1]; +} MDB_txninfo; + +/** Lockfile format signature: version, features and field layout */ +#define MDB_LOCK_FORMAT \ + ((uint32_t)((MDB_LOCK_VERSION) /* Flags which describe functionality */ \ + + (0 /* SYSV_SEM_FLAG */ << 18) + (1 /* MDB_PIDLOCK */ << 16))) +/** @} */ + +/** Common header for all page types. The page type depends on #mp_flags. + * + * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with + * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages + * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. + * + * #P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of #F_BIGDATA nodes. + * + * #P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. + * + * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a freeDB record. + */ +typedef struct MDB_page { +#define mp_pgno mp_p.p_pgno +#define mp_next mp_p.p_next + union { + pgno_t p_pgno; /**< page number */ + struct MDB_page *p_next; /**< for in-memory list of freed pages */ + } mp_p; + uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */ + /** @defgroup mdbx_page Page Flags + * @ingroup internal + * Flags for the page headers. + * @{ + */ +#define P_BRANCH 0x01 /**< branch page */ +#define P_LEAF 0x02 /**< leaf page */ +#define P_OVERFLOW 0x04 /**< overflow page */ +#define P_META 0x08 /**< meta page */ +#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ +#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ +#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ +#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ +#define P_KEEP 0x8000 /**< leave this page alone during spill */ + /** @} */ + uint16_t mp_flags; /**< @ref mdbx_page */ +#define mp_lower mp_pb.pb.pb_lower +#define mp_upper mp_pb.pb.pb_upper +#define mp_pages mp_pb.pb_pages + union { + struct { + indx_t pb_lower; /**< lower bound of free space */ + indx_t pb_upper; /**< upper bound of free space */ + } pb; + uint32_t pb_pages; /**< number of overflow pages */ + } mp_pb; + indx_t mp_ptrs[1]; /**< dynamic size */ +} MDB_page; + +/** Size of the page header, excluding dynamic data at the end */ +#define PAGEHDRSZ ((unsigned)offsetof(MDB_page, mp_ptrs)) + +/** Address of first usable data byte in a page, after the header */ +#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) + +/** ITS#7713, change PAGEBASE to handle 65536 byte pages */ +#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) + +/** Number of nodes on a page */ +#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) + +/** The amount of space remaining in the page */ +#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) + +/** The percentage of space used in the page, in tenths of a percent. */ +#define PAGEFILL(env, p) \ + (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + ((env)->me_psize - PAGEHDRSZ)) +/** The minimum page fill factor, in tenths of a percent. + * Pages emptier than this are candidates for merging. + */ +#define FILL_THRESHOLD 250 + +/** Test if a page is a leaf page */ +#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) +/** Test if a page is a LEAF2 page */ +#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) +/** Test if a page is a branch page */ +#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) +/** Test if a page is an overflow page */ +#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) +/** Test if a page is a sub page */ +#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) + +/** The number of overflow pages needed to store the given size. */ +#define OVPAGES(size, psize) ((PAGEHDRSZ - 1 + (size)) / (psize) + 1) + +/** Link in #MDB_txn.%mt_loose_pgs list. + * Kept outside the page header, which is needed when reusing the page. + */ +#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) + +/** Header for a single key/data pair within a page. + * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. + * We guarantee 2-byte alignment for 'MDB_node's. + * + * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used + * for pgno. (Branch nodes have no flags). Lo and hi are in host byte + * order in case some accesses can be optimized to 32-bit word access. + * + * Leaf node flags describe node contents. #F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just #F_SUBDATA). + */ +typedef struct MDB_node { +/** part of data size or pgno + * @{ */ +#if BYTE_ORDER == LITTLE_ENDIAN + unsigned short mn_lo, mn_hi; +#else + unsigned short mn_hi, mn_lo; +#endif +/** @} */ +/** @defgroup mdbx_node Node Flags + * @ingroup internal + * Flags for node headers. + * @{ + */ +#define F_BIGDATA 0x01 /**< data put on overflow page */ +#define F_SUBDATA 0x02 /**< data is a sub-database */ +#define F_DUPDATA 0x04 /**< data has duplicates */ + +/** valid flags for #mdbx_node_add() */ +#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDB_RESERVE | MDB_APPEND) + + /** @} */ + unsigned short mn_flags; /**< @ref mdbx_node */ + unsigned short mn_ksize; /**< key size */ + char mn_data[1]; /**< key and data are appended here */ +} MDB_node; + +/** Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDB_node, mn_data) + +/** Bit position of top word in page number, for shifting mn_flags */ +#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) + +/** Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. + */ +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) + +/** Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. + */ +#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) + +/** Address of node \b i in page \b p */ +#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) + +/** Address of the key for the node */ +#define NODEKEY(node) (void *)((node)->mn_data) + +/** Address of the data for a node */ +#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) + +/** Get the page number pointed to by a branch node */ +#define NODEPGNO(node) \ + ((node)->mn_lo | ((pgno_t)(node)->mn_hi << 16) | \ + (PGNO_TOPWORD ? ((pgno_t)(node)->mn_flags << PGNO_TOPWORD) : 0)) +/** Set the page number in a branch node */ +#define SETPGNO(node, pgno) \ + do { \ + (node)->mn_lo = (pgno)&0xffff; \ + (node)->mn_hi = (pgno) >> 16; \ + if (PGNO_TOPWORD) \ + (node)->mn_flags = (pgno) >> PGNO_TOPWORD; \ + } while (0) + +/** Get the size of the data in a leaf node */ +#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) +/** Set the size of the data for a leaf node */ +#define SETDSZ(node, size) \ + do { \ + (node)->mn_lo = (size)&0xffff; \ + (node)->mn_hi = (size) >> 16; \ + } while (0) +/** The size of a key in a node */ +#define NODEKSZ(node) ((node)->mn_ksize) + +/** Copy a page number from src to dst */ +#ifdef MISALIGNED_OK +#define COPY_PGNO(dst, src) dst = src +#elif SIZE_MAX > 4294967295UL +#define COPY_PGNO(dst, src) \ + do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d++ = *s++; \ + *d++ = *s++; \ + *d = *s; \ + } while (0) +#else +#define COPY_PGNO(dst, src) \ + do { \ + unsigned short *s, *d; \ + s = (unsigned short *)&(src); \ + d = (unsigned short *)&(dst); \ + *d++ = *s++; \ + *d = *s; \ + } while (0) +#endif /* MISALIGNED_OK */ + +/** The address of a key in a LEAF2 page. + * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate + *sub-DBs. + * There are no node headers, keys are stored contiguously. + */ +#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks))) + +/** Set the \b node's key into \b keyptr, if requested. */ +#define MDB_GET_KEY(node, keyptr) \ + { \ + if ((keyptr) != NULL) { \ + (keyptr)->mv_size = NODEKSZ(node); \ + (keyptr)->mv_data = NODEKEY(node); \ + } \ + } + +/** Set the \b node's key into \b key. */ +#define MDB_GET_KEY2(node, key) \ + { \ + key.mv_size = NODEKSZ(node); \ + key.mv_data = NODEKEY(node); \ + } + +/** Information about a single database in the environment. */ +typedef struct MDB_db { + uint32_t md_xsize; /**< also ksize for LEAF2 pages */ + uint16_t md_flags; /**< @ref mdbx_dbi_open */ + uint16_t md_depth; /**< depth of this tree */ + pgno_t md_branch_pages; /**< number of internal pages */ + pgno_t md_leaf_pages; /**< number of leaf pages */ + pgno_t md_overflow_pages; /**< number of overflow pages */ + size_t md_entries; /**< number of data items */ + pgno_t md_root; /**< the root page of this tree */ +} MDB_db; + +#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) +/** #mdbx_dbi_open() flags */ +#define VALID_FLAGS \ + (MDB_REVERSEKEY | MDB_DUPSORT | MDB_INTEGERKEY | MDB_DUPFIXED | \ + MDB_INTEGERDUP | MDB_REVERSEDUP | MDB_CREATE) + +/** Handle for the DB used to track free pages. */ +#define FREE_DBI 0 +/** Handle for the default DB. */ +#define MAIN_DBI 1 +/** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +#define CORE_DBS 2 + +/** Number of meta pages - also hardcoded elsewhere */ +#define NUM_METAS 2 + +/** Meta page content. + * A meta page is the start point for accessing a database snapshot. + * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). + */ +typedef struct MDB_meta { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mm_magic; + /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ + uint32_t mm_version; + void *mm_address; /**< address for fixed mapping */ + size_t mm_mapsize; /**< size of mmap region */ + MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ + /** The size of pages used in this DB */ +#define mm_psize mm_dbs[FREE_DBI].md_xsize +/** Any persistent environment flags. @ref mdbx_env */ +#define mm_flags mm_dbs[FREE_DBI].md_flags + /** Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. + */ + pgno_t mm_last_pg; + volatile txnid_t mm_txnid; /**< txnid that committed this page */ +#define MDB_DATASIGN_NONE 0 +#define MDB_DATASIGN_WEAK 1 + volatile uint64_t mm_datasync_sign; +#define META_IS_WEAK(meta) ((meta)->mm_datasync_sign == MDB_DATASIGN_WEAK) +#define META_IS_STEADY(meta) ((meta)->mm_datasync_sign > MDB_DATASIGN_WEAK) + +#if MDBX_MODE_ENABLED + volatile mdbx_canary mm_canary; +#endif +} MDB_meta; + +/** Buffer for a stack-allocated meta page. + * The members define size and alignment, and silence type + * aliasing warnings. They are not used directly; that could + * mean incorrectly using several union members in parallel. + */ +typedef union MDB_metabuf { + MDB_page mb_page; + struct { + char mm_pad[PAGEHDRSZ]; + MDB_meta mm_meta; + } mb_metabuf; +} MDB_metabuf; + +/** Auxiliary DB info. + * The information here is mostly static/read-only. There is + * only a single copy of this record in the environment. + */ +typedef struct MDB_dbx { + MDB_val md_name; /**< name of the database */ + MDB_cmp_func *md_cmp; /**< function for comparing keys */ + MDB_cmp_func *md_dcmp; /**< function for comparing data items */ + MDB_rel_func *md_rel; /**< user relocate function */ + void *md_relctx; /**< user-provided context for md_rel */ +} MDB_dbx; + +#if MDBX_MODE_ENABLED +#define MDBX_MODE_SALT 0 +#else +#error !? +#endif + +/** A database transaction. + * Every operation requires a transaction handle. + */ +struct MDB_txn { +#define MDBX_MT_SIGNATURE (0x93D53A31 ^ MDBX_MODE_SALT) + unsigned mt_signature; + MDB_txn *mt_parent; /**< parent of a nested txn */ + /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ + MDB_txn *mt_child; + pgno_t mt_next_pgno; /**< next unallocated page */ + /** The ID of this transaction. IDs are integers incrementing from 1. + * Only committed write transactions increment the ID. If a transaction + * aborts, the ID may be re-used by the next writer. + */ + txnid_t mt_txnid; + MDB_env *mt_env; /**< the DB environment */ + /** The list of reclaimed txns from freeDB */ + MDB_IDL mt_lifo_reclaimed; + /** The list of pages that became unused during this transaction. + */ + MDB_IDL mt_free_pgs; + /** The list of loose pages that became unused and may be reused + * in this transaction, linked through #NEXT_LOOSE_PAGE(page). + */ + MDB_page *mt_loose_pgs; + /** Number of loose pages (#mt_loose_pgs) */ + int mt_loose_count; + /** The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. + */ + MDB_IDL mt_spill_pgs; + union { + /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ + MDB_ID2L dirty_list; + /** For read txns: This thread/txn's reader table slot, or NULL. */ + MDB_reader *reader; + } mt_u; + /** Array of records for each DB known in the environment. */ + MDB_dbx *mt_dbxs; + /** Array of MDB_db records for each known DB */ + MDB_db *mt_dbs; + /** Array of sequence numbers for each DB handle */ + unsigned *mt_dbiseqs; +/** @defgroup mt_dbflag Transaction DB Flags + * @ingroup internal + * @{ + */ +#define DB_DIRTY 0x01 /**< DB was written in this txn */ +#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ +#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ +#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ +#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ + /** @} */ + /** In write txns, array of cursors for each DB */ + MDB_cursor **mt_cursors; + /** Array of flags for each DB */ + unsigned char *mt_dbflags; + /** Number of DB records in use, or 0 when the txn is finished. + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. + */ + MDB_dbi mt_numdbs; + +/** @defgroup mdbx_txn Transaction Flags + * @ingroup internal + * @{ + */ +/** #mdbx_txn_begin() flags */ +#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC | MDB_NOSYNC | MDB_RDONLY) +#define MDB_TXN_NOMETASYNC \ + MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ +#define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ +#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ + /* internal txn flags */ +#define MDB_TXN_WRITEMAP \ + MDB_WRITEMAP /**< copy of #MDB_env flag in writers \ + */ +#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ +#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ +#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ +#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ +#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ +/** most operations on the txn are currently illegal */ +#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED | MDB_TXN_ERROR | MDB_TXN_HAS_CHILD) + /** @} */ + unsigned mt_flags; /**< @ref mdbx_txn */ + /** #dirty_list room: Array size - \#dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirty_list into mt_parent after freeing hidden mt_parent pages. + */ + unsigned mt_dirty_room; + +#if MDBX_MODE_ENABLED + mdbx_canary mt_canary; +#endif +}; + +/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. + * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to + * raise this on a 64 bit machine. + */ +#define CURSOR_STACK 32 + +struct MDB_xcursor; + +/** Cursors are used for all DB operations. + * A cursor holds a path of (page pointer, key index) from the DB + * root to a position in the DB, plus other state. #MDB_DUPSORT + * cursors include an xcursor to the current data item. Write txns + * track their cursors and keep them up to date when data moves. + * Exception: An xcursor's pointer to a #P_SUBP page can be stale. + * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). + */ +struct MDB_cursor { +#define MDBX_MC_SIGNATURE (0xFE05D5B1 ^ MDBX_MODE_SALT) +#define MDBX_MC_READY4CLOSE (0x2817A047 ^ MDBX_MODE_SALT) +#define MDBX_MC_WAIT4EOT (0x90E297A7 ^ MDBX_MODE_SALT) + unsigned mc_signature; + /** Next cursor on this DB in this txn */ + MDB_cursor *mc_next; + /** Backup of the original cursor if this cursor is a shadow */ + MDB_cursor *mc_backup; + /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ + struct MDB_xcursor *mc_xcursor; + /** The transaction that owns this cursor */ + MDB_txn *mc_txn; + /** The database handle this cursor operates on */ + MDB_dbi mc_dbi; + /** The database record for this cursor */ + MDB_db *mc_db; + /** The database auxiliary record for this cursor */ + MDB_dbx *mc_dbx; + /** The @ref mt_dbflag for this database */ + unsigned char *mc_dbflag; + unsigned short mc_snum; /**< number of pushed pages */ + unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ + /** @defgroup mdbx_cursor Cursor Flags + * @ingroup internal + * Cursor state flags. + * @{ + */ +#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ +#define C_EOF 0x02 /**< No more data */ +#define C_SUB 0x04 /**< Cursor is a sub-cursor */ +#define C_DEL 0x08 /**< last op was a cursor_del */ +#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ +#define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */ + /** @} */ + unsigned mc_flags; /**< @ref mdbx_cursor */ + MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ +}; + +/** Context for sorted-dup records. + * We could have gone to a fully recursive design, with arbitrarily + * deep nesting of sub-databases. But for now we only handle these + * levels - main DB, optional sub-DB, sorted-duplicate DB. + */ +typedef struct MDB_xcursor { + /** A sub-cursor for traversing the Dup DB */ + MDB_cursor mx_cursor; + /** The database record for this Dup DB */ + MDB_db mx_db; + /** The auxiliary DB record for this Dup DB */ + MDB_dbx mx_dbx; + /** The @ref mt_dbflag for this Dup DB */ + unsigned char mx_dbflag; +} MDB_xcursor; + +/** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + +/** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed + * when the node which contains the sub-page may have moved. Called + * with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. + */ +#define XCURSOR_REFRESH(mc, mp, ki) \ + do { \ + MDB_page *xr_pg = (mp); \ + MDB_node *xr_node = NODEPTR(xr_pg, ki); \ + if ((xr_node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ + } while (0) + +/** State of FreeDB old pages, stored in the MDB_env */ +typedef struct MDB_pgstate { + pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ +} MDB_pgstate; + +/** Context for deferred cleanup of reader's threads. + * to avoid https://github.com/ReOpen/ReOpenLDAP/issues/48 */ +typedef struct MDBX_rthc { + struct MDBX_rthc *rc_next; + pthread_t rc_thread; + MDB_reader *rc_reader; +} MDBX_rthc; + +static MDBX_rthc *mdbx_rthc_get(pthread_key_t key); + +/** The database environment. */ +struct MDB_env { +#define MDBX_ME_SIGNATURE (0x9A899641 ^ MDBX_MODE_SALT) + unsigned me_signature; + HANDLE me_fd; /**< The main data file */ + HANDLE me_lfd; /**< The lock file */ + /** Failed to update the meta page. Probably an I/O error. */ +#define MDB_FATAL_ERROR 0x80000000U +/** Some fields are initialized. */ +#define MDB_ENV_ACTIVE 0x20000000U +/** me_txkey is set */ +#define MDB_ENV_TXKEY 0x10000000U + uint32_t me_flags; /**< @ref mdbx_env */ + unsigned me_psize; /**< DB page size, inited from me_os_psize */ + unsigned me_os_psize; /**< OS page size, from #GET_PAGESIZE */ + unsigned me_maxreaders; /**< size of the reader table */ + /** Max #MDB_txninfo.%mti_numreaders of interest to #mdbx_env_close() */ + unsigned me_close_readers; + MDB_dbi me_numdbs; /**< number of DBs opened */ + MDB_dbi me_maxdbs; /**< size of the DB table */ + pid_t me_pid; /**< process ID of this env */ + char *me_path; /**< path to the DB files */ + char *me_map; /**< the memory map of the data file */ + MDB_txninfo *me_txns; /**< the memory map of the lock file, never NULL */ + void *me_pbuf; /**< scratch area for DUPSORT put() */ + MDB_txn *me_txn; /**< current write transaction */ + MDB_txn *me_txn0; /**< prealloc'd write transaction */ + size_t me_mapsize; /**< size of the data memory map */ + pgno_t me_maxpg; /**< me_mapsize / me_psize */ + MDB_dbx *me_dbxs; /**< array of static DB info */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ + unsigned *me_dbiseqs; /**< array of dbi sequence numbers */ + pthread_key_t me_txkey; /**< thread-key for readers */ + txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ + MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ +#define me_pglast me_pgstate.mf_pglast +#define me_pghead me_pgstate.mf_pghead + MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ + /** IDL of pages that became unused in a write txn */ + MDB_IDL me_free_pgs; + /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + MDB_ID2L me_dirty_list; + /** Max number of freelist items that can fit in a single overflow page */ + unsigned me_maxfree_1pg; + /** Max size of a node on a page */ + unsigned me_nodemax; + unsigned me_maxkey_limit; /**< max size of a key */ + int me_live_reader; /**< have liveness lock in reader table */ + void *me_userctx; /**< User-settable context */ #if MDB_DEBUG - mdb_debug_edge = edge_txn; + MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ #endif + uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last + mdbx_env_sync() */ + uint64_t + me_sync_threshold; /**< Treshold of above to force synchronous flush */ +#if MDBX_MODE_ENABLED + MDBX_oom_func *me_oom_func; /**< Callback for kicking laggard readers */ +#endif +#ifdef USE_VALGRIND + int me_valgrind_handle; +#endif +}; + +/** Nested transaction */ +typedef struct MDB_ntxn { + MDB_txn mnt_txn; /**< the transaction */ + MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ +} MDB_ntxn; + +/** max number of pages to commit in one writev() call */ +#define MDB_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES +#undef MDB_COMMIT_PAGES +#define MDB_COMMIT_PAGES IOV_MAX +#endif + +/** max bytes to write in one call */ +#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) + +/** Check \b txn and \b dbi arguments to a function */ +#define TXN_DBI_EXIST(txn, dbi, validity) \ + ((dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) + +/** Check for misused \b dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) + +#define METAPAGE_1(env) (&((MDB_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) + +#define METAPAGE_2(env) \ + (&((MDB_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) + +static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags); +static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, + MDB_page **mp); +static int mdbx_page_touch(MDB_cursor *mc); +static int mdbx_cursor_touch(MDB_cursor *mc); + +#define MDB_END_NAMES \ + { \ + "committed", "empty-commit", "abort", "reset", "reset-tmp", "fail-begin", \ + "fail-beginchild" \ + } +enum { + /* mdbx_txn_end operation number, for logging */ + MDB_END_COMMITTED, + MDB_END_EMPTY_COMMIT, + MDB_END_ABORT, + MDB_END_RESET, + MDB_END_RESET_TMP, + MDB_END_FAIL_BEGIN, + MDB_END_FAIL_BEGINCHILD +}; +#define MDB_END_OPMASK 0x0F /**< mask for #mdbx_txn_end() operation number */ +#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ +#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ +#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ +static int mdbx_txn_end(MDB_txn *txn, unsigned mode); + +static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); +static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); +#define MDB_PS_MODIFY 1 +#define MDB_PS_ROOTONLY 2 +#define MDB_PS_FIRST 4 +#define MDB_PS_LAST 8 +static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags); +static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); + +#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ +static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, + pgno_t newpgno, unsigned nflags); + +static int mdbx_env_read_header(MDB_env *env, MDB_meta *meta); +static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); +static void mdbx_env_close0(MDB_env *env); + +static MDB_node *mdbx_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); +static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, + MDB_val *data, pgno_t pgno, unsigned flags); +static void mdbx_node_del(MDB_cursor *mc, int ksize); +static void mdbx_node_shrink(MDB_page *mp, indx_t indx); +static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); +static int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); +static size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); +static size_t mdbx_branch_size(MDB_env *env, MDB_val *key); + +static int mdbx_rebalance(MDB_cursor *mc); +static int mdbx_update_key(MDB_cursor *mc, MDB_val *key); + +static void mdbx_cursor_pop(MDB_cursor *mc); +static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp); + +static int mdbx_cursor_del0(MDB_cursor *mc); +static int mdbx_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned flags); +static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right); +static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op); +static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op); +static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op, int *exactp); +static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); +static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); + +static void mdbx_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, + MDB_xcursor *mx); +static void mdbx_xcursor_init0(MDB_cursor *mc); +static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node); +static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); + +static int mdbx_drop0(MDB_cursor *mc, int subs); +static int mdbx_reader_check0(MDB_env *env, int rlocked, int *dead); + +/** @cond */ +static MDB_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_ai, + mdbx_cmp_int_a2, mdbx_cmp_int_ua; +/** @endcond */ + +#ifdef __SANITIZE_THREAD__ +static pthread_mutex_t tsan_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif + +/** Return the library version info. */ +char *__cold mdbx_version(int *major, int *minor, int *patch) { + if (major) + *major = MDB_VERSION_MAJOR; + if (minor) + *minor = MDB_VERSION_MINOR; + if (patch) + *patch = MDB_VERSION_PATCH; + return MDB_VERSION_STRING; +} + +/** Table of descriptions for LMDB @ref errors */ +static char *const mdbx_errstr[] = { + "MDB_KEYEXIST: Key/data pair already exists", + "MDB_NOTFOUND: No matching key/data pair found", + "MDB_PAGE_NOTFOUND: Requested page not found", + "MDB_CORRUPTED: Located page was wrong type", + "MDB_PANIC: Update of meta page failed or environment had fatal error", + "MDB_VERSION_MISMATCH: Database environment version mismatch", + "MDB_INVALID: File is not an LMDB file", + "MDB_MAP_FULL: Environment mapsize limit reached", + "MDB_DBS_FULL: Environment maxdbs limit reached", + "MDB_READERS_FULL: Environment maxreaders limit reached", + "MDB_TLS_FULL: Thread-local storage keys full - too many environments " + "open", + "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too " + "big", + "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", + "MDB_PAGE_FULL: Internal error - page has no more space", + "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", + "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", + "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", + "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", + "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong " + "DUPFIXED size", + "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", + "MDB_PROBLEM: Unexpected problem - txn should abort", +}; + +char *__cold mdbx_strerror(int err) { + int i; + if (!err) + return ("Successful return: 0"); + + if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { + i = err - MDB_KEYEXIST; + return mdbx_errstr[i]; + } + + return strerror(err); +} + +#if MDBX_MODE_ENABLED +static txnid_t mdbx_oomkick(MDB_env *env, txnid_t oldest); +#endif /* MDBX_MODE_ENABLED */ + +static void mdbx_debug_log(int type, const char *function, int line, + const char *fmt, ...) + __attribute__((format(printf, 4, 5))); + +#if MDB_DEBUG +static txnid_t mdbx_debug_edge; + +static void __cold mdbx_assert_fail(MDB_env *env, const char *msg, + const char *func, int line) { + if (env && env->me_assert_func) + env->me_assert_func(env, msg, func, line); + else { + if (mdbx_debug_logger) + mdbx_debug_log(MDBX_DBG_ASSERT, func, line, "assert: %s\n", msg); + __assert_fail(msg, __FILE__, line, func); + } +} + +#define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT) + +#define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT) + +#define mdbx_debug_enabled(type) \ + unlikely(mdbx_runtime_flags &(type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) + +#else +#ifndef NDEBUG +#define mdbx_debug_enabled(type) (1) +#else +#define mdbx_debug_enabled(type) (0) +#endif +#define mdbx_audit_enabled() (0) +#define mdbx_assert_enabled() (0) +#define mdbx_assert_fail(env, msg, func, line) \ + __assert_fail(msg, __FILE__, line, func) +#endif /* MDB_DEBUG */ + +static void __cold mdbx_debug_log(int type, const char *function, int line, + const char *fmt, ...) { + va_list args; + + va_start(args, fmt); + if (mdbx_debug_logger) + mdbx_debug_logger(type, function, line, fmt, args); + else { + if (function && line > 0) + fprintf(stderr, "%s:%d ", function, line); + else if (function) + fprintf(stderr, "%s: ", function); + else if (line > 0) + fprintf(stderr, "%d: ", line); + vfprintf(stderr, fmt, args); + } + va_end(args); +} + +#define mdbx_print(fmt, ...) \ + mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) + +#define mdbx_debug(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ + mdbx_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", \ + ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_print(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ + mdbx_debug_log(MDBX_DBG_TRACE, NULL, 0, fmt, ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_extra(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ + mdbx_debug_log(MDBX_DBG_EXTRA, __FUNCTION__, __LINE__, fmt, \ + ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_extra_print(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ + mdbx_debug_log(MDBX_DBG_EXTRA, NULL, 0, fmt, ##__VA_ARGS__); \ + } while (0) + +#define mdbx_ensure_msg(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + mdbx_assert_fail(env, msg, __FUNCTION__, __LINE__); \ + } while (0) + +#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) + +/** assert(3) variant in environment context */ +#define mdbx_assert(env, expr) \ + do { \ + if (mdbx_assert_enabled()) \ + mdbx_ensure(env, expr); \ + } while (0) + +/** assert(3) variant in cursor context */ +#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) + +/** assert(3) variant in transaction context */ +#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) + +/** Return the page number of \b mp which may be sub-page, for debug output */ +static MDBX_INLINE pgno_t mdbx_dbg_pgno(MDB_page *mp) { + pgno_t ret; + COPY_PGNO(ret, mp->mp_pgno); + return ret; +} + +/** Display a key in hexadecimal and return the address of the result. + * @param[in] key the key to display + * @param[in] buf the buffer to write into. Should always be #DKBUF. + * @return The key in hexadecimal form. + */ +char *mdbx_dkey(MDB_val *key, char *buf) { + char *ptr = buf; + unsigned i; + + if (!key) + return ""; + + if (key->mv_size > DKBUF_MAXKEYSIZE) + return "MDB_MAXKEYSIZE"; +/* may want to make this a dynamic check: if the key is mostly + * printable characters, print it as-is instead of converting to hex. */ +#if 1 + buf[0] = '\0'; + for (i = 0; i < key->mv_size; i++) + ptr += sprintf(ptr, "%02x", ((unsigned char *)key->mv_data)[i]); +#else + sprintf(buf, "%.*s", key->mv_size, key->mv_data); +#endif + return buf; +} + +#if 0 /* LY: debug stuff */ +static const char * +mdbx_leafnode_type(MDB_node *n) +{ + static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; + return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : + tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; +} + +/** Display all the keys in the page. */ +static void +mdbx_page_list(MDB_page *mp) +{ + pgno_t pgno = mdbx_dbg_pgno(mp); + const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; + MDB_node *node; + unsigned i, nkeys, nsize, total = 0; + MDB_val key; + DKBUF; + + switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { + case P_BRANCH: type = "Branch page"; break; + case P_LEAF: type = "Leaf page"; break; + case P_LEAF|P_SUBP: type = "Sub-page"; break; + case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; + case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; + case P_OVERFLOW: + mdbx_print("Overflow page %zu pages %u%s\n", + pgno, mp->mp_pages, state); + return; + case P_META: + mdbx_print("Meta-page %zu txnid %zu\n", + pgno, ((MDB_meta *)PAGEDATA(mp))->mm_txnid); + return; + default: + mdbx_print("Bad page %zu flags 0x%X\n", pgno, mp->mp_flags); + return; } - return ret; -} - -static txnid_t __cold -mdbx_oomkick(MDB_env *env, txnid_t oldest) -{ - int retry; - txnid_t snap; - mdb_debug("DB size maxed out"); - - for(retry = 0; ; ++retry) { - int reader; - - if (mdb_reader_check(env, NULL)) - break; - - snap = mdb_find_oldest(env, &reader); - if (oldest < snap || reader < 0) { - if (retry && env->me_oom_func) { - /* LY: notify end of oom-loop */ - env->me_oom_func(env, 0, 0, oldest, snap - oldest, -retry); - } - return snap; - } - - MDB_reader *r; - pthread_t tid; - pid_t pid; - int rc; - - if (!env->me_oom_func) - break; - - r = &env->me_txns->mti_readers[ reader ]; - pid = r->mr_pid; - tid = r->mr_tid; - if (r->mr_txnid != oldest || pid <= 0) - continue; - - rc = env->me_oom_func(env, pid, (void*) tid, oldest, - mdb_meta_head_w(env)->mm_txnid - oldest, retry); - if (rc < 0) - break; - - if (rc) { - r->mr_txnid = ~(txnid_t)0; - if (rc > 1) { - r->mr_tid = 0; - r->mr_pid = 0; - mdbx_coherent_barrier(); - } - } - } - - if (retry && env->me_oom_func) { - /* LY: notify end of oom-loop */ - env->me_oom_func(env, 0, 0, oldest, 0, -retry); - } - return mdb_find_oldest(env, NULL); -} - -int __cold -mdbx_env_set_syncbytes(MDB_env *env, size_t bytes) -{ - if (unlikely(!env)) - return EINVAL; - - if(unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - env->me_sync_threshold = bytes; - return env->me_map ? mdb_env_sync(env, 0) : MDB_SUCCESS; -} - -void __cold -mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oomfunc) -{ - if (likely(env && env->me_signature == MDBX_ME_SIGNATURE)) - env->me_oom_func = oomfunc; -} - -MDBX_oom_func* __cold -mdbx_env_get_oomfunc(MDB_env *env) -{ - return likely(env && env->me_signature == MDBX_ME_SIGNATURE) - ? env->me_oom_func : NULL; -} - -ATTRIBUTE_NO_SANITIZE_THREAD /* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */ -int mdbx_txn_straggler(MDB_txn *txn, int *percent) -{ - MDB_env *env; - MDB_meta *meta; - txnid_t lag; - - if(unlikely(!txn)) - return -EINVAL; - - if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(! txn->mt_u.reader)) - return -1; - - env = txn->mt_env; - meta = mdb_meta_head_r(env); - if (percent) { - size_t maxpg = env->me_maxpg; - size_t last = meta->mm_last_pg + 1; - if (env->me_txn) - last = env->me_txn0->mt_next_pgno; - *percent = (last * 100ull + maxpg / 2) / maxpg; - } - lag = meta->mm_txnid - txn->mt_u.reader->mr_txnid; - return (0 > (long) lag) ? ~0u >> 1: lag; -} - -typedef struct mdb_walk_ctx { - MDB_txn *mw_txn; - void *mw_user; - MDBX_pgvisitor_func *mw_visitor; -} mdb_walk_ctx_t; - -/** Depth-first tree traversal. */ -static int __cold -mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int deep) -{ - MDB_page *mp; - int rc, i, nkeys; - unsigned header_size, unused_size, payload_size, align_bytes; - const char* type; - - if (pg == P_INVALID) - return MDB_SUCCESS; /* empty db */ - - MDB_cursor mc; - memset(&mc, 0, sizeof(mc)); - mc.mc_snum = 1; - mc.mc_txn = ctx->mw_txn; - - rc = mdb_page_get(&mc, pg, &mp, NULL); - if (rc) - return rc; - if (pg != mp->mp_p.p_pgno) - return MDB_CORRUPTED; nkeys = NUMKEYS(mp); - header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower; - unused_size = SIZELEFT(mp); - payload_size = 0; + mdbx_print("%s %zu numkeys %u%s\n", type, pgno, nkeys, state); - /* LY: Don't use mask here, e.g bitwise (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). - * Pages should not me marked dirty/loose or otherwise. */ - switch (mp->mp_flags) { - case P_BRANCH: - type = "branch"; - if (nkeys < 1) - return MDB_CORRUPTED; - break; - case P_LEAF: - type = "leaf"; - break; - case P_LEAF|P_SUBP: - type = "dupsort-subleaf"; - break; - case P_LEAF|P_LEAF2: - type = "dupfixed-leaf"; - break; - case P_LEAF|P_LEAF2|P_SUBP: - type = "dupsort-dupfixed-subleaf"; - break; - case P_META: - case P_OVERFLOW: - default: - return MDB_CORRUPTED; - } - - for (align_bytes = i = 0; i < nkeys; - align_bytes += ((payload_size + align_bytes) & 1), i++) { - MDB_node *node; - - if (IS_LEAF2(mp)) { - /* LEAF2 pages have no mp_ptrs[] or node headers */ - payload_size += mp->mp_leaf2_ksize; + for (i=0; imp_leaf2_ksize; + key.mv_data = LEAF2KEY(mp, i, nsize); + total += nsize; + mdbx_print("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); continue; } - node = NODEPTR(mp, i); - payload_size += NODESIZE + node->mn_ksize; - + key.mv_size = node->mn_ksize; + key.mv_data = node->mn_data; + nsize = NODESIZE + key.mv_size; if (IS_BRANCH(mp)) { - rc = mdb_env_walk(ctx, dbi, NODEPGNO(node), deep); - if (rc) - return rc; - continue; - } - - assert(IS_LEAF(mp)); - if (node->mn_flags & F_BIGDATA) { - MDB_page *omp; - pgno_t *opg; - size_t over_header, over_payload, over_unused; - - payload_size += sizeof(pgno_t); - opg = NODEDATA(node); - rc = mdb_page_get(&mc, *opg, &omp, NULL); - if (rc) - return rc; - if (*opg != omp->mp_p.p_pgno) - return MDB_CORRUPTED; - /* LY: Don't use mask here, e.g bitwise (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). - * Pages should not me marked dirty/loose or otherwise. */ - if (P_OVERFLOW != omp->mp_flags) - return MDB_CORRUPTED; - - over_header = PAGEHDRSZ; - over_payload = NODEDSZ(node); - over_unused = omp->mp_pages * ctx->mw_txn->mt_env->me_psize - - over_payload - over_header; - - rc = ctx->mw_visitor(*opg, omp->mp_pages, ctx->mw_user, dbi, - "overflow-data", 1, over_payload, over_header, over_unused); - if (rc) - return rc; - continue; - } - - payload_size += NODEDSZ(node); - if (node->mn_flags & F_SUBDATA) { - MDB_db *db = NODEDATA(node); - char* name = NULL; - - if (! (node->mn_flags & F_DUPDATA)) { - name = NODEKEY(node); - int namelen = (char*) db - name; - name = memcpy(alloca(namelen + 1), name, namelen); - name[namelen] = 0; - } - rc = mdb_env_walk(ctx, (name && name[0]) ? name : dbi, - db->md_root, deep + 1); - if (rc) - return rc; + mdbx_print("key %u: page %zu, %s\n", i, NODEPGNO(node), DKEY(&key)); + total += nsize; + } else { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + nsize += sizeof(pgno_t); + else + nsize += NODEDSZ(node); + total += nsize; + nsize += sizeof(indx_t); + mdbx_print("key %u: nsize %u, %s%s\n", + i, nsize, DKEY(&key), mdbx_leafnode_type(node)); } + total = EVEN(total); } - - return ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, type, - nkeys, payload_size, header_size, unused_size + align_bytes); + mdbx_print("Total: header %u + contents %u + unused %u\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); } -int __cold -mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func* visitor, void* user) +static void +mdbx_cursor_chk(MDB_cursor *mc) { - mdb_walk_ctx_t ctx; - int rc; - - if (unlikely(!txn)) - return MDB_BAD_TXN; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - ctx.mw_txn = txn; - ctx.mw_user = user; - ctx.mw_visitor = visitor; - - rc = visitor(0, 2, user, "lmdb", "meta", 2, sizeof(MDB_meta)*2, PAGEHDRSZ*2, - (txn->mt_env->me_psize - sizeof(MDB_meta) - PAGEHDRSZ) *2); - if (! rc) - rc = mdb_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0); - if (! rc) - rc = mdb_env_walk(&ctx, "main", txn->mt_dbs[MAIN_DBI].md_root, 0); - if (! rc) - rc = visitor(P_INVALID, 0, user, NULL, NULL, 0, 0, 0, 0); - return rc; -} - -int mdbx_canary_put(MDB_txn *txn, const mdbx_canary* canary) -{ - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) - return EACCES; - - if (likely(canary)) { - txn->mt_canary.x = canary->x; - txn->mt_canary.y = canary->y; - txn->mt_canary.z = canary->z; - } - txn->mt_canary.v = txn->mt_txnid; - - return MDB_SUCCESS; -} - -size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary* canary) -{ - if(unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return 0; - - if (likely(canary)) - *canary = txn->mt_canary; - - return txn->mt_txnid; -} - -int mdbx_cursor_on_first(MDB_cursor *mc) -{ - if (unlikely(mc == NULL)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (!(mc->mc_flags & C_INITIALIZED)) - return MDBX_RESULT_FALSE; - unsigned i; - for(i = 0; i < mc->mc_snum; ++i) { - if (mc->mc_ki[i]) - return MDBX_RESULT_FALSE; + MDB_node *node; + MDB_page *mp; + + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; + for (i=0; imc_top; i++) { + mp = mc->mc_pg[i]; + node = NODEPTR(mp, mc->mc_ki[i]); + if (unlikely(NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)) + mdbx_print("oops!\n"); } - - return MDBX_RESULT_TRUE; -} - -int mdbx_cursor_on_last(MDB_cursor *mc) -{ - if (unlikely(mc == NULL)) - return EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (!(mc->mc_flags & C_INITIALIZED)) - return MDBX_RESULT_FALSE; - - unsigned i; - for(i = 0; i < mc->mc_snum; ++i) { - unsigned nkeys = NUMKEYS(mc->mc_pg[i]); - if (mc->mc_ki[i] < nkeys - 1) - return MDBX_RESULT_FALSE; + if (unlikely(mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))) + mdbx_print("ack!\n"); + if (XCURSOR_INITED(mc)) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && + mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { + mdbx_print("blah!\n"); + } } +} +#endif /* 0 */ - return MDBX_RESULT_TRUE; +/** Count all the pages in each DB and in the freelist + * and make sure it matches the actual number of pages + * being used. + * All named DBs must be open for a correct count. + */ +static void mdbx_audit(MDB_txn *txn) { + MDB_cursor mc; + MDB_val key, data; + MDB_ID freecount, count; + MDB_dbi i; + int rc; + + freecount = 0; + mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdbx_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + mdbx_tassert(txn, rc == MDB_NOTFOUND); + + count = 0; + for (i = 0; i < txn->mt_numdbs; i++) { + MDB_xcursor mx; + if (!(txn->mt_dbflags[i] & DB_VALID)) + continue; + mdbx_cursor_init(&mc, txn, i, &mx); + if (txn->mt_dbs[i].md_root == P_INVALID) + continue; + count += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + + txn->mt_dbs[i].md_overflow_pages; + if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { + rc = mdbx_page_search(&mc, NULL, MDB_PS_FIRST); + for (; rc == MDB_SUCCESS; rc = mdbx_cursor_sibling(&mc, 1)) { + unsigned j; + MDB_page *mp; + mp = mc.mc_pg[mc.mc_top]; + for (j = 0; j < NUMKEYS(mp); j++) { + MDB_node *leaf = NODEPTR(mp, j); + if (leaf->mn_flags & F_SUBDATA) { + MDB_db db; + memcpy(&db, NODEDATA(leaf), sizeof(db)); + count += + db.md_branch_pages + db.md_leaf_pages + db.md_overflow_pages; + } + } + } + mdbx_tassert(txn, rc == MDB_NOTFOUND); + } + } + if (freecount + count + NUM_METAS != txn->mt_next_pgno) { + mdbx_print( + "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", + txn->mt_txnid, freecount, count + NUM_METAS, + freecount + count + NUM_METAS, txn->mt_next_pgno); + } } -int mdbx_cursor_eof(MDB_cursor *mc) +int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { + mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + return txn->mt_dbxs[dbi].md_cmp(a, b); +} + +int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { + mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + return txn->mt_dbxs[dbi].md_dcmp(a, b); +} + +/** Allocate memory for a page. + * Re-use old malloc'd pages first for singletons, otherwise just malloc. + * Set #MDB_TXN_ERROR on failure. + */ +static MDB_page *mdbx_page_malloc(MDB_txn *txn, unsigned num) { + MDB_env *env = txn->mt_env; + size_t size = env->me_psize; + MDB_page *np = env->me_dpages; + if (likely(num == 1 && np)) { + ASAN_UNPOISON_MEMORY_REGION(np, size); + VALGRIND_MEMPOOL_ALLOC(env, np, size); + VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); + env->me_dpages = np->mp_next; + } else { + size *= num; + np = malloc(size); + if (unlikely(!np)) { + txn->mt_flags |= MDB_TXN_ERROR; + return np; + } + VALGRIND_MEMPOOL_ALLOC(env, np, size); + } + + if ((env->me_flags & MDB_NOMEMINIT) == 0) { + /* For a single page alloc, we init everything after the page header. + * For multi-page, we init the final page; if the caller needed that + * many pages they will be filling in at least up to the last page. */ + size_t skip = PAGEHDRSZ; + if (num > 1) + skip += (num - 1) * env->me_psize; + memset((char *)np + skip, 0, size - skip); + } + VALGRIND_MAKE_MEM_UNDEFINED(np, size); + np->mp_flags = 0; + np->mp_pages = num; + return np; +} + +/** Free a single page. + * Saves single pages to a list, for future reuse. + * (This is not used for multi-page overflow pages.) + */ +static MDBX_INLINE void mdbx_page_free(MDB_env *env, MDB_page *mp) { + mp->mp_next = env->me_dpages; + VALGRIND_MEMPOOL_FREE(env, mp); + env->me_dpages = mp; +} + +/** Free a dirty page */ +static void mdbx_dpage_free(MDB_env *env, MDB_page *dp) { + if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { + mdbx_page_free(env, dp); + } else { + /* large pages just get freed directly */ + VALGRIND_MEMPOOL_FREE(env, dp); + free(dp); + } +} + +/** Return all dirty pages to dpage list */ +static void mdbx_dlist_free(MDB_txn *txn) { + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, n = dl[0].mid; + + for (i = 1; i <= n; i++) { + mdbx_dpage_free(env, dl[i].mptr); + } + dl[0].mid = 0; +} + +static void __cold mdbx_kill_page(MDB_env *env, pgno_t pgno) { + const size_t offs = env->me_psize * pgno; + const size_t shift = offsetof(MDB_page, mp_pb); + + if (env->me_flags & MDB_WRITEMAP) { + MDB_page *mp = (MDB_page *)(env->me_map + offs); + memset(&mp->mp_pb, 0x6F /* 'o', 111 */, env->me_psize - shift); + VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_pb, env->me_psize - shift); + ASAN_POISON_MEMORY_REGION(&mp->mp_pb, env->me_psize - shift); + } else { + struct iovec iov[1]; + iov[0].iov_len = env->me_psize - shift; + iov[0].iov_base = alloca(iov[0].iov_len); + memset(iov[0].iov_base, 0x6F /* 'o', 111 */, iov[0].iov_len); + ssize_t rc = pwritev(env->me_fd, iov, 1, offs + shift); + assert(rc == (ssize_t)iov[0].iov_len); + (void)rc; + } +} + +/** Loosen or free a single page. + * Saves single pages to a list for future reuse + * in this same txn. It has been pulled from the freeDB + * and already resides on the dirty list, but has been + * deleted. Use these pages first before pulling again + * from the freeDB. + * + * If the page wasn't dirtied in this txn, just add it + * to this txn's free list. + */ +static int mdbx_page_loose(MDB_cursor *mc, MDB_page *mp) { + int loose = 0; + pgno_t pgno = mp->mp_pgno; + MDB_txn *txn = mc->mc_txn; + + if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { + if (txn->mt_parent) { + MDB_ID2 *dl = txn->mt_u.dirty_list; + /* If txn has a parent, make sure the page is in our + * dirty list. */ + if (dl[0].mid) { + unsigned x = mdbx_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PROBLEM; + } + /* ok, it's ours */ + loose = 1; + } + } + } else { + /* no parent txn, so it's just ours */ + loose = 1; + } + } + if (loose) { + mdbx_debug("loosen db %d page %zu", DDBI(mc), mp->mp_pgno); + MDB_page **link = &NEXT_LOOSE_PAGE(mp); + if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) { + mdbx_kill_page(txn->mt_env, pgno); + VALGRIND_MAKE_MEM_UNDEFINED(link, sizeof(MDB_page *)); + ASAN_UNPOISON_MEMORY_REGION(link, sizeof(MDB_page *)); + } + *link = txn->mt_loose_pgs; + txn->mt_loose_pgs = mp; + txn->mt_loose_count++; + mp->mp_flags |= P_LOOSE; + } else { + int rc = mdbx_midl_append(&txn->mt_free_pgs, pgno); + if (unlikely(rc)) + return rc; + } + + return MDB_SUCCESS; +} + +/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. + * @param[in] mc A cursor handle for the current operation. + * @param[in] pflags Flags of the pages to update: + * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. + * @param[in] all No shortcuts. Needed except after a full #mdbx_page_flush(). + * @return 0 on success, non-zero on failure. + */ +static int mdbx_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { + enum { Mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP }; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m3, *m0 = mc; + MDB_xcursor *mx; + MDB_page *dp, *mp; + MDB_node *leaf; + unsigned i, j; + int rc = MDB_SUCCESS, level; + + /* Mark pages seen by cursors: First m0, then tracked cursors */ + for (i = txn->mt_numdbs;;) { + if (mc->mc_flags & C_INITIALIZED) { + for (m3 = mc;; m3 = &mx->mx_cursor) { + mp = NULL; + for (j = 0; j < m3->mc_snum; j++) { + mp = m3->mc_pg[j]; + if ((mp->mp_flags & Mask) == pflags) + mp->mp_flags ^= P_KEEP; + } + mx = m3->mc_xcursor; + /* Proceed to mx if it is at a sub-database */ + if (!(mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) + break; + if (!(mp && (mp->mp_flags & P_LEAF))) + break; + leaf = NODEPTR(mp, m3->mc_ki[j - 1]); + if (!(leaf->mn_flags & F_SUBDATA)) + break; + } + } + mc = mc->mc_next; + for (; !mc || mc == m0; mc = txn->mt_cursors[--i]) + if (i == 0) + goto mark_done; + } + +mark_done: + if (all) { + /* Mark dirty root pages */ + for (i = 0; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if (unlikely((rc = mdbx_page_get(m0, pgno, &dp, &level)) != + MDB_SUCCESS)) + break; + if ((dp->mp_flags & Mask) == pflags && level <= 1) + dp->mp_flags ^= P_KEEP; + } + } + } + + return rc; +} + +static int mdbx_page_flush(MDB_txn *txn, int keep); + +/** Spill pages from the dirty list back to disk. + * This is intended to prevent running into #MDB_TXN_FULL situations, + * but note that they may still occur in a few cases: + * 1) our estimate of the txn size could be too small. Currently this + * seems unlikely, except with a large number of #MDB_MULTIPLE items. + * 2) child txns may run out of space if their parents dirtied a + * lot of pages and never spilled them. TODO: we probably should do + * a preemptive spill during #mdbx_txn_begin() of a child txn, if + * the parent's dirty_room is below a given threshold. + * + * Otherwise, if not using nested txns, it is expected that apps will + * not run into #MDB_TXN_FULL any more. The pages are flushed to disk + * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. + * If the txn never references them again, they can be left alone. + * If the txn only reads them, they can be used without any fuss. + * If the txn writes them again, they can be dirtied immediately without + * going thru all of the work of #mdbx_page_touch(). Such references are + * handled by #mdbx_page_unspill(). + * + * Also note, we never spill DB root pages, nor pages of active cursors, + * because we'll need these back again soon anyway. And in nested txns, + * we can't spill a page in a child txn if it was already spilled in a + * parent txn. That would alter the parent txns' data even though + * the child hasn't committed yet, and we'd have no way to undo it if + * the child aborted. + * + * @param[in] m0 cursor A cursor handle identifying the transaction and + * database for which we are checking space. + * @param[in] key For a put operation, the key being stored. + * @param[in] data For a put operation, the data being stored. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { + MDB_txn *txn = m0->mc_txn; + MDB_page *dp; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, j, need; + int rc; + + if (m0->mc_flags & C_SUB) + return MDB_SUCCESS; + + /* Estimate how much space this op will take */ + i = m0->mc_db->md_depth; + /* Named DBs also dirty the main DB */ + if (m0->mc_dbi >= CORE_DBS) + i += txn->mt_dbs[MAIN_DBI].md_depth; + /* For puts, roughly factor in the key+data size */ + if (key) + i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; + i += i; /* double it for good measure */ + need = i; + + if (txn->mt_dirty_room > i) + return MDB_SUCCESS; + + if (!txn->mt_spill_pgs) { + txn->mt_spill_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX); + if (unlikely(!txn->mt_spill_pgs)) + return ENOMEM; + } else { + /* purge deleted slots */ + MDB_IDL sl = txn->mt_spill_pgs; + unsigned num = sl[0]; + j = 0; + for (i = 1; i <= num; i++) { + if (!(sl[i] & 1)) + sl[++j] = sl[i]; + } + sl[0] = j; + } + + /* Preserve pages which may soon be dirtied again */ + rc = mdbx_pages_xkeep(m0, P_DIRTY, 1); + if (unlikely(rc != MDB_SUCCESS)) + goto bailout; + + /* Less aggressive spill - we originally spilled the entire dirty list, + * with a few exceptions for cursor pages and DB root pages. But this + * turns out to be a lot of wasted effort because in a large txn many + * of those pages will need to be used again. So now we spill only 1/8th + * of the dirty pages. Testing revealed this to be a good tradeoff, + * better than 1/2, 1/4, or 1/10. */ + if (need < MDB_IDL_UM_MAX / 8) + need = MDB_IDL_UM_MAX / 8; + + /* Save the page IDs of all the pages we're flushing */ + /* flush from the tail forward, this saves a lot of shifting later on. */ + for (i = dl[0].mid; i && need; i--) { + MDB_ID pn = dl[i].mid << 1; + dp = dl[i].mptr; + if (dp->mp_flags & (P_LOOSE | P_KEEP)) + continue; + /* Can't spill twice, make sure it's not already in a parent's + * spill list. */ + if (txn->mt_parent) { + MDB_txn *tx2; + for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { + if (tx2->mt_spill_pgs) { + j = mdbx_midl_search(tx2->mt_spill_pgs, pn); + if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { + dp->mp_flags |= P_KEEP; + break; + } + } + } + if (tx2) + continue; + } + rc = mdbx_midl_append(&txn->mt_spill_pgs, pn); + if (unlikely(rc != MDB_SUCCESS)) + goto bailout; + need--; + } + mdbx_midl_sort(txn->mt_spill_pgs); + + /* Flush the spilled part of dirty list */ + rc = mdbx_page_flush(txn, i); + if (unlikely(rc != MDB_SUCCESS)) + goto bailout; + + /* Reset any dirty pages we kept that page_flush didn't see */ + rc = mdbx_pages_xkeep(m0, P_DIRTY | P_KEEP, i); + +bailout: + txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; + return rc; +} + +static MDBX_INLINE uint64_t mdbx_meta_sign(MDB_meta *meta) { + uint64_t sign = MDB_DATASIGN_NONE; +#if 0 /* TODO */ + sign = hippeus_hash64( + &meta->mm_mapsize, + sizeof(MDB_meta) - offsetof(MDB_meta, mm_mapsize), + meta->mm_version | (uint64_t) MDB_MAGIC << 32 + ); +#else + (void)meta; +#endif + /* LY: newer returns MDB_DATASIGN_NONE or MDB_DATASIGN_WEAK */ + return (sign > MDB_DATASIGN_WEAK) ? sign : ~sign; +} + +static MDBX_INLINE MDB_meta *mdbx_meta_head_w(MDB_env *env) { + MDB_meta *a = METAPAGE_1(env); + MDB_meta *b = METAPAGE_2(env); + txnid_t head_txnid = env->me_txns->mti_txnid; + + mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); + if (a->mm_txnid == head_txnid) + return a; + if (likely(b->mm_txnid == head_txnid)) + return b; + + mdbx_debug("me_txns->mti_txnid not match meta-pages"); + mdbx_assert(env, head_txnid == a->mm_txnid || head_txnid == b->mm_txnid); + env->me_flags |= MDB_FATAL_ERROR; + return a; +} + +static MDB_meta *mdbx_meta_head_r(MDB_env *env) { + MDB_meta *a = METAPAGE_1(env); + MDB_meta *b = METAPAGE_2(env), *h; + +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + + txnid_t head_txnid = env->me_txns->mti_txnid; + mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); + if (likely(a->mm_txnid == head_txnid)) { + h = a; + } else if (likely(b->mm_txnid == head_txnid)) { + h = b; + } else { + /* LY: seems got a collision with mdbx_env_sync0() */ + mdbx_coherent_barrier(); + head_txnid = env->me_txns->mti_txnid; + mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); + + if (likely(a->mm_txnid == head_txnid)) { + h = a; + } else if (likely(b->mm_txnid == head_txnid)) { + h = b; + } else { + /* LY: got a race again, or DB is corrupted */ + int rc = mdbx_mutex_lock(env, MDB_MUTEX(env, w)); + h = mdbx_meta_head_w(env); + if (rc == 0) + mdbx_mutex_unlock(env, MDB_MUTEX(env, w)); + } + } + +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + + return h; +} + +static MDBX_INLINE MDB_meta *mdbx_env_meta_flipflop(const MDB_env *env, + MDB_meta *meta) { + return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env); +} + +static MDBX_INLINE int mdbx_meta_lt(MDB_meta *a, MDB_meta *b) { + return (META_IS_STEADY(a) == META_IS_STEADY(b)) ? a->mm_txnid < b->mm_txnid + : META_IS_STEADY(b); +} + +/** Find oldest txnid still referenced. */ +static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + int i, reader; + MDB_reader *r = env->me_txns->mti_readers; + txnid_t oldest = env->me_txns->mti_txnid; + + MDB_meta *a = METAPAGE_1(env); + MDB_meta *b = METAPAGE_2(env); + if (META_IS_WEAK(a) && oldest > b->mm_txnid) + oldest = b->mm_txnid; + if (META_IS_WEAK(b) && oldest > a->mm_txnid) + oldest = a->mm_txnid; + + for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0;) { + if (r[i].mr_pid) { + txnid_t snap = r[i].mr_txnid; + if (oldest > snap) { + oldest = snap; + reader = i; + } + } + } +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + + if (laggard) + *laggard = reader; + return env->me_pgoldest = oldest; +} + +/** Add a page to the txn's dirty list */ +static void mdbx_page_dirty(MDB_txn *txn, MDB_page *mp) { + MDB_ID2 mid; + int rc, (*insert)(MDB_ID2L, MDB_ID2 *); + + if (txn->mt_flags & MDB_TXN_WRITEMAP) { + insert = mdbx_mid2l_append; + } else { + insert = mdbx_mid2l_insert; + } + mid.mid = mp->mp_pgno; + mid.mptr = mp; + rc = insert(txn->mt_u.dirty_list, &mid); + mdbx_tassert(txn, rc == 0); + txn->mt_dirty_room--; +} + +/** Allocate page numbers and memory for writing. Maintain me_pglast, + * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. + * + * If there are free pages available from older transactions, they + * are re-used first. Otherwise allocate a new page at mt_next_pgno. + * Do not modify the freedB, just merge freeDB records into me_pghead[] + * and move me_pglast to say which records were consumed. Only this + * function can create me_pghead and move me_pglast/mt_next_pgno. + * @param[in] mc cursor A cursor handle identifying the transaction and + * database for which we are allocating. + * @param[in] num the number of pages to allocate. + * @param[out] mp Address of the allocated page(s). Requests for multiple + *pages + * will always be satisfied by a single contiguous chunk of memory. + * @return 0 on success, non-zero on failure. + */ + +#define MDBX_ALLOC_CACHE 1 +#define MDBX_ALLOC_GC 2 +#define MDBX_ALLOC_NEW 4 +#define MDBX_ALLOC_KICK 8 +#define MDBX_ALLOC_ALL \ + (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW | MDBX_ALLOC_KICK) + +static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { + int rc; + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + pgno_t pgno, *mop = env->me_pghead; + unsigned i = 0, j, mop_len = mop ? mop[0] : 0, n2 = num - 1; + MDB_page *np; + txnid_t oldest = 0, last = 0; + MDB_cursor_op op; + MDB_cursor m2; + int found_oldest = 0; + + if (likely(flags & MDBX_ALLOC_GC)) { + flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); + if (unlikely(mc->mc_flags & C_RECLAIMING)) { + /* If mc is updating the freeDB, then the freelist cannot play + * catch-up with itself by growing while trying to save it. */ + flags &= + ~(MDBX_ALLOC_GC | MDBX_ALLOC_KICK | MDBX_COALESCE | MDBX_LIFORECLAIM); + } + } + + if (likely(flags & MDBX_ALLOC_CACHE)) { + /* If there are any loose pages, just use them */ + assert(mp && num); + if (likely(num == 1 && txn->mt_loose_pgs)) { + np = txn->mt_loose_pgs; + txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); + txn->mt_loose_count--; + mdbx_debug("db %d use loose page %zu", DDBI(mc), np->mp_pgno); + ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize); + *mp = np; + return MDB_SUCCESS; + } + } + + /* If our dirty list is already full, we can't do anything */ + if (unlikely(txn->mt_dirty_room == 0)) { + rc = MDB_TXN_FULL; + goto fail; + } + + for (;;) { /* oom-kick retry loop */ + for (op = MDB_FIRST;; + op = (flags & MDBX_LIFORECLAIM) ? MDB_PREV : MDB_NEXT) { + MDB_val key, data; + MDB_node *leaf; + pgno_t *idl; + + /* Seek a big enough contiguous page range. Prefer + * pages at the tail, just truncating the list. */ + if (likely(flags & MDBX_ALLOC_CACHE) && mop_len > n2 && + (!(flags & MDBX_COALESCE) || op == MDB_FIRST)) { + i = mop_len; + do { + pgno = mop[i]; + if (likely(mop[i - n2] == pgno + n2)) + goto done; + } while (--i > n2); + } + + if (op == MDB_FIRST) { /* 1st iteration */ + /* Prepare to fetch more and coalesce */ + if (unlikely(!(flags & MDBX_ALLOC_GC))) + break; + + oldest = env->me_pgoldest; + mdbx_cursor_init(&m2, txn, FREE_DBI, NULL); + if (flags & MDBX_LIFORECLAIM) { + if (!found_oldest) { + oldest = mdbx_find_oldest(env, NULL); + found_oldest = 1; + } + /* Begin from oldest reader if any */ + if (oldest > 2) { + last = oldest - 1; + op = MDB_SET_RANGE; + } + } else if (env->me_pglast) { + /* Continue lookup from env->me_pglast to higher/last */ + last = env->me_pglast; + op = MDB_SET_RANGE; + } + + key.mv_data = &last; + key.mv_size = sizeof(last); + } + + if (!(flags & MDBX_LIFORECLAIM)) { + /* Do not fetch more if the record will be too recent */ + if (op != MDB_FIRST && ++last >= oldest) { + if (!found_oldest) { + oldest = mdbx_find_oldest(env, NULL); + found_oldest = 1; + } + if (oldest <= last) + break; + } + } + + rc = mdbx_cursor_get(&m2, &key, NULL, op); + if (rc == MDB_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { + if (op == MDB_SET_RANGE) + continue; + found_oldest = 1; + if (oldest < mdbx_find_oldest(env, NULL)) { + oldest = env->me_pgoldest; + last = oldest - 1; + key.mv_data = &last; + key.mv_size = sizeof(last); + op = MDB_SET_RANGE; + rc = mdbx_cursor_get(&m2, &key, NULL, op); + } + } + if (unlikely(rc)) { + if (rc == MDB_NOTFOUND) + break; + goto fail; + } + + last = *(txnid_t *)key.mv_data; + if (oldest <= last) { + if (!found_oldest) { + oldest = mdbx_find_oldest(env, NULL); + found_oldest = 1; + } + if (oldest <= last) { + if (flags & MDBX_LIFORECLAIM) + continue; + break; + } + } + + if (flags & MDBX_LIFORECLAIM) { + if (txn->mt_lifo_reclaimed) { + for (j = txn->mt_lifo_reclaimed[0]; j > 0; --j) + if (txn->mt_lifo_reclaimed[j] == last) + break; + if (j) + continue; + } + } + + np = m2.mc_pg[m2.mc_top]; + leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); + if (unlikely((rc = mdbx_node_read(&m2, leaf, &data)) != MDB_SUCCESS)) + goto fail; + + if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { + txn->mt_lifo_reclaimed = mdbx_midl_alloc(env->me_maxfree_1pg); + if (unlikely(!txn->mt_lifo_reclaimed)) { + rc = ENOMEM; + goto fail; + } + } + + idl = (MDB_ID *)data.mv_data; + mdbx_tassert(txn, idl[0] == 0 || + data.mv_size == (idl[0] + 1) * sizeof(MDB_ID)); + i = idl[0]; + if (!mop) { + if (unlikely(!(env->me_pghead = mop = mdbx_midl_alloc(i)))) { + rc = ENOMEM; + goto fail; + } + } else { + if (unlikely((rc = mdbx_midl_need(&env->me_pghead, i)) != 0)) + goto fail; + mop = env->me_pghead; + } + if (flags & MDBX_LIFORECLAIM) { + if ((rc = mdbx_midl_append(&txn->mt_lifo_reclaimed, last)) != 0) + goto fail; + } + env->me_pglast = last; + + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { + mdbx_debug_extra("IDL read txn %zu root %zu num %u, IDL", last, + txn->mt_dbs[FREE_DBI].md_root, i); + for (j = i; j; j--) + mdbx_debug_extra_print(" %zu", idl[j]); + mdbx_debug_extra_print("\n"); + } + + /* Merge in descending sorted order */ + mdbx_midl_xmerge(mop, idl); + mop_len = mop[0]; + + if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) { + /* force gc reclaim mode */ + return MDB_SUCCESS; + } + + /* Don't try to coalesce too much. */ + if (mop_len > MDB_IDL_UM_SIZE / 2) + break; + if (flags & MDBX_COALESCE) { + if (mop_len /* current size */ >= env->me_maxfree_1pg / 2 || + i /* prev size */ >= env->me_maxfree_1pg / 4) + flags &= ~MDBX_COALESCE; + } + } + + if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == + (MDBX_COALESCE | MDBX_ALLOC_CACHE) && + mop_len > n2) { + i = mop_len; + do { + pgno = mop[i]; + if (mop[i - n2] == pgno + n2) + goto done; + } while (--i > n2); + } + + /* Use new pages from the map when nothing suitable in the freeDB */ + i = 0; + pgno = txn->mt_next_pgno; + rc = MDB_MAP_FULL; + if (likely(pgno + num <= env->me_maxpg)) { + rc = MDB_NOTFOUND; + if (likely(flags & MDBX_ALLOC_NEW)) + goto done; + } + + if ((flags & MDBX_ALLOC_GC) && + ((flags & MDBX_ALLOC_KICK) || rc == MDB_MAP_FULL)) { + MDB_meta *head = mdbx_meta_head_w(env); + MDB_meta *tail = mdbx_env_meta_flipflop(env, head); + + if (oldest == tail->mm_txnid && META_IS_WEAK(head) && + !META_IS_WEAK(tail)) { + MDB_meta meta = *head; + /* LY: Here an oom was happened: + * - all pages had allocated; + * - reclaiming was stopped at the last steady-sync; + * - the head-sync is weak. + * Now we need make a sync to resume reclaiming. If both + * MDB_NOSYNC and MDB_MAPASYNC flags are set, then assume that + * utterly no-sync write mode was requested. In such case + * don't make a steady-sync, but only a legacy-mode checkpoint, + * just for resume reclaiming only, not for data consistency. */ + + mdbx_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu, txnid %zu", + head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', + tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest, + env->me_txns->mt1.mtb.mtb_txnid); + + int flags = env->me_flags & MDB_WRITEMAP; + if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC) + flags |= MDBX_UTTERLY_NOSYNC; + + mdbx_assert(env, env->me_sync_pending > 0); + if (mdbx_env_sync0(env, flags, &meta) == MDB_SUCCESS) { + txnid_t snap = mdbx_find_oldest(env, NULL); + if (snap > oldest) { + continue; + } + } + } + + if (rc == MDB_MAP_FULL) { +#if MDBX_MODE_ENABLED + txnid_t snap = mdbx_oomkick(env, oldest); +#else + mdbx_debug("DB size maxed out"); + txnid_t snap = mdbx_find_oldest(env, NULL); +#endif /* MDBX_MODE_ENABLED */ + if (snap > oldest) { + oldest = snap; + continue; + } + } + } + + fail: + if (mp) { + *mp = NULL; + txn->mt_flags |= MDB_TXN_ERROR; + } + assert(rc); + return rc; + } + +done: + assert(mp && num); + if (env->me_flags & MDB_WRITEMAP) { + np = (MDB_page *)(env->me_map + env->me_psize * pgno); + /* LY: reset no-access flag from mdbx_kill_page() */ + VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); + ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize * num); + } else { + if (unlikely(!(np = mdbx_page_malloc(txn, num)))) { + rc = ENOMEM; + goto fail; + } + } + if (i) { + mop[0] = mop_len -= num; + /* Move any stragglers down */ + for (j = i - num; j < mop_len;) + mop[++j] = mop[++i]; + } else { + txn->mt_next_pgno = pgno + num; + } + + if (env->me_flags & MDBX_PAGEPERTURB) + memset(np, 0x71 /* 'q', 113 */, env->me_psize * num); + VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); + + np->mp_pgno = pgno; + np->mp_leaf2_ksize = 0; + np->mp_flags = 0; + np->mp_pages = num; + mdbx_page_dirty(txn, np); + *mp = np; + + return MDB_SUCCESS; +} + +/** Copy the used portions of a non-overflow page. + * @param[in] dst page to copy into + * @param[in] src page to copy from + * @param[in] psize size of a page + */ +static void mdbx_page_copy(MDB_page *dst, MDB_page *src, unsigned psize) { + enum { Align = sizeof(pgno_t) }; + indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; + + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. */ + if ((unused &= -Align) && !IS_LEAF2(src)) { + upper = (upper + PAGEBASE) & -Align; + memcpy(dst, src, (lower + PAGEBASE + (Align - 1)) & -Align); + memcpy((pgno_t *)((char *)dst + upper), (pgno_t *)((char *)src + upper), + psize - upper); + } else { + memcpy(dst, src, psize - unused); + } +} + +/** Pull a page off the txn's spill list, if present. + * If a page being referenced was spilled to disk in this txn, bring + * it back and make it dirty/writable again. + * @param[in] txn the transaction handle. + * @param[in] mp the page being referenced. It must not be dirty. + * @param[out] ret the writable page, if any. ret is unchanged if + * mp wasn't spilled. + */ +static int mdbx_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) { + MDB_env *env = txn->mt_env; + const MDB_txn *tx2; + unsigned x; + pgno_t pgno = mp->mp_pgno, pn = pgno << 1; + + for (tx2 = txn; tx2; tx2 = tx2->mt_parent) { + if (!tx2->mt_spill_pgs) + continue; + x = mdbx_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + MDB_page *np; + int num; + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; + if (IS_OVERFLOW(mp)) + num = mp->mp_pages; + else + num = 1; + if (env->me_flags & MDB_WRITEMAP) { + np = mp; + } else { + np = mdbx_page_malloc(txn, num); + if (unlikely(!np)) + return ENOMEM; + if (num > 1) + memcpy(np, mp, num * env->me_psize); + else + mdbx_page_copy(np, mp, env->me_psize); + } + if (tx2 == txn) { + /* If in current txn, this page is no longer spilled. + * If it happens to be the last page, truncate the spill list. + * Otherwise mark it as deleted by setting the LSB. */ + if (x == txn->mt_spill_pgs[0]) + txn->mt_spill_pgs[0]--; + else + txn->mt_spill_pgs[x] |= 1; + } /* otherwise, if belonging to a parent txn, the + * page remains spilled until child commits + */ + + mdbx_page_dirty(txn, np); + np->mp_flags |= P_DIRTY; + *ret = np; + break; + } + } + return MDB_SUCCESS; +} + +/** Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc cursor pointing to the page to be touched + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_touch(MDB_cursor *mc) { + MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m2, *m3; + pgno_t pgno; + int rc; + + if (!F_ISSET(mp->mp_flags, P_DIRTY)) { + if (txn->mt_flags & MDB_TXN_SPILLS) { + np = NULL; + rc = mdbx_page_unspill(txn, mp, &np); + if (unlikely(rc)) + goto fail; + if (likely(np)) + goto done; + } + if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pgs, 1)) || + (rc = mdbx_page_alloc(mc, 1, &np, MDBX_ALLOC_ALL)))) + goto fail; + pgno = np->mp_pgno; + mdbx_debug("touched db %d page %zu -> %zu", DDBI(mc), mp->mp_pgno, pgno); + mdbx_cassert(mc, mp->mp_pgno != pgno); + mdbx_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + /* Update the parent page, if any, to point to the new page */ + if (mc->mc_top) { + MDB_page *parent = mc->mc_pg[mc->mc_top - 1]; + MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top - 1]); + SETPGNO(node, pgno); + } else { + mc->mc_db->md_root = pgno; + } + } else if (txn->mt_parent && !IS_SUBP(mp)) { + MDB_ID2 mid, *dl = txn->mt_u.dirty_list; + pgno = mp->mp_pgno; + /* If txn has a parent, make sure the page is in our + * dirty list. */ + if (dl[0].mid) { + unsigned x = mdbx_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PROBLEM; + } + return 0; + } + } + mdbx_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); + /* No - copy it */ + np = mdbx_page_malloc(txn, 1); + if (unlikely(!np)) + return ENOMEM; + mid.mid = pgno; + mid.mptr = np; + rc = mdbx_mid2l_insert(dl, &mid); + mdbx_cassert(mc, rc == 0); + } else { + return 0; + } + + mdbx_page_copy(np, mp, txn->mt_env->me_psize); + np->mp_pgno = pgno; + np->mp_flags |= P_DIRTY; + +done: + /* Adjust cursors pointing to mp */ + mc->mc_pg[mc->mc_top] = np; + m2 = txn->mt_cursors[mc->mc_dbi]; + if (mc->mc_flags & C_SUB) { + for (; m2; m2 = m2->mc_next) { + m3 = &m2->mc_xcursor->mx_cursor; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) + m3->mc_pg[mc->mc_top] = np; + } + } else { + for (; m2; m2 = m2->mc_next) { + if (m2->mc_snum < mc->mc_snum) + continue; + if (m2 == mc) + continue; + if (m2->mc_pg[mc->mc_top] == mp) { + m2->mc_pg[mc->mc_top] = np; + if (XCURSOR_INITED(m2) && IS_LEAF(np)) + XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); + } + } + } + return 0; + +fail: + txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int mdbx_env_sync(MDB_env *env, int force) { + int rc; + pthread_mutex_t *mutex; + MDB_meta *head; + unsigned flags; + + if (unlikely(!env)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!env->me_txns)) + return MDB_PANIC; + + flags = env->me_flags & ~MDB_NOMETASYNC; + if (unlikely(flags & (MDB_RDONLY | MDB_FATAL_ERROR))) + return EACCES; + + head = mdbx_meta_head_r(env); + if (!META_IS_WEAK(head) && env->me_sync_pending == 0 && + env->me_mapsize == head->mm_mapsize) + /* LY: nothing to do */ + return MDB_SUCCESS; + + if (force || head->mm_mapsize != env->me_mapsize || + (env->me_sync_threshold && + env->me_sync_pending >= env->me_sync_threshold)) + flags &= MDB_WRITEMAP; + + /* LY: early sync before acquiring the mutex to reduce writer's latency */ + if (env->me_sync_pending > env->me_psize * 16 && (flags & MDB_NOSYNC) == 0) { + if (flags & MDB_WRITEMAP) { + size_t used_size = env->me_psize * (head->mm_last_pg + 1); + rc = msync(env->me_map, used_size, + (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC); + } else { + rc = fdatasync(env->me_fd); + } + if (unlikely(rc)) + return errno; + } + + mutex = MDB_MUTEX(env, w); + rc = mdbx_mutex_lock(env, mutex); + if (unlikely(rc)) + return rc; + + /* LY: head may be changed while the mutex has been acquired. */ + head = mdbx_meta_head_w(env); + rc = MDB_SUCCESS; + if (META_IS_WEAK(head) || env->me_sync_pending != 0 || + env->me_mapsize != head->mm_mapsize) { + MDB_meta meta = *head; + rc = mdbx_env_sync0(env, flags, &meta); + } + + mdbx_mutex_unlock(env, mutex); + return rc; +} + +/** Back up parent txn's cursors, then grab the originals for tracking */ +static int mdbx_cursor_shadow(MDB_txn *src, MDB_txn *dst) { + MDB_cursor *mc, *bk; + MDB_xcursor *mx; + size_t size; + int i; + + for (i = src->mt_numdbs; --i >= 0;) { + if ((mc = src->mt_cursors[i]) != NULL) { + size = sizeof(MDB_cursor); + if (mc->mc_xcursor) + size += sizeof(MDB_xcursor); + for (; mc; mc = bk->mc_next) { + bk = malloc(size); + if (unlikely(!bk)) + return ENOMEM; + *bk = *mc; + mc->mc_backup = bk; + mc->mc_db = &dst->mt_dbs[i]; + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. */ + mc->mc_txn = dst; + mc->mc_dbflag = &dst->mt_dbflags[i]; + if ((mx = mc->mc_xcursor) != NULL) { + *(MDB_xcursor *)(bk + 1) = *mx; + mx->mx_cursor.mc_txn = dst; + } + mc->mc_next = dst->mt_cursors[i]; + dst->mt_cursors[i] = mc; + } + } + } + return MDB_SUCCESS; +} + +/** Close this write txn's cursors, give parent txn's cursors back to parent. + * @param[in] txn the transaction handle. + * @param[in] merge true to keep changes to parent cursors, false to revert. + * @return 0 on success, non-zero on failure. + */ +static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { + MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDB_xcursor *mx; + int i; + + for (i = txn->mt_numdbs; --i >= 0;) { + for (mc = cursors[i]; mc; mc = next) { + unsigned stage = mc->mc_signature; + mdbx_ensure(NULL, + stage == MDBX_MC_SIGNATURE || stage == MDBX_MC_WAIT4EOT); + next = mc->mc_next; + if ((bk = mc->mc_backup) != NULL) { + if (merge) { + /* Commit changes to parent txn */ + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbflag = bk->mc_dbflag; + if ((mx = mc->mc_xcursor) != NULL) + mx->mx_cursor.mc_txn = bk->mc_txn; + } else { + /* Abort nested txn */ + *mc = *bk; + if ((mx = mc->mc_xcursor) != NULL) + *mx = *(MDB_xcursor *)(bk + 1); + } +#if MDBX_MODE_ENABLED + bk->mc_signature = 0; + free(bk); + } + if (stage == MDBX_MC_WAIT4EOT) { + mc->mc_signature = 0; + free(mc); + } else { + mc->mc_signature = MDBX_MC_READY4CLOSE; + mc->mc_flags = 0 /* reset C_UNTRACK */; + } +#else + mc = bk; + } + /* Only malloced cursors are permanently tracked. */ + mc->mc_signature = 0; + free(mc); +#endif + } + cursors[i] = NULL; + } +} + +/** Set or check a pid lock. Set returns 0 on success. + * Check returns 0 if the process is certainly dead, nonzero if it may + * be alive (the lock exists or an error happened so we do not know). + */ +static int mdbx_reader_pid(MDB_env *env, int op, pid_t pid) { + for (;;) { + int rc; + struct flock lock_info; + memset(&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = pid; + lock_info.l_len = 1; + if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { + if (op == F_GETLK && lock_info.l_type != F_UNLCK) + rc = -1; + } else if ((rc = errno) == EINTR) { + continue; + } + return rc; + } +} + +/** Common code for #mdbx_txn_begin() and #mdbx_txn_renew(). + * @param[in] txn the transaction handle to initialize + * @return 0 on success, non-zero on failure. + */ +static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { + MDB_env *env = txn->mt_env; + unsigned i, nr; + int rc, new_notls = 0; + + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + + if (flags & MDB_TXN_RDONLY) { + MDBX_rthc *rthc = NULL; + MDB_reader *r = NULL; + + txn->mt_flags = MDB_TXN_RDONLY; + if (likely(env->me_flags & MDB_ENV_TXKEY)) { + mdbx_assert(env, !(env->me_flags & MDB_NOTLS)); + rthc = mdbx_rthc_get(env->me_txkey); + if (unlikely(!rthc)) + return ENOMEM; + if (likely(rthc->rc_reader)) { + r = rthc->rc_reader; + mdbx_assert(env, r->mr_pid == env->me_pid); + mdbx_assert(env, r->mr_tid == pthread_self()); + } + } else { + mdbx_assert(env, env->me_flags & MDB_NOTLS); + r = txn->mt_u.reader; + } + + if (likely(r)) { + if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != ~(txnid_t)0)) + return MDB_BAD_RSLOT; + } else { + pid_t pid = env->me_pid; + pthread_t tid = pthread_self(); + pthread_mutex_t *rmutex = MDB_MUTEX(env, r); + + rc = mdbx_mutex_lock(env, rmutex); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + + if (unlikely(!env->me_live_reader)) { + rc = mdbx_reader_pid(env, F_SETLK, pid); + if (unlikely(rc != MDB_SUCCESS)) { + mdbx_mutex_unlock(env, rmutex); + return rc; + } + env->me_live_reader = 1; + } + + nr = env->me_txns->mti_numreaders; + for (i = 0; i < nr; i++) + if (env->me_txns->mti_readers[i].mr_pid == 0) + break; + if (unlikely(i == env->me_maxreaders)) { + mdbx_mutex_unlock(env, rmutex); + return MDB_READERS_FULL; + } + r = &env->me_txns->mti_readers[i]; + /* Claim the reader slot, carefully since other code + * uses the reader table un-mutexed: First reset the + * slot, next publish it in mti_numreaders. After + * that, it is safe for mdbx_env_close() to touch it. + * When it will be closed, we can finally claim it. */ + r->mr_pid = 0; + r->mr_txnid = ~(txnid_t)0; + r->mr_tid = tid; + mdbx_coherent_barrier(); +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + if (i == nr) + env->me_txns->mti_numreaders = ++nr; + if (env->me_close_readers < nr) + env->me_close_readers = nr; + r->mr_pid = pid; +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + mdbx_mutex_unlock(env, rmutex); + + new_notls = MDB_END_SLOT; + if (likely(rthc)) { + rthc->rc_reader = r; + new_notls = 0; + } + } + + while ((env->me_flags & MDB_FATAL_ERROR) == 0) { + MDB_meta *meta = mdbx_meta_head_r(txn->mt_env); + txnid_t lead = meta->mm_txnid; + r->mr_txnid = lead; + mdbx_coherent_barrier(); + + txnid_t snap = txn->mt_env->me_txns->mti_txnid; + /* LY: Retry on a race, ITS#7970. */ + if (likely(lead == snap)) { + txn->mt_txnid = lead; + txn->mt_next_pgno = meta->mm_last_pg + 1; + /* Copy the DB info and flags */ + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); +#if MDBX_MODE_ENABLED + txn->mt_canary = meta->mm_canary; +#endif + break; + } + } + + txn->mt_u.reader = r; + txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ + } else { + /* Not yet touching txn == env->me_txn0, it may be active */ + rc = mdbx_mutex_lock(env, MDB_MUTEX(env, w)); + if (unlikely(rc)) + return rc; + +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + MDB_meta *meta = mdbx_meta_head_w(env); +#if MDBX_MODE_ENABLED + txn->mt_canary = meta->mm_canary; +#endif + txn->mt_txnid = meta->mm_txnid + 1; + txn->mt_flags = flags; +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + +#if MDB_DEBUG + if (unlikely(txn->mt_txnid == mdbx_debug_edge)) { + if (!mdbx_debug_logger) + mdbx_runtime_flags |= + MDBX_DBG_TRACE | MDBX_DBG_EXTRA | MDBX_DBG_AUDIT | MDBX_DBG_ASSERT; + mdbx_debug_log(MDBX_DBG_EDGE, __FUNCTION__, __LINE__, + "on/off edge (txn %zu)", txn->mt_txnid); + } +#endif + txn->mt_child = NULL; + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + txn->mt_dirty_room = MDB_IDL_UM_MAX; + txn->mt_u.dirty_list = env->me_dirty_list; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_free_pgs = env->me_free_pgs; + txn->mt_free_pgs[0] = 0; + txn->mt_spill_pgs = NULL; + if (txn->mt_lifo_reclaimed) + txn->mt_lifo_reclaimed[0] = 0; + env->me_txn = txn; + memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned)); + /* Copy the DB info and flags */ + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); + /* Moved to here to avoid a data race in read TXNs */ + txn->mt_next_pgno = meta->mm_last_pg + 1; + } + + /* Setup db info */ + txn->mt_numdbs = env->me_numdbs; + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + unsigned x = env->me_dbflags[i]; + txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; + txn->mt_dbflags[i] = + (x & MDB_VALID) ? DB_VALID | DB_USRVALID | DB_STALE : 0; + } + txn->mt_dbflags[MAIN_DBI] = DB_VALID | DB_USRVALID; + txn->mt_dbflags[FREE_DBI] = DB_VALID; + + if (unlikely(env->me_flags & MDB_FATAL_ERROR)) { + mdbx_debug("environment had fatal error, must shutdown!"); + rc = MDB_PANIC; + } else if (unlikely(env->me_maxpg < txn->mt_next_pgno)) { + rc = MDB_MAP_RESIZED; + } else { + return MDB_SUCCESS; + } + mdbx_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); + return rc; +} + +int mdbx_txn_renew(MDB_txn *txn) { + int rc; + + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY | MDB_TXN_FINISHED))) + return EINVAL; + + rc = mdbx_txn_renew0(txn, MDB_TXN_RDONLY); + if (rc == MDB_SUCCESS) { + mdbx_debug("renew txn %zu%c %p on mdbenv %p, root page %zu", txn->mt_txnid, + (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); + } + return rc; +} + +int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, + MDB_txn **ret) { + MDB_txn *txn; + MDB_ntxn *ntxn; + int rc, size, tsize; + + if (unlikely(!env || !ret)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + + flags &= MDB_TXN_BEGIN_FLAGS; + flags |= env->me_flags & MDB_WRITEMAP; + + if (unlikely(env->me_flags & MDB_RDONLY & + ~flags)) /* write txn in RDONLY env */ + return EACCES; + + if (parent) { + if (unlikely(parent->mt_signature != MDBX_MT_SIGNATURE)) + return EINVAL; + + /* Nested transactions: Max 1 child, write txns only, no writemap */ + flags |= parent->mt_flags; + if (unlikely(flags & (MDB_RDONLY | MDB_WRITEMAP | MDB_TXN_BLOCKED))) { + return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; + } + /* Child txns save MDB_pgstate and use own copy of cursors */ + size = env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + 1); + size += tsize = sizeof(MDB_ntxn); + } else if (flags & MDB_RDONLY) { + size = env->me_maxdbs * (sizeof(MDB_db) + 1); + size += tsize = sizeof(MDB_txn); + } else { + /* Reuse preallocated write txn. However, do not touch it until + * mdbx_txn_renew0() succeeds, since it currently may be active. */ + txn = env->me_txn0; + goto renew; + } + if (unlikely((txn = calloc(1, size)) == NULL)) { + mdbx_debug("calloc: %s", strerror(errno)); + return ENOMEM; + } + txn->mt_dbxs = env->me_dbxs; /* static */ + txn->mt_dbs = (MDB_db *)((char *)txn + tsize); + txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; + txn->mt_flags = flags; + txn->mt_env = env; + + if (parent) { + unsigned i; + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = parent->mt_dbiseqs; + txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2) * MDB_IDL_UM_SIZE); + if (!txn->mt_u.dirty_list || + !(txn->mt_free_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX))) { + free(txn->mt_u.dirty_list); + free(txn); + return ENOMEM; + } + txn->mt_txnid = parent->mt_txnid; + txn->mt_dirty_room = parent->mt_dirty_room; + txn->mt_u.dirty_list[0].mid = 0; + txn->mt_spill_pgs = NULL; + txn->mt_next_pgno = parent->mt_next_pgno; + parent->mt_flags |= MDB_TXN_HAS_CHILD; + parent->mt_child = txn; + txn->mt_parent = parent; + txn->mt_numdbs = parent->mt_numdbs; + memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + /* Copy parent's mt_dbflags, but clear DB_NEW */ + for (i = 0; i < txn->mt_numdbs; i++) + txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; + rc = 0; + ntxn = (MDB_ntxn *)txn; + ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ + if (env->me_pghead) { + size = MDB_IDL_SIZEOF(env->me_pghead); + env->me_pghead = mdbx_midl_alloc(env->me_pghead[0]); + if (likely(env->me_pghead)) + memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + else + rc = ENOMEM; + } + if (likely(!rc)) + rc = mdbx_cursor_shadow(parent, txn); + if (unlikely(rc)) + mdbx_txn_end(txn, MDB_END_FAIL_BEGINCHILD); + } else { /* MDB_RDONLY */ + txn->mt_dbiseqs = env->me_dbiseqs; + renew: + rc = mdbx_txn_renew0(txn, flags); + } + if (unlikely(rc)) { + if (txn != env->me_txn0) + free(txn); + } else { + txn->mt_signature = MDBX_MT_SIGNATURE; + *ret = txn; + mdbx_debug("begin txn %zu%c %p on mdbenv %p, root page %zu", txn->mt_txnid, + (flags & MDB_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, + txn->mt_dbs[MAIN_DBI].md_root); + } + + return rc; +} + +MDB_env *mdbx_txn_env(MDB_txn *txn) { + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return NULL; + return txn->mt_env; +} + +size_t mdbx_txn_id(MDB_txn *txn) { + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return 0; + return txn->mt_txnid; +} + +/** Export or close DBI handles opened in this txn. */ +static void mdbx_dbis_update(MDB_txn *txn, int keep) { + int i; + MDB_dbi n = txn->mt_numdbs; + MDB_env *env = txn->mt_env; + unsigned char *tdbflags = txn->mt_dbflags; + + for (i = n; --i >= CORE_DBS;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.mv_data; + if (ptr) { + env->me_dbxs[i].md_name.mv_data = NULL; + env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbflags[i] = 0; + env->me_dbiseqs[i]++; + free(ptr); + } + } + } + } + if (keep && env->me_numdbs < n) + env->me_numdbs = n; +} + +/** End a transaction, except successful commit of a nested transaction. + * May be called twice for readonly txns: First reset it, then abort. + * @param[in] txn the transaction handle to end + * @param[in] mode why and how to end the transaction + */ +static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { + MDB_env *env = txn->mt_env; + static const char *const names[] = MDB_END_NAMES; + + if (unlikely(txn->mt_env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + + /* Export or close DBI handles opened in this txn */ + mdbx_dbis_update(txn, mode & MDB_END_UPDATE); + + mdbx_debug("%s txn %zu%c %p on mdbenv %p, root page %zu", + names[mode & MDB_END_OPMASK], txn->mt_txnid, + (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)env, txn->mt_dbs[MAIN_DBI].md_root); + + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + if (txn->mt_u.reader) { +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + txn->mt_u.reader->mr_txnid = ~(txnid_t)0; + if (!(env->me_flags & MDB_NOTLS)) { + txn->mt_u.reader = NULL; /* txn does not own reader */ + } else if (mode & MDB_END_SLOT) { + txn->mt_u.reader->mr_pid = 0; + txn->mt_u.reader = NULL; + } /* else txn owns the slot until it does MDB_END_SLOT */ +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + } + mdbx_coherent_barrier(); + txn->mt_numdbs = 0; /* prevent further DBI activity */ + txn->mt_flags |= MDB_TXN_FINISHED; + + } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { + pgno_t *pghead = env->me_pghead; + + if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ + mdbx_cursors_eot(txn, 0); + if (!(env->me_flags & MDB_WRITEMAP)) { + mdbx_dlist_free(txn); + } + + if (txn->mt_lifo_reclaimed) { + txn->mt_lifo_reclaimed[0] = 0; + if (txn != env->me_txn0) { + mdbx_midl_free(txn->mt_lifo_reclaimed); + txn->mt_lifo_reclaimed = NULL; + } + } + txn->mt_numdbs = 0; + txn->mt_flags = MDB_TXN_FINISHED; + + if (!txn->mt_parent) { + mdbx_midl_shrink(&txn->mt_free_pgs); + env->me_free_pgs = txn->mt_free_pgs; + /* me_pgstate: */ + env->me_pghead = NULL; + env->me_pglast = 0; + + env->me_txn = NULL; + mode = 0; /* txn == env->me_txn0, do not free() it */ + + /* The writer mutex was locked in mdbx_txn_begin. */ + mdbx_mutex_unlock(env, MDB_MUTEX(env, w)); + } else { + txn->mt_parent->mt_child = NULL; + txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; + mdbx_midl_free(txn->mt_free_pgs); + mdbx_midl_free(txn->mt_spill_pgs); + free(txn->mt_u.dirty_list); + } + + mdbx_midl_free(pghead); + } + + if (mode & MDB_END_FREE) { + txn->mt_signature = 0; + free(txn); + } + + return MDB_SUCCESS; +} + +int mdbx_txn_reset(MDB_txn *txn) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + /* This call is only valid for read-only txns */ + if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) + return EINVAL; + +#if MDBX_MODE_ENABLED + /* LY: don't close DBI-handles in MDBX mode */ + return mdbx_txn_end(txn, MDB_END_RESET | MDB_END_UPDATE); +#else + return mdbx_txn_end(txn, MDB_END_RESET); +#endif /* MDBX_MODE_ENABLED */ +} + +int mdbx_txn_abort(MDB_txn *txn) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + +#if MDBX_MODE_ENABLED + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + /* LY: don't close DBI-handles in MDBX mode */ + return mdbx_txn_end(txn, MDB_END_ABORT | MDB_END_UPDATE | MDB_END_SLOT | + MDB_END_FREE); +#endif /* MDBX_MODE_ENABLED */ + + if (txn->mt_child) + mdbx_txn_abort(txn->mt_child); + + return mdbx_txn_end(txn, MDB_END_ABORT | MDB_END_SLOT | MDB_END_FREE); +} + +static MDBX_INLINE int mdbx_backlog_size(MDB_txn *txn) { + int reclaimed = txn->mt_env->me_pghead ? txn->mt_env->me_pghead[0] : 0; + return reclaimed + txn->mt_loose_count; +} + +/* LY: Prepare a backlog of pages to modify FreeDB itself, + * while reclaiming is prohibited. It should be enough to prevent search + * in mdbx_page_alloc() during a deleting, when freeDB tree is unbalanced. */ +static int mdbx_prep_backlog(MDB_txn *txn, MDB_cursor *mc) { + /* LY: extra page(s) for b-tree rebalancing */ + const int extra = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) ? 2 : 1; + + if (mdbx_backlog_size(txn) < mc->mc_db->md_depth + extra) { + int rc = mdbx_cursor_touch(mc); + if (unlikely(rc)) + return rc; + + while (unlikely(mdbx_backlog_size(txn) < extra)) { + rc = mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC); + if (unlikely(rc)) { + if (unlikely(rc != MDB_NOTFOUND)) + return rc; + break; + } + } + } + + return MDB_SUCCESS; +} + +/** Save the freelist as of this transaction to the freeDB. + * This changes the freelist. Keep trying until it stabilizes. + */ +static int mdbx_freelist_save(MDB_txn *txn) { + /* env->me_pghead[] can grow and shrink during this call. + * env->me_pglast and txn->mt_free_pgs[] can only grow. + * Page numbers cannot disappear from txn->mt_free_pgs[]. */ + MDB_cursor mc; + MDB_env *env = txn->mt_env; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + txnid_t pglast = 0, head_id = 0; + pgno_t freecnt = 0, *free_pgs, *mop; + ssize_t head_room = 0, total_room = 0, mop_len, clean_limit; + unsigned cleanup_idx = 0, refill_idx = 0; + const int lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; + + mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); + + /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ + clean_limit = (env->me_flags & (MDB_NOMEMINIT | MDB_WRITEMAP)) ? SSIZE_MAX + : maxfree_1pg; + +again: + for (;;) { + /* Come back here after each Put() in case freelist changed */ + MDB_val key, data; + pgno_t *pgs; + ssize_t j; + + if (!lifo) { + /* If using records from freeDB which we have not yet + * deleted, delete them and any we reserved for me_pghead. */ + while (pglast < env->me_pglast) { + rc = mdbx_cursor_first(&mc, &key, NULL); + if (unlikely(rc)) + goto bailout; + rc = mdbx_prep_backlog(txn, &mc); + if (unlikely(rc)) + goto bailout; + pglast = head_id = *(txnid_t *)key.mv_data; + total_room = head_room = 0; + more = 1; + mdbx_tassert(txn, pglast <= env->me_pglast); + mc.mc_flags |= C_RECLAIMING; + rc = mdbx_cursor_del(&mc, 0); + mc.mc_flags &= ~C_RECLAIMING; + if (unlikely(rc)) + goto bailout; + } + } else if (txn->mt_lifo_reclaimed) { + /* LY: cleanup reclaimed records. */ + while (cleanup_idx < txn->mt_lifo_reclaimed[0]) { + pglast = txn->mt_lifo_reclaimed[++cleanup_idx]; + key.mv_data = &pglast; + key.mv_size = sizeof(pglast); + rc = mdbx_cursor_get(&mc, &key, NULL, MDB_SET); + if (likely(rc != MDB_NOTFOUND)) { + if (unlikely(rc)) + goto bailout; + rc = mdbx_prep_backlog(txn, &mc); + if (unlikely(rc)) + goto bailout; + mc.mc_flags |= C_RECLAIMING; + rc = mdbx_cursor_del(&mc, 0); + mc.mc_flags &= ~C_RECLAIMING; + if (unlikely(rc)) + goto bailout; + } + } + } + + if (unlikely(!env->me_pghead) && txn->mt_loose_pgs) { + /* Put loose page numbers in mt_free_pgs, since + * we may be unable to return them to me_pghead. */ + MDB_page *mp = txn->mt_loose_pgs; + if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pgs, + txn->mt_loose_count)) != 0)) + return rc; + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) + mdbx_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + } + + /* Save the IDL of pages freed by this txn, to a single record */ + if (freecnt < txn->mt_free_pgs[0]) { + if (unlikely(!freecnt)) { + /* Make sure last page of freeDB is touched and on freelist */ + rc = mdbx_page_search(&mc, NULL, MDB_PS_LAST | MDB_PS_MODIFY); + if (unlikely(rc && rc != MDB_NOTFOUND)) + goto bailout; + } + free_pgs = txn->mt_free_pgs; + /* Write to last page of freeDB */ + key.mv_size = sizeof(txn->mt_txnid); + key.mv_data = &txn->mt_txnid; + do { + freecnt = free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(free_pgs); + rc = mdbx_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (unlikely(rc)) + goto bailout; + /* Retry if mt_free_pgs[] grew during the Put() */ + free_pgs = txn->mt_free_pgs; + } while (freecnt < free_pgs[0]); + + mdbx_midl_sort(free_pgs); + memcpy(data.mv_data, free_pgs, data.mv_size); + + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { + unsigned i = free_pgs[0]; + mdbx_debug_extra("IDL write txn %zu root %zu num %u, IDL", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); + for (; i; i--) + mdbx_debug_extra_print(" %zu", free_pgs[i]); + mdbx_debug_extra_print("\n"); + } + continue; + } + + mop = env->me_pghead; + mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; + + if (mop_len && refill_idx == 0) + refill_idx = 1; + + /* Reserve records for me_pghead[]. Split it if multi-page, + * to avoid searching freeDB for a page range. Use keys in + * range [1,me_pglast]: Smaller than txnid of oldest reader. */ + if (total_room >= mop_len) { + if (total_room == mop_len || --more < 0) + break; + } else if (head_room >= maxfree_1pg && head_id > 1) { + /* Keep current record (overflow page), add a new one */ + head_id--; + refill_idx++; + head_room = 0; + } + + if (lifo) { + if (refill_idx > + (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)) { + /* LY: need just a txn-id for save page list. */ + rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); + if (likely(rc == 0)) + /* LY: ok, reclaimed from freedb. */ + continue; + if (unlikely(rc != MDB_NOTFOUND)) + /* LY: other troubles... */ + goto bailout; + + /* LY: freedb is empty, will look any free txn-id in high2low order. + */ + if (unlikely(env->me_pglast < 1)) { + /* LY: not any txn in the past of freedb. */ + rc = MDB_MAP_FULL; + goto bailout; + } + + if (unlikely(!txn->mt_lifo_reclaimed)) { + txn->mt_lifo_reclaimed = mdbx_midl_alloc(env->me_maxfree_1pg); + if (unlikely(!txn->mt_lifo_reclaimed)) { + rc = ENOMEM; + goto bailout; + } + } + /* LY: append the list. */ + rc = mdbx_midl_append(&txn->mt_lifo_reclaimed, env->me_pglast - 1); + if (unlikely(rc)) + goto bailout; + --env->me_pglast; + /* LY: note that freeDB cleanup is not needed. */ + ++cleanup_idx; + } + head_id = txn->mt_lifo_reclaimed[refill_idx]; + } + + /* (Re)write {key = head_id, IDL length = head_room} */ + total_room -= head_room; + head_room = mop_len - total_room; + if (head_room > maxfree_1pg && head_id > 1) { + /* Overflow multi-page for part of me_pghead */ + head_room /= head_id; /* amortize page sizes */ + head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); + } else if (head_room < 0) { + /* Rare case, not bothering to delete this record */ + head_room = 0; + continue; + } + key.mv_size = sizeof(head_id); + key.mv_data = &head_id; + data.mv_size = (head_room + 1) * sizeof(pgno_t); + rc = mdbx_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (unlikely(rc)) + goto bailout; + /* IDL is initially empty, zero out at least the length */ + pgs = (pgno_t *)data.mv_data; + j = head_room > clean_limit ? head_room : 0; + do { + pgs[j] = 0; + } while (--j >= 0); + total_room += head_room; + } + + mdbx_tassert(txn, + cleanup_idx == + (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); + + /* Return loose page numbers to me_pghead, though usually none are + * left at this point. The pages themselves remain in dirty_list. */ + if (txn->mt_loose_pgs) { + MDB_page *mp = txn->mt_loose_pgs; + unsigned count = txn->mt_loose_count; + MDB_IDL loose; + /* Room for loose pages + temp IDL with same */ + if ((rc = mdbx_midl_need(&env->me_pghead, 2 * count + 1)) != 0) + goto bailout; + mop = env->me_pghead; + loose = mop + MDB_IDL_ALLOCLEN(mop) - count; + for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) + loose[++count] = mp->mp_pgno; + loose[0] = count; + mdbx_midl_sort(loose); + mdbx_midl_xmerge(mop, loose); + txn->mt_loose_pgs = NULL; + txn->mt_loose_count = 0; + mop_len = mop[0]; + } + + /* Fill in the reserved me_pghead records */ + rc = MDB_SUCCESS; + if (mop_len) { + MDB_val key, data; + + mop += mop_len; + if (!lifo) { + rc = mdbx_cursor_first(&mc, &key, &data); + if (unlikely(rc)) + goto bailout; + } + + for (;;) { + txnid_t id; + ssize_t len; + MDB_ID save; + + if (!lifo) { + id = *(txnid_t *)key.mv_data; + mdbx_tassert(txn, id <= env->me_pglast); + } else { + mdbx_tassert(txn, + refill_idx > 0 && refill_idx <= txn->mt_lifo_reclaimed[0]); + id = txn->mt_lifo_reclaimed[refill_idx--]; + key.mv_data = &id; + key.mv_size = sizeof(id); + rc = mdbx_cursor_get(&mc, &key, &data, MDB_SET); + if (unlikely(rc)) + goto bailout; + } + mdbx_tassert( + txn, cleanup_idx == + (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); + + len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + mdbx_tassert(txn, len >= 0); + if (len > mop_len) + len = mop_len; + data.mv_size = (len + 1) * sizeof(MDB_ID); + key.mv_data = &id; + key.mv_size = sizeof(id); + data.mv_data = mop -= len; + + save = mop[0]; + mop[0] = len; + rc = mdbx_cursor_put(&mc, &key, &data, MDB_CURRENT); + mdbx_tassert( + txn, cleanup_idx == + (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); + mop[0] = save; + if (unlikely(rc || (mop_len -= len) == 0)) + goto bailout; + + if (!lifo) { + rc = mdbx_cursor_next(&mc, &key, &data, MDB_NEXT); + if (unlikely(rc)) + goto bailout; + } + } + } + +bailout: + if (txn->mt_lifo_reclaimed) { + mdbx_tassert(txn, rc || cleanup_idx == txn->mt_lifo_reclaimed[0]); + if (rc == 0 && cleanup_idx != txn->mt_lifo_reclaimed[0]) { + mdbx_tassert(txn, cleanup_idx < txn->mt_lifo_reclaimed[0]); + /* LY: zeroed cleanup_idx to force cleanup & refill created freeDB + * records. */ + cleanup_idx = 0; + /* LY: restart filling */ + refill_idx = total_room = head_room = 0; + more = 1; + goto again; + } + txn->mt_lifo_reclaimed[0] = 0; + if (txn != env->me_txn0) { + mdbx_midl_free(txn->mt_lifo_reclaimed); + txn->mt_lifo_reclaimed = NULL; + } + } + + return rc; +} + +/** Flush (some) dirty pages to the map, after clearing their dirty flag. + * @param[in] txn the transaction that's being committed + * @param[in] keep number of initial pages in dirty_list to keep dirty. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_flush(MDB_txn *txn, int keep) { + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned psize = env->me_psize, j; + int i, pagecount = dl[0].mid, rc; + size_t size = 0, pos = 0; + pgno_t pgno = 0; + MDB_page *dp = NULL; + struct iovec iov[MDB_COMMIT_PAGES]; + ssize_t wpos = 0, wsize = 0, wres; + size_t next_pos = 1; /* impossible pos, so pos != next_pos */ + int n = 0; + + j = i = keep; + + if (env->me_flags & MDB_WRITEMAP) { + /* Clear dirty flags */ + while (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE | P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[++j] = dl[i]; + continue; + } + dp->mp_flags &= ~P_DIRTY; + env->me_sync_pending += IS_OVERFLOW(dp) ? psize * dp->mp_pages : psize; + } + goto done; + } + + /* Write the pages */ + for (;;) { + if (++i <= pagecount) { + dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & (P_LOOSE | P_KEEP)) { + dp->mp_flags &= ~P_KEEP; + dl[i].mid = 0; + continue; + } + pgno = dl[i].mid; + /* clear dirty flag */ + dp->mp_flags &= ~P_DIRTY; + pos = pgno * psize; + size = psize; + if (IS_OVERFLOW(dp)) + size *= dp->mp_pages; + env->me_sync_pending += size; + } + /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ + if (pos != next_pos || n == MDB_COMMIT_PAGES || wsize + size > MAX_WRITE) { + if (n) { + retry: + /* Write previous page(s) */ + wres = pwritev(env->me_fd, iov, n, wpos); + if (unlikely(wres != wsize)) { + if (wres < 0) { + rc = errno; + if (rc == EINTR) + goto retry; + mdbx_debug("Write error: %s", strerror(rc)); + } else { + rc = EIO; /* TODO: Use which error code? */ + mdbx_debug("short write, filesystem full?"); + } + return rc; + } + n = 0; + } + if (i > pagecount) + break; + wpos = pos; + wsize = 0; + } + mdbx_debug("committing page %zu", pgno); + next_pos = pos + size; + iov[n].iov_len = size; + iov[n].iov_base = (char *)dp; + wsize += size; + n++; + } + + mdbx_invalidate_cache(env->me_map, txn->mt_next_pgno * env->me_psize); + + for (i = keep; ++i <= pagecount;) { + dp = dl[i].mptr; + /* This is a page we skipped above */ + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; + } + mdbx_dpage_free(env, dp); + } + +done: + i--; + txn->mt_dirty_room += i - j; + dl[0].mid = j; + return MDB_SUCCESS; +} + +int mdbx_txn_commit(MDB_txn *txn) { + int rc; + unsigned i, end_mode; + MDB_env *env; + + if (unlikely(txn == NULL)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(txn->mt_env->me_pid != getpid())) { + txn->mt_env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + + /* mdbx_txn_end() mode for a commit which writes nothing */ + end_mode = + MDB_END_EMPTY_COMMIT | MDB_END_UPDATE | MDB_END_SLOT | MDB_END_FREE; + + if (txn->mt_child) { + rc = mdbx_txn_commit(txn->mt_child); + txn->mt_child = NULL; + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + } + + env = txn->mt_env; + + if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) { + goto done; + } + + if (unlikely(txn->mt_flags & (MDB_TXN_FINISHED | MDB_TXN_ERROR))) { + mdbx_debug("error flag is set, can't commit"); + if (txn->mt_parent) + txn->mt_parent->mt_flags |= MDB_TXN_ERROR; + rc = MDB_BAD_TXN; + goto fail; + } + + if (txn->mt_parent) { + MDB_txn *parent = txn->mt_parent; + MDB_page **lp; + MDB_ID2L dst, src; + MDB_IDL pspill; + unsigned x, y, len, ps_len; + + /* Append our reclaim list to parent's */ + if (txn->mt_lifo_reclaimed) { + if (parent->mt_lifo_reclaimed) { + rc = mdbx_midl_append_list(&parent->mt_lifo_reclaimed, + txn->mt_lifo_reclaimed); + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + mdbx_midl_free(txn->mt_lifo_reclaimed); + } else + parent->mt_lifo_reclaimed = txn->mt_lifo_reclaimed; + txn->mt_lifo_reclaimed = NULL; + } + + /* Append our free list to parent's */ + rc = mdbx_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + mdbx_midl_free(txn->mt_free_pgs); + /* Failures after this must either undo the changes + * to the parent or set MDB_TXN_ERROR in the parent. */ + + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; + + /* Merge our cursors into parent's and close them */ + mdbx_cursors_eot(txn, 1); + + /* Update parent's DB table. */ + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; + parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + /* preserve parent's DB_NEW status */ + x = parent->mt_dbflags[i] & DB_NEW; + parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + } + + dst = parent->mt_u.dirty_list; + src = txn->mt_u.dirty_list; + /* Remove anything in our dirty list from parent's spill list */ + if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { + x = y = ps_len; + pspill[0] = (pgno_t)-1; + /* Mark our dirty pages as deleted in parent spill list */ + for (i = 0, len = src[0].mid; ++i <= len;) { + MDB_ID pn = src[i].mid << 1; + while (pn > pspill[x]) + x--; + if (pn == pspill[x]) { + pspill[x] = 1; + y = --x; + } + } + /* Squash deleted pagenums if we deleted any */ + for (x = y; ++x <= ps_len;) + if (!(pspill[x] & 1)) + pspill[++y] = pspill[x]; + pspill[0] = y; + } + + /* Remove anything in our spill list from parent's dirty list */ + if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { + for (i = 1; i <= txn->mt_spill_pgs[0]; i++) { + MDB_ID pn = txn->mt_spill_pgs[i]; + if (pn & 1) + continue; /* deleted spillpg */ + pn >>= 1; + y = mdbx_mid2l_search(dst, pn); + if (y <= dst[0].mid && dst[y].mid == pn) { + free(dst[y].mptr); + while (y < dst[0].mid) { + dst[y] = dst[y + 1]; + y++; + } + dst[0].mid--; + } + } + } + + /* Find len = length of merging our dirty list with parent's */ + x = dst[0].mid; + dst[0].mid = 0; /* simplify loops */ + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdbx_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } + } + } else { /* Simplify the above for single-ancestor case */ + len = MDB_IDL_UM_MAX - txn->mt_dirty_room; + } + /* Merge our dirty list with parent's */ + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + mdbx_tassert(txn, i == x); + dst[0].mid = len; + free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + if (txn->mt_spill_pgs) { + if (parent->mt_spill_pgs) { + /* TODO: Prevent failure here, so parent does not fail */ + rc = mdbx_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + if (unlikely(rc != MDB_SUCCESS)) + parent->mt_flags |= MDB_TXN_ERROR; + mdbx_midl_free(txn->mt_spill_pgs); + mdbx_midl_sort(parent->mt_spill_pgs); + } else { + parent->mt_spill_pgs = txn->mt_spill_pgs; + } + } + + /* Append our loose page list to parent's */ + for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) + ; + *lp = txn->mt_loose_pgs; + parent->mt_loose_count += txn->mt_loose_count; + + parent->mt_child = NULL; + mdbx_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); + txn->mt_signature = 0; + free(txn); + return rc; + } + + env = txn->mt_env; + if (unlikely(txn != env->me_txn)) { + mdbx_debug("attempt to commit unknown transaction"); + rc = EINVAL; + goto fail; + } + + mdbx_cursors_eot(txn, 0); + + if (!txn->mt_u.dirty_list[0].mid && + !(txn->mt_flags & (MDB_TXN_DIRTY | MDB_TXN_SPILLS))) + goto done; + + mdbx_debug("committing txn %zu %p on mdbenv %p, root page %zu", txn->mt_txnid, + (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); + + /* Update DB root pointers */ + if (txn->mt_numdbs > CORE_DBS) { + MDB_cursor mc; + MDB_dbi i; + MDB_val data; + data.mv_size = sizeof(MDB_db); + + mdbx_cursor_init(&mc, txn, MAIN_DBI, NULL); + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + if (unlikely(TXN_DBI_CHANGED(txn, i))) { + rc = MDB_BAD_DBI; + goto fail; + } + data.mv_data = &txn->mt_dbs[i]; + rc = mdbx_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, F_SUBDATA); + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + } + } + } + + rc = mdbx_freelist_save(txn); + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + + mdbx_midl_free(env->me_pghead); + env->me_pghead = NULL; + mdbx_midl_shrink(&txn->mt_free_pgs); + + if (mdbx_audit_enabled()) + mdbx_audit(txn); + + rc = mdbx_page_flush(txn, 0); + if (likely(rc == MDB_SUCCESS)) { + MDB_meta meta; + + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta.mm_last_pg = txn->mt_next_pgno - 1; + meta.mm_txnid = txn->mt_txnid; +#if MDBX_MODE_ENABLED + meta.mm_canary = txn->mt_canary; +#endif + + rc = mdbx_env_sync0(env, env->me_flags | txn->mt_flags, &meta); + } + if (unlikely(rc != MDB_SUCCESS)) + goto fail; + end_mode = MDB_END_COMMITTED | MDB_END_UPDATE; + +done: + return mdbx_txn_end(txn, end_mode); + +fail: + mdbx_txn_abort(txn); + return rc; +} + +/** Read the environment parameters of a DB environment before + * mapping it into memory. + * @param[in] env the environment handle + * @param[out] meta address of where to store the meta information + * @return 0 on success, non-zero on failure. + */ +static int __cold mdbx_env_read_header(MDB_env *env, MDB_meta *meta) { + MDB_metabuf pbuf; + MDB_page *p; + MDB_meta *m; + int i, rc, off; + enum { Size = sizeof(pbuf) }; + + /* We don't know the page size yet, so use a minimum value. + * Read both meta pages so we can use the latest one. + */ + + meta->mm_datasync_sign = MDB_DATASIGN_WEAK; + meta->mm_txnid = 0; + for (i = off = 0; i < NUM_METAS; i++, off += meta->mm_psize) { + rc = pread(env->me_fd, &pbuf, Size, off); + if (rc != Size) { + if (rc == 0 && off == 0) + return ENOENT; + rc = rc < 0 ? (int)errno : MDB_INVALID; + mdbx_debug("read: %s", mdbx_strerror(rc)); + return rc; + } + + p = (MDB_page *)&pbuf; + + if (!F_ISSET(p->mp_flags, P_META)) { + mdbx_debug("page %zu not a meta page", p->mp_pgno); + return MDB_INVALID; + } + + m = PAGEDATA(p); + if (m->mm_magic != MDB_MAGIC) { + mdbx_debug("meta has invalid magic"); + return MDB_INVALID; + } + + if (m->mm_version != MDB_DATA_VERSION) { + mdbx_debug("database is version %u, expected version %u", m->mm_version, + MDB_DATA_VERSION); + return MDB_VERSION_MISMATCH; + } + + if (m->mm_datasync_sign > MDB_DATASIGN_WEAK && + m->mm_datasync_sign != mdbx_meta_sign(m)) + continue; + + if (mdbx_meta_lt(meta, m)) + *meta = *m; + } + + if (meta->mm_datasync_sign == MDB_DATASIGN_WEAK) + /* LY: Both meta-pages are weak. */ + return MDB_CORRUPTED; + + return MDB_SUCCESS; +} + +/** Fill in most of the zeroed #MDB_meta for an empty database environment */ +static void __cold mdbx_env_init_meta0(MDB_env *env, MDB_meta *meta) { + meta->mm_magic = MDB_MAGIC; + meta->mm_version = MDB_DATA_VERSION; + meta->mm_mapsize = env->me_mapsize; + meta->mm_psize = env->me_psize; + meta->mm_last_pg = NUM_METAS - 1; + meta->mm_flags = env->me_flags & 0xffff; + meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + meta->mm_dbs[FREE_DBI].md_root = P_INVALID; + meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; + meta->mm_datasync_sign = mdbx_meta_sign(meta); +} + +/** Write the environment parameters of a freshly created DB environment. + * @param[in] env the environment handle + * @param[in] meta the #MDB_meta to write + * @return 0 on success, non-zero on failure. + */ +static int __cold mdbx_env_init_meta(MDB_env *env, MDB_meta *meta) { + MDB_page *p, *q; + int rc; + unsigned psize; + int len; + + mdbx_debug("writing new meta page"); + + psize = env->me_psize; + + p = calloc(NUM_METAS, psize); + if (!p) + return ENOMEM; + p->mp_pgno = 0; + p->mp_flags = P_META; + *(MDB_meta *)PAGEDATA(p) = *meta; + + q = (MDB_page *)((char *)p + psize); + q->mp_pgno = 1; + q->mp_flags = P_META; + *(MDB_meta *)PAGEDATA(q) = *meta; + + do + len = pwrite(env->me_fd, p, psize * NUM_METAS, 0); + while (len == -1 && errno == EINTR); + + if (len < 0) + rc = errno; + else if ((unsigned)len == psize * NUM_METAS) + rc = MDB_SUCCESS; + else + rc = ENOSPC; + free(p); + return rc; +} + +static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { + int rc; + MDB_meta *head = mdbx_meta_head_w(env); + size_t prev_mapsize = head->mm_mapsize; + size_t used_size = env->me_psize * (pending->mm_last_pg + 1); + + mdbx_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env)); + mdbx_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0); + mdbx_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0 || + env->me_mapsize != prev_mapsize); + + pending->mm_mapsize = env->me_mapsize; + mdbx_assert(env, pending->mm_mapsize >= used_size); + if (unlikely(pending->mm_mapsize != prev_mapsize)) { + if (pending->mm_mapsize < prev_mapsize) { + /* LY: currently this can't happen, but force full-sync. */ + flags &= MDB_WRITEMAP; + } else { + /* Persist any increases of mapsize config */ + } + } + + if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) + flags &= MDB_WRITEMAP; + + /* LY: step#1 - sync previously written/updated data-pages */ + if (env->me_sync_pending && (flags & MDB_NOSYNC) == 0) { + if (env->me_flags & MDB_WRITEMAP) { + int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + if (unlikely(msync(env->me_map, used_size, mode))) { + rc = errno; + /* LY: msync() should never return EINTR */ + goto fail; + } + if ((flags & MDB_MAPASYNC) == 0) + env->me_sync_pending = 0; + } else { + int (*flush)(int fd) = fdatasync; + if (unlikely(prev_mapsize != pending->mm_mapsize)) { + /* LY: It is no reason to use fdatasync() here, even in case + * no such bug in a kernel. Because "no-bug" mean that a kernel + * internally do nearly the same, e.g. fdatasync() == fsync() + * when no-kernel-bug and file size was changed. + * + * So, this code is always safe and without appreciable + * performance degradation. + * + * For more info about of a corresponding fdatasync() bug + * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ + flush = fsync; + } + while (unlikely(flush(env->me_fd) < 0)) { + rc = errno; + if (rc != EINTR) + goto fail; + } + env->me_sync_pending = 0; + } + } + + /* LY: step#2 - update meta-page. */ + if (env->me_sync_pending == 0) { + pending->mm_datasync_sign = mdbx_meta_sign(pending); + } else { + pending->mm_datasync_sign = + (flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC + ? MDB_DATASIGN_NONE + : MDB_DATASIGN_WEAK; + } + + volatile MDB_meta *target = + (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head)) + ? head + : mdbx_env_meta_flipflop(env, head); + off_t offset = (char *)target - env->me_map; + + MDB_meta *stay = mdbx_env_meta_flipflop(env, (MDB_meta *)target); + mdbx_debug( + "writing meta %d (%s, was %zu/%s, stay %s %zu/%s), root %zu, " + "txn_id %zu, %s", + offset >= env->me_psize, target == head ? "head" : "tail", + target->mm_txnid, + META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" + : "Legacy", + stay == head ? "head" : "tail", stay->mm_txnid, + META_IS_WEAK(stay) ? "Weak" : META_IS_STEADY(stay) ? "Steady" : "Legacy", + pending->mm_dbs[MAIN_DBI].md_root, pending->mm_txnid, + META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" + : "Legacy"); + + if (env->me_flags & MDB_WRITEMAP) { +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + /* LY: 'invalidate' the meta, + * but mdbx_meta_head_r() will be confused/retired in collision case. */ + target->mm_datasync_sign = MDB_DATASIGN_WEAK; + target->mm_txnid = 0; + /* LY: update info */ + target->mm_mapsize = pending->mm_mapsize; + target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; + target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; + target->mm_last_pg = pending->mm_last_pg; +#if MDBX_MODE_ENABLED + target->mm_canary = pending->mm_canary; +#endif + /* LY: 'commit' the meta */ + target->mm_txnid = pending->mm_txnid; + target->mm_datasync_sign = pending->mm_datasync_sign; + } else { + pending->mm_magic = MDB_MAGIC; + pending->mm_version = MDB_DATA_VERSION; + pending->mm_address = head->mm_address; + retry: + rc = pwrite(env->me_fd, pending, sizeof(MDB_meta), offset); + if (unlikely(rc != sizeof(MDB_meta))) { + rc = (rc < 0) ? errno : EIO; + if (rc == EINTR) + goto retry; + + undo: + mdbx_debug("write failed, disk error?"); + /* On a failure, the pagecache still contains the new data. + * Write some old data back, to prevent it from being used. */ + if (pwrite(env->me_fd, (void *)target, sizeof(MDB_meta), offset) == + sizeof(MDB_meta)) { + /* LY: take a chance, if write succeeds at a magic ;) */ + goto retry; + } + goto fail; + } + mdbx_invalidate_cache(env->me_map + offset, sizeof(MDB_meta)); +#ifdef __SANITIZE_THREAD__ + pthread_mutex_lock(&tsan_mutex); +#endif + } + + /* Memory ordering issues are irrelevant; since the entire writer + * is wrapped by wmutex, all of these changes will become visible + * after the wmutex is unlocked. Since the DB is multi-version, + * readers will get consistent data regardless of how fresh or + * how stale their view of these values is. + */ + env->me_txns->mti_txnid = pending->mm_txnid; +#ifdef __SANITIZE_THREAD__ + pthread_mutex_unlock(&tsan_mutex); +#endif + + /* LY: step#3 - sync meta-pages. */ + if ((flags & (MDB_NOSYNC | MDB_NOMETASYNC)) == 0) { + if (env->me_flags & MDB_WRITEMAP) { + char *ptr = env->me_map + (offset & ~(env->me_os_psize - 1)); + int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + if (unlikely(msync(ptr, env->me_os_psize, mode) < 0)) { + rc = errno; + goto fail; + } + } else { + while (unlikely(fdatasync(env->me_fd) < 0)) { + rc = errno; + if (rc != EINTR) + goto undo; + } + } + } + + /* LY: currently this can't happen, but... */ + if (unlikely(pending->mm_mapsize < prev_mapsize)) { + mdbx_assert(env, pending->mm_mapsize == env->me_mapsize); + if (unlikely(mremap(env->me_map, prev_mapsize, pending->mm_mapsize, + MREMAP_FIXED, pending->mm_address) == MAP_FAILED)) { + rc = errno; + goto fail; + } + if (unlikely(ftruncate(env->me_fd, pending->mm_mapsize) < 0)) { + rc = errno; + goto fail; + } + } + + return MDB_SUCCESS; + +fail: + env->me_flags |= MDB_FATAL_ERROR; + return rc; +} + +int __cold mdbx_env_create(MDB_env **env) { + MDB_env *e; + + e = calloc(1, sizeof(MDB_env)); + if (!e) + return ENOMEM; + + e->me_maxreaders = DEFAULT_READERS; + e->me_maxdbs = e->me_numdbs = CORE_DBS; + e->me_fd = INVALID_HANDLE_VALUE; + e->me_lfd = INVALID_HANDLE_VALUE; + e->me_pid = getpid(); + GET_PAGESIZE(e->me_os_psize); + VALGRIND_CREATE_MEMPOOL(e, 0, 0); + e->me_signature = MDBX_ME_SIGNATURE; + *env = e; + return MDB_SUCCESS; +} + +static int __cold mdbx_env_map(MDB_env *env, void *addr, size_t usedsize) { + unsigned flags = env->me_flags; + + int prot = PROT_READ; + if (flags & MDB_WRITEMAP) { + prot |= PROT_WRITE; + if (ftruncate(env->me_fd, env->me_mapsize) < 0) + return errno; + } + + env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, env->me_fd, 0); + if (env->me_map == MAP_FAILED) { + env->me_map = NULL; + return errno; + } + + /* Can happen because the address argument to mmap() is just a + * hint. mmap() can pick another, e.g. if the range is in use. + * The MAP_FIXED flag would prevent that, but then mmap could + * instead unmap existing pages to make room for the new map. + */ + if (addr && env->me_map != addr) { + errno = 0; /* LY: clean errno as a hit for this case */ + return EBUSY; /* TODO: Make a new MDB_* error code? */ + } + + if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK)) + return errno; + +#ifdef MADV_NOHUGEPAGE + (void)madvise(env->me_map, env->me_mapsize, MADV_NOHUGEPAGE); +#endif + +#ifdef MADV_DONTDUMP + if (!(flags & MDBX_PAGEPERTURB)) { + (void)madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP); + } +#endif + +#ifdef MADV_REMOVE + if (flags & MDB_WRITEMAP) { + (void)madvise(env->me_map + usedsize, env->me_mapsize - usedsize, + MADV_REMOVE); + } +#endif + + /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ + if (madvise(env->me_map, env->me_mapsize, + (flags & MDB_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED)) + return errno; + + /* Lock meta pages to avoid unexpected write, + * before the data pages would be synchronized. */ + if ((flags & MDB_WRITEMAP) && mlock(env->me_map, env->me_psize * 2)) + return errno; + +#ifdef USE_VALGRIND + env->me_valgrind_handle = + VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "lmdb"); +#endif + + return MDB_SUCCESS; +} + +int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { + if (unlikely(!env)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(size < env->me_psize * 8)) + return EINVAL; + + /* If env is already open, caller is responsible for making + * sure there are no active txns. + */ + if (env->me_map) { + int rc; + MDB_meta *meta; + void *old; + if (env->me_txn) + return EINVAL; + meta = mdbx_meta_head_w(env); + if (!size) + size = meta->mm_mapsize; + /* Silently round up to minimum if the size is too small */ + const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; + if (size < usedsize) + size = usedsize; + munmap(env->me_map, env->me_mapsize); +#ifdef USE_VALGRIND + VALGRIND_DISCARD(env->me_valgrind_handle); + env->me_valgrind_handle = -1; +#endif + env->me_mapsize = size; + old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; + rc = mdbx_env_map(env, old, usedsize); + if (rc) + return rc; + } + env->me_mapsize = size; + if (env->me_psize) + env->me_maxpg = env->me_mapsize / env->me_psize; + return MDB_SUCCESS; +} + +int __cold mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { + if (unlikely(!env)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(env->me_map)) + return EINVAL; + + env->me_maxdbs = dbs + CORE_DBS; + return MDB_SUCCESS; +} + +int __cold mdbx_env_set_maxreaders(MDB_env *env, unsigned readers) { + if (unlikely(!env || readers < 1)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(env->me_map)) + return EINVAL; + + env->me_maxreaders = readers; + return MDB_SUCCESS; +} + +int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) { + if (!env || !readers) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + *readers = env->me_maxreaders; + return MDB_SUCCESS; +} + +static int __cold mdbx_fsize(HANDLE fd, size_t *size) { + struct stat st; + + if (fstat(fd, &st)) + return errno; + + *size = st.st_size; + return MDB_SUCCESS; +} + +/** Further setup required for opening an LMDB environment + */ +static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { + unsigned flags = env->me_flags; + int i, newenv = 0, rc; + + if ((i = mdbx_env_read_header(env, meta)) != 0) { + if (i != ENOENT) + return i; + mdbx_debug("new mdbenv"); + newenv = 1; + env->me_psize = env->me_os_psize; + if (env->me_psize > MAX_PAGESIZE) + env->me_psize = MAX_PAGESIZE; + memset(meta, 0, sizeof(*meta)); + mdbx_env_init_meta0(env, meta); + meta->mm_mapsize = DEFAULT_MAPSIZE; + } else { + env->me_psize = meta->mm_psize; + } + + /* Was a mapsize configured? */ + if (!env->me_mapsize) { + env->me_mapsize = meta->mm_mapsize; + } + { + /* Make sure mapsize >= committed data size. Even when using + * mm_mapsize, which could be broken in old files (ITS#7789). + */ + size_t minsize = (meta->mm_last_pg + 1) * meta->mm_psize; + if (env->me_mapsize < minsize) + env->me_mapsize = minsize; + } + meta->mm_mapsize = env->me_mapsize; + + if (newenv && !(flags & MDB_FIXEDMAP)) { + /* mdbx_env_map() may grow the datafile. Write the metapages + * first, so the file will be valid if initialization fails. + * Except with FIXEDMAP, since we do not yet know mm_address. + * We could fill in mm_address later, but then a different + * program might end up doing that - one with a memory layout + * and map address which does not suit the main program. + */ + rc = mdbx_env_init_meta(env, meta); + if (rc) + return rc; + newenv = 0; + } + + const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; + rc = mdbx_env_map(env, (flags & MDB_FIXEDMAP) ? meta->mm_address : NULL, + usedsize); + if (rc) + return rc; + + if (newenv) { + if (flags & MDB_FIXEDMAP) + meta->mm_address = env->me_map; + i = mdbx_env_init_meta(env, meta); + if (i != MDB_SUCCESS) { + return i; + } + } + + env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + env->me_nodemax = + (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) - sizeof(indx_t); + env->me_maxkey_limit = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); + env->me_maxpg = env->me_mapsize / env->me_psize; + + if (MDB_MAXKEYSIZE > env->me_maxkey_limit) + return MDB_BAD_VALSIZE; + + return MDB_SUCCESS; +} + +/****************************************************************************/ + +#ifndef MDBX_USE_THREAD_ATEXIT +#if __GLIBC_PREREQ(2, 18) +#define MDBX_USE_THREAD_ATEXIT 1 +#else +#define MDBX_USE_THREAD_ATEXIT 0 +#endif +#endif + +static pthread_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER; +static MDBX_rthc *mdbx_rthc_list; +static pthread_key_t mdbx_pthread_crutch_key; + +static __inline void mdbx_rthc_lock(void) { + mdbx_ensure(NULL, pthread_mutex_lock(&mdbx_rthc_mutex) == 0); +} + +static __inline void mdbx_rthc_unlock(void) { + mdbx_ensure(NULL, pthread_mutex_unlock(&mdbx_rthc_mutex) == 0); +} + +/** Release a reader thread's slot in the reader lock table. + * This function is called automatically when a thread exits. + * @param[in] ptr This points to the MDB_rthc of a slot in the reader lock + *table. + */ +static __cold void mdbx_rthc_dtor(void) { + /* LY: Основная задача этого деструктора была и есть в освобождении + * слота таблицы читателей при завершении треда, но тут есть пара + * не очевидных сложностей: + * - Таблица читателей располагается в разделяемой памяти, поэтому + * во избежание segfault деструктор не должен что-либо делать после + * или одновременно с mdbx_env_close(). + * - Действительно, mdbx_env_close() вызовет pthread_key_delete() и + * после этого glibc не будет вызывать деструктор. + * - ОДНАКО, это никак не решает проблему гонок между mdbx_env_close() + * и завершающимися тредами. Грубо говоря, при старте mdbx_env_close() + * деструктор уже может выполняться в некоторых тредах, и завершиться + * эти выполнения могут во время или после окончания mdbx_env_close(). + * - БОЛЕЕ ТОГО, схожая проблема возникает при выгрузке dso/dll, + * так как в текущей glibc (2.24) подсистема ld.so ничего не знает о + * TSD-деструкторах и поэтому может выгрузить lib.so до того как + * отработали все деструкторы. + * - Исходное проявление проблемы было зафиксировано + * в https://github.com/ReOpen/ReOpenLDAP/issues/48 + * + * Предыдущее решение посредством выделяемого динамически MDB_rthc + * было не удачным, так как порождало либо утечку памяти, + * либо вероятностное обращение к уже освобожденной памяти + * из этого деструктора. + * + * Текущее решение достаточно "развесисто", но решает все описанные выше + * проблемы без пенальти по производительности. + */ + + mdbx_rthc_lock(); + + pid_t pid = getpid(); + pthread_t thread = pthread_self(); + for (MDBX_rthc **ref = &mdbx_rthc_list; *ref;) { + MDBX_rthc *rthc = *ref; + if (rthc->rc_thread == thread) { + if (rthc->rc_reader && rthc->rc_reader->mr_pid == pid) { + rthc->rc_reader->mr_pid = 0; + mdbx_coherent_barrier(); + } + *ref = rthc->rc_next; + free(rthc); + } else { + ref = &(*ref)->rc_next; + } + } + + mdbx_rthc_unlock(); +} + +#if MDBX_USE_THREAD_ATEXIT + +extern void *__dso_handle __attribute__((__weak__)); +extern int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, + void *dso_symbol); + +static __cold void mdbx_rthc__thread_atexit(void *ptr) { + mdbx_ensure(NULL, ptr == pthread_getspecific(mdbx_pthread_crutch_key)); + mdbx_ensure(NULL, pthread_setspecific(mdbx_pthread_crutch_key, NULL) == 0); + mdbx_rthc_dtor(); +} + +static __attribute__((constructor)) __cold void mdbx_pthread_crutch_ctor(void) { + mdbx_ensure(NULL, pthread_key_create(&mdbx_pthread_crutch_key, NULL) == 0); +} + +#else /* MDBX_USE_THREAD_ATEXIT */ + +static __cold void mdbx_rthc__thread_key_dtor(void *ptr) { + (void)ptr; + if (mdbx_pthread_crutch_key != (pthread_key_t)-1) + mdbx_rthc_dtor(); +} + +static __attribute__((constructor)) __cold void mdbx_pthread_crutch_ctor(void) { + mdbx_ensure(NULL, pthread_key_create(&mdbx_pthread_crutch_key, + mdbx_rthc__thread_key_dtor) == 0); +} + +static __attribute__((destructor)) __cold void mdbx_pthread_crutch_dtor(void) { + pthread_key_delete(mdbx_pthread_crutch_key); + mdbx_pthread_crutch_key = -1; + + /* LY: Из-за race condition в pthread_key_delete() + * деструкторы уже могли начать выполняться. + * Уступая квант времени сразу после удаления ключа + * мы даем им шанс завершиться. */ + pthread_yield(); + + mdbx_rthc_lock(); + pid_t pid = getpid(); + while (mdbx_rthc_list != NULL) { + MDBX_rthc *rthc = mdbx_rthc_list; + mdbx_rthc_list = mdbx_rthc_list->rc_next; + if (rthc->rc_reader && rthc->rc_reader->mr_pid == pid) { + rthc->rc_reader->mr_pid = 0; + mdbx_coherent_barrier(); + } + free(rthc); + + /* LY: Каждый неудаленный элемент списка - это один + * не отработавший деструктор и потенциальный + * шанс получить segfault после выгрузки lib.so + * Поэтому на каждой итерации уступаем квант времени, + * в надежде что деструкторы успеют отработать. */ + mdbx_rthc_unlock(); + pthread_yield(); + mdbx_rthc_lock(); + } + mdbx_rthc_unlock(); + pthread_yield(); +} +#endif /* MDBX_USE_THREAD_ATEXIT */ + +static __cold MDBX_rthc *mdbx_rthc_add(pthread_key_t key) { + MDBX_rthc *rthc = malloc(sizeof(MDBX_rthc)); + if (unlikely(rthc == NULL)) + goto bailout; + + rthc->rc_next = NULL; + rthc->rc_reader = NULL; + rthc->rc_thread = pthread_self(); + if (unlikely(pthread_setspecific(key, rthc) != 0)) + goto bailout_free; + + mdbx_rthc_lock(); + if (pthread_getspecific(mdbx_pthread_crutch_key) == NULL) { +#if MDBX_USE_THREAD_ATEXIT + void *dso_anchor = + (&__dso_handle && __dso_handle) ? __dso_handle : (void *)mdbx_version; + if (unlikely(__cxa_thread_atexit_impl(mdbx_rthc__thread_atexit, rthc, + dso_anchor) != 0)) { + mdbx_rthc_unlock(); + goto bailout_free; + } +#endif /* MDBX_USE_THREAD_ATEXIT */ + mdbx_ensure(NULL, pthread_setspecific(mdbx_pthread_crutch_key, rthc) == 0); + } + rthc->rc_next = mdbx_rthc_list; + mdbx_rthc_list = rthc; + mdbx_rthc_unlock(); + return rthc; + +bailout_free: + free(rthc); +bailout: + return NULL; +} + +static __inline MDBX_rthc *mdbx_rthc_get(pthread_key_t key) { + MDBX_rthc *rthc = pthread_getspecific(key); + if (likely(rthc != NULL)) + return rthc; + return mdbx_rthc_add(key); +} + +static __cold void mdbx_rthc_cleanup(MDB_env *env) { + mdbx_rthc_lock(); + + MDB_reader *begin = env->me_txns->mti_readers; + MDB_reader *end = begin + env->me_close_readers; + for (MDBX_rthc **ref = &mdbx_rthc_list; *ref;) { + MDBX_rthc *rthc = *ref; + if (rthc->rc_reader >= begin && rthc->rc_reader < end) { + if (rthc->rc_reader->mr_pid == env->me_pid) { + rthc->rc_reader->mr_pid = 0; + mdbx_coherent_barrier(); + } + *ref = rthc->rc_next; + free(rthc); + } else { + ref = &(*ref)->rc_next; + } + } + + mdbx_rthc_unlock(); +} + +/****************************************************************************/ + +/** Downgrade the exclusive lock on the region back to shared */ +static __cold int mdbx_env_share_locks(MDB_env *env, int *excl) { + struct flock lock_info; + int rc = 0; + + /* The shared lock replaces the existing lock */ + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_RDLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = errno) == EINTR) + ; + *excl = rc ? -1 : 0; /* error may mean we lost the lock */ + + return rc; +} + +/** Try to get exclusive lock, otherwise shared. + * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. + */ +static int __cold mdbx_env_excl_lock(MDB_env *env, int *excl) { + int rc = 0; + struct flock lock_info; + + memset((void *)&lock_info, 0, sizeof(lock_info)); + lock_info.l_type = F_WRLCK; + lock_info.l_whence = SEEK_SET; + lock_info.l_start = 0; + lock_info.l_len = 1; + while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && + (rc = errno) == EINTR) + ; + if (!rc) { + *excl = 1; + } else { + lock_info.l_type = F_RDLCK; + while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && + (rc = errno) == EINTR) + ; + if (rc == 0) + *excl = 0; + } + return rc; +} + +#ifdef MDB_USE_HASH +/* + * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code + * + * @(#) $Revision: 5.1 $ + * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ + * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ + * + * http://www.isthe.com/chongo/tech/comp/fnv/index.html + * + *** + * + * Please do not copyright this code. This code is in the public domain. + * + * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO + * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + * + * By: + * chongo /\oo/\ + * http://www.isthe.com/chongo/ + * + * Share and Enjoy! :-) + */ + +typedef unsigned long long mdbx_hash_t; +#define MDB_HASH_INIT ((mdbx_hash_t)0xcbf29ce484222325ULL) + +/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer + * @param[in] val value to hash + * @param[in] hval initial value for hash + * @return 64 bit hash + * + * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the + * hval arg on the first call. + */ +static mdbx_hash_t mdbx_hash_val(MDB_val *val, mdbx_hash_t hval) { + unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ + unsigned char *end = s + val->mv_size; + /* + * FNV-1a hash each octet of the string + */ + while (s < end) { + /* xor the bottom with the current octet */ + hval ^= (mdbx_hash_t)*s++; + + /* multiply by the 64 bit FNV magic prime mod 2^64 */ + hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + + (hval << 8) + (hval << 40); + } + /* return our new hash value */ + return hval; +} + +/** Hash the string and output the encoded hash. + * This uses modified RFC1924 Ascii85 encoding to accommodate systems with + * very short name limits. We don't care about the encoding being reversible, + * we just want to preserve as many bits of the input as possible in a + * small printable string. + * @param[in] str string to hash + * @param[out] encbuf an array of 11 chars to hold the hash + */ +static const char mdbx_a85[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij" + "klmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; + +static void __cold mdbx_pack85(unsigned long l, char *out) { + int i; + + for (i = 0; i < 5; i++) { + *out++ = mdbx_a85[l % 85]; + l /= 85; + } +} + +static void __cold mdbx_hash_enc(MDB_val *val, char *encbuf) { + mdbx_hash_t h = mdbx_hash_val(val, MDB_HASH_INIT); + + mdbx_pack85(h, encbuf); + mdbx_pack85(h >> 32, encbuf + 5); + encbuf[10] = '\0'; +} +#endif + +/** Open and/or initialize the lock region for the environment. + * @param[in] env The LMDB environment. + * @param[in] lpath The pathname of the file used for the lock region. + * @param[in] mode The Unix permissions for the file, if we create it. + * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive + * @return 0 on success, non-zero on failure. + */ +static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, + int *excl) { + int fdflags; + int rc; + off_t size, rsize; + void *m; + + env->me_lfd = open(lpath, O_RDWR | O_CREAT | O_CLOEXEC, mode); + if (env->me_lfd == INVALID_HANDLE_VALUE) { + rc = errno; + if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { + return MDB_SUCCESS; + } + return rc; + } + + /* Lose record locks when exec*() */ + if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) + fcntl(env->me_lfd, F_SETFD, fdflags); + + if (!(env->me_flags & MDB_NOTLS)) { + rc = pthread_key_create(&env->me_txkey, NULL); + if (rc) + return rc; + env->me_flags |= MDB_ENV_TXKEY; + } + + /* Try to get exclusive lock. If we succeed, then + * nobody is using the lock region and we should initialize it. + */ + if ((rc = mdbx_env_excl_lock(env, excl))) + return rc; + + size = lseek(env->me_lfd, 0, SEEK_END); + if (size == -1) + return errno; + rsize = (env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); + if (size 0) { + if (ftruncate(env->me_lfd, rsize) != 0) + return errno; + } else { + rsize = size; + size = rsize - sizeof(MDB_txninfo); + env->me_maxreaders = size / sizeof(MDB_reader) + 1; + } + + m = mmap(NULL, rsize, PROT_READ | PROT_WRITE, MAP_SHARED, env->me_lfd, 0); + if (m == MAP_FAILED) + return errno; + env->me_txns = m; + +#ifdef MADV_NOHUGEPAGE + (void)madvise(env->me_txns, rsize, MADV_NOHUGEPAGE); +#endif + +#ifdef MADV_DODUMP + (void)madvise(env->me_txns, rsize, MADV_DODUMP); +#endif + + if (madvise(env->me_txns, rsize, MADV_DONTFORK) < 0) + return errno; + + if (madvise(env->me_txns, rsize, MADV_WILLNEED) < 0) + return errno; + + if (madvise(env->me_txns, rsize, MADV_RANDOM) < 0) + return errno; + + if (*excl > 0) { + /* Solaris needs this before initing a robust mutex. Otherwise + * it may skip the init and return EBUSY "seems someone already + * inited" or EINVAL "it was inited differently". + */ + memset(&env->me_txns->mti_rmutex, 0, sizeof(env->me_txns->mti_rmutex)); + memset(&env->me_txns->mti_wmutex, 0, sizeof(env->me_txns->mti_wmutex)); + + pthread_mutexattr_t mattr; + rc = pthread_mutexattr_init(&mattr); + if (rc) + return rc; + + rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); + +#if MDB_USE_ROBUST + if (!rc) + rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); +#endif /* MDB_USE_ROBUST */ + if (!rc) + rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr); + if (!rc) + rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr); + + pthread_mutexattr_destroy(&mattr); + if (rc) + return rc; + + env->me_txns->mti_magic = MDB_MAGIC; + env->me_txns->mti_format = MDB_LOCK_FORMAT; + env->me_txns->mti_txnid = ~0L; + env->me_txns->mti_numreaders = 0; + } else { + if (env->me_txns->mti_magic != MDB_MAGIC) { + mdbx_debug("lock region has invalid magic"); + return MDB_INVALID; + } + if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { + mdbx_debug("lock region has format+version 0x%x, expected 0x%x", + env->me_txns->mti_format, MDB_LOCK_FORMAT); + return MDB_VERSION_MISMATCH; + } + } + + return MDB_SUCCESS; +} + +/** The name of the lock file in the DB environment */ +#define LOCKNAME "/lock.mdb" +/** The name of the data file in the DB environment */ +#define DATANAME "/data.mdb" +/** The suffix of the lock file when no subdir is used */ +#define LOCKSUFF "-lock" +/** Only a subset of the @ref mdbx_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. + */ +#define CHANGEABLE \ + (MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC | MDB_NOMEMINIT | \ + MDBX_COALESCE | MDBX_PAGEPERTURB) +#define CHANGELESS \ + (MDB_FIXEDMAP | MDB_NOSUBDIR | MDB_RDONLY | MDB_WRITEMAP | MDB_NOTLS | \ + MDB_NORDAHEAD | MDBX_LIFORECLAIM) + +#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE | CHANGELESS) +#error "Persistent DB flags & env flags overlap, but both go in mm_flags" +#endif + +int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, + mode_t mode, int *exclusive) { + int oflags, rc, len, excl = -1; + char *lpath, *dpath; + + if (unlikely(!env || !path)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (env->me_fd != INVALID_HANDLE_VALUE || + (flags & ~(CHANGEABLE | CHANGELESS))) + return EINVAL; + + len = strlen(path); + if (flags & MDB_NOSUBDIR) { + rc = len + sizeof(LOCKSUFF) + len + 1; + } else { + rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); + } + lpath = malloc(rc); + if (!lpath) + return ENOMEM; + if (flags & MDB_NOSUBDIR) { + dpath = lpath + len + sizeof(LOCKSUFF); + sprintf(lpath, "%s" LOCKSUFF, path); + strcpy(dpath, path); + } else { + dpath = lpath + len + sizeof(LOCKNAME); + sprintf(lpath, "%s" LOCKNAME, path); + sprintf(dpath, "%s" DATANAME, path); + } + + rc = MDB_SUCCESS; + flags |= env->me_flags; + if (flags & MDB_RDONLY) { + /* LY: silently ignore irrelevant flags when we're only getting read + * access */ + flags &= ~(MDB_WRITEMAP | MDB_MAPASYNC | MDB_NOSYNC | MDB_NOMETASYNC | + MDBX_COALESCE | MDBX_LIFORECLAIM | MDB_NOMEMINIT); + } else { + if (!((env->me_free_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX)) && + (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + rc = ENOMEM; + } + env->me_flags = flags |= MDB_ENV_ACTIVE; + if (rc) + goto leave; + + env->me_path = strdup(path); + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); + env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned)); + if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { + rc = ENOMEM; + goto leave; + } + env->me_dbxs[FREE_DBI].md_cmp = mdbx_cmp_int_ai; /* aligned MDB_INTEGERKEY */ + + /* For RDONLY, get lockfile after we know datafile exists */ + if (!(flags & MDB_RDONLY)) { + rc = mdbx_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; + } + + if (F_ISSET(flags, MDB_RDONLY)) + oflags = O_RDONLY; + else + oflags = O_RDWR | O_CREAT; + + env->me_fd = open(dpath, oflags | O_CLOEXEC, mode); + if (env->me_fd == INVALID_HANDLE_VALUE) { + rc = errno; + goto leave; + } + + int fdflags; + if ((fdflags = fcntl(env->me_fd, F_GETFD) | FD_CLOEXEC) >= 0) + fcntl(env->me_fd, F_SETFD, fdflags); + + if (flags & MDB_RDONLY) { + rc = mdbx_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; + } + + MDB_meta meta; + if ((rc = mdbx_env_open2(env, &meta)) == MDB_SUCCESS) { + mdbx_debug("opened dbenv %p", (void *)env); + if (excl > 0) { + env->me_txns->mti_txnid = meta.mm_txnid; + if (exclusive == NULL || *exclusive < 2) { + /* LY: downgrade lock only if exclusive access not requested. + * in case exclusive==1, just leave value as is. */ + rc = mdbx_env_share_locks(env, &excl); + if (rc) + goto leave; + } + } else if (exclusive) { + /* LY: just indicate that is not an exclusive access. */ + *exclusive = 0; + } + if (!(flags & MDB_RDONLY)) { + MDB_txn *txn; + int tsize = sizeof(MDB_txn), + size = tsize + + env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + + sizeof(unsigned) + 1); + if ((env->me_pbuf = calloc(1, env->me_psize)) && + (txn = calloc(1, size))) { + txn->mt_dbs = (MDB_db *)((char *)txn + tsize); + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_env = env; + txn->mt_dbxs = env->me_dbxs; + txn->mt_flags = MDB_TXN_FINISHED; + env->me_txn0 = txn; + } else { + rc = ENOMEM; + } + } + } + +#if MDB_DEBUG + if (rc == MDB_SUCCESS) { + MDB_meta *meta = mdbx_meta_head_r(env); + MDB_db *db = &meta->mm_dbs[MAIN_DBI]; + int toggle = ((char *)meta == PAGEDATA(env->me_map)) ? 0 : 1; + + mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, + env->me_psize); + mdbx_debug("using meta page %d, txn %zu", toggle, meta->mm_txnid); + mdbx_debug("depth: %u", db->md_depth); + mdbx_debug("entries: %zu", db->md_entries); + mdbx_debug("branch pages: %zu", db->md_branch_pages); + mdbx_debug("leaf pages: %zu", db->md_leaf_pages); + mdbx_debug("overflow pages: %zu", db->md_overflow_pages); + mdbx_debug("root: %zu", db->md_root); + } +#endif + +leave: + if (rc) + mdbx_env_close0(env); + free(lpath); + return rc; +} + +int __cold mdbx_env_open(MDB_env *env, const char *path, unsigned flags, + mode_t mode) { + return mdbx_env_open_ex(env, path, flags, mode, NULL); +} + +/** Destroy resources from mdbx_env_open(), clear our readers & DBIs */ +static void __cold mdbx_env_close0(MDB_env *env) { + int i; + + if (!(env->me_flags & MDB_ENV_ACTIVE)) + return; + env->me_flags &= ~MDB_ENV_ACTIVE; + + /* Doing this here since me_dbxs may not exist during mdbx_env_close */ + if (env->me_dbxs) { + for (i = env->me_maxdbs; --i >= CORE_DBS;) + free(env->me_dbxs[i].md_name.mv_data); + free(env->me_dbxs); + } + + free(env->me_pbuf); + free(env->me_dbiseqs); + free(env->me_dbflags); + free(env->me_path); + free(env->me_dirty_list); + if (env->me_txn0) + mdbx_midl_free(env->me_txn0->mt_lifo_reclaimed); + free(env->me_txn0); + mdbx_midl_free(env->me_free_pgs); + + if (env->me_flags & MDB_ENV_TXKEY) { + mdbx_ensure(env, pthread_key_delete(env->me_txkey) == 0); + env->me_flags &= ~MDB_ENV_TXKEY; + } + + if (env->me_map) { + munmap(env->me_map, env->me_mapsize); +#ifdef USE_VALGRIND + VALGRIND_DISCARD(env->me_valgrind_handle); + env->me_valgrind_handle = -1; +#endif + } + if (env->me_fd != INVALID_HANDLE_VALUE) + (void)close(env->me_fd); + + /* Clearing readers is done in this function because + * me_txkey with its destructor must be disabled first. + * + * We skip the the reader mutex, so we touch only + * data owned by this process (me_close_readers and + * our readers), and clear each reader atomically. + */ + if (env->me_pid == getpid()) + mdbx_rthc_cleanup(env); + + munmap((void *)env->me_txns, + (env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDB_txninfo)); + env->me_txns = NULL; + env->me_pid = 0; + + if (env->me_lfd != INVALID_HANDLE_VALUE) { + (void)close(env->me_lfd); + } +} + +int __cold mdbx_env_close_ex(MDB_env *env, int dont_sync) { + MDB_page *dp; + int rc = MDB_SUCCESS; + + if (unlikely(!env)) + return EINVAL; + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (!dont_sync && env->me_txns) + rc = mdbx_env_sync(env, 1); + + VALGRIND_DESTROY_MEMPOOL(env); + while ((dp = env->me_dpages) != NULL) { + ASAN_UNPOISON_MEMORY_REGION(&dp->mp_next, sizeof(dp->mp_next)); + VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); + env->me_dpages = dp->mp_next; + free(dp); + } + + mdbx_env_close0(env); + env->me_signature = 0; + free(env); + + return rc; +} + +void __cold mdbx_env_close(MDB_env *env) { mdbx_env_close_ex(env, 0); } + +/* LY: fast enough on most arches + * + * / + * | -1, a < b + * cmp2int(a,b) = < 0, a == b + * | 1, a > b + * \ + */ +#if 1 +#define mdbx_cmp2int(a, b) (((b) > (a)) ? -1 : (a) > (b)) +#else +#define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) +#endif + +/** Compare two items pointing at aligned unsigned int's. */ +static int __hot mdbx_cmp_int_ai(const MDB_val *a, const MDB_val *b) { + mdbx_assert(NULL, a->mv_size == b->mv_size); + mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(int) && + 0 == (uintptr_t)b->mv_data % sizeof(int)); + + if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) + return mdbx_cmp2int(*(size_t *)a->mv_data, *(size_t *)b->mv_data); + + mdbx_assert(NULL, a->mv_size == sizeof(int)); + return mdbx_cmp2int(*(unsigned *)a->mv_data, *(unsigned *)b->mv_data); +} + +/** Compare two items pointing at 2-byte aligned unsigned int's. */ +static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { + mdbx_assert(NULL, a->mv_size == b->mv_size); + mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(uint16_t) && + 0 == (uintptr_t)b->mv_data % sizeof(uint16_t)); +#ifdef MISALIGNED_OK + if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) + return mdbx_cmp2int(*(size_t *)a->mv_data, *(size_t *)b->mv_data); + + mdbx_assert(NULL, a->mv_size == sizeof(int)); + return mdbx_cmp2int(*(unsigned *)a->mv_data, *(unsigned *)b->mv_data); +#else + mdbx_assert(NULL, 0 == a->mv_size % sizeof(uint16_t)); + { + int diff; + const uint16_t *pa, *pb, *end; + +#if BYTE_ORDER == LITTLE_ENDIAN + end = (const uint16_t *)a->mv_data; + pa = (const uint16_t *)((char *)a->mv_data + a->mv_size); + pb = (const uint16_t *)((char *)b->mv_data + a->mv_size); + do { + diff = *--pa - *--pb; +#else /* BYTE_ORDER */ + end = (const uint16_t *)((char *)a->mv_data + a->mv_size); + pa = (const uint16_t *)a->mv_data; + pb = (const uint16_t *)b->mv_data; + do { + diff = *pa++ - *pb++; +#endif /* BYTE_ORDER */ + if (likely(diff != 0)) + break; + } while (pa != end); + return diff; + } +#endif /* MISALIGNED_OK */ +} + +/** Compare two items pointing at unsigneds of unknown alignment. + * + * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp. + */ +static int __hot mdbx_cmp_int_ua(const MDB_val *a, const MDB_val *b) { + mdbx_assert(NULL, a->mv_size == b->mv_size); +#if MISALIGNED_OK + if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) + return mdbx_cmp2int(*(size_t *)a->mv_data, *(size_t *)b->mv_data); + + mdbx_assert(NULL, a->mv_size == sizeof(int)); + return mdbx_cmp2int(*(unsigned *)a->mv_data, *(unsigned *)b->mv_data); +#else + mdbx_assert(NULL, a->mv_size == sizeof(int) || a->mv_size == sizeof(size_t)); +#if BYTE_ORDER == LITTLE_ENDIAN + { + int diff; + const uint8_t *pa, *pb; + + pa = (const uint8_t *)a->mv_data + a->mv_size; + pb = (const uint8_t *)b->mv_data + a->mv_size; + + do { + diff = *--pa - *--pb; + if (likely(diff != 0)) + break; + } while (pa != a->mv_data); + return diff; + } +#else /* BYTE_ORDER */ + return memcmp(a->mv_data, b->mv_data, a->mv_size); +#endif /* BYTE_ORDER */ +#endif /* MISALIGNED_OK */ +} + +/** Compare two items lexically */ +static int __hot mdbx_cmp_memn(const MDB_val *a, const MDB_val *b) { +/* LY: assumes that length of keys are NOT equal for most cases, + * if no then branch-prediction should mitigate the problem */ +#if 0 + /* LY: without branch instructions on x86, + * but isn't best for equal length of keys */ + int diff_len = mdbx_cmp2int(a->mv_size, b->mv_size); +#else + /* LY: best when length of keys are equal, + * but got a branch-penalty otherwise */ + if (unlikely(a->mv_size == b->mv_size)) + return memcmp(a->mv_data, b->mv_data, a->mv_size); + int diff_len = (a->mv_size < b->mv_size) ? -1 : 1; +#endif + size_t shortest = (a->mv_size < b->mv_size) ? a->mv_size : b->mv_size; + int diff_data = memcmp(a->mv_data, b->mv_data, shortest); + return likely(diff_data) ? diff_data : diff_len; +} + +/** Compare two items in reverse byte order */ +static int __hot mdbx_cmp_memnr(const MDB_val *a, const MDB_val *b) { + const uint8_t *pa, *pb, *end; + + pa = (const uint8_t *)a->mv_data + a->mv_size; + pb = (const uint8_t *)b->mv_data + b->mv_size; + size_t minlen = (a->mv_size < b->mv_size) ? a->mv_size : b->mv_size; + end = pa - minlen; + + while (pa != end) { + int diff = *--pa - *--pb; + if (likely(diff)) + return diff; + } + return mdbx_cmp2int(a->mv_size, b->mv_size); +} + +/** Search for key within a page, using binary search. + * Returns the smallest entry larger or equal to the key. + * If exactp is non-null, stores whether the found entry was an exact match + * in *exactp (1 or 0). + * Updates the cursor index with the index of the found entry. + * If no entry larger or equal to the key is found, returns NULL. + */ +static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, + int *exactp) { + unsigned i = 0, nkeys; + int low, high; + int rc = 0; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NULL; + MDB_val nodekey; + MDB_cmp_func *cmp; + DKBUF; + + nkeys = NUMKEYS(mp); + + mdbx_debug("searching %u keys in %s %spage %zu", nkeys, + IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mdbx_dbg_pgno(mp)); + + low = IS_LEAF(mp) ? 0 : 1; + high = nkeys - 1; + cmp = mc->mc_dbx->md_cmp; + + /* Branch pages have no data, so if using integer keys, + * alignment is guaranteed. Use faster mdbx_cmp_int_ai. + */ + if (cmp == mdbx_cmp_int_a2 && IS_BRANCH(mp)) + cmp = mdbx_cmp_int_ai; + + if (IS_LEAF2(mp)) { + nodekey.mv_size = mc->mc_db->md_xsize; + node = NODEPTR(mp, 0); /* fake */ + while (low <= high) { + i = (low + high) >> 1; + nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); + rc = cmp(key, &nodekey); + mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc); + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } else { + while (low <= high) { + i = (low + high) >> 1; + + node = NODEPTR(mp, i); + nodekey.mv_size = NODEKSZ(node); + nodekey.mv_data = NODEKEY(node); + + rc = cmp(key, &nodekey); + if (IS_LEAF(mp)) + mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc); + else + mdbx_debug("found branch index %u [%s -> %zu], rc = %i", i, + DKEY(&nodekey), NODEPGNO(node), rc); + if (rc == 0) + break; + if (rc > 0) + low = i + 1; + else + high = i - 1; + } + } + + if (rc > 0) { /* Found entry is less than the key. */ + i++; /* Skip to get the smallest entry larger than key. */ + if (!IS_LEAF2(mp)) + node = NODEPTR(mp, i); + } + if (exactp) + *exactp = (rc == 0 && nkeys > 0); + /* store the key index */ + mc->mc_ki[mc->mc_top] = i; + if (i >= nkeys) + /* There is no entry larger or equal to the key. */ + return NULL; + + /* nodeptr is fake for LEAF2 */ + return node; +} + +#if 0 +static void +mdbx_cursor_adjust(MDB_cursor *mc, func) { - if (unlikely(mc == NULL)) - return EINVAL; + MDB_cursor *m2; - if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { + func(mc, m2); + } + } +} +#endif - if ((mc->mc_flags & C_INITIALIZED) == 0) - return MDBX_RESULT_TRUE; +/** Pop a page off the top of the cursor's stack. */ +static void mdbx_cursor_pop(MDB_cursor *mc) { + if (mc->mc_snum) { + mdbx_debug("popped page %zu off db %d cursor %p", + mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); - if (mc->mc_snum == 0) - return MDBX_RESULT_TRUE; - - if ((mc->mc_flags & C_EOF) - && mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) - return MDBX_RESULT_TRUE; - - return MDBX_RESULT_FALSE; + mc->mc_snum--; + if (mc->mc_snum) { + mc->mc_top--; + } else { + mc->mc_flags &= ~C_INITIALIZED; + } + } } -static int mdbx_is_samedata(const MDB_val* a, const MDB_val* b) { - return a->iov_len == b->iov_len - && memcmp(a->iov_base, b->iov_base, a->iov_len) == 0; +/** Push a page onto the top of the cursor's stack. + * Set #MDB_TXN_ERROR on failure. + */ +static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp) { + mdbx_debug("pushing page %zu on db %d cursor %p", mp->mp_pgno, DDBI(mc), + (void *)mc); + + if (unlikely(mc->mc_snum >= CURSOR_STACK)) { + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CURSOR_FULL; + } + + mc->mc_top = mc->mc_snum++; + mc->mc_pg[mc->mc_top] = mp; + mc->mc_ki[mc->mc_top] = 0; + + return MDB_SUCCESS; +} + +/** Find the address of the page corresponding to a given page number. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc the cursor accessing the page. + * @param[in] pgno the page number for the page to retrieve. + * @param[out] ret address of a pointer where the page's address will be + * stored. + * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, + * 0=mapped page. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, + int *lvl) { + MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + MDB_page *p = NULL; + int level; + + if (!(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_WRITEMAP))) { + MDB_txn *tx2 = txn; + level = 1; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + unsigned x; + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). */ + if (tx2->mt_spill_pgs) { + MDB_ID pn = pgno << 1; + x = mdbx_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) + goto mapped; + } + if (dl[0].mid) { + unsigned x = mdbx_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + level++; + } while ((tx2 = tx2->mt_parent) != NULL); + } + + if (unlikely(pgno >= txn->mt_next_pgno)) { + mdbx_debug("page %zu not found", pgno); + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_NOTFOUND; + } + level = 0; + +mapped: + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + +done: + *ret = p; + if (lvl) + *lvl = level; + return MDB_SUCCESS; +} + +/** Finish #mdbx_page_search() / #mdbx_page_search_lowest(). + * The cursor is at the root page, set up the rest of it. + */ +static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int rc; + DKBUF; + + while (IS_BRANCH(mp)) { + MDB_node *node; + indx_t i; + + mdbx_debug("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp)); + /* Don't assert on branch pages in the FreeDB. We can get here + * while in the process of rebalancing a FreeDB branch page; we must + * let that proceed. ITS#8336 + */ + mdbx_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); + mdbx_debug("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0))); + + if (flags & (MDB_PS_FIRST | MDB_PS_LAST)) { + i = 0; + if (flags & MDB_PS_LAST) { + i = NUMKEYS(mp) - 1; + /* if already init'd, see if we're already in right place */ + if (mc->mc_flags & C_INITIALIZED) { + if (mc->mc_ki[mc->mc_top] == i) { + mc->mc_top = mc->mc_snum++; + mp = mc->mc_pg[mc->mc_top]; + goto ready; + } + } + } + } else { + int exact; + node = mdbx_node_search(mc, key, &exact); + if (node == NULL) + i = NUMKEYS(mp) - 1; + else { + i = mc->mc_ki[mc->mc_top]; + if (!exact) { + mdbx_cassert(mc, i > 0); + i--; + } + } + mdbx_debug("following index %u for key [%s]", i, DKEY(key)); + } + + mdbx_cassert(mc, i < NUMKEYS(mp)); + node = NODEPTR(mp, i); + + if (unlikely((rc = mdbx_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) + return rc; + + mc->mc_ki[mc->mc_top] = i; + if (unlikely(rc = mdbx_cursor_push(mc, mp))) + return rc; + + ready: + if (flags & MDB_PS_MODIFY) { + if (unlikely((rc = mdbx_page_touch(mc)) != 0)) + return rc; + mp = mc->mc_pg[mc->mc_top]; + } + } + + if (unlikely(!IS_LEAF(mp))) { + mdbx_debug("internal error, index points to a %02X page!?", mp->mp_flags); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + + mdbx_debug("found leaf page %zu for key [%s]", mp->mp_pgno, + key ? DKEY(key) : "null"); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + return MDB_SUCCESS; +} + +/** Search for the lowest key under the current branch page. + * This just bypasses a NUMKEYS check in the current page + * before calling mdbx_page_search_root(), because the callers + * are all in situations where the current page is known to + * be underfilled. + */ +static int mdbx_page_search_lowest(MDB_cursor *mc) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_node *node = NODEPTR(mp, 0); + int rc; + + if (unlikely((rc = mdbx_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) + return rc; + + mc->mc_ki[mc->mc_top] = 0; + if (unlikely(rc = mdbx_cursor_push(mc, mp))) + return rc; + return mdbx_page_search_root(mc, NULL, MDB_PS_FIRST); +} + +/** Search for the page a given key should be in. + * Push it and its parent pages on the cursor stack. + * @param[in,out] mc the cursor for this operation. + * @param[in] key the key to search for, or NULL for first/last page. + * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB + * are touched (updated with new page numbers). + * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. + * This is used by #mdbx_cursor_first() and #mdbx_cursor_last(). + * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { + int rc; + pgno_t root; + + /* Make sure the txn is still viable, then find the root from + * the txn's db table and set it as the root of the cursor's stack. + */ + if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) { + mdbx_debug("transaction has failed, must abort"); + return MDB_BAD_TXN; + } else { + /* Make sure we're using an up-to-date root */ + if (unlikely(*mc->mc_dbflag & DB_STALE)) { + MDB_cursor mc2; + if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + return MDB_BAD_DBI; + mdbx_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); + rc = mdbx_page_search(&mc2, &mc->mc_dbx->md_name, 0); + if (rc) + return rc; + { + MDB_val data; + int exact = 0; + uint16_t flags; + MDB_node *leaf = mdbx_node_search(&mc2, &mc->mc_dbx->md_name, &exact); + if (!exact) + return MDB_NOTFOUND; + if (unlikely((leaf->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) + return MDB_INCOMPATIBLE; /* not a named DB */ + rc = mdbx_node_read(&mc2, leaf, &data); + if (rc) + return rc; + memcpy(&flags, ((char *)data.mv_data + offsetof(MDB_db, md_flags)), + sizeof(uint16_t)); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. + */ + if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)) + return MDB_INCOMPATIBLE; + memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); + } + *mc->mc_dbflag &= ~DB_STALE; + } + root = mc->mc_db->md_root; + + if (unlikely(root == P_INVALID)) { /* Tree is empty. */ + mdbx_debug("tree is empty"); + return MDB_NOTFOUND; + } + } + + mdbx_cassert(mc, root > 1); + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) + if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) + return rc; + + mc->mc_snum = 1; + mc->mc_top = 0; + + mdbx_debug("db %d root page %zu has flags 0x%X", DDBI(mc), root, + mc->mc_pg[0]->mp_flags); + + if (flags & MDB_PS_MODIFY) { + if (unlikely(rc = mdbx_page_touch(mc))) + return rc; + } + + if (flags & MDB_PS_ROOTONLY) + return MDB_SUCCESS; + + return mdbx_page_search_root(mc, key, flags); +} + +static int mdbx_ovpage_free(MDB_cursor *mc, MDB_page *mp) { + MDB_txn *txn = mc->mc_txn; + pgno_t pg = mp->mp_pgno; + unsigned x = 0, ovpages = mp->mp_pages; + MDB_env *env = txn->mt_env; + MDB_IDL sl = txn->mt_spill_pgs; + MDB_ID pn = pg << 1; + int rc; + + mdbx_debug("free ov page %zu (%u)", pg, ovpages); + /* If the page is dirty or on the spill list we just acquired it, + * so we should give it back to our current free list, if any. + * Otherwise put it onto the list of pages we freed in this txn. + * + * Won't create me_pghead: me_pglast must be inited along with it. + * Unsupported in nested txns: They would need to hide the page + * range in ancestor txns' dirty and spilled lists. + */ + if (env->me_pghead && !txn->mt_parent && + ((mp->mp_flags & P_DIRTY) || + (sl && (x = mdbx_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) { + unsigned i, j; + pgno_t *mop; + MDB_ID2 *dl, ix, iy; + rc = mdbx_midl_need(&env->me_pghead, ovpages); + if (unlikely(rc)) + return rc; + if (!(mp->mp_flags & P_DIRTY)) { + /* This page is no longer spilled */ + if (x == sl[0]) + sl[0]--; + else + sl[x] |= 1; + goto release; + } + /* Remove from dirty list */ + dl = txn->mt_u.dirty_list; + x = dl[0].mid--; + for (ix = dl[x]; ix.mptr != mp; ix = iy) { + if (likely(x > 1)) { + x--; + iy = dl[x]; + dl[x] = ix; + } else { + mdbx_cassert(mc, x > 1); + j = ++(dl[0].mid); + dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ + txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PROBLEM; + } + } + txn->mt_dirty_room++; + if (!(env->me_flags & MDB_WRITEMAP)) + mdbx_dpage_free(env, mp); + release: + /* Insert in me_pghead */ + mop = env->me_pghead; + j = mop[0] + ovpages; + for (i = mop[0]; i && mop[i] < pg; i--) + mop[j--] = mop[i]; + while (j > i) + mop[j--] = pg++; + mop[0] += ovpages; + } else { + rc = mdbx_midl_append_range(&txn->mt_free_pgs, pg, ovpages); + if (unlikely(rc)) + return rc; + } + mc->mc_db->md_overflow_pages -= ovpages; + return 0; +} + +/** Return the data associated with a given node. + * @param[in] mc The cursor for this operation. + * @param[in] leaf The node being read. + * @param[out] data Updated to point to the node's data. + * @return 0 on success, non-zero on failure. + */ +static MDBX_INLINE int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, + MDB_val *data) { + MDB_page *omp; /* overflow page */ + pgno_t pgno; + int rc; + + if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { + data->mv_size = NODEDSZ(leaf); + data->mv_data = NODEDATA(leaf); + return MDB_SUCCESS; + } + + /* Read overflow data. + */ + data->mv_size = NODEDSZ(leaf); + memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); + if (unlikely((rc = mdbx_page_get(mc, pgno, &omp, NULL)) != 0)) { + mdbx_debug("read overflow page %zu failed", pgno); + return rc; + } + data->mv_data = PAGEDATA(omp); + + return MDB_SUCCESS; +} + +int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { + MDB_cursor mc; + MDB_xcursor mx; + int exact = 0; + DKBUF; + + mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); + + if (unlikely(!key || !data || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + mdbx_cursor_init(&mc, txn, dbi, &mx); + return mdbx_cursor_set(&mc, key, data, MDB_SET, &exact); +} + +/** Find a sibling for a page. + * Replaces the page at the top of the cursor's stack with the + * specified sibling, if one exists. + * @param[in] mc The cursor for this operation. + * @param[in] move_right Non-zero if the right sibling is requested, + * otherwise the left sibling. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { + int rc; + MDB_node *indx; + MDB_page *mp; + + if (unlikely(mc->mc_snum < 2)) { + return MDB_NOTFOUND; /* root has no siblings */ + } + + mdbx_cursor_pop(mc); + mdbx_debug("parent page is page %zu, index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); + + if (move_right + ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { + mdbx_debug("no more keys left, moving to %s sibling", + move_right ? "right" : "left"); + if (unlikely((rc = mdbx_cursor_sibling(mc, move_right)) != MDB_SUCCESS)) { + /* undo cursor_pop before returning */ + mc->mc_top++; + mc->mc_snum++; + return rc; + } + } else { + if (move_right) + mc->mc_ki[mc->mc_top]++; + else + mc->mc_ki[mc->mc_top]--; + mdbx_debug("just moving to %s index key %u", move_right ? "right" : "left", + mc->mc_ki[mc->mc_top]); + } + mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + + indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (unlikely((rc = mdbx_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0)) { + /* mc will be inconsistent if caller does mc_snum++ as above */ + mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + return rc; + } + + mdbx_cursor_push(mc, mp); + if (!move_right) + mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + + return MDB_SUCCESS; +} + +/** Move the cursor to the next data item. */ +static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op) { + MDB_page *mp; + MDB_node *leaf; + int rc; + + if ((mc->mc_flags & C_DEL) && op == MDB_NEXT_DUP) + return MDB_NOTFOUND; + + if (!(mc->mc_flags & C_INITIALIZED)) + return mdbx_cursor_first(mc, key, data); + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_flags & C_EOF) { + if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp) - 1) + return MDB_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_NEXT || op == MDB_NEXT_DUP) { + rc = mdbx_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); + if (op != MDB_NEXT || rc != MDB_NOTFOUND) { + if (likely(rc == MDB_SUCCESS)) + MDB_GET_KEY(leaf, key); + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + if (op == MDB_NEXT_DUP) + return MDB_NOTFOUND; + } + } + + mdbx_debug("cursor_next: top page is %zu in cursor %p", mdbx_dbg_pgno(mp), + (void *)mc); + if (mc->mc_flags & C_DEL) { + mc->mc_flags ^= C_DEL; + goto skip; + } + + if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { + mdbx_debug("=====> move to next sibling page"); + if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDB_SUCCESS)) { + mc->mc_flags |= C_EOF; + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mdbx_debug("next page is %zu, key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); + } else + mc->mc_ki[mc->mc_top]++; + +skip: + mdbx_debug("==> cursor points to page %zu with %u keys, key index %u", + mdbx_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdbx_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_xcursor_init1(mc, leaf); + } + if (data) { + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the previous data item. */ +static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op) { + MDB_page *mp; + MDB_node *leaf; + int rc; + + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdbx_cursor_last(mc, key, data); + if (unlikely(rc)) + return rc; + mc->mc_ki[mc->mc_top]++; + } + + mp = mc->mc_pg[mc->mc_top]; + + if (mc->mc_db->md_flags & MDB_DUPSORT) { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_PREV || op == MDB_PREV_DUP) { + rc = mdbx_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); + if (op != MDB_PREV || rc != MDB_NOTFOUND) { + if (likely(rc == MDB_SUCCESS)) { + MDB_GET_KEY(leaf, key); + mc->mc_flags &= ~C_EOF; + } + return rc; + } + } + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + if (op == MDB_PREV_DUP) + return MDB_NOTFOUND; + } + } + + mdbx_debug("cursor_prev: top page is %zu in cursor %p", mdbx_dbg_pgno(mp), + (void *)mc); + + mc->mc_flags &= ~(C_EOF | C_DEL); + + if (mc->mc_ki[mc->mc_top] == 0) { + mdbx_debug("=====> move to prev sibling page"); + if ((rc = mdbx_cursor_sibling(mc, 0)) != MDB_SUCCESS) { + return rc; + } + mp = mc->mc_pg[mc->mc_top]; + mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; + mdbx_debug("prev page is %zu, key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); + } else + mc->mc_ki[mc->mc_top]--; + + mdbx_debug("==> cursor points to page %zu with %u keys, key index %u", + mdbx_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + mdbx_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_xcursor_init1(mc, leaf); + } + if (data) { + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + return rc; + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Set the cursor on a specific data item. */ +static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op, int *exactp) { + int rc; + MDB_page *mp; + MDB_node *leaf = NULL; + DKBUF; + + if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && + unlikely(key->mv_size != sizeof(unsigned) && + key->mv_size != sizeof(size_t))) { + mdbx_cassert(mc, !"key-size is invalid for MDB_INTEGERKEY"); + return MDB_BAD_VALSIZE; + } + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + + /* See if we're already on the right page */ + if (mc->mc_flags & C_INITIALIZED) { + MDB_val nodekey; + + mp = mc->mc_pg[mc->mc_top]; + if (!NUMKEYS(mp)) { + mc->mc_ki[mc->mc_top] = 0; + return MDB_NOTFOUND; + } + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_size = mc->mc_db->md_xsize; + nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, 0); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* Probably happens rarely, but first node on the page + * was the one we wanted. + */ + mc->mc_ki[mc->mc_top] = 0; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc > 0) { + unsigned i; + unsigned nkeys = NUMKEYS(mp); + if (nkeys > 1) { + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = LEAF2KEY(mp, nkeys - 1, nodekey.mv_size); + } else { + leaf = NODEPTR(mp, nkeys - 1); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* last node was the one we wanted */ + mc->mc_ki[mc->mc_top] = nkeys - 1; + if (exactp) + *exactp = 1; + goto set1; + } + if (rc < 0) { + if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { + /* This is definitely the right page, skip search_page */ + if (mp->mp_flags & P_LEAF2) { + nodekey.mv_data = + LEAF2KEY(mp, mc->mc_ki[mc->mc_top], nodekey.mv_size); + } else { + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY2(leaf, nodekey); + } + rc = mc->mc_dbx->md_cmp(key, &nodekey); + if (rc == 0) { + /* current node was the one we wanted */ + if (exactp) + *exactp = 1; + goto set1; + } + } + rc = 0; + mc->mc_flags &= ~C_EOF; + goto set2; + } + } + /* If any parents have right-sibs, search. + * Otherwise, there's nothing further. */ + for (i = 0; i < mc->mc_top; i++) + if (mc->mc_ki[i] < NUMKEYS(mc->mc_pg[i]) - 1) + break; + if (i == mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = nkeys; + return MDB_NOTFOUND; + } + } + if (!mc->mc_top) { + /* There are no other pages */ + mc->mc_ki[mc->mc_top] = 0; + if (op == MDB_SET_RANGE && !exactp) { + rc = 0; + goto set1; + } else + return MDB_NOTFOUND; + } + } else { + mc->mc_pg[0] = 0; + } + + rc = mdbx_page_search(mc, key, 0); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mp)); + +set2: + leaf = mdbx_node_search(mc, key, exactp); + if (exactp != NULL && !*exactp) { + /* MDB_SET specified and not an exact match. */ + return MDB_NOTFOUND; + } + + if (leaf == NULL) { + mdbx_debug("===> inexact leaf not found, goto sibling"); + if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDB_SUCCESS)) { + mc->mc_flags |= C_EOF; + return rc; /* no entries matched */ + } + mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mp)); + leaf = NODEPTR(mp, 0); + } + +set1: + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + if (IS_LEAF2(mp)) { + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } + return MDB_SUCCESS; + } + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_xcursor_init1(mc, leaf); + } + if (likely(data)) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + int ex2, *ex2p; + if (op == MDB_GET_BOTH) { + ex2p = &ex2; + ex2 = 0; + } else { + ex2p = NULL; + } + rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, + MDB_SET_RANGE, ex2p); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { + MDB_val olddata; + if (unlikely((rc = mdbx_node_read(mc, leaf, &olddata)) != MDB_SUCCESS)) + return rc; + rc = mc->mc_dbx->md_dcmp(data, &olddata); + if (rc) { + if (op == MDB_GET_BOTH || rc > 0) + return MDB_NOTFOUND; + rc = 0; + } + *data = olddata; + } else { + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + return rc; + } + } + + /* The key already matches in all other cases */ + if (op == MDB_SET_RANGE || op == MDB_SET_KEY) + MDB_GET_KEY(leaf, key); + mdbx_debug("==> cursor placed on key [%s]", DKEY(key)); + + return rc; +} + +/** Move the cursor to the first item in the database. */ +static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) { + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdbx_page_search(mc, NULL, MDB_PS_FIRST); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + + mc->mc_ki[mc->mc_top] = 0; + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + return MDB_SUCCESS; + } + + if (likely(data)) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_xcursor_init1(mc, leaf); + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else { + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + return rc; + } + } + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +/** Move the cursor to the last item in the database. */ +static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) { + int rc; + MDB_node *leaf; + + if (mc->mc_xcursor) + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + + if (likely(!(mc->mc_flags & C_EOF))) { + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdbx_page_search(mc, NULL, MDB_PS_LAST); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + } + + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_flags |= C_INITIALIZED | C_EOF; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = + LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + return MDB_SUCCESS; + } + + if (likely(data)) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_xcursor_init1(mc, leaf); + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else { + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + return rc; + } + } + + MDB_GET_KEY(leaf, key); + return MDB_SUCCESS; +} + +int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, + MDB_cursor_op op) { + int rc; + int exact = 0; + int (*mfunc)(MDB_cursor * mc, MDB_val * key, MDB_val * data); + + if (unlikely(mc == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + switch (op) { + case MDB_GET_CURRENT: + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { + rc = EINVAL; + } else { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int nkeys = NUMKEYS(mp); + if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + mc->mc_ki[mc->mc_top] = nkeys; + rc = MDB_NOTFOUND; + break; + } + rc = MDB_SUCCESS; + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + } else { + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY(leaf, key); + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (unlikely( + !(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { + mdbx_xcursor_init1(mc, leaf); + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + break; + } + rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, + MDB_GET_CURRENT); + } else { + rc = mdbx_node_read(mc, leaf, data); + } + } + } + } + break; + case MDB_GET_BOTH: + case MDB_GET_BOTH_RANGE: + if (unlikely(data == NULL)) { + rc = EINVAL; + break; + } + if (unlikely(mc->mc_xcursor == NULL)) { + rc = MDB_INCOMPATIBLE; + break; + } + /* FALLTHRU */ + case MDB_SET: + case MDB_SET_KEY: + case MDB_SET_RANGE: + if (unlikely(key == NULL)) { + rc = EINVAL; + } else { + rc = mdbx_cursor_set(mc, key, data, op, + op == MDB_SET_RANGE ? NULL : &exact); + } + break; + case MDB_GET_MULTIPLE: + if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) { + rc = EINVAL; + break; + } + if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = MDB_SUCCESS; + if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || + (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + break; + goto fetchm; + case MDB_NEXT_MULTIPLE: + if (unlikely(data == NULL)) { + rc = EINVAL; + break; + } + if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) { + rc = MDB_INCOMPATIBLE; + break; + } + rc = mdbx_cursor_next(mc, key, data, MDB_NEXT_DUP); + if (rc == MDB_SUCCESS) { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + MDB_cursor *mx; + fetchm: + mx = &mc->mc_xcursor->mx_cursor; + data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize; + data->mv_data = PAGEDATA(mx->mc_pg[mx->mc_top]); + mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top]) - 1; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_PREV_MULTIPLE: + if (data == NULL) { + rc = EINVAL; + break; + } + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + rc = MDB_INCOMPATIBLE; + break; + } + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdbx_cursor_last(mc, key, data); + else + rc = MDB_SUCCESS; + if (rc == MDB_SUCCESS) { + MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; + if (mx->mc_flags & C_INITIALIZED) { + rc = mdbx_cursor_sibling(mx, 0); + if (rc == MDB_SUCCESS) + goto fetchm; + } else { + rc = MDB_NOTFOUND; + } + } + break; + case MDB_NEXT: + case MDB_NEXT_DUP: + case MDB_NEXT_NODUP: + rc = mdbx_cursor_next(mc, key, data, op); + break; + case MDB_PREV: + case MDB_PREV_DUP: + case MDB_PREV_NODUP: + rc = mdbx_cursor_prev(mc, key, data, op); + break; + case MDB_FIRST: + rc = mdbx_cursor_first(mc, key, data); + break; + case MDB_FIRST_DUP: + mfunc = mdbx_cursor_first; + mmove: + if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) { + rc = EINVAL; + break; + } + if (unlikely(mc->mc_xcursor == NULL)) { + rc = MDB_INCOMPATIBLE; + break; + } + { + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + MDB_GET_KEY(leaf, key); + rc = mdbx_node_read(mc, leaf, data); + break; + } + } + if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { + rc = EINVAL; + break; + } + rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); + break; + case MDB_LAST: + rc = mdbx_cursor_last(mc, key, data); + break; + case MDB_LAST_DUP: + mfunc = mdbx_cursor_last; + goto mmove; + default: + mdbx_debug("unhandled/unimplemented cursor operation %u", op); + rc = EINVAL; + break; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + return rc; +} + +/** Touch all the pages in the cursor stack. Set mc_top. + * Makes sure all the pages are writable, before attempting a write + *operation. + * @param[in] mc The cursor to operate on. + */ +static int mdbx_cursor_touch(MDB_cursor *mc) { + int rc = MDB_SUCCESS; + + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY | DB_DUPDATA))) { + /* Touch DB record of named DB */ + MDB_cursor mc2; + MDB_xcursor mcx; + if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) + return MDB_BAD_DBI; + mdbx_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); + rc = mdbx_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); + if (unlikely(rc)) + return rc; + *mc->mc_dbflag |= DB_DIRTY; + } + mc->mc_top = 0; + if (mc->mc_snum) { + do { + rc = mdbx_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum - 1; + } + return rc; +} + +/** Do not spill pages to disk if txn is getting full, may fail instead */ +#define MDB_NOSPILL 0x8000 + +int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, + unsigned flags) { + MDB_env *env; + MDB_node *leaf = NULL; + MDB_page *fp, *mp, *sub_root = NULL; + uint16_t fp_flags; + MDB_val xdata, *rdata, dkey, olddata; + MDB_db dummy; + int do_sub = 0, insert_key, insert_data; + unsigned mcount = 0, dcount = 0, nospill; + size_t nsize; + int rc, rc2; + unsigned nflags; + DKBUF; + + if (unlikely(mc == NULL || key == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + env = mc->mc_txn->mt_env; + + /* Check this first so counter will always be zero on any + * early failures. + */ + if (flags & MDB_MULTIPLE) { + dcount = data[1].mv_size; + data[1].mv_size = 0; + if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED))) + return MDB_INCOMPATIBLE; + } + + if (flags & MDB_RESERVE) { + if (unlikely(mc->mc_db->md_flags & (MDB_DUPSORT | MDB_REVERSEDUP))) + return MDB_INCOMPATIBLE; + } + + nospill = flags & MDB_NOSPILL; + flags &= ~MDB_NOSPILL; + + if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (unlikely(key->mv_size > ENV_MAXKEY(env))) + return MDB_BAD_VALSIZE; + +#if SIZE_MAX > MAXDATASIZE + if (unlikely(data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) + ? ENV_MAXKEY(env) + : MAXDATASIZE))) + return MDB_BAD_VALSIZE; +#else + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + unlikely(data->mv_size > ENV_MAXKEY(env))) + return MDB_BAD_VALSIZE; +#endif + + if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && + unlikely(key->mv_size != sizeof(unsigned) && + key->mv_size != sizeof(size_t))) { + mdbx_cassert(mc, !"key-size is invalid for MDB_INTEGERKEY"); + return MDB_BAD_VALSIZE; + } + + if ((mc->mc_db->md_flags & MDB_INTEGERDUP) && + unlikely(data->mv_size != sizeof(unsigned) && + data->mv_size != sizeof(size_t))) { + mdbx_cassert(mc, !"data-size is invalid MDB_INTEGERDUP"); + return MDB_BAD_VALSIZE; + } + + mdbx_debug("==> put db %d key [%s], size %zu, data size %zu", DDBI(mc), + DKEY(key), key ? key->mv_size : 0, data->mv_size); + + int dupdata_flag = 0; + if (flags & MDB_CURRENT) { + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + return EINVAL; +#if MDBX_MODE_ENABLED + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_cassert(mc, + mc->mc_xcursor != NULL && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); + if (mc->mc_xcursor->mx_db.md_entries > 1) { + rc = mdbx_cursor_del(mc, 0); + if (rc != MDB_SUCCESS) + return rc; + flags -= MDB_CURRENT; + } + } + } +#endif /* MDBX_MODE_ENABLED */ + rc = MDB_SUCCESS; + } else if (mc->mc_db->md_root == P_INVALID) { + /* new database, cursor has nothing to point to */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + rc = MDB_NO_ROOT; + } else { + int exact = 0; + MDB_val d2; + if (flags & MDB_APPEND) { + MDB_val k2; + rc = mdbx_cursor_last(mc, &k2, &d2); + if (rc == 0) { + rc = mc->mc_dbx->md_cmp(key, &k2); + if (rc > 0) { + rc = MDB_NOTFOUND; + mc->mc_ki[mc->mc_top]++; + } else { + /* new key is <= last key */ + rc = MDB_KEYEXIST; + } + } + } else { + rc = mdbx_cursor_set(mc, key, &d2, MDB_SET, &exact); + } + if ((flags & MDB_NOOVERWRITE) && rc == 0) { + mdbx_debug("duplicate key [%s]", DKEY(key)); + *data = d2; + return MDB_KEYEXIST; + } + if (rc && unlikely(rc != MDB_NOTFOUND)) + return rc; + } + + if (mc->mc_flags & C_DEL) + mc->mc_flags ^= C_DEL; + + /* Cursor is positioned, check for room in the dirty list */ + if (!nospill) { + if (flags & MDB_MULTIPLE) { + rdata = &xdata; + xdata.mv_size = data->mv_size * dcount; + } else { + rdata = data; + } + if (unlikely(rc2 = mdbx_page_spill(mc, key, rdata))) + return rc2; + } + + if (rc == MDB_NO_ROOT) { + MDB_page *np; + /* new database, write a root leaf page */ + mdbx_debug("allocating new root leaf page"); + if (unlikely(rc2 = mdbx_page_new(mc, P_LEAF, 1, &np))) { + return rc2; + } + mdbx_cursor_push(mc, np); + mc->mc_db->md_root = np->mp_pgno; + mc->mc_db->md_depth++; + *mc->mc_dbflag |= DB_DIRTY; + if ((mc->mc_db->md_flags & (MDB_DUPSORT | MDB_DUPFIXED)) == MDB_DUPFIXED) + np->mp_flags |= P_LEAF2; + mc->mc_flags |= C_INITIALIZED; + } else { + /* make sure all cursor pages are writable */ + rc2 = mdbx_cursor_touch(mc); + if (unlikely(rc2)) + return rc2; + } + + insert_key = insert_data = rc; + if (insert_key) { + /* The key does not exist */ + mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + LEAFSIZE(key, data) > env->me_nodemax) { + /* Too big for a node, insert in sub-DB. Set up an empty + * "old sub-page" for prep_subDB to expand to a full page. + */ + fp_flags = P_LEAF | P_DIRTY; + fp = env->me_pbuf; + fp->mp_leaf2_ksize = data->mv_size; /* used if MDB_DUPFIXED */ + fp->mp_lower = fp->mp_upper = (PAGEHDRSZ - PAGEBASE); + olddata.mv_size = PAGEHDRSZ; + goto prep_subDB; + } + } else { + /* there's only a key anyway, so this is a no-op */ + if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { + char *ptr; + unsigned ksize = mc->mc_db->md_xsize; + if (key->mv_size != ksize) + return MDB_BAD_VALSIZE; + ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + memcpy(ptr, key->mv_data, ksize); + fix_parent: + /* if overwriting slot 0 of leaf, need to + * update branch key if there is a parent page + */ + if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + unsigned short dtop = 1; + mc->mc_top--; + /* slot 0 is always an empty key, find real slot */ + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { + mc->mc_top--; + dtop++; + } + if (mc->mc_ki[mc->mc_top]) + rc2 = mdbx_update_key(mc, key); + else + rc2 = MDB_SUCCESS; + mc->mc_top += dtop; + if (rc2) + return rc2; + } + return MDB_SUCCESS; + } + + more: + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + olddata.mv_size = NODEDSZ(leaf); + olddata.mv_data = NODEDATA(leaf); + + /* DB has dups? */ + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + /* Prepare (sub-)page/sub-DB to accept the new item, + * if needed. fp: old sub-page or a header faking + * it. mp: new (sub-)page. offset: growth in page + * size. xdata: node data with new page or DB. + */ + unsigned i, offset = 0; + mp = fp = xdata.mv_data = env->me_pbuf; + mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; + + /* Was a single item before, must convert now */ + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + /* Just overwrite the current item */ + if (flags & MDB_CURRENT) { + if ((flags & MDB_NODUPDATA) && !mc->mc_dbx->md_dcmp(data, &olddata)) + return MDB_KEYEXIST; + goto current; + } + + /* does data match? */ + if (!mc->mc_dbx->md_dcmp(data, &olddata)) { + if (unlikely(flags & (MDB_NODUPDATA | MDB_APPENDDUP))) + return MDB_KEYEXIST; + /* overwrite it */ + goto current; + } + + /* Back up original data item */ + dupdata_flag = 1; + dkey.mv_size = olddata.mv_size; + dkey.mv_data = memcpy(fp + 1, olddata.mv_data, olddata.mv_size); + + /* Make sub-page header for the dup items, with dummy body */ + fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP; + fp->mp_lower = (PAGEHDRSZ - PAGEBASE); + xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp->mp_flags |= P_LEAF2; + fp->mp_leaf2_ksize = data->mv_size; + xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ + } else { + xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.mv_size & 1) + (data->mv_size & 1); + } + fp->mp_upper = xdata.mv_size - PAGEBASE; + olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ + } else if (leaf->mn_flags & F_SUBDATA) { + /* Data is on sub-DB, just store it */ + flags |= F_DUPDATA | F_SUBDATA; + goto put_sub; + } else { + /* Data is on sub-page */ + fp = olddata.mv_data; + switch (flags) { + default: + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + offset = EVEN(NODESIZE + sizeof(indx_t) + data->mv_size); + break; + } + offset = fp->mp_leaf2_ksize; + if (SIZELEFT(fp) < offset) { + offset *= 4; /* space for 4 more */ + break; + } + /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ + case MDB_CURRENT | MDB_NODUPDATA: + case MDB_CURRENT: + fp->mp_flags |= P_DIRTY; + COPY_PGNO(fp->mp_pgno, mp->mp_pgno); + mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; + flags |= F_DUPDATA; + goto put_sub; + } + xdata.mv_size = olddata.mv_size + offset; + } + + fp_flags = fp->mp_flags; + if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { + /* Too big for a sub-page, convert to sub-DB */ + fp_flags &= ~P_SUBP; + prep_subDB: + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + fp_flags |= P_LEAF2; + dummy.md_xsize = fp->mp_leaf2_ksize; + dummy.md_flags = MDB_DUPFIXED; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + dummy.md_flags |= MDB_INTEGERKEY; + } else { + dummy.md_xsize = 0; + dummy.md_flags = 0; + } + dummy.md_depth = 1; + dummy.md_branch_pages = 0; + dummy.md_leaf_pages = 1; + dummy.md_overflow_pages = 0; + dummy.md_entries = NUMKEYS(fp); + xdata.mv_size = sizeof(MDB_db); + xdata.mv_data = &dummy; + if ((rc = mdbx_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) + return rc; + offset = env->me_psize - olddata.mv_size; + flags |= F_DUPDATA | F_SUBDATA; + dummy.md_root = mp->mp_pgno; + sub_root = mp; + } + if (mp != fp) { + mp->mp_flags = fp_flags | P_DIRTY; + mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; + mp->mp_lower = fp->mp_lower; + mp->mp_upper = fp->mp_upper + offset; + if (fp_flags & P_LEAF2) { + memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize); + } else { + memcpy((char *)mp + mp->mp_upper + PAGEBASE, + (char *)fp + fp->mp_upper + PAGEBASE, + olddata.mv_size - fp->mp_upper - PAGEBASE); + for (i = 0; i < NUMKEYS(fp); i++) + mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; + } + } + + rdata = &xdata; + flags |= F_DUPDATA; + do_sub = 1; + if (!insert_key) + mdbx_node_del(mc, 0); + goto new_sub; + } + current: + /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ + if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) + return MDB_INCOMPATIBLE; + /* overflow page overwrites need special handling */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); + + memcpy(&pg, olddata.mv_data, sizeof(pg)); + if (unlikely((rc2 = mdbx_page_get(mc, pg, &omp, &level)) != 0)) + return rc2; + ovpages = omp->mp_pages; + + /* Is the ov page large enough? */ + if (ovpages >= dpages) { + if (!(omp->mp_flags & P_DIRTY) && + (level || (env->me_flags & MDB_WRITEMAP))) { + rc = mdbx_page_unspill(mc->mc_txn, omp, &omp); + if (unlikely(rc)) + return rc; + level = 0; /* dirty in this txn or clean */ + } + /* Is it dirty? */ + if (omp->mp_flags & P_DIRTY) { + /* yes, overwrite it. Note in this case we don't + * bother to try shrinking the page if the new data + * is smaller than the overflow threshold. + */ + if (unlikely(level > 1)) { + /* It is writable only in a parent txn */ + MDB_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); + MDB_ID2 id2; + if (unlikely(!np)) + return ENOMEM; + id2.mid = pg; + id2.mptr = np; + /* Note - this page is already counted in parent's dirty_room */ + rc2 = mdbx_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + mdbx_cassert(mc, rc2 == 0); + /* Currently we make the page look as with put() in the + * parent txn, in case the user peeks at MDB_RESERVEd + * or unused parts. Some users treat ovpages specially. + */ + size_t sz = (size_t)env->me_psize * ovpages, off; + if (MDBX_MODE_ENABLED || !(flags & MDB_RESERVE)) { + /* Skip the part where LMDB will put *data. + * Copy end of page, adjusting alignment so + * compiler may copy words instead of bytes. + */ + off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); + memcpy((size_t *)((char *)np + off), + (size_t *)((char *)omp + off), sz - off); + sz = PAGEHDRSZ; + } + memcpy(np, omp, sz); /* Copy whole or header of page */ + omp = np; + } + SETDSZ(leaf, data->mv_size); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = PAGEDATA(omp); + else + memcpy(PAGEDATA(omp), data->mv_data, data->mv_size); + return MDB_SUCCESS; + } + } + if ((rc2 = mdbx_ovpage_free(mc, omp)) != MDB_SUCCESS) + return rc2; + } else if (data->mv_size == olddata.mv_size) { + /* same size, just replace it. Note that we could + * also reuse this node if the new data is smaller, + * but instead we opt to shrink the node in that case. + */ + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = olddata.mv_data; + else if (!(mc->mc_flags & C_SUB)) + memcpy(olddata.mv_data, data->mv_data, data->mv_size); + else { + memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); + goto fix_parent; + } + return MDB_SUCCESS; + } + mdbx_node_del(mc, 0); + } + + rdata = data; + +new_sub: + nflags = flags & NODE_ADD_FLAGS; + nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size + : mdbx_leaf_size(env, key, rdata); + if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { + if ((flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) + nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ + if (!insert_key) + nflags |= MDB_SPLIT_REPLACE; + rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags); + } else { + /* There is room already in this leaf page. */ + rc = mdbx_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); + if (likely(rc == 0)) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) + continue; + if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { + m3->mc_ki[i]++; + } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); + } + } + } + + if (likely(rc == MDB_SUCCESS)) { + /* Now store the actual data in the child DB. Note that we're + * storing the user data in the keys field, so there are strict + * size limits on dupdata. The actual data fields of the child + * DB are all zero size. */ + if (do_sub) { + int xflags; + size_t ecount; + put_sub: + xdata.mv_size = 0; + xdata.mv_data = ""; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (flags & MDB_CURRENT) { + xflags = (flags & MDB_NODUPDATA) + ? MDB_CURRENT | MDB_NOOVERWRITE | MDB_NOSPILL + : MDB_CURRENT | MDB_NOSPILL; + } else { + mdbx_xcursor_init1(mc, leaf); + xflags = (flags & MDB_NODUPDATA) ? MDB_NOOVERWRITE | MDB_NOSPILL + : MDB_NOSPILL; + } + if (sub_root) + mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; + /* converted, write the original data first */ + if (dupdata_flag) { + rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + if (unlikely(rc)) + goto bad_sub; + /* we've done our job */ + dkey.mv_size = 0; + } + if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2; + MDB_xcursor *mx = mc->mc_xcursor; + unsigned i = mc->mc_top; + MDB_page *mp = mc->mc_pg[i]; + int nkeys = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) + continue; + if (!(m2->mc_flags & C_INITIALIZED)) + continue; + if (m2->mc_pg[i] == mp) { + if (m2->mc_ki[i] == mc->mc_ki[i]) { + mdbx_xcursor_init2(m2, mx, dupdata_flag); + } else if (!insert_key && m2->mc_ki[i] < nkeys) { + XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); + } + } + } + } + ecount = mc->mc_xcursor->mx_db.md_entries; + if (flags & MDB_APPENDDUP) + xflags |= MDB_APPEND; + rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + if (flags & F_SUBDATA) { + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } + insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; + } + /* Increment count unless we just replaced an existing item. */ + if (insert_data) + mc->mc_db->md_entries++; + if (insert_key) { + /* Invalidate txn if we created an empty sub-DB */ + if (unlikely(rc)) + goto bad_sub; + /* If we succeeded and the key didn't exist before, + * make sure the cursor is marked valid. */ + mc->mc_flags |= C_INITIALIZED; + } + if (flags & MDB_MULTIPLE) { + if (!rc) { + mcount++; + /* let caller know how many succeeded, if any */ + data[1].mv_size = mcount; + if (mcount < dcount) { + data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; + insert_key = insert_data = 0; + goto more; + } + } + } + return rc; + bad_sub: + if (unlikely(rc == + MDB_KEYEXIST)) /* should not happen, we deleted that item */ + rc = MDB_PROBLEM; + } + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { + MDB_node *leaf; + MDB_page *mp; + int rc; + + if (unlikely(!mc)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + return EINVAL; + + if (unlikely(mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))) + return MDB_NOTFOUND; + + if (unlikely(!(flags & MDB_NOSPILL) && + (rc = mdbx_page_spill(mc, NULL, NULL)))) + return rc; + + rc = mdbx_cursor_touch(mc); + if (unlikely(rc)) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + if (IS_LEAF2(mp)) + goto del_key; + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (flags & MDB_NODUPDATA) { + /* mdbx_cursor_del0() will subtract the final entry */ + mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } else { + if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); + if (unlikely(rc)) + return rc; + /* If sub-DB still has entries, we're done */ + if (mc->mc_xcursor->mx_db.md_entries) { + if (leaf->mn_flags & F_SUBDATA) { + /* update subDB info */ + void *db = NODEDATA(leaf); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + } else { + MDB_cursor *m2; + /* shrink fake page */ + mdbx_node_shrink(mp, mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + /* fix other sub-DB cursors pointed at fake pages on this page */ + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) + continue; + if (!(m2->mc_flags & C_INITIALIZED)) + continue; + if (m2->mc_pg[mc->mc_top] == mp) { + MDB_node *n2 = leaf; + if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { + n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); + if (n2->mn_flags & F_SUBDATA) + continue; + } + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + } + } + } + mc->mc_db->md_entries--; + return rc; + } else { + mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + } + /* otherwise fall thru and delete the sub-DB */ + } + + if (leaf->mn_flags & F_SUBDATA) { + /* add all the child DB's pages to the free list */ + rc = mdbx_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (unlikely(rc)) + goto fail; + } + } + /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ + else if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) { + rc = MDB_INCOMPATIBLE; + goto fail; + } + + /* add overflow pages to free list */ + if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { + MDB_page *omp; + pgno_t pg; + + memcpy(&pg, NODEDATA(leaf), sizeof(pg)); + if (unlikely((rc = mdbx_page_get(mc, pg, &omp, NULL)) || + (rc = mdbx_ovpage_free(mc, omp)))) + goto fail; + } + +del_key: + return mdbx_cursor_del0(mc); + +fail: + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +/** Allocate and initialize new pages for a database. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc a cursor on the database being added to. + * @param[in] flags flags defining what type of page is being allocated. + * @param[in] num the number of pages to allocate. This is usually 1, + * unless allocating overflow pages for a large record. + * @param[out] mp Address of a page, or NULL on failure. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, + MDB_page **mp) { + MDB_page *np; + int rc; + + if (unlikely((rc = mdbx_page_alloc(mc, num, &np, MDBX_ALLOC_ALL)))) + return rc; + mdbx_debug("allocated new mpage %zu, page size %u", np->mp_pgno, + mc->mc_txn->mt_env->me_psize); + np->mp_flags = flags | P_DIRTY; + np->mp_lower = (PAGEHDRSZ - PAGEBASE); + np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; + + if (IS_BRANCH(np)) + mc->mc_db->md_branch_pages++; + else if (IS_LEAF(np)) + mc->mc_db->md_leaf_pages++; + else if (IS_OVERFLOW(np)) { + mc->mc_db->md_overflow_pages += num; + np->mp_pages = num; + } + *mp = np; + + return 0; +} + +/** Calculate the size of a leaf node. + * The size depends on the environment's page size; if a data item + * is too large it will be put onto an overflow page and the node + * size will only include the key and not the data. Sizes are always + * rounded up to an even number of bytes, to guarantee 2-byte alignment + * of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @param[in] data The data for the node. + * @return The number of bytes needed to store the node. + */ +static MDBX_INLINE size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, + MDB_val *data) { + size_t sz; + + sz = LEAFSIZE(key, data); + if (sz > env->me_nodemax) { + /* put on overflow page */ + sz -= data->mv_size - sizeof(pgno_t); + } + + return EVEN(sz + sizeof(indx_t)); +} + +/** Calculate the size of a branch node. + * The size should depend on the environment's page size but since + * we currently don't support spilling large keys onto overflow + * pages, it's simply the size of the #MDB_node header plus the + * size of the key. Sizes are always rounded up to an even number + * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. + * @param[in] env The environment handle. + * @param[in] key The key for the node. + * @return The number of bytes needed to store the node. + */ +static MDBX_INLINE size_t mdbx_branch_size(MDB_env *env, MDB_val *key) { + size_t sz; + + sz = INDXSIZE(key); + if (unlikely(sz > env->me_nodemax)) { + /* put on overflow page */ + /* not implemented */ + mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __FUNCTION__, + __LINE__); + sz -= key->mv_size - sizeof(pgno_t); + } + + return sz + sizeof(indx_t); +} + +/** Add a node to the page pointed to by the cursor. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc The cursor for this operation. + * @param[in] indx The index on the page where the new node should be added. + * @param[in] key The key for the new node. + * @param[in] data The data for the new node, if any. + * @param[in] pgno The page number, if adding a branch node. + * @param[in] flags Flags for the node. + * @return 0 on success, non-zero on failure. Possible errors are: + *
    + *
  • ENOMEM - failed to allocate overflow pages for the node. + *
  • MDB_PAGE_FULL - there is insufficient room in the page. This error + * should never happen since all callers already calculate the + * page's free space before calling this function. + *
+ */ +static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, + MDB_val *data, pgno_t pgno, unsigned flags) { + unsigned i; + size_t node_size = NODESIZE; + ssize_t room; + indx_t ofs; + MDB_node *node; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_page *ofp = NULL; /* overflow page */ + void *ndata; + DKBUF; + + mdbx_cassert(mc, mp->mp_upper >= mp->mp_lower); + + mdbx_debug("add to %s %spage %zu index %i, data size %zu key size %zu [%s]", + IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mdbx_dbg_pgno(mp), indx, data ? data->mv_size : 0, + key ? key->mv_size : 0, key ? DKEY(key) : "null"); + + if (IS_LEAF2(mp)) { + mdbx_cassert(mc, key); + /* Move higher keys up one slot. */ + int ksize = mc->mc_db->md_xsize, dif; + char *ptr = LEAF2KEY(mp, indx, ksize); + dif = NUMKEYS(mp) - indx; + if (dif > 0) + memmove(ptr + ksize, ptr, dif * ksize); + /* insert new key */ + memcpy(ptr, key->mv_data, ksize); + + /* Just using these for counting */ + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + return MDB_SUCCESS; + } + + room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); + if (key != NULL) + node_size += key->mv_size; + if (IS_LEAF(mp)) { + mdbx_cassert(mc, key && data); + if (unlikely(F_ISSET(flags, F_BIGDATA))) { + /* Data already on overflow page. */ + node_size += sizeof(pgno_t); + } else if (unlikely(node_size + data->mv_size > + mc->mc_txn->mt_env->me_nodemax)) { + int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); + int rc; + /* Put data on overflow page. */ + mdbx_debug( + "data size is %zu, node would be %zu, put data on overflow page", + data->mv_size, node_size + data->mv_size); + node_size = EVEN(node_size + sizeof(pgno_t)); + if ((ssize_t)node_size > room) + goto full; + if ((rc = mdbx_page_new(mc, P_OVERFLOW, ovpages, &ofp))) + return rc; + mdbx_debug("allocated overflow page %zu", ofp->mp_pgno); + flags |= F_BIGDATA; + goto update; + } else { + node_size += data->mv_size; + } + } + node_size = EVEN(node_size); + if (unlikely((ssize_t)node_size > room)) + goto full; + +update: + /* Move higher pointers up one slot. */ + for (i = NUMKEYS(mp); i > indx; i--) + mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + + /* Adjust free space offsets. */ + ofs = mp->mp_upper - node_size; + mdbx_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); + mp->mp_ptrs[indx] = ofs; + mp->mp_upper = ofs; + mp->mp_lower += sizeof(indx_t); + + /* Write the node data. */ + node = NODEPTR(mp, indx); + node->mn_ksize = (key == NULL) ? 0 : key->mv_size; + node->mn_flags = flags; + if (IS_LEAF(mp)) + SETDSZ(node, data->mv_size); + else + SETPGNO(node, pgno); + + if (key) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + if (IS_LEAF(mp)) { + ndata = NODEDATA(node); + if (unlikely(ofp == NULL)) { + if (unlikely(F_ISSET(flags, F_BIGDATA))) + memcpy(ndata, data->mv_data, sizeof(pgno_t)); + else if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = ndata; + else if (likely(ndata != data->mv_data)) + memcpy(ndata, data->mv_data, data->mv_size); + } else { + memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); + ndata = PAGEDATA(ofp); + if (F_ISSET(flags, MDB_RESERVE)) + data->mv_data = ndata; + else if (likely(ndata != data->mv_data)) + memcpy(ndata, data->mv_data, data->mv_size); + } + } + + return MDB_SUCCESS; + +full: + mdbx_debug("not enough room in page %zu, got %u ptrs", mdbx_dbg_pgno(mp), + NUMKEYS(mp)); + mdbx_debug("upper-lower = %u - %u = %zd", mp->mp_upper, mp->mp_lower, room); + mdbx_debug("node size = %zu", node_size); + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_PAGE_FULL; +} + +/** Delete the specified node from a page. + * @param[in] mc Cursor pointing to the node to delete. + * @param[in] ksize The size of a node. Only used if the page is + * part of a #MDB_DUPFIXED database. + */ +static void mdbx_node_del(MDB_cursor *mc, int ksize) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + indx_t indx = mc->mc_ki[mc->mc_top]; + unsigned sz; + indx_t i, j, numkeys, ptr; + MDB_node *node; + char *base; + + mdbx_debug("delete node %u on %s page %zu", indx, + IS_LEAF(mp) ? "leaf" : "branch", mdbx_dbg_pgno(mp)); + numkeys = NUMKEYS(mp); + mdbx_cassert(mc, indx < numkeys); + + if (IS_LEAF2(mp)) { + int x = numkeys - 1 - indx; + base = LEAF2KEY(mp, indx, ksize); + if (x) + memmove(base, base + ksize, x * ksize); + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += ksize - sizeof(indx_t); + return; + } + + node = NODEPTR(mp, indx); + sz = NODESIZE + node->mn_ksize; + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + sz += sizeof(pgno_t); + else + sz += NODEDSZ(node); + } + sz = EVEN(sz); + + ptr = mp->mp_ptrs[indx]; + for (i = j = 0; i < numkeys; i++) { + if (i != indx) { + mp->mp_ptrs[j] = mp->mp_ptrs[i]; + if (mp->mp_ptrs[i] < ptr) + mp->mp_ptrs[j] += sz; + j++; + } + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + sz, base, ptr - mp->mp_upper); + + mp->mp_lower -= sizeof(indx_t); + mp->mp_upper += sz; +} + +/** Compact the main page after deleting a node on a subpage. + * @param[in] mp The main page to operate on. + * @param[in] indx The index of the subpage on the main page. + */ +static void mdbx_node_shrink(MDB_page *mp, indx_t indx) { + MDB_node *node; + MDB_page *sp, *xp; + char *base; + indx_t delta, nsize, len, ptr; + int i; + + node = NODEPTR(mp, indx); + sp = (MDB_page *)NODEDATA(node); + delta = SIZELEFT(sp); + nsize = NODEDSZ(node) - delta; + + /* Prepare to shift upward, set len = length(subpage part to shift) */ + if (IS_LEAF2(sp)) { + len = nsize; + if (nsize & 1) + return; /* do not make the node uneven-sized */ + } else { + xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ + for (i = NUMKEYS(sp); --i >= 0;) + xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; + len = PAGEHDRSZ; + } + sp->mp_upper = sp->mp_lower; + COPY_PGNO(sp->mp_pgno, mp->mp_pgno); + SETDSZ(node, nsize); + + /* Shift upward */ + base = (char *)mp + mp->mp_upper + PAGEBASE; + memmove(base + delta, base, (char *)sp + len - base); + + ptr = mp->mp_ptrs[indx]; + for (i = NUMKEYS(mp); --i >= 0;) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] += delta; + } + mp->mp_upper += delta; +} + +/** Initial setup of a sorted-dups cursor. + * Sorted duplicates are implemented as a sub-database for the given key. + * The duplicate data items are actually keys of the sub-database. + * Operations on the duplicate data items are performed using a sub-cursor + * initialized when the sub-database is first accessed. This function does + * the preliminary setup of the sub-cursor, filling in the fields that + * depend only on the parent DB. + * @param[in] mc The main cursor whose sorted-dups cursor is to be + * initialized. + */ +static void mdbx_xcursor_init0(MDB_cursor *mc) { + MDB_xcursor *mx = mc->mc_xcursor; + + mx->mx_cursor.mc_xcursor = NULL; + mx->mx_cursor.mc_txn = mc->mc_txn; + mx->mx_cursor.mc_db = &mx->mx_db; + mx->mx_cursor.mc_dbx = &mx->mx_dbx; + mx->mx_cursor.mc_dbi = mc->mc_dbi; + mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + mx->mx_dbx.md_name.mv_size = 0; + mx->mx_dbx.md_name.mv_data = NULL; + mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; + mx->mx_dbx.md_dcmp = NULL; + mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; +} + +/** Final setup of a sorted-dups cursor. + * Sets up the fields that depend on the data from the main cursor. + * @param[in] mc The main cursor whose sorted-dups cursor is to be + *initialized. + * @param[in] node The data containing the #MDB_db record for the + * sorted-dup database. + */ +static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node) { + MDB_xcursor *mx = mc->mc_xcursor; + + if (node->mn_flags & F_SUBDATA) { + memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); + mx->mx_cursor.mc_pg[0] = 0; + mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_SUB; + } else { + MDB_page *fp = NODEDATA(node); + mx->mx_db.md_xsize = 0; + mx->mx_db.md_flags = 0; + mx->mx_db.md_depth = 1; + mx->mx_db.md_branch_pages = 0; + mx->mx_db.md_leaf_pages = 1; + mx->mx_db.md_overflow_pages = 0; + mx->mx_db.md_entries = NUMKEYS(fp); + COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_INITIALIZED | C_SUB; + mx->mx_cursor.mc_pg[0] = fp; + mx->mx_cursor.mc_ki[0] = 0; + if (mc->mc_db->md_flags & MDB_DUPFIXED) { + mx->mx_db.md_flags = MDB_DUPFIXED; + mx->mx_db.md_xsize = fp->mp_leaf2_ksize; + if (mc->mc_db->md_flags & MDB_INTEGERDUP) + mx->mx_db.md_flags |= MDB_INTEGERKEY; + } + } + mdbx_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); + mx->mx_dbflag = DB_VALID | DB_USRVALID | DB_DUPDATA; + /* #if UINT_MAX < SIZE_MAX + if (mx->mx_dbx.md_cmp == mdbx_cmp_int && mx->mx_db.md_pad == + sizeof(size_t)) + mx->mx_dbx.md_cmp = mdbx_cmp_clong; + #endif */ +} + +/** Fixup a sorted-dups cursor due to underlying update. + * Sets up some fields that depend on the data from the main cursor. + * Almost the same as init1, but skips initialization steps if the + * xcursor had already been used. + * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. + * @param[in] src_mx The xcursor of an up-to-date cursor. + * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. + */ +static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, + int new_dupdata) { + MDB_xcursor *mx = mc->mc_xcursor; + + if (new_dupdata) { + mx->mx_cursor.mc_snum = 1; + mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags |= C_INITIALIZED; + mx->mx_cursor.mc_ki[0] = 0; + mx->mx_dbflag = DB_VALID | DB_USRVALID | DB_DUPDATA; + mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; + } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) { + return; + } + mx->mx_db = src_mx->mx_db; + mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; + mdbx_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); +} + +/** Initialize a cursor for a given transaction and database. */ +static void mdbx_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, + MDB_xcursor *mx) { + mc->mc_signature = MDBX_MC_SIGNATURE; + mc->mc_next = NULL; + mc->mc_backup = NULL; + mc->mc_dbi = dbi; + mc->mc_txn = txn; + mc->mc_db = &txn->mt_dbs[dbi]; + mc->mc_dbx = &txn->mt_dbxs[dbi]; + mc->mc_dbflag = &txn->mt_dbflags[dbi]; + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_pg[0] = 0; + mc->mc_flags = 0; + mc->mc_ki[0] = 0; + mc->mc_xcursor = NULL; + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + mdbx_tassert(txn, mx != NULL); + mx->mx_cursor.mc_signature = MDBX_MC_SIGNATURE; + mc->mc_xcursor = mx; + mdbx_xcursor_init0(mc); + } + if (unlikely(*mc->mc_dbflag & DB_STALE)) { + mdbx_page_search(mc, NULL, MDB_PS_ROOTONLY); + } +} + +int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { + MDB_cursor *mc; + size_t size = sizeof(MDB_cursor); + + if (unlikely(!ret || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) + return EINVAL; + + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + return EINVAL; + + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) + size += sizeof(MDB_xcursor); + + if (likely((mc = malloc(size)) != NULL)) { + mdbx_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); + if (txn->mt_cursors) { + mc->mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = mc; + mc->mc_flags |= C_UNTRACK; + } + } else { + return ENOMEM; + } + + *ret = mc; + + return MDB_SUCCESS; +} + +int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { + if (unlikely(!mc || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE && + mc->mc_signature != MDBX_MC_READY4CLOSE)) + return EINVAL; + + if (unlikely(!TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID))) + return EINVAL; + + if (unlikely(mc->mc_backup)) + return EINVAL; + + if (unlikely((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)) { +#if MDBX_MODE_ENABLED + MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->mc_next; + if (*prev == mc) + *prev = mc->mc_next; + mc->mc_signature = MDBX_MC_READY4CLOSE; +#else + return EINVAL; +#endif + } + + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + mdbx_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); + return MDB_SUCCESS; +} + +/* Return the count of duplicate data items for the current key */ +int mdbx_cursor_count(MDB_cursor *mc, size_t *countp) { + if (unlikely(mc == NULL || countp == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + return EINVAL; + +#if MDBX_MODE_ENABLED + if (!mc->mc_snum) { + *countp = 0; + return MDB_NOTFOUND; + } + + MDB_page *mp = mc->mc_pg[mc->mc_top]; + if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) { + *countp = 0; + return MDB_NOTFOUND; + } + + *countp = 1; + if (mc->mc_xcursor != NULL) { + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & + C_INITIALIZED)); + *countp = mc->mc_xcursor->mx_db.md_entries; + } + } +#else + if (unlikely(mc->mc_xcursor == NULL)) + return MDB_INCOMPATIBLE; + + if (!mc->mc_snum) + return MDB_NOTFOUND; + + MDB_page *mp = mc->mc_pg[mc->mc_top]; + if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) + return MDB_NOTFOUND; + + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { + *countp = 1; + } else { + if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) + return EINVAL; + *countp = mc->mc_xcursor->mx_db.md_entries; + } +#endif /* MDBX_MODE_ENABLED */ + return MDB_SUCCESS; +} + +void mdbx_cursor_close(MDB_cursor *mc) { + if (mc) { + mdbx_ensure(NULL, mc->mc_signature == MDBX_MC_SIGNATURE || + mc->mc_signature == MDBX_MC_READY4CLOSE); + if (!mc->mc_backup) { + /* Remove from txn, if tracked. + * A read-only txn (!C_UNTRACK) may have been freed already, + * so do not peek inside it. Only write txns track cursors. */ + if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { + MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->mc_next; + if (*prev == mc) + *prev = mc->mc_next; + } + mc->mc_signature = 0; + free(mc); + } else { + /* cursor closed before nested txn ends */ + mdbx_cassert(mc, mc->mc_signature == MDBX_MC_SIGNATURE); + mc->mc_signature = MDBX_MC_WAIT4EOT; + } + } +} + +MDB_txn *mdbx_cursor_txn(MDB_cursor *mc) { + if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) + return NULL; + return mc->mc_txn; +} + +MDB_dbi mdbx_cursor_dbi(MDB_cursor *mc) { + if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) + return INT_MIN; + return mc->mc_dbi; +} + +/** Replace the key for a branch node with a new key. + * Set #MDB_TXN_ERROR on failure. + * @param[in] mc Cursor pointing to the node to operate on. + * @param[in] key The new key to use. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { + MDB_page *mp; + MDB_node *node; + char *base; + size_t len; + int delta, ksize, oksize; + indx_t ptr, i, numkeys, indx; + DKBUF; + + indx = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + node = NODEPTR(mp, indx); + ptr = mp->mp_ptrs[indx]; + { + MDB_val k2; + char kbuf2[DKBUF_MAXKEYSIZE * 2 + 1]; + k2.mv_data = NODEKEY(node); + k2.mv_size = node->mn_ksize; + mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %zu", indx, ptr, + mdbx_dkey(&k2, kbuf2), DKEY(key), mp->mp_pgno); + } + + /* Sizes must be 2-byte aligned. */ + ksize = EVEN(key->mv_size); + oksize = EVEN(node->mn_ksize); + delta = ksize - oksize; + + /* Shift node contents if EVEN(key length) changed. */ + if (delta) { + if (delta > 0 && SIZELEFT(mp) < delta) { + pgno_t pgno; + /* not enough space left, do a delete and split */ + mdbx_debug("Not enough room, delta = %d, splitting...", delta); + pgno = NODEPGNO(node); + mdbx_node_del(mc, 0); + return mdbx_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); + } + + numkeys = NUMKEYS(mp); + for (i = 0; i < numkeys; i++) { + if (mp->mp_ptrs[i] <= ptr) + mp->mp_ptrs[i] -= delta; + } + + base = (char *)mp + mp->mp_upper + PAGEBASE; + len = ptr - mp->mp_upper + NODESIZE; + memmove(base - delta, base, len); + mp->mp_upper -= delta; + + node = NODEPTR(mp, indx); + } + + /* But even if no shift was needed, update ksize */ + if (node->mn_ksize != key->mv_size) + node->mn_ksize = key->mv_size; + + if (key->mv_size) + memcpy(NODEKEY(node), key->mv_data, key->mv_size); + + return MDB_SUCCESS; +} + +static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); + +/** Perform \b act while tracking temporary cursor \b mn */ +#define WITH_CURSOR_TRACKING(mn, act) \ + do { \ + MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + if ((mn).mc_flags & C_SUB) { \ + dummy.mc_flags = C_INITIALIZED; \ + dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ + tracked = &dummy; \ + } else { \ + tracked = &(mn); \ + } \ + tracked->mc_next = *tp; \ + *tp = tracked; \ + { act; } \ + *tp = tracked->mc_next; \ + } while (0) + +/** Move a node from csrc to cdst. + */ +static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { + MDB_node *srcnode; + MDB_val key, data; + pgno_t srcpg; + MDB_cursor mn; + int rc; + unsigned short flags; + + DKBUF; + + /* Mark src and dst as dirty. */ + if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + return rc; + + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_xsize; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], + key.mv_size); + data.mv_size = 0; + data.mv_data = NULL; + srcpg = 0; + flags = 0; + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]); + mdbx_cassert(csrc, !((size_t)srcnode & 1)); + srcpg = NODEPGNO(srcnode); + flags = srcnode->mn_flags; + if (csrc->mc_ki[csrc->mc_top] == 0 && + IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + unsigned snum = csrc->mc_snum; + MDB_node *s2; + /* must find the lowest key below src */ + rc = mdbx_page_search_lowest(csrc); + if (unlikely(rc)) + return rc; + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_size = csrc->mc_db->md_xsize; + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + csrc->mc_snum = snum--; + csrc->mc_top = snum; + } else { + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + } + mn.mc_xcursor = NULL; + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { + unsigned snum = cdst->mc_snum; + MDB_node *s2; + MDB_val bkey; + /* must find the lowest key below dst */ + mdbx_cursor_copy(cdst, &mn); + rc = mdbx_page_search_lowest(&mn); + if (unlikely(rc)) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + bkey.mv_size = mn.mc_db->md_xsize; + bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + bkey.mv_size = NODEKSZ(s2); + bkey.mv_data = NODEKEY(s2); + } + mn.mc_snum = snum--; + mn.mc_top = snum; + mn.mc_ki[snum] = 0; + rc = mdbx_update_key(&mn, &bkey); + if (unlikely(rc)) + return rc; + } + + mdbx_debug("moving %s node %u [%s] on page %zu to node %u on page %zu", + IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", + csrc->mc_ki[csrc->mc_top], DKEY(&key), + csrc->mc_pg[csrc->mc_top]->mp_pgno, cdst->mc_ki[cdst->mc_top], + cdst->mc_pg[cdst->mc_top]->mp_pgno); + + /* Add the node to the destination page. */ + rc = + mdbx_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + + /* Delete the node from the source page. */ + mdbx_node_del(csrc, key.mv_size); + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + MDB_page *mpd, *mps; + + mps = csrc->mc_pg[csrc->mc_top]; + /* If we're adding on the left, bump others up */ + if (fromleft) { + mpd = cdst->mc_pg[csrc->mc_top]; + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3 != cdst && m3->mc_pg[csrc->mc_top] == mpd && + m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { + m3->mc_ki[csrc->mc_top]++; + } + if (m3 != csrc && m3->mc_pg[csrc->mc_top] == mps && + m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top - 1]++; + } + if (XCURSOR_INITED(m3) && IS_LEAF(mps)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); + } + } else + /* Adding on the right, bump others down */ + { + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) + continue; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; + if (m3->mc_pg[csrc->mc_top] == mps) { + if (!m3->mc_ki[csrc->mc_top]) { + m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; + m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; + m3->mc_ki[csrc->mc_top - 1]--; + } else { + m3->mc_ki[csrc->mc_top]--; + } + if (XCURSOR_INITED(m3) && IS_LEAF(mps)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], + m3->mc_ki[csrc->mc_top]); + } + } + } + } + + /* Update the parent separators. */ + if (csrc->mc_ki[csrc->mc_top] == 0) { + if (csrc->mc_ki[csrc->mc_top - 1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + mdbx_debug("update separator for source page %zu to [%s]", + csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)); + mdbx_cursor_copy(csrc, &mn); + mn.mc_snum--; + mn.mc_top--; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { + MDB_val nullkey; + indx_t ix = csrc->mc_ki[csrc->mc_top]; + nullkey.mv_size = 0; + csrc->mc_ki[csrc->mc_top] = 0; + rc = mdbx_update_key(csrc, &nullkey); + csrc->mc_ki[csrc->mc_top] = ix; + mdbx_cassert(csrc, rc == MDB_SUCCESS); + } + } + + if (cdst->mc_ki[cdst->mc_top] == 0) { + if (cdst->mc_ki[cdst->mc_top - 1] != 0) { + if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { + key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); + } else { + srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); + key.mv_size = NODEKSZ(srcnode); + key.mv_data = NODEKEY(srcnode); + } + mdbx_debug("update separator for destination page %zu to [%s]", + cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)); + mdbx_cursor_copy(cdst, &mn); + mn.mc_snum--; + mn.mc_top--; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { + MDB_val nullkey; + indx_t ix = cdst->mc_ki[cdst->mc_top]; + nullkey.mv_size = 0; + cdst->mc_ki[cdst->mc_top] = 0; + rc = mdbx_update_key(cdst, &nullkey); + cdst->mc_ki[cdst->mc_top] = ix; + mdbx_cassert(cdst, rc == MDB_SUCCESS); + } + } + + return MDB_SUCCESS; +} + +/** Merge one page into another. + * The nodes from the page pointed to by \b csrc will + * be copied to the page pointed to by \b cdst and then + * the \b csrc page will be freed. + * @param[in] csrc Cursor pointing to the source page. + * @param[in] cdst Cursor pointing to the destination page. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { + MDB_page *psrc, *pdst; + MDB_node *srcnode; + MDB_val key, data; + unsigned nkeys; + int rc; + indx_t i, j; + + psrc = csrc->mc_pg[csrc->mc_top]; + pdst = cdst->mc_pg[cdst->mc_top]; + + mdbx_debug("merging page %zu into %zu", psrc->mp_pgno, pdst->mp_pgno); + + mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ + mdbx_cassert(csrc, cdst->mc_snum > 1); + + /* Mark dst as dirty. */ + if (unlikely(rc = mdbx_page_touch(cdst))) + return rc; + + /* get dst page again now that we've touched it. */ + pdst = cdst->mc_pg[cdst->mc_top]; + + /* Move all nodes from src to dst. + */ + j = nkeys = NUMKEYS(pdst); + if (IS_LEAF2(psrc)) { + key.mv_size = csrc->mc_db->md_xsize; + key.mv_data = PAGEDATA(psrc); + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + rc = mdbx_node_add(cdst, j, &key, NULL, 0, 0); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + key.mv_data = (char *)key.mv_data + key.mv_size; + } + } else { + for (i = 0; i < NUMKEYS(psrc); i++, j++) { + srcnode = NODEPTR(psrc, i); + if (i == 0 && IS_BRANCH(psrc)) { + MDB_cursor mn; + MDB_node *s2; + mdbx_cursor_copy(csrc, &mn); + mn.mc_xcursor = NULL; + /* must find the lowest key below src */ + rc = mdbx_page_search_lowest(&mn); + if (unlikely(rc)) + return rc; + if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { + key.mv_size = mn.mc_db->md_xsize; + key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); + } else { + s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); + key.mv_size = NODEKSZ(s2); + key.mv_data = NODEKEY(s2); + } + } else { + key.mv_size = srcnode->mn_ksize; + key.mv_data = NODEKEY(srcnode); + } + + data.mv_size = NODEDSZ(srcnode); + data.mv_data = NODEDATA(srcnode); + rc = mdbx_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), + srcnode->mn_flags); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } + } + + mdbx_debug("dst page %zu now has %u keys (%.1f%% filled)", pdst->mp_pgno, + NUMKEYS(pdst), (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10); + + /* Unlink the src page from parent and add to free list. + */ + csrc->mc_top--; + mdbx_node_del(csrc, 0); + if (csrc->mc_ki[csrc->mc_top] == 0) { + key.mv_size = 0; + rc = mdbx_update_key(csrc, &key); + if (unlikely(rc)) { + csrc->mc_top++; + return rc; + } + } + csrc->mc_top++; + + psrc = csrc->mc_pg[csrc->mc_top]; + /* If not operating on FreeDB, allow this page to be reused + * in this txn. Otherwise just add to free list. + */ + rc = mdbx_page_loose(csrc, psrc); + if (unlikely(rc)) + return rc; + if (IS_LEAF(psrc)) + csrc->mc_db->md_leaf_pages--; + else + csrc->mc_db->md_branch_pages--; + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = csrc->mc_dbi; + unsigned top = csrc->mc_top; + + for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (csrc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == csrc) + continue; + if (m3->mc_snum < csrc->mc_snum) + continue; + if (m3->mc_pg[top] == psrc) { + m3->mc_pg[top] = pdst; + m3->mc_ki[top] += nkeys; + m3->mc_ki[top - 1] = cdst->mc_ki[top - 1]; + } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] && + m3->mc_ki[top - 1] > csrc->mc_ki[top - 1]) { + m3->mc_ki[top - 1]--; + } + if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); + } + } + { + unsigned snum = cdst->mc_snum; + uint16_t depth = cdst->mc_db->md_depth; + mdbx_cursor_pop(cdst); + rc = mdbx_rebalance(cdst); + /* Did the tree height change? */ + if (depth != cdst->mc_db->md_depth) + snum += cdst->mc_db->md_depth - depth; + cdst->mc_snum = snum; + cdst->mc_top = snum - 1; + } + return rc; +} + +/** Copy the contents of a cursor. + * @param[in] csrc The cursor to copy from. + * @param[out] cdst The cursor to copy to. + */ +static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) { + unsigned i; + + cdst->mc_txn = csrc->mc_txn; + cdst->mc_dbi = csrc->mc_dbi; + cdst->mc_db = csrc->mc_db; + cdst->mc_dbx = csrc->mc_dbx; + cdst->mc_snum = csrc->mc_snum; + cdst->mc_top = csrc->mc_top; + cdst->mc_flags = csrc->mc_flags; + + for (i = 0; i < csrc->mc_snum; i++) { + cdst->mc_pg[i] = csrc->mc_pg[i]; + cdst->mc_ki[i] = csrc->mc_ki[i]; + } +} + +/** Rebalance the tree after a delete operation. + * @param[in] mc Cursor pointing to the page where rebalancing + * should begin. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_rebalance(MDB_cursor *mc) { + MDB_node *node; + int rc, fromleft; + unsigned ptop, minkeys, thresh; + MDB_cursor mn; + indx_t oldki; + + if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { + minkeys = 2; + thresh = 1; + } else { + minkeys = 1; + thresh = FILL_THRESHOLD; + } + mdbx_debug("rebalancing %s page %zu (has %u keys, %.1f%% full)", + IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", + mdbx_dbg_pgno(mc->mc_pg[mc->mc_top]), + NUMKEYS(mc->mc_pg[mc->mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10); + + if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && + NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { + mdbx_debug("no need to rebalance page %zu, above fill threshold", + mdbx_dbg_pgno(mc->mc_pg[mc->mc_top])); + return MDB_SUCCESS; + } + + if (mc->mc_snum < 2) { + MDB_page *mp = mc->mc_pg[0]; + if (IS_SUBP(mp)) { + mdbx_debug("Can't rebalance a subpage, ignoring"); + return MDB_SUCCESS; + } + if (NUMKEYS(mp) == 0) { + mdbx_debug("tree is completely empty"); + mc->mc_db->md_root = P_INVALID; + mc->mc_db->md_depth = 0; + mc->mc_db->md_leaf_pages = 0; + rc = mdbx_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (unlikely(rc)) + return rc; + /* Adjust cursors pointing to mp */ + mc->mc_snum = 0; + mc->mc_top = 0; + mc->mc_flags &= ~C_INITIALIZED; + { + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) + continue; + if (m3->mc_pg[0] == mp) { + m3->mc_snum = 0; + m3->mc_top = 0; + m3->mc_flags &= ~C_INITIALIZED; + } + } + } + } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { + int i; + mdbx_debug("collapsing root page!"); + rc = mdbx_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (unlikely(rc)) + return rc; + mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); + rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); + if (unlikely(rc)) + return rc; + mc->mc_db->md_depth--; + mc->mc_db->md_branch_pages--; + mc->mc_ki[0] = mc->mc_ki[1]; + for (i = 1; i < mc->mc_db->md_depth; i++) { + mc->mc_pg[i] = mc->mc_pg[i + 1]; + mc->mc_ki[i] = mc->mc_ki[i + 1]; + } + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) + continue; + if (!(m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_pg[0] == mp) { + for (i = 0; i < mc->mc_db->md_depth; i++) { + m3->mc_pg[i] = m3->mc_pg[i + 1]; + m3->mc_ki[i] = m3->mc_ki[i + 1]; + } + m3->mc_snum--; + m3->mc_top--; + } + } + } + } else + mdbx_debug("root page doesn't need rebalancing"); + return MDB_SUCCESS; + } + + /* The parent (branch page) must have at least 2 pointers, + * otherwise the tree is invalid. + */ + ptop = mc->mc_top - 1; + mdbx_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); + + /* Leaf page fill factor is below the threshold. + * Try to move keys from left or right neighbor, or + * merge with a neighbor page. + */ + + /* Find neighbors. + */ + mdbx_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + + oldki = mc->mc_ki[mc->mc_top]; + if (mc->mc_ki[ptop] == 0) { + /* We're the leftmost leaf in our parent. + */ + mdbx_debug("reading right neighbor"); + mn.mc_ki[ptop]++; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdbx_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + if (unlikely(rc)) + return rc; + mn.mc_ki[mn.mc_top] = 0; + mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + fromleft = 0; + } else { + /* There is at least one neighbor to the left. + */ + mdbx_debug("reading left neighbor"); + mn.mc_ki[ptop]--; + node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); + rc = mdbx_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); + if (unlikely(rc)) + return rc; + mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; + mc->mc_ki[mc->mc_top] = 0; + fromleft = 1; + } + + mdbx_debug("found neighbor page %zu (%u keys, %.1f%% full)", + mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), + (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10); + + /* If the neighbor page is above threshold and has enough keys, + * move one key from it. Otherwise we should try to merge them. + * (A branch page must never have less than 2 keys.) + */ + if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && + NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { + rc = mdbx_node_move(&mn, mc, fromleft); + if (fromleft) { + /* if we inserted on left, bump position up */ + oldki++; + } + } else { + if (!fromleft) { + rc = mdbx_page_merge(&mn, mc); + } else { + oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; + /* We want mdbx_rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); + mdbx_cursor_copy(&mn, mc); + } + mc->mc_flags &= ~C_EOF; + } + mc->mc_ki[mc->mc_top] = oldki; + return rc; +} + +/** Complete a delete operation started by #mdbx_cursor_del(). */ +static int mdbx_cursor_del0(MDB_cursor *mc) { + int rc; + MDB_page *mp; + indx_t ki; + unsigned nkeys; + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + mdbx_node_del(mc, mc->mc_db->md_xsize); + mc->mc_db->md_entries--; + { + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3 == mc || m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDB_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + } + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; + } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + } + } + rc = mdbx_rebalance(mc); + + if (likely(rc == MDB_SUCCESS)) { + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by mdbx_rebalance and aren't needed here. + */ + if (!mc->mc_snum) + return rc; + + mp = mc->mc_pg[mc->mc_top]; + nkeys = NUMKEYS(mp); + + /* Adjust other cursors pointing to mp */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2 = m2->mc_next) { + m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdbx_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; + continue; + } + } + if (mc->mc_db->md_flags & MDB_DUPSORT) { + MDB_node *node = + NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node is a fake page, it needs to be reinited + * because its data has moved. But just reset mc_pg[0] + * if the xcursor is already live. + */ + if ((node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + else + mdbx_xcursor_init1(m3, node); + } + } + } + } + } + mc->mc_flags |= C_DEL; + } + + if (unlikely(rc)) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { + if (unlikely(!key || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + +#if !MDBX_MODE_ENABLED + if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { + /* must ignore any data */ + data = NULL; + } +#endif + + return mdbx_del0(txn, dbi, key, data, 0); +} + +static int mdbx_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned flags) { + MDB_cursor mc; + MDB_xcursor mx; + MDB_cursor_op op; + MDB_val rdata; + int rc, exact = 0; + DKBUF; + + mdbx_debug("====> delete db %u key [%s]", dbi, DKEY(key)); + + mdbx_cursor_init(&mc, txn, dbi, &mx); + + if (data) { + op = MDB_GET_BOTH; + rdata = *data; + data = &rdata; + } else { + op = MDB_SET; + flags |= MDB_NODUPDATA; + } + rc = mdbx_cursor_set(&mc, key, data, op, &exact); + if (likely(rc == 0)) { + /* let mdbx_page_split know about this cursor if needed: + * delete will trigger a rebalance; if it needs to move + * a node from one page to another, it will have to + * update the parent's separator key(s). If the new sepkey + * is larger than the current one, the parent page may + * run out of space, triggering a split. We need this + * cursor to be consistent until the end of the rebalance. + */ + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdbx_cursor_del(&mc, flags); + txn->mt_cursors[dbi] = mc.mc_next; + } + return rc; +} + +/** Split a page and insert a new node. + * Set #MDB_TXN_ERROR on failure. + * @param[in,out] mc Cursor pointing to the page and desired insertion index. + * The cursor will be updated to point to the actual page and index where + * the node got inserted after the split. + * @param[in] newkey The key for the newly inserted node. + * @param[in] newdata The data for the newly inserted node. + * @param[in] newpgno The page number, if the new node is a branch node. + * @param[in] nflags The #NODE_ADD_FLAGS for the new node. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, + pgno_t newpgno, unsigned nflags) { + unsigned flags; + int rc = MDB_SUCCESS, new_root = 0, did_split = 0; + indx_t newindx; + pgno_t pgno = 0; + int i, j, split_indx, nkeys, pmax; + MDB_env *env = mc->mc_txn->mt_env; + MDB_node *node; + MDB_val sepkey, rkey, xdata, *rdata = &xdata; + MDB_page *copy = NULL; + MDB_page *mp, *rp, *pp; + int ptop; + MDB_cursor mn; + DKBUF; + + mp = mc->mc_pg[mc->mc_top]; + newindx = mc->mc_ki[mc->mc_top]; + nkeys = NUMKEYS(mp); + + mdbx_debug("-----> splitting %s page %zu and adding [%s] at index %i/%i", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, DKEY(newkey), + mc->mc_ki[mc->mc_top], nkeys); + + /* Create a right sibling. */ + if ((rc = mdbx_page_new(mc, mp->mp_flags, 1, &rp))) + return rc; + rp->mp_leaf2_ksize = mp->mp_leaf2_ksize; + mdbx_debug("new right sibling: page %zu", rp->mp_pgno); + + /* Usually when splitting the root page, the cursor + * height is 1. But when called from mdbx_update_key, + * the cursor height may be greater because it walks + * up the stack while finding the branch slot to update. + */ + if (mc->mc_top < 1) { + if ((rc = mdbx_page_new(mc, P_BRANCH, 1, &pp))) + goto done; + /* shift current top to make room for new parent */ + for (i = mc->mc_snum; i > 0; i--) { + mc->mc_pg[i] = mc->mc_pg[i - 1]; + mc->mc_ki[i] = mc->mc_ki[i - 1]; + } + mc->mc_pg[0] = pp; + mc->mc_ki[0] = 0; + mc->mc_db->md_root = pp->mp_pgno; + mdbx_debug("root split! new root = %zu", pp->mp_pgno); + new_root = mc->mc_db->md_depth++; + + /* Add left (implicit) pointer. */ + if (unlikely((rc = mdbx_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != + MDB_SUCCESS)) { + /* undo the pre-push */ + mc->mc_pg[0] = mc->mc_pg[1]; + mc->mc_ki[0] = mc->mc_ki[1]; + mc->mc_db->md_root = mp->mp_pgno; + mc->mc_db->md_depth--; + goto done; + } + mc->mc_snum++; + mc->mc_top++; + ptop = 0; + } else { + ptop = mc->mc_top - 1; + mdbx_debug("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno); + } + + mdbx_cursor_copy(mc, &mn); + mn.mc_xcursor = NULL; + mn.mc_pg[mn.mc_top] = rp; + mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1; + + if (nflags & MDB_APPEND) { + mn.mc_ki[mn.mc_top] = 0; + sepkey = *newkey; + split_indx = newindx; + nkeys = 0; + } else { + split_indx = (nkeys + 1) / 2; + + if (IS_LEAF2(rp)) { + char *split, *ins; + int x; + unsigned lsize, rsize, ksize; + /* Move half of the keys to the right sibling */ + x = mc->mc_ki[mc->mc_top] - split_indx; + ksize = mc->mc_db->md_xsize; + split = LEAF2KEY(mp, split_indx, ksize); + rsize = (nkeys - split_indx) * ksize; + lsize = (nkeys - split_indx) * sizeof(indx_t); + mp->mp_lower -= lsize; + rp->mp_lower += lsize; + mp->mp_upper += rsize - lsize; + rp->mp_upper -= rsize - lsize; + sepkey.mv_size = ksize; + if (newindx == split_indx) { + sepkey.mv_data = newkey->mv_data; + } else { + sepkey.mv_data = split; + } + if (x < 0) { + ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); + memcpy(rp->mp_ptrs, split, rsize); + sepkey.mv_data = rp->mp_ptrs; + memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memcpy(ins, newkey->mv_data, ksize); + mp->mp_lower += sizeof(indx_t); + mp->mp_upper -= ksize - sizeof(indx_t); + } else { + if (x) + memcpy(rp->mp_ptrs, split, x * ksize); + ins = LEAF2KEY(rp, x, ksize); + memcpy(ins, newkey->mv_data, ksize); + memcpy(ins + ksize, split + x * ksize, rsize - x * ksize); + rp->mp_lower += sizeof(indx_t); + rp->mp_upper -= ksize - sizeof(indx_t); + mc->mc_ki[mc->mc_top] = x; + } + } else { + int psize, nsize, k; + /* Maximum free space in an empty page */ + pmax = env->me_psize - PAGEHDRSZ; + if (IS_LEAF(mp)) + nsize = mdbx_leaf_size(env, newkey, newdata); + else + nsize = mdbx_branch_size(env, newkey); + nsize = EVEN(nsize); + + /* grab a page to hold a temporary copy */ + copy = mdbx_page_malloc(mc->mc_txn, 1); + if (unlikely(copy == NULL)) { + rc = ENOMEM; + goto done; + } + copy->mp_pgno = mp->mp_pgno; + copy->mp_flags = mp->mp_flags; + copy->mp_lower = (PAGEHDRSZ - PAGEBASE); + copy->mp_upper = env->me_psize - PAGEBASE; + + /* prepare to insert */ + for (i = 0, j = 0; i < nkeys; i++) { + if (i == newindx) { + copy->mp_ptrs[j++] = 0; + } + copy->mp_ptrs[j++] = mp->mp_ptrs[i]; + } + + /* When items are relatively large the split point needs + * to be checked, because being off-by-one will make the + * difference between success or failure in mdbx_node_add. + * + * It's also relevant if a page happens to be laid out + * such that one half of its nodes are all "small" and + * the other half of its nodes are "large." If the new + * item is also "large" and falls on the half with + * "large" nodes, it also may not fit. + * + * As a final tweak, if the new item goes on the last + * spot on the page (and thus, onto the new page), bias + * the split so the new page is emptier than the old page. + * This yields better packing during sequential inserts. + */ + if (nkeys < 20 || nsize > pmax / 16 || newindx >= nkeys) { + /* Find split point */ + psize = 0; + if (newindx <= split_indx || newindx >= nkeys) { + i = 0; + j = 1; + k = newindx >= nkeys ? nkeys : split_indx + 1 + IS_LEAF(mp); + } else { + i = nkeys; + j = -1; + k = split_indx - 1; + } + for (; i != k; i += j) { + if (i == newindx) { + psize += nsize; + node = NULL; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); + if (IS_LEAF(mp)) { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + psize += sizeof(pgno_t); + else + psize += NODEDSZ(node); + } + psize = EVEN(psize); + } + if (psize > pmax || i == k - j) { + split_indx = i + (j < 0); + break; + } + } + } + if (split_indx == newindx) { + sepkey.mv_size = newkey->mv_size; + sepkey.mv_data = newkey->mv_data; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); + sepkey.mv_size = node->mn_ksize; + sepkey.mv_data = NODEKEY(node); + } + } + } + + mdbx_debug("separator is %d [%s]", split_indx, DKEY(&sepkey)); + + /* Copy separator key to the parent. */ + if (SIZELEFT(mn.mc_pg[ptop]) < mdbx_branch_size(env, &sepkey)) { + int snum = mc->mc_snum; + mn.mc_snum--; + mn.mc_top--; + did_split = 1; + /* We want other splits to find mn when doing fixups */ + WITH_CURSOR_TRACKING( + mn, rc = mdbx_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); + if (unlikely(rc != MDB_SUCCESS)) + goto done; + + /* root split? */ + if (mc->mc_snum > snum) { + ptop++; + } + /* Right page might now have changed parent. + * Check if left page also changed parent. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i = 0; i < ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + mc->mc_pg[ptop] = mn.mc_pg[ptop]; + if (mn.mc_ki[ptop]) { + mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; + } else { + /* find right page's left sibling */ + mc->mc_ki[ptop] = mn.mc_ki[ptop]; + rc = mdbx_cursor_sibling(mc, 0); + } + } + } else { + mn.mc_top--; + rc = mdbx_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); + mn.mc_top++; + } + if (unlikely(rc != MDB_SUCCESS)) { + if (rc == MDB_NOTFOUND) /* improper mdbx_cursor_sibling() result */ + rc = MDB_PROBLEM; + goto done; + } + if (nflags & MDB_APPEND) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[mc->mc_top] = 0; + rc = mdbx_node_add(mc, 0, newkey, newdata, newpgno, nflags); + if (rc) + goto done; + for (i = 0; i < mc->mc_top; i++) + mc->mc_ki[i] = mn.mc_ki[i]; + } else if (!IS_LEAF2(mp)) { + /* Move nodes */ + mc->mc_pg[mc->mc_top] = rp; + i = split_indx; + j = 0; + do { + if (i == newindx) { + rkey.mv_data = newkey->mv_data; + rkey.mv_size = newkey->mv_size; + if (IS_LEAF(mp)) { + rdata = newdata; + } else + pgno = newpgno; + flags = nflags; + /* Update index for the new key. */ + mc->mc_ki[mc->mc_top] = j; + } else { + node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + rkey.mv_data = NODEKEY(node); + rkey.mv_size = node->mn_ksize; + if (IS_LEAF(mp)) { + xdata.mv_data = NODEDATA(node); + xdata.mv_size = NODEDSZ(node); + rdata = &xdata; + } else + pgno = NODEPGNO(node); + flags = node->mn_flags; + } + + if (!IS_LEAF(mp) && j == 0) { + /* First branch index doesn't need key data. */ + rkey.mv_size = 0; + } + + rc = mdbx_node_add(mc, j, &rkey, rdata, pgno, flags); + if (rc) + goto done; + if (i == nkeys) { + i = 0; + j = 0; + mc->mc_pg[mc->mc_top] = copy; + } else { + i++; + j++; + } + } while (i != split_indx); + + nkeys = NUMKEYS(copy); + for (i = 0; i < nkeys; i++) + mp->mp_ptrs[i] = copy->mp_ptrs[i]; + mp->mp_lower = copy->mp_lower; + mp->mp_upper = copy->mp_upper; + memcpy(NODEPTR(mp, nkeys - 1), NODEPTR(copy, nkeys - 1), + env->me_psize - copy->mp_upper - PAGEBASE); + + /* reset back to original page */ + if (newindx < split_indx) { + mc->mc_pg[mc->mc_top] = mp; + } else { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i = 0; i <= ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + if (nflags & MDB_RESERVE) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + if (newindx >= split_indx) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i = 0; i <= ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } + } + + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2, *m3; + MDB_dbi dbi = mc->mc_dbi; + nkeys = NUMKEYS(mp); + + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + if (mc->mc_flags & C_SUB) + m3 = &m2->mc_xcursor->mx_cursor; + else + m3 = m2; + if (m3 == mc) + continue; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (new_root) { + int k; + /* sub cursors may be on different DB */ + if (m3->mc_pg[0] != mp) + continue; + /* root split */ + for (k = new_root; k >= 0; k--) { + m3->mc_ki[k + 1] = m3->mc_ki[k]; + m3->mc_pg[k + 1] = m3->mc_pg[k]; + } + if (m3->mc_ki[0] >= nkeys) { + m3->mc_ki[0] = 1; + } else { + m3->mc_ki[0] = 0; + } + m3->mc_pg[0] = mc->mc_pg[0]; + m3->mc_snum++; + m3->mc_top++; + } + if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) + m3->mc_ki[mc->mc_top]++; + if (m3->mc_ki[mc->mc_top] >= nkeys) { + m3->mc_pg[mc->mc_top] = rp; + m3->mc_ki[mc->mc_top] -= nkeys; + for (i = 0; i < mc->mc_top; i++) { + m3->mc_ki[i] = mn.mc_ki[i]; + m3->mc_pg[i] = mn.mc_pg[i]; + } + } + } else if (!did_split && m3->mc_top >= ptop && + m3->mc_pg[ptop] == mc->mc_pg[ptop] && + m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { + m3->mc_ki[ptop]++; + } + if (XCURSOR_INITED(m3) && IS_LEAF(mp)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + } + mdbx_debug("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)); + +done: + if (copy) /* tmp page */ + mdbx_page_free(env, copy); + if (unlikely(rc)) + mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + return rc; +} + +int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned flags) { + MDB_cursor mc; + MDB_xcursor mx; + + if (unlikely(!key || !data || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + if (unlikely(flags & + ~(MDB_NOOVERWRITE | MDB_NODUPDATA | MDB_RESERVE | MDB_APPEND | + MDB_APPENDDUP + /* LY: MDB_CURRENT indicates explicit overwrite (update) + for MDBX */ + | (MDBX_MODE_ENABLED ? MDB_CURRENT : 0)))) + return EINVAL; + + if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + + mdbx_cursor_init(&mc, txn, dbi, &mx); + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + int rc = MDB_SUCCESS; +#if MDBX_MODE_ENABLED + /* LY: support for update (explicit overwrite) */ + if (flags & MDB_CURRENT) { + rc = mdbx_cursor_get(&mc, key, NULL, MDB_SET); + if (likely(rc == MDB_SUCCESS) && + (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)) { + /* LY: allows update (explicit overwrite) only for unique keys */ + MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_tassert(txn, XCURSOR_INITED(&mc) && + mc.mc_xcursor->mx_db.md_entries > 1); + rc = MDBX_EMULTIVAL; + } + } + } +#endif /* MDBX_MODE_ENABLED */ + if (likely(rc == MDB_SUCCESS)) + rc = mdbx_cursor_put(&mc, key, data, flags); + txn->mt_cursors[dbi] = mc.mc_next; + + return rc; +} + +#ifndef MDB_WBUF +#define MDB_WBUF (1024 * 1024) +#endif +#define MDB_EOF 0x10 /**< #mdbx_env_copyfd1() is done reading */ + +/** State needed for a double-buffering compacting copy. */ +typedef struct mdbx_copy { + MDB_env *mc_env; + MDB_txn *mc_txn; + pthread_mutex_t mc_mutex; + pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ + char *mc_wbuf[2]; + char *mc_over[2]; + int mc_wlen[2]; + int mc_olen[2]; + pgno_t mc_next_pgno; + HANDLE mc_fd; + int mc_toggle; /**< Buffer number in provider */ + int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ + /** Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, LMDB expects atomic int. + */ + volatile int mc_error; +} mdbx_copy; + +/** Dedicated writer thread for compacting copy. */ +static void *__cold mdbx_env_copythr(void *arg) { + mdbx_copy *my = arg; + char *ptr; + int toggle = 0, wsize, rc = 0; + int len; + +#ifdef SIGPIPE + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) + my->mc_error = rc; +#endif + + pthread_mutex_lock(&my->mc_mutex); + for (;;) { + while (!my->mc_new) + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ + break; + wsize = my->mc_wlen[toggle]; + ptr = my->mc_wbuf[toggle]; + again: + rc = MDB_SUCCESS; + while (wsize > 0 && !my->mc_error) { + len = write(my->mc_fd, ptr, wsize); + if (len < 0) { + rc = errno; +#ifdef SIGPIPE + if (rc == EPIPE) { + /* Collect the pending SIGPIPE, otherwise at least OS X + * gives it to the process on thread-exit (ITS#8504). + */ + int tmp; + sigwait(&set, &tmp); + } +#endif + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + if (rc) { + my->mc_error = rc; + } + /* If there's an overflow page tail, write it too */ + if (my->mc_olen[toggle]) { + wsize = my->mc_olen[toggle]; + ptr = my->mc_over[toggle]; + my->mc_olen[toggle] = 0; + goto again; + } + my->mc_wlen[toggle] = 0; + toggle ^= 1; + /* Return the empty buffer to provider */ + my->mc_new--; + pthread_cond_signal(&my->mc_cond); + } + pthread_mutex_unlock(&my->mc_mutex); + return NULL; +} + +/** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. + * + * @param[in] my control structure. + * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). + */ +static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { + pthread_mutex_lock(&my->mc_mutex); + my->mc_new += adjust; + pthread_cond_signal(&my->mc_cond); + while (my->mc_new & 2) /* both buffers in use */ + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + pthread_mutex_unlock(&my->mc_mutex); + + my->mc_toggle ^= (adjust & 1); + /* Both threads reset mc_wlen, to be safe from threading errors */ + my->mc_wlen[my->mc_toggle] = 0; + return my->mc_error; +} + +/** Depth-first tree traversal for compacting copy. + * @param[in] my control structure. + * @param[in,out] pg database root. + * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. + */ +static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { + MDB_cursor mc; + MDB_node *ni; + MDB_page *mo, *mp, *leaf; + char *buf, *ptr; + int rc, toggle; + unsigned i; + + /* Empty DB, nothing to do */ + if (*pg == P_INVALID) + return MDB_SUCCESS; + + memset(&mc, 0, sizeof(mc)); + mc.mc_snum = 1; + mc.mc_txn = my->mc_txn; + + rc = mdbx_page_get(&mc, *pg, &mc.mc_pg[0], NULL); + if (rc) + return rc; + rc = mdbx_page_search_root(&mc, NULL, MDB_PS_FIRST); + if (rc) + return rc; + + /* Make cursor pages writable */ + buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); + if (buf == NULL) + return ENOMEM; + + for (i = 0; i < mc.mc_top; i++) { + mdbx_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); + mc.mc_pg[i] = (MDB_page *)ptr; + ptr += my->mc_env->me_psize; + } + + /* This is writable space for a leaf page. Usually not needed. */ + leaf = (MDB_page *)ptr; + + toggle = my->mc_toggle; + while (mc.mc_snum > 0) { + unsigned n; + mp = mc.mc_pg[mc.mc_top]; + n = NUMKEYS(mp); + + if (IS_LEAF(mp)) { + if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { + for (i = 0; i < n; i++) { + ni = NODEPTR(mp, i); + if (ni->mn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); + rc = mdbx_page_get(&mc, pg, &omp, NULL); + if (rc) + goto done; + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdbx_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + memcpy(mo, omp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno; + my->mc_next_pgno += omp->mp_pages; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (omp->mp_pages > 1) { + my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); + my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; + rc = mdbx_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + } else if (ni->mn_flags & F_SUBDATA) { + MDB_db db; + + /* Need writable leaf */ + if (mp != leaf) { + mc.mc_pg[mc.mc_top] = leaf; + mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mp = leaf; + ni = NODEPTR(mp, i); + } + + memcpy(&db, NODEDATA(ni), sizeof(db)); + my->mc_toggle = toggle; + rc = mdbx_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); + if (rc) + goto done; + toggle = my->mc_toggle; + memcpy(NODEDATA(ni), &db, sizeof(db)); + } + } + } + } else { + mc.mc_ki[mc.mc_top]++; + if (mc.mc_ki[mc.mc_top] < n) { + pgno_t pg; + again: + ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); + pg = NODEPGNO(ni); + rc = mdbx_page_get(&mc, pg, &mp, NULL); + if (rc) + goto done; + mc.mc_top++; + mc.mc_snum++; + mc.mc_ki[mc.mc_top] = 0; + if (IS_BRANCH(mp)) { + /* Whenever we advance to a sibling branch page, + * we must proceed all the way down to its first leaf. + */ + mdbx_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); + goto again; + } else + mc.mc_pg[mc.mc_top] = mp; + continue; + } + } + if (my->mc_wlen[toggle] >= MDB_WBUF) { + rc = mdbx_env_cthr_toggle(my, 1); + if (rc) + goto done; + toggle = my->mc_toggle; + } + mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mdbx_page_copy(mo, mp, my->mc_env->me_psize); + mo->mp_pgno = my->mc_next_pgno++; + my->mc_wlen[toggle] += my->mc_env->me_psize; + if (mc.mc_top) { + /* Update parent if there is one */ + ni = NODEPTR(mc.mc_pg[mc.mc_top - 1], mc.mc_ki[mc.mc_top - 1]); + SETPGNO(ni, mo->mp_pgno); + mdbx_cursor_pop(&mc); + } else { + /* Otherwise we're done */ + *pg = mo->mp_pgno; + break; + } + } +done: + free(buf); + return rc; +} + +/** Copy environment with compaction. */ +static int __cold mdbx_env_copyfd1(MDB_env *env, HANDLE fd) { + MDB_meta *mm; + MDB_page *mp; + mdbx_copy my; + MDB_txn *txn = NULL; + pthread_t thr; + pgno_t root, new_root; + int rc = MDB_SUCCESS; + + memset(&my, 0, sizeof(my)); + if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) + return rc; + if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) + goto done2; + my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF * 2); + if (my.mc_wbuf[0] == NULL) { + rc = errno; + goto done; + } + memset(my.mc_wbuf[0], 0, MDB_WBUF * 2); + my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; + my.mc_next_pgno = NUM_METAS; + my.mc_env = env; + my.mc_fd = fd; + rc = pthread_create(&thr, NULL, mdbx_env_copythr, &my); + if (rc) + goto done; + + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + goto finish; + + mp = (MDB_page *)my.mc_wbuf[0]; + memset(mp, 0, NUM_METAS * env->me_psize); + mp->mp_pgno = 0; + mp->mp_flags = P_META; + mm = (MDB_meta *)PAGEDATA(mp); + mdbx_env_init_meta0(env, mm); + mm->mm_address = METAPAGE_1(env)->mm_address; + + mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); + mp->mp_pgno = 1; + mp->mp_flags = P_META; + *(MDB_meta *)PAGEDATA(mp) = *mm; + mm = (MDB_meta *)PAGEDATA(mp); + + /* Set metapage 1 with current main DB */ + root = new_root = txn->mt_dbs[MAIN_DBI].md_root; + if (root != P_INVALID) { + /* Count free pages + freeDB pages. Subtract from last_pg + * to find the new last_pg, which also becomes the new root. + */ + MDB_ID freecount = 0; + MDB_cursor mc; + MDB_val key, data; + mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); + while ((rc = mdbx_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + freecount += *(MDB_ID *)data.mv_data; + if (rc != MDB_NOTFOUND) + goto finish; + freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + + txn->mt_dbs[FREE_DBI].md_leaf_pages + + txn->mt_dbs[FREE_DBI].md_overflow_pages; + + new_root = txn->mt_next_pgno - 1 - freecount; + mm->mm_last_pg = new_root; + mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + mm->mm_dbs[MAIN_DBI].md_root = new_root; + } else { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. + */ + mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; + } + if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { + mm->mm_txnid = 1; /* use metapage 1 */ + } + + my.mc_wlen[0] = env->me_psize * NUM_METAS; + my.mc_txn = txn; + rc = mdbx_env_cwalk(&my, &root, 0); + if (rc == MDB_SUCCESS && root != new_root) { + rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ + } + +finish: + if (rc) + my.mc_error = rc; + mdbx_env_cthr_toggle(&my, 1 | MDB_EOF); + rc = pthread_join(thr, NULL); + mdbx_txn_abort(txn); + +done: + free(my.mc_wbuf[0]); + pthread_cond_destroy(&my.mc_cond); +done2: + pthread_mutex_destroy(&my.mc_mutex); + return rc ? rc : my.mc_error; +} + +/** Copy environment as-is. */ +static int __cold mdbx_env_copyfd0(MDB_env *env, HANDLE fd) { + MDB_txn *txn = NULL; + pthread_mutex_t *wmutex = NULL; + int rc; + size_t wsize; + char *ptr; + ssize_t len; + size_t w2; + + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. + */ + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) + return rc; + + /* We must start the actual read txn after blocking writers */ + rc = mdbx_txn_end(txn, MDB_END_RESET_TMP); + if (rc) + return rc; + + /* Temporarily block writers until we snapshot the meta pages */ + wmutex = MDB_MUTEX(env, w); + rc = mdbx_mutex_lock(env, wmutex); + if (unlikely(rc)) + goto leave; + + rc = mdbx_txn_renew0(txn, MDB_RDONLY); + if (rc) { + mdbx_mutex_unlock(env, wmutex); + goto leave; + } + + wsize = env->me_psize * NUM_METAS; + ptr = env->me_map; + w2 = wsize; + while (w2 > 0) { + len = write(fd, ptr, w2); + if (len < 0) { + rc = errno; + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + w2 -= len; + continue; + } else { + /* Non-blocking or async handles are not supported */ + rc = EIO; + break; + } + } + mdbx_mutex_unlock(env, wmutex); + + if (rc) + goto leave; + + w2 = txn->mt_next_pgno * env->me_psize; + { + size_t fsize = 0; + if ((rc = mdbx_fsize(env->me_fd, &fsize))) + goto leave; + if (w2 > fsize) + w2 = fsize; + } + wsize = w2 - wsize; + while (wsize > 0) { + if (wsize > MAX_WRITE) + w2 = MAX_WRITE; + else + w2 = wsize; + len = write(fd, ptr, w2); + if (len < 0) { + rc = errno; + break; + } else if (len > 0) { + rc = MDB_SUCCESS; + ptr += len; + wsize -= len; + continue; + } else { + rc = EIO; + break; + } + } + +leave: + mdbx_txn_abort(txn); + return rc; +} + +int __cold mdbx_env_copyfd2(MDB_env *env, HANDLE fd, unsigned flags) { + if (flags & MDB_CP_COMPACT) + return mdbx_env_copyfd1(env, fd); + else + return mdbx_env_copyfd0(env, fd); +} + +int __cold mdbx_env_copyfd(MDB_env *env, HANDLE fd) { + return mdbx_env_copyfd2(env, fd, 0); +} + +int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { + int rc, len; + char *lpath; + HANDLE newfd = INVALID_HANDLE_VALUE; + + if (env->me_flags & MDB_NOSUBDIR) { + lpath = (char *)path; + } else { + len = strlen(path); + len += sizeof(DATANAME); + lpath = malloc(len); + if (!lpath) + return ENOMEM; + sprintf(lpath, "%s" DATANAME, path); + } + + /* The destination path must exist, but the destination file must not. + * We don't want the OS to cache the writes, since the source data is + * already in the OS cache. + */ + newfd = open(lpath, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0666); + if (newfd == INVALID_HANDLE_VALUE) { + rc = errno; + goto leave; + } + + int fdflags; + if ((fdflags = fcntl(newfd, F_GETFD) | FD_CLOEXEC) >= 0) + fcntl(newfd, F_SETFD, fdflags); + + if (env->me_psize >= env->me_os_psize) { +#ifdef F_NOCACHE /* __APPLE__ */ + (void)fcntl(newfd, F_NOCACHE, 1); +#elif defined O_DIRECT + /* Set O_DIRECT if the file system supports it */ + if ((rc = fcntl(newfd, F_GETFL)) != -1) + (void)fcntl(newfd, F_SETFL, rc | O_DIRECT); +#endif + } + + rc = mdbx_env_copyfd2(env, newfd, flags); + +leave: + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); + if (newfd != INVALID_HANDLE_VALUE) + if (close(newfd) < 0 && rc == MDB_SUCCESS) + rc = errno; + + return rc; +} + +int __cold mdbx_env_copy(MDB_env *env, const char *path) { + return mdbx_env_copy2(env, path, 0); +} + +int __cold mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff) { + if (unlikely(flags & ~CHANGEABLE)) + return EINVAL; + + pthread_mutex_t *mutex = MDB_MUTEX(env, w); + int rc = mdbx_mutex_lock(env, mutex); + if (unlikely(rc)) + return rc; + + if (onoff) + env->me_flags |= flags; + else + env->me_flags &= ~flags; + + mdbx_mutex_unlock(env, mutex); + return MDB_SUCCESS; +} + +int __cold mdbx_env_get_flags(MDB_env *env, unsigned *arg) { + if (unlikely(!env || !arg)) + return EINVAL; + + *arg = env->me_flags & (CHANGEABLE | CHANGELESS); + return MDB_SUCCESS; +} + +int __cold mdbx_env_set_userctx(MDB_env *env, void *ctx) { + if (unlikely(!env)) + return EINVAL; + env->me_userctx = ctx; + return MDB_SUCCESS; +} + +void *__cold mdbx_env_get_userctx(MDB_env *env) { + return env ? env->me_userctx : NULL; +} + +int __cold mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func) { + if (unlikely(!env)) + return EINVAL; +#if MDB_DEBUG + env->me_assert_func = func; + return MDB_SUCCESS; +#else + (void)func; + return ENOSYS; +#endif +} + +int __cold mdbx_env_get_path(MDB_env *env, const char **arg) { + if (unlikely(!env || !arg)) + return EINVAL; + + *arg = env->me_path; + return MDB_SUCCESS; +} + +int __cold mdbx_env_get_fd(MDB_env *env, int *arg) { + if (unlikely(!env || !arg)) + return EINVAL; + + *arg = env->me_fd; + return MDB_SUCCESS; +} + +/** Common code for #mdbx_stat() and #mdbx_env_stat(). + * @param[in] env the environment to operate in. + * @param[in] db the #MDB_db record containing the stats to return. + * @param[out] arg the address of an #MDB_stat structure to receive the stats. + * @return 0, this function always succeeds. + */ +static int __cold mdbx_stat0(MDB_env *env, MDB_db *db, MDBX_stat *arg) { + arg->ms_psize = env->me_psize; + arg->ms_depth = db->md_depth; + arg->ms_branch_pages = db->md_branch_pages; + arg->ms_leaf_pages = db->md_leaf_pages; + arg->ms_overflow_pages = db->md_overflow_pages; + arg->ms_entries = db->md_entries; + + return MDB_SUCCESS; +} + +int __cold mdbx_env_stat(MDB_env *env, MDBX_stat *arg, size_t bytes) { + MDB_meta *meta; + + if (unlikely(env == NULL || arg == NULL)) + return EINVAL; + if (unlikely(bytes != sizeof(MDBX_stat))) + return EINVAL; + + meta = mdbx_meta_head_r(env); + return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); +} + +int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) { + MDB_meta *meta; + + if (unlikely(env == NULL || arg == NULL)) + return EINVAL; + + if (bytes != sizeof(MDBX_envinfo)) + return EINVAL; + + MDB_meta *m1, *m2; + MDB_reader *r; + unsigned i; + + m1 = METAPAGE_1(env); + m2 = METAPAGE_2(env); + + do { + meta = mdbx_meta_head_r(env); + arg->me_last_txnid = meta->mm_txnid; + arg->me_last_pgno = meta->mm_last_pg; + arg->me_meta1_txnid = m1->mm_txnid; + arg->me_meta1_sign = m1->mm_datasync_sign; + arg->me_meta2_txnid = m2->mm_txnid; + arg->me_meta2_sign = m2->mm_datasync_sign; + } while (unlikely(arg->me_last_txnid != env->me_txns->mti_txnid || + arg->me_meta1_sign != m1->mm_datasync_sign || + arg->me_meta2_sign != m2->mm_datasync_sign)); + + arg->me_mapaddr = meta->mm_address; + arg->me_mapsize = env->me_mapsize; + arg->me_maxreaders = env->me_maxreaders; + arg->me_numreaders = env->me_txns->mti_numreaders; + arg->me_tail_txnid = 0; + + r = env->me_txns->mti_readers; + arg->me_tail_txnid = arg->me_last_txnid; + for (i = 0; i < arg->me_numreaders; ++i) { + if (r[i].mr_pid) { + txnid_t mr = r[i].mr_txnid; + if (arg->me_tail_txnid > mr) + arg->me_tail_txnid = mr; + } + } + + return MDB_SUCCESS; +} + +static MDB_cmp_func *mdbx_default_keycmp(unsigned flags) { + return (flags & MDB_REVERSEKEY) ? mdbx_cmp_memnr : (flags & MDB_INTEGERKEY) + ? mdbx_cmp_int_a2 + : mdbx_cmp_memn; +} + +static MDB_cmp_func *mdbx_default_datacmp(unsigned flags) { + return !(flags & MDB_DUPSORT) + ? 0 + : ((flags & MDB_INTEGERDUP) + ? mdbx_cmp_int_ua + : ((flags & MDB_REVERSEDUP) ? mdbx_cmp_memnr + : mdbx_cmp_memn)); +} + +/** Set the default comparison functions for a database. + * Called immediately after a database is opened to set the defaults. + * The user can then override them with #mdbx_set_compare() or + * #mdbx_set_dupsort(). + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + */ +static void mdbx_default_cmp(MDB_txn *txn, MDB_dbi dbi) { + unsigned flags = txn->mt_dbs[dbi].md_flags; + txn->mt_dbxs[dbi].md_cmp = mdbx_default_keycmp(flags); + txn->mt_dbxs[dbi].md_dcmp = mdbx_default_datacmp(flags); +} + +int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, + MDB_dbi *dbi) { + MDB_val key, data; + MDB_dbi i; + MDB_cursor mc; + MDB_db dummy; + int rc, dbflag, exact; + unsigned unused = 0, seq; + char *namedup; + size_t len; + + if (unlikely(!txn || !dbi)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(flags & ~VALID_FLAGS)) + return EINVAL; + + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + /* main DB? */ + if (!name) { + *dbi = MAIN_DBI; + if (flags & PERSISTENT_FLAGS) { + uint16_t f2 = flags & PERSISTENT_FLAGS; + /* make sure flag changes get committed */ + if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != + txn->mt_dbs[MAIN_DBI].md_flags) { + txn->mt_dbs[MAIN_DBI].md_flags |= f2; + txn->mt_flags |= MDB_TXN_DIRTY; + } + } + mdbx_default_cmp(txn, MAIN_DBI); + return MDB_SUCCESS; + } + + if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + mdbx_default_cmp(txn, MAIN_DBI); + } + + /* Is the DB already open? */ + len = strlen(name); + for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + if (!txn->mt_dbxs[i].md_name.mv_size) { + /* Remember this free slot */ + if (!unused) + unused = i; + continue; + } + if (len == txn->mt_dbxs[i].md_name.mv_size && + !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { + *dbi = i; + return MDB_SUCCESS; + } + } + + /* If no free slot and max hit, fail */ + if (!unused && unlikely(txn->mt_numdbs >= txn->mt_env->me_maxdbs)) + return MDB_DBS_FULL; + + /* Cannot mix named databases with some mainDB flags */ + if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT | MDB_INTEGERKEY))) + return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + + /* Find the DB info */ + dbflag = DB_NEW | DB_VALID | DB_USRVALID; + exact = 0; + key.mv_size = len; + key.mv_data = (void *)name; + mdbx_cursor_init(&mc, txn, MAIN_DBI, NULL); + rc = mdbx_cursor_set(&mc, &key, &data, MDB_SET, &exact); + if (likely(rc == MDB_SUCCESS)) { + /* make sure this is actually a DB */ + MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (unlikely((node->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) + return MDB_INCOMPATIBLE; + } else if (!(rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { + return rc; + } + + /* Done here so we cannot fail after creating a new DB */ + if (unlikely((namedup = strdup(name)) == NULL)) + return ENOMEM; + + if (unlikely(rc)) { + /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ + data.mv_size = sizeof(MDB_db); + data.mv_data = &dummy; + memset(&dummy, 0, sizeof(dummy)); + dummy.md_root = P_INVALID; + dummy.md_flags = flags & PERSISTENT_FLAGS; + WITH_CURSOR_TRACKING(mc, rc = mdbx_cursor_put(&mc, &key, &data, F_SUBDATA)); + dbflag |= DB_DIRTY; + } + + if (unlikely(rc)) { + free(namedup); + } else { + /* Got info, register DBI in this txn */ + unsigned slot = unused ? unused : txn->mt_numdbs; + txn->mt_dbxs[slot].md_name.mv_data = namedup; + txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_rel = NULL; + txn->mt_dbflags[slot] = dbflag; + /* txn-> and env-> are the same in read txns, use + * tmp variable to avoid undefined assignment + */ + seq = ++txn->mt_env->me_dbiseqs[slot]; + txn->mt_dbiseqs[slot] = seq; + + memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + *dbi = slot; + mdbx_default_cmp(txn, slot); + if (!unused) { + txn->mt_numdbs++; + } + } + + return rc; +} + +int __cold mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, size_t bytes) { + if (unlikely(!arg || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) + return EINVAL; + + if (unlikely(bytes != sizeof(MDBX_stat))) + return EINVAL; + + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + if (unlikely(txn->mt_dbflags[dbi] & DB_STALE)) { + MDB_cursor mc; + MDB_xcursor mx; + /* Stale, must read the DB's root. cursor_init does it for us. */ + mdbx_cursor_init(&mc, txn, dbi, &mx); + } + return mdbx_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); +} + +void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi) { + char *ptr; + if (dbi < CORE_DBS || dbi >= env->me_maxdbs) + return; + ptr = env->me_dbxs[dbi].md_name.mv_data; + /* If there was no name, this was already closed */ + if (ptr) { + env->me_dbxs[dbi].md_name.mv_data = NULL; + env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbflags[dbi] = 0; + env->me_dbiseqs[dbi]++; + free(ptr); + } +} + +int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags) { + if (unlikely(!txn || !flags)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) + return EINVAL; + + *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; + return MDB_SUCCESS; +} + +/** Add all the DB's pages to the free list. + * @param[in] mc Cursor on the DB to free. + * @param[in] subs non-Zero to check for sub-DBs in this DB. + * @return 0 on success, non-zero on failure. + */ +static int mdbx_drop0(MDB_cursor *mc, int subs) { + int rc; + + rc = mdbx_page_search(mc, NULL, MDB_PS_FIRST); + if (likely(rc == MDB_SUCCESS)) { + MDB_txn *txn = mc->mc_txn; + MDB_node *ni; + MDB_cursor mx; + unsigned i; + + /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. + * This also avoids any P_LEAF2 pages, which have no nodes. + * Also if the DB doesn't have sub-DBs and has no overflow + * pages, omit scanning leaves. + */ + if ((mc->mc_flags & C_SUB) || (!subs && !mc->mc_db->md_overflow_pages)) + mdbx_cursor_pop(mc); + + mdbx_cursor_copy(mc, &mx); + while (mc->mc_snum > 0) { + MDB_page *mp = mc->mc_pg[mc->mc_top]; + unsigned n = NUMKEYS(mp); + if (IS_LEAF(mp)) { + for (i = 0; i < n; i++) { + ni = NODEPTR(mp, i); + if (ni->mn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t pg; + memcpy(&pg, NODEDATA(ni), sizeof(pg)); + rc = mdbx_page_get(mc, pg, &omp, NULL); + if (unlikely(rc)) + goto done; + mdbx_cassert(mc, IS_OVERFLOW(omp)); + rc = mdbx_midl_append_range(&txn->mt_free_pgs, pg, omp->mp_pages); + if (unlikely(rc)) + goto done; + mc->mc_db->md_overflow_pages -= omp->mp_pages; + if (!mc->mc_db->md_overflow_pages && !subs) + break; + } else if (subs && (ni->mn_flags & F_SUBDATA)) { + mdbx_xcursor_init1(mc, ni); + rc = mdbx_drop0(&mc->mc_xcursor->mx_cursor, 0); + if (unlikely(rc)) + goto done; + } + } + if (!subs && !mc->mc_db->md_overflow_pages) + goto pop; + } else { + if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pgs, n)) != 0)) + goto done; + for (i = 0; i < n; i++) { + pgno_t pg; + ni = NODEPTR(mp, i); + pg = NODEPGNO(ni); + /* free it */ + mdbx_midl_xappend(txn->mt_free_pgs, pg); + } + } + if (!mc->mc_top) + break; + mc->mc_ki[mc->mc_top] = i; + rc = mdbx_cursor_sibling(mc, 1); + if (rc) { + if (unlikely(rc != MDB_NOTFOUND)) + goto done; + /* no more siblings, go back to beginning + * of previous level. + */ + pop: + mdbx_cursor_pop(mc); + mc->mc_ki[0] = 0; + for (i = 1; i < mc->mc_snum; i++) { + mc->mc_ki[i] = 0; + mc->mc_pg[i] = mx.mc_pg[i]; + } + } + } + /* free it */ + rc = mdbx_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); + done: + if (unlikely(rc)) + txn->mt_flags |= MDB_TXN_ERROR; + } else if (rc == MDB_NOTFOUND) { + rc = MDB_SUCCESS; + } + mc->mc_flags &= ~C_INITIALIZED; + return rc; +} + +int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del) { + MDB_cursor *mc, *m2; + int rc; + + if (unlikely(1 < (unsigned)del || !txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + if (unlikely(TXN_DBI_CHANGED(txn, dbi))) + return MDB_BAD_DBI; + + if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + return EACCES; + + rc = mdbx_cursor_open(txn, dbi, &mc); + if (unlikely(rc)) + return rc; + + rc = mdbx_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + /* Invalidate the dropped DB's cursors */ + for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~(C_INITIALIZED | C_EOF); + if (unlikely(rc)) + goto leave; + + /* Can't delete the main DB */ + if (del && dbi >= CORE_DBS) { + rc = mdbx_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + if (likely(!rc)) { + txn->mt_dbflags[dbi] = DB_STALE; + mdbx_dbi_close(txn->mt_env, dbi); + } else { + txn->mt_flags |= MDB_TXN_ERROR; + } + } else { + /* reset the DB record, mark it dirty */ + txn->mt_dbflags[dbi] |= DB_DIRTY; + txn->mt_dbs[dbi].md_depth = 0; + txn->mt_dbs[dbi].md_branch_pages = 0; + txn->mt_dbs[dbi].md_leaf_pages = 0; + txn->mt_dbs[dbi].md_overflow_pages = 0; + txn->mt_dbs[dbi].md_entries = 0; + txn->mt_dbs[dbi].md_root = P_INVALID; + + txn->mt_flags |= MDB_TXN_DIRTY; + } +leave: + mdbx_cursor_close(mc); + return rc; +} + +int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + txn->mt_dbxs[dbi].md_cmp = cmp; + return MDB_SUCCESS; +} + +int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + txn->mt_dbxs[dbi].md_dcmp = cmp; + return MDB_SUCCESS; +} + +int mdbx_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + txn->mt_dbxs[dbi].md_rel = rel; + return MDB_SUCCESS; +} + +int mdbx_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + txn->mt_dbxs[dbi].md_relctx = ctx; + return MDB_SUCCESS; +} + +int __cold mdbx_env_get_maxkeysize(MDB_env *env) { + if (!env || env->me_signature != MDBX_ME_SIGNATURE) + return EINVAL; + return ENV_MAXKEY(env); +} + +int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { + unsigned i, rdrs; + MDB_reader *mr; + char buf[64]; + int rc = 0, first = 1; + + if (unlikely(!env || !func)) + return -EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + rdrs = env->me_txns->mti_numreaders; + mr = env->me_txns->mti_readers; + for (i = 0; i < rdrs; i++) { + if (mr[i].mr_pid) { + txnid_t txnid = mr[i].mr_txnid; + if (txnid == ~(txnid_t)0) + sprintf(buf, "%10d %zx -\n", (int)mr[i].mr_pid, (size_t)mr[i].mr_tid); + else + sprintf(buf, "%10d %zx %zu\n", (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, + txnid); + + if (first) { + first = 0; + rc = func(" pid thread txnid\n", ctx); + if (rc < 0) + break; + } + rc = func(buf, ctx); + if (rc < 0) + break; + } + } + if (first) { + rc = func("(no active readers)\n", ctx); + } + return rc; +} + +/** Insert pid into list if not already present. + * return -1 if already present. + */ +static int __cold mdbx_pid_insert(pid_t *ids, pid_t pid) { + /* binary search of pid in list */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; + + while (0 < n) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = pid - ids[cursor]; + + if (val < 0) { + n = pivot; + } else if (val > 0) { + base = cursor; + n -= pivot + 1; + } else { + /* found, so it's a duplicate */ + return -1; + } + } + + if (val > 0) { + ++cursor; + } + ids[0]++; + for (n = ids[0]; n > cursor; n--) + ids[n] = ids[n - 1]; + ids[n] = pid; + return 0; +} + +int __cold mdbx_reader_check(MDB_env *env, int *dead) { + if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) + return EINVAL; + if (dead) + *dead = 0; + return mdbx_reader_check0(env, 0, dead); +} + +/** As #mdbx_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ +static int __cold mdbx_reader_check0(MDB_env *env, int rlocked, int *dead) { + pthread_mutex_t *rmutex = rlocked ? NULL : MDB_MUTEX(env, r); + unsigned i, j, rdrs; + MDB_reader *mr; + pid_t *pids, pid; + int rc = MDB_SUCCESS, count = 0; + + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + + rdrs = env->me_txns->mti_numreaders; + pids = malloc((rdrs + 1) * sizeof(pid_t)); + if (!pids) + return ENOMEM; + pids[0] = 0; + mr = env->me_txns->mti_readers; + for (i = 0; i < rdrs; i++) { + pid = mr[i].mr_pid; + if (pid && pid != env->me_pid) { + if (mdbx_pid_insert(pids, pid) == 0) { + if (!mdbx_reader_pid(env, F_GETLK, pid)) { + /* Stale reader found */ + j = i; + if (rmutex) { + if ((rc = pthread_mutex_lock(rmutex)) != 0) { + if ((rc = mdbx_mutex_failed(env, rmutex, rc))) + break; + rdrs = 0; /* the above checked all readers */ + } else { + /* Recheck, a new process may have reused pid */ + if (mdbx_reader_pid(env, F_GETLK, pid)) + j = rdrs; + } + } + for (; j < rdrs; j++) { + if (mr[j].mr_pid == pid) { + mdbx_debug("clear stale reader pid %u txn %zd", (unsigned)pid, + mr[j].mr_txnid); + mr[j].mr_pid = 0; + count++; + } + } + if (rmutex) + mdbx_mutex_unlock(env, rmutex); + } + } + } + } + free(pids); + if (dead) + *dead = count; + return rc; +} + +static int __cold mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, + int rc) { +#if MDB_USE_ROBUST + if (unlikely(rc == EOWNERDEAD)) { + int rlocked, rc2; + + /* We own the mutex. Clean up after dead previous owner. */ + rc = MDB_SUCCESS; + rlocked = (mutex == MDB_MUTEX(env, r)); + if (!rlocked) { + /* Keep mti_txnid updated, otherwise next writer can + * overwrite data which latest meta page refers to. + * + * LY: Hm, how this can happen, if the mti_txnid + * is updating only at the finish of a successful commit ? + */ + + MDB_meta *meta = mdbx_meta_head_w(env); + assert(env->me_txns->mti_txnid == meta->mm_txnid); + (void)meta; + /* env is hosed if the dead thread was ours */ + if (env->me_txn) { + env->me_flags |= MDB_FATAL_ERROR; + env->me_txn = NULL; + rc = MDB_PANIC; + } + } + mdbx_debug("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); + rc2 = mdbx_reader_check0(env, rlocked, NULL); + if (rc2 == 0) + rc2 = pthread_mutex_consistent(mutex); + if (rc || (rc = rc2)) { + mdbx_debug("mutex recovery failed, %s", mdbx_strerror(rc)); + pthread_mutex_unlock(mutex); + } + } +#endif /* MDB_USE_ROBUST */ + if (unlikely(rc)) { + mdbx_debug("lock mutex failed, %s", mdbx_strerror(rc)); + if (rc != EDEADLK) { + env->me_flags |= MDB_FATAL_ERROR; + rc = MDB_PANIC; + } + } + + return rc; +} + +static int mdbx_mutex_lock(MDB_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_lock(mutex); + if (unlikely(rc)) + rc = mdbx_mutex_failed(env, mutex, rc); + return rc; +} + +static void mdbx_mutex_unlock(MDB_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_unlock(mutex); + mdbx_assert(env, rc == 0); + (void)env; + (void)rc; +} + +static unsigned __hot mdbx_midl_search(MDB_IDL ids, MDB_ID id) { + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; + + while (0 < n) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = mdbx_cmp2int(ids[cursor], id); + + if (val < 0) { + n = pivot; + + } else if (val > 0) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if (val > 0) { + ++cursor; + } + return cursor; +} + +static MDB_IDL mdbx_midl_alloc(int num) { + MDB_IDL ids = malloc((num + 2) * sizeof(MDB_ID)); + if (ids) { + *ids++ = num; + *ids = 0; + } + return ids; +} + +static void mdbx_midl_free(MDB_IDL ids) { + if (ids) + free(ids - 1); +} + +static void mdbx_midl_shrink(MDB_IDL *idp) { + MDB_IDL ids = *idp; + if (*(--ids) > MDB_IDL_UM_MAX && + (ids = realloc(ids, (MDB_IDL_UM_MAX + 2) * sizeof(MDB_ID)))) { + *ids++ = MDB_IDL_UM_MAX; + *idp = ids; + } +} + +static int mdbx_midl_grow(MDB_IDL *idp, int num) { + MDB_IDL idn = *idp - 1; + /* grow it */ + idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID)); + if (!idn) + return ENOMEM; + *idn++ += num; + *idp = idn; + return 0; +} + +static int mdbx_midl_need(MDB_IDL *idp, unsigned num) { + MDB_IDL ids = *idp; + num += ids[0]; + if (num > ids[-1]) { + num = (num + num / 4 + (256 + 2)) & -256; + if (!(ids = realloc(ids - 1, num * sizeof(MDB_ID)))) + return ENOMEM; + *ids++ = num - 2; + *idp = ids; + } + return 0; +} + +static int mdbx_midl_append(MDB_IDL *idp, MDB_ID id) { + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] >= ids[-1]) { + if (mdbx_midl_grow(idp, MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0]++; + ids[ids[0]] = id; + return 0; +} + +static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app) { + MDB_IDL ids = *idp; + /* Too big? */ + if (ids[0] + app[0] >= ids[-1]) { + if (mdbx_midl_grow(idp, app[0])) + return ENOMEM; + ids = *idp; + } + memcpy(&ids[ids[0] + 1], &app[1], app[0] * sizeof(MDB_ID)); + ids[0] += app[0]; + return 0; +} + +static int mdbx_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n) { + MDB_ID *ids = *idp, len = ids[0]; + /* Too big? */ + if (len + n > ids[-1]) { + if (mdbx_midl_grow(idp, n | MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0] = len + n; + ids += len; + while (n) + ids[n--] = id++; + return 0; +} + +static void __hot mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge) { + MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i + j, total = k; + idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */ + old_id = idl[j]; + while (i) { + merge_id = merge[i--]; + for (; old_id < merge_id; old_id = idl[--j]) + idl[k--] = old_id; + idl[k--] = merge_id; + } + idl[0] = total; +} + +/* Quicksort + Insertion sort for small arrays */ + +#define SMALL 8 +#define MIDL_SWAP(a, b) \ + { \ + itmp = (a); \ + (a) = (b); \ + (b) = itmp; \ + } + +static void __hot mdbx_midl_sort(MDB_IDL ids) { + /* Max possible depth of int-indexed tree * 2 items/level */ + int istack[sizeof(int) * CHAR_BIT * 2]; + int i, j, k, l, ir, jstack; + MDB_ID a, itmp; + + ir = (int)ids[0]; + l = 1; + jstack = 0; + for (;;) { + if (ir - l < SMALL) { /* Insertion sort */ + for (j = l + 1; j <= ir; j++) { + a = ids[j]; + for (i = j - 1; i >= 1; i--) { + if (ids[i] >= a) + break; + ids[i + 1] = ids[i]; + } + ids[i + 1] = a; + } + if (jstack == 0) + break; + ir = istack[jstack--]; + l = istack[jstack--]; + } else { + k = (l + ir) >> 1; /* Choose median of left, center, right */ + MIDL_SWAP(ids[k], ids[l + 1]); + if (ids[l] < ids[ir]) { + MIDL_SWAP(ids[l], ids[ir]); + } + if (ids[l + 1] < ids[ir]) { + MIDL_SWAP(ids[l + 1], ids[ir]); + } + if (ids[l] < ids[l + 1]) { + MIDL_SWAP(ids[l], ids[l + 1]); + } + i = l + 1; + j = ir; + a = ids[l + 1]; + for (;;) { + do + i++; + while (ids[i] > a); + do + j--; + while (ids[j] < a); + if (j < i) + break; + MIDL_SWAP(ids[i], ids[j]); + } + ids[l + 1] = ids[j]; + ids[j] = a; + jstack += 2; + if (ir - i + 1 >= j - l) { + istack[jstack] = ir; + istack[jstack - 1] = i; + ir = j - 1; + } else { + istack[jstack] = j - 1; + istack[jstack - 1] = l; + l = i; + } + } + } +} + +static unsigned __hot mdbx_mid2l_search(MDB_ID2L ids, MDB_ID id) { + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = (unsigned)ids[0].mid; + + while (0 < n) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = mdbx_cmp2int(id, ids[cursor].mid); + + if (val < 0) { + n = pivot; + + } else if (val > 0) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if (val > 0) { + ++cursor; + } + return cursor; +} + +static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id) { + unsigned x, i; + + x = mdbx_mid2l_search(ids, id->mid); + + if (x < 1) { + /* internal error */ + return -2; + } + + if (x <= ids[0].mid && ids[x].mid == id->mid) { + /* duplicate */ + return -1; + } + + if (ids[0].mid >= MDB_IDL_UM_MAX) { + /* too big */ + return -2; + + } else { + /* insert id */ + ids[0].mid++; + for (i = (unsigned)ids[0].mid; i > x; i--) + ids[i] = ids[i - 1]; + ids[x] = *id; + } + + return 0; +} + +static int mdbx_mid2l_append(MDB_ID2L ids, MDB_ID2 *id) { + /* Too big? */ + if (ids[0].mid >= MDB_IDL_UM_MAX) { + return -2; + } + ids[0].mid++; + ids[ids[0].mid] = *id; + return 0; +} + +int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn) { + unsigned ret = mdbx_runtime_flags; + if (flags != (int)MDBX_DBG_DNT) + mdbx_runtime_flags = flags; + if (logger != (MDBX_debug_func *)MDBX_DBG_DNT) + mdbx_debug_logger = logger; + if (edge_txn != (long)MDBX_DBG_DNT) { +#if MDB_DEBUG + mdbx_debug_edge = edge_txn; +#endif + } + return ret; +} + +static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { + int retry; + txnid_t snap; + mdbx_debug("DB size maxed out"); + + for (retry = 0;; ++retry) { + int reader; + + if (mdbx_reader_check(env, NULL)) + break; + + snap = mdbx_find_oldest(env, &reader); + if (oldest < snap || reader < 0) { + if (retry && env->me_oom_func) { + /* LY: notify end of oom-loop */ + env->me_oom_func(env, 0, 0, oldest, snap - oldest, -retry); + } + return snap; + } + + MDB_reader *r; + pthread_t tid; + pid_t pid; + int rc; + + if (!env->me_oom_func) + break; + + r = &env->me_txns->mti_readers[reader]; + pid = r->mr_pid; + tid = r->mr_tid; + if (r->mr_txnid != oldest || pid <= 0) + continue; + + rc = env->me_oom_func(env, pid, (void *)tid, oldest, + mdbx_meta_head_w(env)->mm_txnid - oldest, retry); + if (rc < 0) + break; + + if (rc) { + r->mr_txnid = ~(txnid_t)0; + if (rc > 1) { + r->mr_tid = 0; + r->mr_pid = 0; + mdbx_coherent_barrier(); + } + } + } + + if (retry && env->me_oom_func) { + /* LY: notify end of oom-loop */ + env->me_oom_func(env, 0, 0, oldest, 0, -retry); + } + return mdbx_find_oldest(env, NULL); +} + +int __cold mdbx_env_set_syncbytes(MDB_env *env, size_t bytes) { + if (unlikely(!env)) + return EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + env->me_sync_threshold = bytes; + return env->me_map ? mdbx_env_sync(env, 0) : MDB_SUCCESS; +} + +void __cold mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oomfunc) { + if (likely(env && env->me_signature == MDBX_ME_SIGNATURE)) + env->me_oom_func = oomfunc; +} + +MDBX_oom_func *__cold mdbx_env_get_oomfunc(MDB_env *env) { + return likely(env && env->me_signature == MDBX_ME_SIGNATURE) + ? env->me_oom_func + : NULL; +} + +ATTRIBUTE_NO_SANITIZE_THREAD /* LY: avoid tsan-trap by me_txn, mm_last_pg and + mt_next_pgno */ + int + mdbx_txn_straggler(MDB_txn *txn, int *percent) { + MDB_env *env; + MDB_meta *meta; + txnid_t lag; + + if (unlikely(!txn)) + return -EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(!txn->mt_u.reader)) + return -1; + + env = txn->mt_env; + meta = mdbx_meta_head_r(env); + if (percent) { + size_t maxpg = env->me_maxpg; + size_t last = meta->mm_last_pg + 1; + if (env->me_txn) + last = env->me_txn0->mt_next_pgno; + *percent = (last * 100ull + maxpg / 2) / maxpg; + } + lag = meta->mm_txnid - txn->mt_u.reader->mr_txnid; + return (0 > (long)lag) ? ~0u >> 1 : lag; +} + +typedef struct mdbx_walk_ctx { + MDB_txn *mw_txn; + void *mw_user; + MDBX_pgvisitor_func *mw_visitor; +} mdbx_walk_ctx_t; + +/** Depth-first tree traversal. */ +static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, + pgno_t pg, int deep) { + MDB_page *mp; + int rc, i, nkeys; + unsigned header_size, unused_size, payload_size, align_bytes; + const char *type; + + if (pg == P_INVALID) + return MDB_SUCCESS; /* empty db */ + + MDB_cursor mc; + memset(&mc, 0, sizeof(mc)); + mc.mc_snum = 1; + mc.mc_txn = ctx->mw_txn; + + rc = mdbx_page_get(&mc, pg, &mp, NULL); + if (rc) + return rc; + if (pg != mp->mp_p.p_pgno) + return MDB_CORRUPTED; + + nkeys = NUMKEYS(mp); + header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower; + unused_size = SIZELEFT(mp); + payload_size = 0; + + /* LY: Don't use mask here, e.g bitwise + * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). + * Pages should not me marked dirty/loose or otherwise. */ + switch (mp->mp_flags) { + case P_BRANCH: + type = "branch"; + if (nkeys < 1) + return MDB_CORRUPTED; + break; + case P_LEAF: + type = "leaf"; + break; + case P_LEAF | P_SUBP: + type = "dupsort-subleaf"; + break; + case P_LEAF | P_LEAF2: + type = "dupfixed-leaf"; + break; + case P_LEAF | P_LEAF2 | P_SUBP: + type = "dupsort-dupfixed-subleaf"; + break; + case P_META: + case P_OVERFLOW: + default: + return MDB_CORRUPTED; + } + + for (align_bytes = i = 0; i < nkeys; + align_bytes += ((payload_size + align_bytes) & 1), i++) { + MDB_node *node; + + if (IS_LEAF2(mp)) { + /* LEAF2 pages have no mp_ptrs[] or node headers */ + payload_size += mp->mp_leaf2_ksize; + continue; + } + + node = NODEPTR(mp, i); + payload_size += NODESIZE + node->mn_ksize; + + if (IS_BRANCH(mp)) { + rc = mdbx_env_walk(ctx, dbi, NODEPGNO(node), deep); + if (rc) + return rc; + continue; + } + + assert(IS_LEAF(mp)); + if (node->mn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t *opg; + size_t over_header, over_payload, over_unused; + + payload_size += sizeof(pgno_t); + opg = NODEDATA(node); + rc = mdbx_page_get(&mc, *opg, &omp, NULL); + if (rc) + return rc; + if (*opg != omp->mp_p.p_pgno) + return MDB_CORRUPTED; + /* LY: Don't use mask here, e.g bitwise + * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). + * Pages should not me marked dirty/loose or otherwise. */ + if (P_OVERFLOW != omp->mp_flags) + return MDB_CORRUPTED; + + over_header = PAGEHDRSZ; + over_payload = NODEDSZ(node); + over_unused = omp->mp_pages * ctx->mw_txn->mt_env->me_psize - + over_payload - over_header; + + rc = ctx->mw_visitor(*opg, omp->mp_pages, ctx->mw_user, dbi, + "overflow-data", 1, over_payload, over_header, + over_unused); + if (rc) + return rc; + continue; + } + + payload_size += NODEDSZ(node); + if (node->mn_flags & F_SUBDATA) { + MDB_db *db = NODEDATA(node); + char *name = NULL; + + if (!(node->mn_flags & F_DUPDATA)) { + name = NODEKEY(node); + int namelen = (char *)db - name; + name = memcpy(alloca(namelen + 1), name, namelen); + name[namelen] = 0; + } + rc = mdbx_env_walk(ctx, (name && name[0]) ? name : dbi, db->md_root, + deep + 1); + if (rc) + return rc; + } + } + + return ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, type, nkeys, + payload_size, header_size, unused_size + align_bytes); +} + +int __cold mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, + void *user) { + mdbx_walk_ctx_t ctx; + int rc; + + if (unlikely(!txn)) + return MDB_BAD_TXN; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + ctx.mw_txn = txn; + ctx.mw_user = user; + ctx.mw_visitor = visitor; + + rc = visitor(0, 2, user, "lmdb", "meta", 2, sizeof(MDB_meta) * 2, + PAGEHDRSZ * 2, + (txn->mt_env->me_psize - sizeof(MDB_meta) - PAGEHDRSZ) * 2); + if (!rc) + rc = mdbx_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0); + if (!rc) + rc = mdbx_env_walk(&ctx, "main", txn->mt_dbs[MAIN_DBI].md_root, 0); + if (!rc) + rc = visitor(P_INVALID, 0, user, NULL, NULL, 0, 0, 0, 0); + return rc; +} + +int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + return EACCES; + + if (likely(canary)) { + txn->mt_canary.x = canary->x; + txn->mt_canary.y = canary->y; + txn->mt_canary.z = canary->z; + } + txn->mt_canary.v = txn->mt_txnid; + + return MDB_SUCCESS; +} + +size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary) { + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return 0; + + if (likely(canary)) + *canary = txn->mt_canary; + + return txn->mt_txnid; +} + +int mdbx_cursor_on_first(MDB_cursor *mc) { + if (unlikely(mc == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (!(mc->mc_flags & C_INITIALIZED)) + return MDBX_RESULT_FALSE; + + unsigned i; + for (i = 0; i < mc->mc_snum; ++i) { + if (mc->mc_ki[i]) + return MDBX_RESULT_FALSE; + } + + return MDBX_RESULT_TRUE; +} + +int mdbx_cursor_on_last(MDB_cursor *mc) { + if (unlikely(mc == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if (!(mc->mc_flags & C_INITIALIZED)) + return MDBX_RESULT_FALSE; + + unsigned i; + for (i = 0; i < mc->mc_snum; ++i) { + unsigned nkeys = NUMKEYS(mc->mc_pg[i]); + if (mc->mc_ki[i] < nkeys - 1) + return MDBX_RESULT_FALSE; + } + + return MDBX_RESULT_TRUE; +} + +int mdbx_cursor_eof(MDB_cursor *mc) { + if (unlikely(mc == NULL)) + return EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) + return MDB_VERSION_MISMATCH; + + if ((mc->mc_flags & C_INITIALIZED) == 0) + return MDBX_RESULT_TRUE; + + if (mc->mc_snum == 0) + return MDBX_RESULT_TRUE; + + if ((mc->mc_flags & C_EOF) && + mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) + return MDBX_RESULT_TRUE; + + return MDBX_RESULT_FALSE; +} + +static int mdbx_is_samedata(const MDB_val *a, const MDB_val *b) { + return a->iov_len == b->iov_len && + memcmp(a->iov_base, b->iov_base, a->iov_len) == 0; } /* Позволяет обновить или удалить существующую запись с получением @@ -444,58 +11300,59 @@ static int mdbx_is_samedata(const MDB_val* a, const MDB_val* b) { * - внешняя аллокация курсоров, в том числе на стеке (без malloc). * - получения статуса страницы по адресу (знать о P_DIRTY). */ -int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *new_data, MDB_val *old_data, unsigned flags) -{ - MDB_cursor mc; - MDB_xcursor mx; +int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, + MDB_val *old_data, unsigned flags) { + MDB_cursor mc; + MDB_xcursor mx; - if (unlikely(!key || !old_data || !txn || old_data == new_data)) - return EINVAL; + if (unlikely(!key || !old_data || !txn || old_data == new_data)) + return EINVAL; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; - if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) - return EINVAL; + if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) + return EINVAL; - if (unlikely(new_data == NULL && !(flags & MDB_CURRENT))) - return EINVAL; + if (unlikely(new_data == NULL && !(flags & MDB_CURRENT))) + return EINVAL; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; - if (unlikely(flags & ~(MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP|MDB_CURRENT))) - return EINVAL; + if (unlikely(flags & + ~(MDB_NOOVERWRITE | MDB_NODUPDATA | MDB_RESERVE | MDB_APPEND | + MDB_APPENDDUP | MDB_CURRENT))) + return EINVAL; - if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) + return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - mdb_cursor_init(&mc, txn, dbi, &mx); - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; + mdbx_cursor_init(&mc, txn, dbi, &mx); + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; - int rc; - MDB_val present_key = *key; - if (F_ISSET(flags, MDB_CURRENT | MDB_NOOVERWRITE)) { - /* в old_data значение для выбора конкретного дубликата */ - if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDB_DUPSORT))) { - rc = EINVAL; - goto bailout; - } + int rc; + MDB_val present_key = *key; + if (F_ISSET(flags, MDB_CURRENT | MDB_NOOVERWRITE)) { + /* в old_data значение для выбора конкретного дубликата */ + if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDB_DUPSORT))) { + rc = EINVAL; + goto bailout; + } - /* убираем лишний бит, он был признаком запрошенного режима */ - flags -= MDB_NOOVERWRITE; + /* убираем лишний бит, он был признаком запрошенного режима */ + flags -= MDB_NOOVERWRITE; - rc = mdbx_cursor_get(&mc, &present_key, old_data, MDB_GET_BOTH); - if (rc != MDB_SUCCESS) - goto bailout; + rc = mdbx_cursor_get(&mc, &present_key, old_data, MDB_GET_BOTH); + if (rc != MDB_SUCCESS) + goto bailout; - if (new_data) { - /* обновление конкретного дубликата */ - if (mdbx_is_samedata(old_data, new_data)) - /* если данные совпадают, то ничего делать не надо */ - goto bailout; + if (new_data) { + /* обновление конкретного дубликата */ + if (mdbx_is_samedata(old_data, new_data)) + /* если данные совпадают, то ничего делать не надо */ + goto bailout; #if 0 /* LY: исправлено в mdbx_cursor_put(), здесь в качестве памятки */ MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA) @@ -516,126 +11373,127 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, flags -= MDB_CURRENT; } #endif - } - } else { - /* в old_data буфер для сохранения предыдущего значения */ - if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) - return EINVAL; - MDB_val present_data; - rc = mdbx_cursor_get(&mc, &present_key, &present_data, MDB_SET_KEY); - if (unlikely(rc != MDB_SUCCESS)) { - old_data->iov_base = NULL; - old_data->iov_len = rc; - if (rc != MDB_NOTFOUND || (flags & MDB_CURRENT)) - goto bailout; - } else if (flags & MDB_NOOVERWRITE) { - rc = MDB_KEYEXIST; - *old_data = present_data; - goto bailout; - } else { - MDB_page *page = mc.mc_pg[mc.mc_top]; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { - if (flags & MDB_CURRENT) { - /* для не-уникальных ключей позволяем update/delete только если ключ один */ - MDB_node *leaf = NODEPTR(page, mc.mc_ki[mc.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_tassert(txn, XCURSOR_INITED(&mc) && mc.mc_xcursor->mx_db.md_entries > 1); - if (mc.mc_xcursor->mx_db.md_entries > 1) { - rc = MDBX_EMULTIVAL; - goto bailout; - } - } - /* если данные совпадают, то ничего делать не надо */ - if (new_data && mdbx_is_samedata(&present_data, new_data)) { - *old_data = *new_data; - goto bailout; - } - /* В оригинальной LMDB фладок MDB_CURRENT здесь приведет - * к замене данных без учета MDB_DUPSORT сортировки, - * но здесь это в любом случае допустимо, так как мы - * проверили что для ключа есть только одно значение. */ - } else if ((flags & MDB_NODUPDATA) && mdbx_is_samedata(&present_data, new_data)) { - /* если данные совпадают и установлен MDB_NODUPDATA */ - rc = MDB_KEYEXIST; - goto bailout; - } - } else { - /* если данные совпадают, то ничего делать не надо */ - if (new_data && mdbx_is_samedata(&present_data, new_data)) { - *old_data = *new_data; - goto bailout; - } - flags |= MDB_CURRENT; - } + } + } else { + /* в old_data буфер для сохранения предыдущего значения */ + if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) + return EINVAL; + MDB_val present_data; + rc = mdbx_cursor_get(&mc, &present_key, &present_data, MDB_SET_KEY); + if (unlikely(rc != MDB_SUCCESS)) { + old_data->iov_base = NULL; + old_data->iov_len = rc; + if (rc != MDB_NOTFOUND || (flags & MDB_CURRENT)) + goto bailout; + } else if (flags & MDB_NOOVERWRITE) { + rc = MDB_KEYEXIST; + *old_data = present_data; + goto bailout; + } else { + MDB_page *page = mc.mc_pg[mc.mc_top]; + if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + if (flags & MDB_CURRENT) { + /* для не-уникальных ключей позволяем update/delete только если ключ + * один */ + MDB_node *leaf = NODEPTR(page, mc.mc_ki[mc.mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_tassert(txn, XCURSOR_INITED(&mc) && + mc.mc_xcursor->mx_db.md_entries > 1); + if (mc.mc_xcursor->mx_db.md_entries > 1) { + rc = MDBX_EMULTIVAL; + goto bailout; + } + } + /* если данные совпадают, то ничего делать не надо */ + if (new_data && mdbx_is_samedata(&present_data, new_data)) { + *old_data = *new_data; + goto bailout; + } + /* В оригинальной LMDB фладок MDB_CURRENT здесь приведет + * к замене данных без учета MDB_DUPSORT сортировки, + * но здесь это в любом случае допустимо, так как мы + * проверили что для ключа есть только одно значение. */ + } else if ((flags & MDB_NODUPDATA) && + mdbx_is_samedata(&present_data, new_data)) { + /* если данные совпадают и установлен MDB_NODUPDATA */ + rc = MDB_KEYEXIST; + goto bailout; + } + } else { + /* если данные совпадают, то ничего делать не надо */ + if (new_data && mdbx_is_samedata(&present_data, new_data)) { + *old_data = *new_data; + goto bailout; + } + flags |= MDB_CURRENT; + } - if (page->mp_flags & P_DIRTY) { - if (unlikely(old_data->iov_len < present_data.iov_len)) { - old_data->iov_base = NULL; - old_data->iov_len = present_data.iov_len; - rc = MDBX_RESULT_TRUE; - goto bailout; - } - memcpy(old_data->iov_base, present_data.iov_base, present_data.iov_len); - old_data->iov_len = present_data.iov_len; - } else { - *old_data = present_data; - } - } - } + if (page->mp_flags & P_DIRTY) { + if (unlikely(old_data->iov_len < present_data.iov_len)) { + old_data->iov_base = NULL; + old_data->iov_len = present_data.iov_len; + rc = MDBX_RESULT_TRUE; + goto bailout; + } + memcpy(old_data->iov_base, present_data.iov_base, present_data.iov_len); + old_data->iov_len = present_data.iov_len; + } else { + *old_data = present_data; + } + } + } - if (likely(new_data)) - rc = mdbx_cursor_put(&mc, key, new_data, flags); - else - rc = mdbx_cursor_del(&mc, 0); + if (likely(new_data)) + rc = mdbx_cursor_put(&mc, key, new_data, flags); + else + rc = mdbx_cursor_del(&mc, 0); bailout: - txn->mt_cursors[dbi] = mc.mc_next; - return rc; + txn->mt_cursors[dbi] = mc.mc_next; + return rc; } -int -mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *data, int* values_count) -{ - DKBUF; - mdb_debug("===> get db %u key [%s]", dbi, DKEY(key)); +int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + int *values_count) { + DKBUF; + mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); - if (unlikely(!key || !data || !txn)) - return EINVAL; + if (unlikely(!key || !data || !txn)) + return EINVAL; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; - MDB_cursor mc; - MDB_xcursor mx; - mdb_cursor_init(&mc, txn, dbi, &mx); + MDB_cursor mc; + MDB_xcursor mx; + mdbx_cursor_init(&mc, txn, dbi, &mx); - int exact = 0; - int rc = mdb_cursor_set(&mc, key, data, MDB_SET_KEY, &exact); - if (unlikely(rc != MDB_SUCCESS)) { - if (rc == MDB_NOTFOUND && values_count) - *values_count = 0; - return rc; - } + int exact = 0; + int rc = mdbx_cursor_set(&mc, key, data, MDB_SET_KEY, &exact); + if (unlikely(rc != MDB_SUCCESS)) { + if (rc == MDB_NOTFOUND && values_count) + *values_count = 0; + return rc; + } - if (values_count) { - *values_count = 1; - if (mc.mc_xcursor != NULL) { - MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - mdb_tassert(txn, mc.mc_xcursor == &mx - && (mx.mx_cursor.mc_flags & C_INITIALIZED)); - *values_count = mx.mx_db.md_entries; - } - } - } - return MDB_SUCCESS; + if (values_count) { + *values_count = 1; + if (mc.mc_xcursor != NULL) { + MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_tassert(txn, mc.mc_xcursor == &mx && + (mx.mx_cursor.mc_flags & C_INITIALIZED)); + *values_count = mx.mx_db.md_entries; + } + } + } + return MDB_SUCCESS; } /* Функция сообщает находится ли указанный адрес в "грязной" странице у @@ -660,80 +11518,79 @@ mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, * так гарантируется что актуальный заголовок страницы будет физически * расположен в той-же странице памяти, в том числе для многостраничных * P_OVERFLOW страниц с длинными данными. */ -int mdbx_is_dirty(const MDB_txn *txn, const void* ptr) -{ - if (unlikely(!txn)) - return EINVAL; +int mdbx_is_dirty(const MDB_txn *txn, const void *ptr) { + if (unlikely(!txn)) + return EINVAL; - if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDB_VERSION_MISMATCH; - if (unlikely(txn->mt_flags & MDB_TXN_RDONLY)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDB_TXN_RDONLY)) + return MDB_BAD_TXN; - const MDB_env *env = txn->mt_env; - const uintptr_t mask = ~(uintptr_t) (env->me_psize - 1); - const MDB_page *page = (const MDB_page *) ((uintptr_t) ptr & mask); + const MDB_env *env = txn->mt_env; + const uintptr_t mask = ~(uintptr_t)(env->me_psize - 1); + const MDB_page *page = (const MDB_page *)((uintptr_t)ptr & mask); - /* LY: Тут не всё хорошо с абсолютной достоверностью результата, - * так как флажок P_DIRTY в LMDB может означать не совсем то, - * что было исходно задумано, детали см в логике кода mdb_page_touch(). - * - * Более того, в режиме БЕЗ WRITEMAP грязные страницы выделяются через - * malloc(), т.е. находятся вне mmap-диаппазона. - * - * Тем не менее, однозначно страница "не грязная" если: - * - адрес находится внутри mmap-диаппазона и в заголовке страницы - * нет флажка P_DIRTY, то однозначно страница "не грязная". - * - адрес вне mmap-диаппазона и его нет среди списка "грязных" страниц. - */ - if (env->me_map < (char*) page) { - const size_t used_size = env->me_psize * txn->mt_next_pgno; - if (env->me_map + used_size > (char*) page) { - /* страница внутри диапазона */ - if (page->mp_flags & P_DIRTY) - return MDBX_RESULT_TRUE; - return MDBX_RESULT_FALSE; - } - /* Гипотетически здесь возможна ситуация, когда указатель адресует что-то - * в пределах mmap, но за границей распределенных страниц. Это тяжелая - * ошибка, которой не возможно добиться без каких-то мега-нарушений. - * Поэтому не проверяем этот случай кроме как assert-ом, ибо бестолку. */ - mdb_tassert(txn, env->me_map + env->me_mapsize > (char*) page); - } - /* Страница вне mmap-диаппазона */ + /* LY: Тут не всё хорошо с абсолютной достоверностью результата, + * так как флажок P_DIRTY в LMDB может означать не совсем то, + * что было исходно задумано, детали см в логике кода mdbx_page_touch(). + * + * Более того, в режиме БЕЗ WRITEMAP грязные страницы выделяются через + * malloc(), т.е. находятся вне mmap-диаппазона. + * + * Тем не менее, однозначно страница "не грязная" если: + * - адрес находится внутри mmap-диаппазона и в заголовке страницы + * нет флажка P_DIRTY, то однозначно страница "не грязная". + * - адрес вне mmap-диаппазона и его нет среди списка "грязных" страниц. + */ + if (env->me_map < (char *)page) { + const size_t used_size = env->me_psize * txn->mt_next_pgno; + if (env->me_map + used_size > (char *)page) { + /* страница внутри диапазона */ + if (page->mp_flags & P_DIRTY) + return MDBX_RESULT_TRUE; + return MDBX_RESULT_FALSE; + } + /* Гипотетически здесь возможна ситуация, когда указатель адресует что-то + * в пределах mmap, но за границей распределенных страниц. Это тяжелая + * ошибка, которой не возможно добиться без каких-то мега-нарушений. + * Поэтому не проверяем этот случай кроме как assert-ом, ибо бестолку. */ + mdbx_tassert(txn, env->me_map + env->me_mapsize > (char *)page); + } + /* Страница вне mmap-диаппазона */ - if (env->me_flags & MDB_WRITEMAP) - /* Если MDB_WRITEMAP, то результат уже ясен. */ - return MDBX_RESULT_FALSE; + if (env->me_flags & MDB_WRITEMAP) + /* Если MDB_WRITEMAP, то результат уже ясен. */ + return MDBX_RESULT_FALSE; - /* Смотрим список грязных страниц у заданной транзакции. */ - MDB_ID2 *list = txn->mt_u.dirty_list; - if (list) { - unsigned i, n = list[0].mid; - for (i = 1; i <= n; i++) { - const MDB_page *dirty = list[i].mptr; - if (dirty == page) - return MDBX_RESULT_TRUE; - } - } + /* Смотрим список грязных страниц у заданной транзакции. */ + MDB_ID2 *list = txn->mt_u.dirty_list; + if (list) { + unsigned i, n = list[0].mid; + for (i = 1; i <= n; i++) { + const MDB_page *dirty = list[i].mptr; + if (dirty == page) + return MDBX_RESULT_TRUE; + } + } - /* При вложенных транзакциях, страница может быть в dirty-списке - * родительской транзакции, но в этом случае она будет скопирована перед - * изменением в текущей транзакции, т.е. относительно заданной транзакции - * проверяемый адрес "не грязный". */ - return MDBX_RESULT_FALSE; + /* При вложенных транзакциях, страница может быть в dirty-списке + * родительской транзакции, но в этом случае она будет скопирована перед + * изменением в текущей транзакции, т.е. относительно заданной транзакции + * проверяемый адрес "не грязный". */ + return MDBX_RESULT_FALSE; } int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, - MDB_dbi *pdbi, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp) -{ - int rc = mdbx_dbi_open(txn, name, flags, pdbi); - if (likely(rc == MDB_SUCCESS)) { - MDB_dbi dbi = *pdbi; - unsigned flags = txn->mt_dbs[dbi].md_flags; - txn->mt_dbxs[dbi].md_cmp = keycmp ? keycmp : mdbx_default_keycmp(flags); - txn->mt_dbxs[dbi].md_dcmp = datacmp ? datacmp : mdbx_default_datacmp(flags); - } - return rc; + MDB_dbi *pdbi, MDB_cmp_func *keycmp, + MDB_cmp_func *datacmp) { + int rc = mdbx_dbi_open(txn, name, flags, pdbi); + if (likely(rc == MDB_SUCCESS)) { + MDB_dbi dbi = *pdbi; + unsigned flags = txn->mt_dbs[dbi].md_flags; + txn->mt_dbxs[dbi].md_cmp = keycmp ? keycmp : mdbx_default_keycmp(flags); + txn->mt_dbxs[dbi].md_dcmp = datacmp ? datacmp : mdbx_default_datacmp(flags); + } + return rc; } diff --git a/mdbx.h b/mdbx.h index 515e819e..b86136a4 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1,7 +1,13 @@ /* * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. + * + * This code is derived from "LMDB engine" written by + * Howard Chu (Symas Corporation), which itself derived from btree.c + * written by Martin Hedenfalk. + * + * --- + * + * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -10,214 +16,1740 @@ * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at * . + * + * --- + * + * Portions Copyright (c) 2009, 2010 Martin Hedenfalk + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -/* - This is solution to provide flexible compatibility with the original liblmdb. - Yeah, this way is partially ugly and madness... - - But, on the other hand, only this way allows provide both API with - minimal changes the source code of an applications, and the source - code of the library itself. Anyway, ideas are welcome! - - So, - - When needed drop-in replacement for liblmdb you should: - - 'make lmdb' to build liblmdb.so and liblmdb.a; - - #include and use mdb_* functions; - - linking with liblmdb.so or liblmdb.a; - - = This provides nearly full compatibility with - original LMDB from Symas Corp. - But you should be noted - such compatibility - is not a goal for MDBX. - - When exactly the libmdbx is needed, you should: - - 'make mdbx' to build libmdbx.so and libmdbx.a; - - #include and use mdbx_* functions; - - linking with libmdbx.so or libmdbx.a; - - = This allows using (linking) both MDBX and LMDB - simultaneously in the one application, for instance - to benchmarking and/or comparison. -*/ - +#pragma once #ifndef _MDBX_H_ #define _MDBX_H_ #define MDBX_MODE_ENABLED 1 -#ifndef _GNU_SOURCE -# define _GNU_SOURCE -#endif - -/** @defgroup mdbx MDBX API - * @{ - * @brief libmdbx - Extended version of LMDB - */ - -#define mdb_version mdbx_version -#define mdb_strerror mdbx_strerror -#define mdb_env_create mdbx_env_create -#define mdb_env_open mdbx_env_open -#define mdb_env_open_ex mdbx_env_open_ex -#define mdb_env_copy mdbx_env_copy -#define mdb_env_copyfd mdbx_env_copyfd -#define mdb_env_copy2 mdbx_env_copy2 -#define mdb_env_copyfd2 mdbx_env_copyfd2 -#define mdb_env_sync mdbx_env_sync -#define mdb_env_close mdbx_env_close -#define mdb_env_set_flags mdbx_env_set_flags -#define mdb_env_get_flags mdbx_env_get_flags -#define mdb_env_get_path mdbx_env_get_path -#define mdb_env_get_fd mdbx_env_get_fd -#define mdb_env_set_mapsize mdbx_env_set_mapsize -#define mdb_env_set_maxreaders mdbx_env_set_maxreaders -#define mdb_env_get_maxreaders mdbx_env_get_maxreaders -#define mdb_env_set_maxdbs mdbx_env_set_maxdbs -#define mdb_env_get_maxkeysize mdbx_env_get_maxkeysize -#define mdb_env_set_userctx mdbx_env_set_userctx -#define mdb_env_get_userctx mdbx_env_get_userctx -#define mdb_env_set_assert mdbx_env_set_assert -#define mdb_txn_begin mdbx_txn_begin -#define mdb_txn_env mdbx_txn_env -#define mdb_txn_id mdbx_txn_id -#define mdb_txn_commit mdbx_txn_commit -#define mdb_txn_abort mdbx_txn_abort -#define mdb_txn_reset mdbx_txn_reset -#define mdb_txn_renew mdbx_txn_renew -#define mdb_dbi_open mdbx_dbi_open -#define mdb_dbi_flags mdbx_dbi_flags -#define mdb_dbi_close mdbx_dbi_close -#define mdb_drop mdbx_drop -#define mdb_set_compare mdbx_set_compare -#define mdb_set_dupsort mdbx_set_dupsort -#define mdb_set_relfunc mdbx_set_relfunc -#define mdb_set_relctx mdbx_set_relctx -#define mdb_get mdbx_get -#define mdb_put mdbx_put -#define mdb_del mdbx_del -#define mdb_cursor_open mdbx_cursor_open -#define mdb_cursor_close mdbx_cursor_close -#define mdb_cursor_renew mdbx_cursor_renew -#define mdb_cursor_txn mdbx_cursor_txn -#define mdb_cursor_dbi mdbx_cursor_dbi -#define mdb_cursor_get mdbx_cursor_get -#define mdb_cursor_put mdbx_cursor_put -#define mdb_cursor_del mdbx_cursor_del -#define mdb_cursor_count mdbx_cursor_count -#define mdb_cmp mdbx_cmp -#define mdb_dcmp mdbx_dcmp -#define mdb_reader_list mdbx_reader_list -#define mdb_reader_check mdbx_reader_check -#define mdb_dkey mdbx_dkey - -/** Compat with version <= 0.9.4, avoid clash with libmdb from MDB Tools project */ -#define mdbx_open(txn,name,flags,dbi) mdbx_dbi_open(txn,name,flags,dbi) -#define mdbx_close(env,dbi) mdbx_dbi_close(env,dbi) - -#include "./lmdb.h" +#include +#include +#include +#include +#include #ifdef __cplusplus extern "C" { #endif -int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mode_t mode, int *exclusive); +/** Library major version */ +#define MDB_VERSION_MAJOR 0 +/** Library minor version */ +#define MDB_VERSION_MINOR 9 +/** Library patch version */ +#define MDB_VERSION_PATCH 19 + +/** Combine args a,b,c into a single integer for easy version comparisons */ +#define MDB_VERINT(a, b, c) (((a) << 24) | ((b) << 16) | (c)) + +/** The full library version as a single integer */ +#define MDB_VERSION_FULL \ + MDB_VERINT(MDB_VERSION_MAJOR, MDB_VERSION_MINOR, MDB_VERSION_PATCH) + +/** The release date of this library version */ +#define MDB_VERSION_DATE "DEVEL" + +/** A stringifier for the version info */ +#define MDB_VERSTR(a, b, c, d) \ + "MDBX " #a "." #b "." #c ": (" d ", https://github.com/ReOpen/libmdbx)" + +/** A helper for the stringifier macro */ +#define MDB_VERFOO(a, b, c, d) MDB_VERSTR(a, b, c, d) + +/** The full library version as a C string */ +#define MDB_VERSION_STRING \ + MDB_VERFOO(MDB_VERSION_MAJOR, MDB_VERSION_MINOR, MDB_VERSION_PATCH, \ + MDB_VERSION_DATE) +/** @} */ + +/** @brief Opaque structure for a database environment. + * + * A DB environment supports multiple databases, all residing in the same + * shared-memory map. + */ +typedef struct MDB_env MDB_env; + +/** @brief Opaque structure for a transaction handle. + * + * All database operations require a transaction handle. Transactions may be + * read-only or read-write. + */ +typedef struct MDB_txn MDB_txn; + +/** @brief A handle for an individual database in the DB environment. */ +typedef unsigned MDB_dbi; + +/** @brief Opaque structure for navigating through a database */ +typedef struct MDB_cursor MDB_cursor; + +/** @brief Generic structure used for passing keys and data in and out + * of the database. + * + * Values returned from the database are valid only until a subsequent + * update operation, or the end of the transaction. Do not modify or + * free them, they commonly point into the database itself. + * + * Key sizes must be between 1 and #mdbx_env_get_maxkeysize() inclusive. + * The same applies to data sizes in databases with the #MDB_DUPSORT flag. + * Other data items can in theory be from 0 to 0xffffffff bytes long. + */ +typedef struct iovec MDB_val; +#define mv_size iov_len +#define mv_data iov_base + +/** @brief A callback function used to compare two keys in a database */ +typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); + +/** @brief A callback function used to relocate a position-dependent data item + * in a fixed-address database. + * + * The \b newptr gives the item's desired address in + * the memory map, and \b oldptr gives its previous address. The item's actual + * data resides at the address in \b item. This callback is expected to walk + * through the fields of the record in \b item and modify any + * values based at the \b oldptr address to be relative to the \b newptr + * address. + * @param[in,out] item The item that is to be relocated. + * @param[in] oldptr The previous address. + * @param[in] newptr The new address to relocate to. + * @param[in] relctx An application-provided context, set by + * #mdbx_set_relctx(). + * @todo This feature is currently unimplemented. + */ +typedef void(MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, + void *relctx); + +/** @defgroup mdbx_env Environment Flags + * @{ + */ +/** mmap at a fixed address (experimental) */ +#define MDB_FIXEDMAP 0x01 +/** no environment directory */ +#define MDB_NOSUBDIR 0x4000 +/** don't fsync after commit */ +#define MDB_NOSYNC 0x10000 +/** read only */ +#define MDB_RDONLY 0x20000 +/** don't fsync metapage after commit */ +#define MDB_NOMETASYNC 0x40000 +/** use writable mmap */ +#define MDB_WRITEMAP 0x80000 +/** use asynchronous msync when #MDB_WRITEMAP is used */ +#define MDB_MAPASYNC 0x100000 +/** tie reader locktable slots to #MDB_txn objects instead of to threads */ +#define MDB_NOTLS 0x200000 +/** don't do any locking, caller must manage their own locks + * WARNING: libmdbx don't support this mode. */ +#define MDB_NOLOCK__UNSUPPORTED 0x400000 +/** don't do readahead */ +#define MDB_NORDAHEAD 0x800000 +/** don't initialize malloc'd memory before writing to datafile */ +#define MDB_NOMEMINIT 0x1000000 + +#if MDBX_MODE_ENABLED +/** aim to coalesce FreeDB records */ +#define MDBX_COALESCE 0x2000000 +/** LIFO policy for reclaiming FreeDB records */ +#define MDBX_LIFORECLAIM 0x4000000 +#endif /* MDBX_MODE_ENABLED */ + +/** make a steady-sync only on close and explicit env-sync */ +#define MDBX_UTTERLY_NOSYNC (MDB_NOSYNC | MDB_MAPASYNC) +/** debuging option, fill/perturb released pages */ +#define MDBX_PAGEPERTURB 0x8000000 +/** @} */ + +/** @defgroup mdbx_dbi_open Database Flags + * @{ + */ +/** use reverse string keys */ +#define MDB_REVERSEKEY 0x02 +/** use sorted duplicates */ +#define MDB_DUPSORT 0x04 +/** numeric keys in native byte order, either unsigned int or #mdbx_size_t. + * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) + * The keys must all be of the same size. */ +#define MDB_INTEGERKEY 0x08 +/** with #MDB_DUPSORT, sorted dup items have fixed size */ +#define MDB_DUPFIXED 0x10 +/** with #MDB_DUPSORT, dups are #MDB_INTEGERKEY-style integers */ +#define MDB_INTEGERDUP 0x20 +/** with #MDB_DUPSORT, use reverse string dups */ +#define MDB_REVERSEDUP 0x40 +/** create DB if not already existing */ +#define MDB_CREATE 0x40000 +/** @} */ + +/** @defgroup mdbx_put Write Flags + * @{ + */ +/** For put: Don't write if the key already exists. */ +#define MDB_NOOVERWRITE 0x10 +/** Only for #MDB_DUPSORT
+ * For put: don't write if the key and data pair already exist.
+ * For mdbx_cursor_del: remove all duplicate data items. + */ +#define MDB_NODUPDATA 0x20 +/** For mdbx_cursor_put: overwrite the current key/data pair + * MDBX allows this flag for mdbx_put() for explicit overwrite/update without + * insertion. */ +#define MDB_CURRENT 0x40 +/** For put: Just reserve space for data, don't copy it. Return a + * pointer to the reserved space. + */ +#define MDB_RESERVE 0x10000 +/** Data is being appended, don't split full pages. */ +#define MDB_APPEND 0x20000 +/** Duplicate data is being appended, don't split full pages. */ +#define MDB_APPENDDUP 0x40000 +/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ +#define MDB_MULTIPLE 0x80000 +/* @} */ + +/** @defgroup mdbx_copy Copy Flags + * @{ + */ +/** Compacting copy: Omit free space from copy, and renumber all + * pages sequentially. + */ +#define MDB_CP_COMPACT 0x01 +/* @} */ + +/** @brief Cursor Get operations. + * + * This is the set of all operations for retrieving data + * using a cursor. + */ +typedef enum MDB_cursor_op { + MDB_FIRST, /**< Position at first key/data item */ + MDB_FIRST_DUP, /**< Position at first data item of current key. + Only for #MDB_DUPSORT */ + MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ + MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for + #MDB_DUPSORT */ + MDB_GET_CURRENT, /**< Return key/data at current cursor position */ + MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items + from current cursor position. Move + cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for + #MDB_DUPFIXED */ + MDB_LAST, /**< Position at last key/data item */ + MDB_LAST_DUP, /**< Position at last data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT, /**< Position at next data item */ + MDB_NEXT_DUP, /**< Position at next data item of current key. + Only for #MDB_DUPSORT */ + MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items + from next cursor position. Move + cursor to prepare + for #MDB_NEXT_MULTIPLE. Only for + #MDB_DUPFIXED */ + MDB_NEXT_NODUP, /**< Position at first data item of next key */ + MDB_PREV, /**< Position at previous data item */ + MDB_PREV_DUP, /**< Position at previous data item of current key. + Only for #MDB_DUPSORT */ + MDB_PREV_NODUP, /**< Position at last data item of previous key */ + MDB_SET, /**< Position at specified key */ + MDB_SET_KEY, /**< Position at specified key, return key + data */ + MDB_SET_RANGE, /**< Position at first key greater than or equal to specified + key. */ + MDB_PREV_MULTIPLE /**< Position at previous page and return key and up to + a page of duplicate data items. + Only for #MDB_DUPFIXED */ +} MDB_cursor_op; + +/** @defgroup errors Return Codes + * + * BerkeleyDB uses -30800 to -30999, we'll go under them + * @{ + */ +/** Successful result */ +#define MDB_SUCCESS 0 +/** key/data pair already exists */ +#define MDB_KEYEXIST (-30799) +/** key/data pair not found (EOF) */ +#define MDB_NOTFOUND (-30798) +/** Requested page not found - this usually indicates corruption */ +#define MDB_PAGE_NOTFOUND (-30797) +/** Located page was wrong type */ +#define MDB_CORRUPTED (-30796) +/** Update of meta page failed or environment had fatal error */ +#define MDB_PANIC (-30795) +/** Environment version mismatch */ +#define MDB_VERSION_MISMATCH (-30794) +/** File is not a valid LMDB file */ +#define MDB_INVALID (-30793) +/** Environment mapsize reached */ +#define MDB_MAP_FULL (-30792) +/** Environment maxdbs reached */ +#define MDB_DBS_FULL (-30791) +/** Environment maxreaders reached */ +#define MDB_READERS_FULL (-30790) +/** Txn has too many dirty pages */ +#define MDB_TXN_FULL (-30788) +/** Cursor stack too deep - internal error */ +#define MDB_CURSOR_FULL (-30787) +/** Page has not enough space - internal error */ +#define MDB_PAGE_FULL (-30786) +/** Database contents grew beyond environment mapsize */ +#define MDB_MAP_RESIZED (-30785) +/** Operation and DB incompatible, or DB type changed. This can mean: + *
    + *
  • The operation expects an #MDB_DUPSORT / #MDB_DUPFIXED database. + *
  • Opening a named DB when the unnamed DB has #MDB_DUPSORT / + *#MDB_INTEGERKEY. + *
  • Accessing a data record as a database, or vice versa. + *
  • The database was dropped and recreated with different flags. + *
+ */ +#define MDB_INCOMPATIBLE (-30784) +/** Invalid reuse of reader locktable slot */ +#define MDB_BAD_RSLOT (-30783) +/** Transaction must abort, has a child, or is invalid */ +#define MDB_BAD_TXN (-30782) +/** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ +#define MDB_BAD_VALSIZE (-30781) +/** The specified DBI was changed unexpectedly */ +#define MDB_BAD_DBI (-30780) +/** Unexpected problem - txn should abort */ +#define MDB_PROBLEM (-30779) +/** The last defined error code */ +#define MDB_LAST_ERRCODE MDB_PROBLEM +/** @} */ + +/** @brief Statistics for a database in the environment */ +typedef struct MDBX_stat { + unsigned ms_psize; /**< Size of a database page. + This is currently the + same for all databases. */ + unsigned ms_depth; /**< Depth (height) of the B-tree */ + size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ + size_t ms_leaf_pages; /**< Number of leaf pages */ + size_t ms_overflow_pages; /**< Number of overflow pages */ + size_t ms_entries; /**< Number of data items */ +} MDBX_stat; + +/** @brief Information about the environment */ +typedef struct MDBX_envinfo { + void *me_mapaddr; /**< Address of map, if fixed */ + size_t me_mapsize; /**< Size of the data memory map */ + size_t me_last_pgno; /**< ID of the last used page */ + size_t me_last_txnid; /**< ID of the last committed transaction */ + unsigned me_maxreaders; /**< max reader slots in the environment */ + unsigned me_numreaders; /**< max reader slots used in the environment */ + size_t me_tail_txnid; /**< ID of the last reader transaction */ + size_t me_meta1_txnid, me_meta1_sign; + size_t me_meta2_txnid, me_meta2_sign; +} MDBX_envinfo; + +/** @brief Return the LMDB library version information. + * + * @param[out] major if non-NULL, the library major version number is copied + * here + * @param[out] minor if non-NULL, the library minor version number is copied + * here + * @param[out] patch if non-NULL, the library patch version number is copied + * here + * @retval "version string" The library version as a string + */ +char *mdbx_version(int *major, int *minor, int *patch); + +/** @brief Return a string describing a given error code. + * + * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) + * function. If the error code is greater than or equal to 0, then the string + * returned by the system function strerror(3) is returned. If the error code + * is less than 0, an error string corresponding to the LMDB library error is + * returned. See @ref errors for a list of LMDB-specific error codes. + * @param[in] err The error code + * @retval "error message" The description of the error + */ +char *mdbx_strerror(int err); + +/** @brief Create an LMDB environment handle. + * + * This function allocates memory for a #MDB_env structure. To release + * the allocated memory and discard the handle, call #mdbx_env_close(). + * Before the handle may be used, it must be opened using #mdbx_env_open(). + * Various other options may also need to be set before opening the handle, + * e.g. #mdbx_env_set_mapsize(), #mdbx_env_set_maxreaders(), + * #mdbx_env_set_maxdbs(), + * depending on usage requirements. + * @param[out] env The address where the new handle will be stored + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_create(MDB_env **env); + +/** @brief Open an environment handle. + * + * If this function fails, #mdbx_env_close() must be called to discard the + *#MDB_env handle. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] path The directory in which the database files reside. This + * directory must already exist and be writable. + * @param[in] flags Special options for this environment. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + * Flags set by mdbx_env_set_flags() are also used. + *
    + *
  • #MDB_FIXEDMAP + * use a fixed address for the mmap region. This flag must be specified + * when creating the environment, and is stored persistently in the + *environment. + * If successful, the memory map will always reside at the same + *virtual address + * and pointers used to reference data items in the database will + *be constant + * across multiple invocations. This option may not always work, + *depending on + * how the operating system has allocated memory to shared + *libraries and other uses. + * The feature is highly experimental. + *
  • #MDB_NOSUBDIR + * By default, LMDB creates its environment in a directory whose + * pathname is given in \b path, and creates its data and lock + *files + * under that directory. With this option, \b path is used as-is + *for + * the database main data file. The database lock file is the \b + *path + * with "-lock" appended. + *
  • #MDB_RDONLY + * Open the environment in read-only mode. No write operations will + *be + * allowed. LMDB will still modify the lock file - except on + *read-only + * filesystems, where LMDB does not use locks. + *
  • #MDB_WRITEMAP + * Use a writeable memory map unless MDB_RDONLY is set. This uses + * fewer mallocs but loses protection from application bugs + * like wild pointer writes and other bad updates into the + *database. + * This may be slightly faster for DBs that fit entirely in RAM, + *but + * is slower for DBs larger than RAM. + * Incompatible with nested transactions. + * Do not mix processes with and without MDB_WRITEMAP on the same + * environment. This can defeat durability (#mdbx_env_sync etc). + *
  • #MDB_NOMETASYNC + * Flush system buffers to disk only once per transaction, omit + *the + * metadata flush. Defer that until the system flushes files to + *disk, + * or next non-MDB_RDONLY commit or #mdbx_env_sync(). This + *optimization + * maintains database integrity, but a system crash may undo the + *last + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database + *property. + * This flag may be changed at any time using + *#mdbx_env_set_flags(). + *
  • #MDB_NOSYNC + * Don't flush system buffers to disk when committing a + *transaction. + * This optimization means a system crash can corrupt the database + *or + * lose the last transactions if buffers are not yet flushed to + *disk. + * The risk is governed by how often the system flushes dirty + *buffers + * to disk and how often #mdbx_env_sync() is called. However, if + *the + * filesystem preserves write order and the #MDB_WRITEMAP flag is + *not + * used, transactions exhibit ACI (atomicity, consistency, + *isolation) + * properties and only lose D (durability). I.e. database + *integrity + * is maintained, but a system crash may undo the final + *transactions. + * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with + *no + * hint for when to write transactions to disk, unless + *#mdbx_env_sync() + * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using + *#mdbx_env_set_flags(). + *
  • #MDB_MAPASYNC + * When using #MDB_WRITEMAP, use asynchronous flushes to disk. + * As with #MDB_NOSYNC, a system crash can then corrupt the + * database or lose the last transactions. Calling + *#mdbx_env_sync() + * ensures on-disk database integrity until next commit. + * This flag may be changed at any time using + *#mdbx_env_set_flags(). + *
  • #MDB_NOTLS + * Don't use Thread-Local Storage. Tie reader locktable slots to + * #MDB_txn objects instead of to threads. I.e. #mdbx_txn_reset() + *keeps + * the slot reseved for the #MDB_txn object. A thread may use + *parallel + * read-only transactions. A read-only transaction may span threads + *if + * the user synchronizes its use. Applications that multiplex + *many + * user threads over individual OS threads need this option. Such + *an + * application must also serialize the write transactions in an + *OS + * thread, since LMDB's write locking is unaware of the user + *threads. + *
  • #MDB_NOLOCK + * Don't do any locking. If concurrent access is anticipated, the + * caller must manage all concurrency itself. For proper + *operation + * the caller must enforce single-writer semantics, and must + *ensure + * that no readers are using old transactions while a writer is + * active. The simplest approach is to use an exclusive lock so + *that + * no readers may be active at all when a writer begins. + *
  • #MDB_NORDAHEAD + * Turn off readahead. Most operating systems perform readahead + *on + * read requests by default. This option turns it off if the OS + * supports it. Turning it off may help random read performance + * when the DB is larger than RAM and system RAM is full. + *
  • #MDB_NOMEMINIT + * Don't initialize malloc'd memory before writing to unused + *spaces + * in the data file. By default, memory for pages written to the + *data + * file is obtained using malloc. While these pages may be reused + *in + * subsequent transactions, freshly malloc'd pages will be + *initialized + * to zeroes before use. This avoids persisting leftover data from + *other + * code (that used the heap and subsequently freed the memory) into + *the + * data file. Note that many other system libraries may allocate + * and free memory from the heap for arbitrary uses. E.g., stdio + *may + * use the heap for file I/O buffers. This initialization step has + *a + * modest performance cost so some applications may want to + *disable + * it using this flag. This option can be a problem for + *applications + * which handle sensitive data like passwords, and it makes + *memory + * checkers like Valgrind noisy. This flag is not needed with + *#MDB_WRITEMAP, + * which writes directly to the mmap instead of using malloc for + *pages. The + * initialization is also skipped if #MDB_RESERVE is used; the + * caller is expected to overwrite all of the memory that was + * reserved in that case. + * This flag may be changed at any time using + *#mdbx_env_set_flags(). + *
  • #MDBX_COALESCE + * Aim to coalesce records while reclaiming FreeDB. + * This flag may be changed at any time using + *#mdbx_env_set_flags(). + *
  • #MDBX_LIFORECLAIM + * LIFO policy for reclaiming FreeDB records. This significantly + *reduce + * write IPOS in case MDB_NOSYNC with periodically checkpoints. + *
+ * @param[in] mode The UNIX permissions to set on created files and + *semaphores. + * This parameter is ignored on Windows. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_VERSION_MISMATCH - the version of the LMDB library doesn't + *match the + * version that created the database environment. + *
  • #MDB_INVALID - the environment file headers are corrupted. + *
  • ENOENT - the directory specified by the path parameter doesn't + *exist. + *
  • EACCES - the user didn't have permission to access the environment + *files. + *
  • EAGAIN - the environment was locked by another process. + *
+ */ +int mdbx_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode); +int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, + mode_t mode, int *exclusive); + +/** @brief Copy an LMDB environment to the specified path. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdbx_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_copy(MDB_env *env, const char *path); + +/** @brief Copy an LMDB environment to the specified file descriptor. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdbx_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_copyfd(MDB_env *env, int fd); + +/** @brief Copy an LMDB environment to the specified path, with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdbx_env_create(). It + * must have already been opened successfully. + * @param[in] path The directory in which the copy will reside. This + * directory must already exist and be writable but must otherwise be + * empty. + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free + * pages and sequentially renumber all pages in output. This + *option + * consumes more CPU and runs more slowly than the default. + * Currently it fails if the environment has suffered a page + *leak. + *
+ * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags); + +/** @brief Copy an LMDB environment to the specified file descriptor, + * with options. + * + * This function may be used to make a backup of an existing environment. + * No lockfile is created, since it gets recreated at need. See + * #mdbx_env_copy2() for further details. + * @note This call can trigger significant file size growth if run in + * parallel with write transactions, because it employs a read-only + * transaction. See long-lived transactions under @ref caveats_sec. + * @param[in] env An environment handle returned by #mdbx_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @param[in] flags Special options for this operation. + * See #mdbx_env_copy2() for options. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_copyfd2(MDB_env *env, int fd, unsigned flags); + +/** @brief Return statistics about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + */ int mdbx_env_stat(MDB_env *env, MDBX_stat *stat, size_t bytes); -int mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, size_t bytes); + +/** @brief Return information about the LMDB environment. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] stat The address of an #MDB_envinfo structure + * where the information will be copied + */ int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); + +/** @brief Flush the data buffers to disk. + * + * Data is always written to disk when #mdbx_txn_commit() is called, + * but the operating system may keep it buffered. LMDB always flushes + * the OS buffers upon commit as well, unless the environment was + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is + * not valid if the environment was opened with #MDB_RDONLY. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] force If non-zero, force a synchronous flush. Otherwise + * if the environment has the #MDB_NOSYNC flag set the flushes + * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - the environment is read-only. + *
  • EINVAL - an invalid parameter was specified. + *
  • EIO - an error occurred during synchronization. + *
+ */ +int mdbx_env_sync(MDB_env *env, int force); + +/** @brief Close the environment and release the memory map. + * + * Only a single thread may call this function. All transactions, databases, + * and cursors must already be closed before calling this function. Attempts + * to + * use any such handles after calling this function will cause a SIGSEGV. + * The environment handle will be freed and must not be used again after this + * call. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] dont_sync A dont'sync flag, if non-zero the last checkpoint + * (meta-page update) will be kept "as is" and may be still "weak" + * in NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be ignored + * on opening next time, and transactions since the last non-weak + * checkpoint (meta-page update) will rolledback for consistency guarantee. + */ +void mdbx_env_close(MDB_env *env); + +/** @brief Set environment flags. + * + * This may be used to set some flags in addition to those from + * #mdbx_env_open(), or to unset these flags. If several threads + * change the flags at the same time, the result is undefined. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] flags The flags to change, bitwise OR'ed together + * @param[in] onoff A non-zero value sets the flags, zero clears them. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff); + +/** @brief Get environment flags. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] flags The address of an integer to store the flags + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_env_get_flags(MDB_env *env, unsigned *flags); + +/** @brief Return the path that was used in #mdbx_env_open(). + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] path Address of a string pointer to contain the path. This + * is the actual string in the environment, not a copy. It should not be + * altered in any way. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_env_get_path(MDB_env *env, const char **path); + +/** @brief Return the filedescriptor for the given environment. + * + * This function may be called after fork(), so the descriptor can be + * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. + * (Until LMDB 0.9.18, only the lockfile had that.) + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] fd Address of a int to contain the descriptor. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_env_get_fd(MDB_env *env, int *fd); + +/** @brief Set the size of the memory map to use for this environment. + * + * The size should be a multiple of the OS page size. The default is + * 10485760 bytes. The size of the memory map is also the maximum size + * of the database. The value should be chosen as large as possible, + * to accommodate future growth of the database. + * This function should be called after #mdbx_env_create() and before + *#mdbx_env_open(). + * It may be called at later times if no transactions are active in + * this process. Note that the library does not check for this condition, + * the caller must ensure it explicitly. + * + * The new size takes effect immediately for the current process but + * will not be persisted to any others until a write transaction has been + * committed by the current process. Also, only mapsize increases are + * persisted into the environment. + * + * If the mapsize is increased by another process, and data has grown + * beyond the range of the current mapsize, #mdbx_txn_begin() will + * return #MDB_MAP_RESIZED. This function may be called with a size + * of zero to adopt the new size. + * + * Any attempt to set a size smaller than the space already consumed + * by the environment will be silently changed to the current size of the used + *space. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] size The size in bytes + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment + *has + * an active write transaction. + *
+ */ +int mdbx_env_set_mapsize(MDB_env *env, size_t size); + +/** @brief Set the maximum number of threads/reader slots for the environment. + * + * This defines the number of slots in the lock table that is used to track + *readers in the + * the environment. The default is 126. + * Starting a read-only transaction normally ties a lock table slot to the + * current thread until the environment closes or the thread exits. If + * MDB_NOTLS is in use, #mdbx_txn_begin() instead ties the slot to the + * MDB_txn object until it or the #MDB_env object is destroyed. + * This function may only be called after #mdbx_env_create() and before + *#mdbx_env_open(). + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] readers The maximum number of reader lock table slots + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment is + *already open. + *
+ */ +int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); + +/** @brief Get the maximum number of threads/reader slots for the environment. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] readers Address of an integer to store the number of readers + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); + +/** @brief Set the maximum number of named databases for the environment. + * + * This function is only needed if multiple databases will be used in the + * environment. Simpler applications that use the environment as a single + * unnamed database can ignore this option. + * This function may only be called after #mdbx_env_create() and before + *#mdbx_env_open(). + * + * Currently a moderate number of slots are cheap but a huge number gets + * expensive: 7-120 words per transaction, and every #mdbx_dbi_open() + * does a linear search of the opened slots. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] dbs The maximum number of databases + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified, or the environment is + *already open. + *
+ */ +int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); + +/** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write. + * + * Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511. + * See @ref MDB_val. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @return The maximum size of a key we can write + */ +int mdbx_env_get_maxkeysize(MDB_env *env); + +/** @brief Set application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_set_userctx(MDB_env *env, void *ctx); + +/** @brief Get the application information associated with the #MDB_env. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @return The pointer set by #mdbx_env_set_userctx(). + */ +void *mdbx_env_get_userctx(MDB_env *env); + +/** @brief A callback function for most LMDB assert() failures, + * called before printing the message and aborting. + * + * @param[in] env An environment handle returned by #mdbx_env_create(). + * @param[in] msg The assertion message, not including newline. + */ +typedef void MDB_assert_func(MDB_env *env, const char *msg, + const char *function, unsigned line); + +/** Set or reset the assert() callback of the environment. + * Disabled if liblmdb is buillt with MDB_DEBUG=0. + * @note This hack should become obsolete as lmdb's error handling matures. + * @param[in] env An environment handle returned by #mdbx_env_create(). + * @param[in] func An #MDB_assert_func function, or 0. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); + +/** @brief Create a transaction for use with the environment. + * + * The transaction handle may be discarded using #mdbx_txn_abort() or + *#mdbx_txn_commit(). + * @note A transaction and its cursors must only be used by a single + * thread, and a thread may only have a single transaction at a time. + * If #MDB_NOTLS is in use, this does not apply to read-only transactions. + * @note Cursors may not span transactions. + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] parent If this parameter is non-NULL, the new transaction + * will be a nested transaction, with the transaction indicated by \b parent + * as its parent. Transactions may be nested to any level. A parent + * transaction and its cursors may not issue any other operations than + * mdbx_txn_commit and mdbx_txn_abort while it has active child transactions. + * @param[in] flags Special options for this transaction. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_RDONLY + * This transaction will not perform any write operations. + *
+ * @param[out] txn Address where the new #MDB_txn handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + *
  • #MDB_MAP_RESIZED - another process wrote data beyond this + *MDB_env's + * mapsize and this environment's map must be resized as well. + * See #mdbx_env_set_mapsize(). + *
  • #MDB_READERS_FULL - a read-only transaction was requested and + * the reader lock table is full. See #mdbx_env_set_maxreaders(). + *
  • ENOMEM - out of memory. + *
+ */ +int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, + MDB_txn **txn); + +/** @brief Returns the transaction's #MDB_env + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + */ +MDB_env *mdbx_txn_env(MDB_txn *txn); + +/** @brief Return the transaction's ID. + * + * This returns the identifier associated with this transaction. For a + * read-only transaction, this corresponds to the snapshot being read; + * concurrent readers will frequently have the same transaction ID. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @return A transaction ID, valid if input is an active transaction. + */ +size_t mdbx_txn_id(MDB_txn *txn); + +/** @brief Commit all the operations of a transaction into the database. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdbx_cursor_renew(). + * + * @note MDBX-mode: + * A cursor must be closed explicitly always, before + * or after its transaction ends. It can be reused with + * #mdbx_cursor_renew() before finally closing it. + * + * @note LMDB-compatible mode: + * Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
  • ENOSPC - no more disk space. + *
  • EIO - a low-level I/O error occurred while writing. + *
  • ENOMEM - out of memory. + *
+ */ +int mdbx_txn_commit(MDB_txn *txn); + +/** @brief Abandon all the operations of the transaction instead of saving + * them. + * + * The transaction handle is freed. It and its cursors must not be used + * again after this call, except with #mdbx_cursor_renew(). + * + * @note MDBX-mode: + * A cursor must be closed explicitly always, before + * or after its transaction ends. It can be reused with + * #mdbx_cursor_renew() before finally closing it. + * + * @note LMDB-compatible mode: + * Earlier documentation incorrectly said all cursors would be freed. + * Only write-transactions free cursors. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + */ +int mdbx_txn_abort(MDB_txn *txn); + +/** @brief Reset a read-only transaction. + * + * Abort the transaction like #mdbx_txn_abort(), but keep the transaction + * handle. #mdbx_txn_renew() may reuse the handle. This saves allocation + * overhead if the process will start a new read-only transaction soon, + * and also locking overhead if #MDB_NOTLS is in use. The reader table + * lock is released, but the table slot stays tied to its thread or + * #MDB_txn. Use mdbx_txn_abort() to discard a reset handle, and to free + * its lock table slot if MDB_NOTLS is in use. + * Cursors opened within the transaction must not be used + * again after this call, except with #mdbx_cursor_renew(). + * Reader locks generally don't interfere with writers, but they keep old + * versions of database pages allocated. Thus they prevent the old pages + * from being reused when writers commit new data, and so under heavy load + * the database size may grow much more rapidly than otherwise. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + */ +int mdbx_txn_reset(MDB_txn *txn); + +/** @brief Renew a read-only transaction. + * + * This acquires a new reader lock for a transaction handle that had been + * released by #mdbx_txn_reset(). It must be called before a reset transaction + * may be used again. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_txn_renew(MDB_txn *txn); + +/** @brief Open a database in the environment. + * A database handle denotes the name and parameters of a database, + * independently of whether such a database exists. + * The database handle may be discarded by calling #mdbx_dbi_close(). + * The old database handle is returned if the database was already open. + * The handle may only be closed once. + * + * The database handle will be private to the current transaction until + * the transaction is successfully committed. If the transaction is + * aborted the handle will be closed automatically. + * After a successful commit the handle will reside in the shared + * environment, and may be used by other transactions. + * + * This function must not be called from multiple concurrent + * transactions in the same process. A transaction that uses + * this function must finish (either commit or abort) before + * any other transaction in the process may use this function. + * + * To use named databases (with name != NULL), #mdbx_env_set_maxdbs() + * must be called before opening the environment. Database names are + * keys in the unnamed database, and may be read but not written. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] name The name of the database to open. If only a single + * database is needed in the environment, this value may be NULL. + * @param[in] flags Special options for this database. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_REVERSEKEY + * Keys are strings to be compared in reverse order, from the end + * of the strings to the beginning. By default, Keys are treated as + *strings and + * compared from beginning to end. + *
  • #MDB_DUPSORT + * Duplicate keys may be used in the database. (Or, from another + *perspective, + * keys may have multiple data items, stored in sorted order.) By + *default + * keys must be unique and may have only a single data item. + *
  • #MDB_INTEGERKEY + * Keys are binary integers in native byte order, either unsigned + *int + * or #mdbx_size_t, and will be sorted as such. + * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) + * The keys must all be of the same size. + *
  • #MDB_DUPFIXED + * This flag may only be used in combination with #MDB_DUPSORT. + *This option + * tells the library that the data items for this database are all + *the same + * size, which allows further optimizations in storage and + *retrieval. When + * all data items are the same size, the #MDB_GET_MULTIPLE, + *#MDB_NEXT_MULTIPLE + * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve + *multiple + * items at once. + *
  • #MDB_INTEGERDUP + * This option specifies that duplicate data items are binary + *integers, + * similar to #MDB_INTEGERKEY keys. + *
  • #MDB_REVERSEDUP + * This option specifies that duplicate data items should be + *compared as + * strings in reverse order. + *
  • #MDB_CREATE + * Create the named database if it doesn't exist. This option is + *not + * allowed in a read-only transaction or a read-only environment. + *
+ * @param[out] dbi Address where the new #MDB_dbi handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - the specified database doesn't exist in the + *environment + * and #MDB_CREATE was not specified. + *
  • #MDB_DBS_FULL - too many databases have been opened. See + *#mdbx_env_set_maxdbs(). + *
+ */ +int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi); + +/** @brief Retrieve statistics for a database. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[out] stat The address of an #MDB_stat structure + * where the statistics will be copied + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, size_t bytes); + +/** @brief Retrieve the DB flags for a database handle. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[out] flags Address where the flags will be returned. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); + +/** @brief Close a database handle. Normally unnecessary. Use with care: + * + * This call is not mutex protected. Handles should only be closed by + * a single thread, and only if no other threads are going to reference + * the database handle or one of its cursors any further. Do not close + * a handle if an existing transaction has modified its database. + * Doing so can cause misbehavior from database corruption to errors + * like MDB_BAD_VALSIZE (since the DB name is gone). + * + * Closing a database handle is not necessary, but lets #mdbx_dbi_open() + * reuse the handle value. Usually it's better to set a bigger + * #mdbx_env_set_maxdbs(), unless that value would be large. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + */ +void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); + +/** @brief Empty or delete+close a database. + * + * See #mdbx_dbi_close() for restrictions about closing the DB handle. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] del 0 to empty the DB, 1 to delete it from the + * environment and close the DB handle. + * @return A non-zero error value on failure and 0 on success. + */ +int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); + +/** @brief Set a custom key comparison function for a database. + * + * The comparison function is called whenever it is necessary to compare a + * key specified by the application with a key currently stored in the + *database. + * If no comparison function is specified, and no special key flags were + *specified + * with #mdbx_dbi_open(), the keys are compared lexically, with shorter keys + *collating + * before longer keys. + * @warning This function must be called before any data access functions are + *used, + * otherwise data corruption may occur. The same comparison function must be + *used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + +/** @brief Set a custom data comparison function for a #MDB_DUPSORT database. + * + * This comparison function is called whenever it is necessary to compare a + *data + * item specified by the application with a data item currently stored in the + *database. + * This function only takes effect if the database was opened with the + *#MDB_DUPSORT + * flag. + * If no comparison function is specified, and no special key flags were + *specified + * with #mdbx_dbi_open(), the data items are compared lexically, with shorter + *items collating + * before longer items. + * @warning This function must be called before any data access functions are + *used, + * otherwise data corruption may occur. The same comparison function must be + *used by every + * program accessing the database, every time the database is used. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] cmp A #MDB_cmp_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); + +/** @brief Set a relocation function for a #MDB_FIXEDMAP database. + * + * @todo The relocation function is called whenever it is necessary to move + *the data + * of an item to a different position in the database (e.g. through tree + * balancing operations, shifts as a result of adds or deletes, etc.). It is + * intended to allow address/position-dependent data items to be stored in + * a database in an environment opened with the #MDB_FIXEDMAP option. + * Currently the relocation feature is unimplemented and setting + * this function has no effect. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] rel A #MDB_rel_func function + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel); + +/** @brief Set a context pointer for a #MDB_FIXEDMAP database's relocation + *function. + * + * See #mdbx_set_relfunc and #MDB_rel_func for more details. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] ctx An arbitrary pointer for whatever the application needs. + * It will be passed to the callback function set by #mdbx_set_relfunc + * as its \b relctx parameter whenever the callback is invoked. + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx); + +/** @brief Get items from a database. + * + * This function retrieves key/data pairs from the database. The address + * and length of the data associated with the specified \b key are returned + * in the structure to which \b data refers. + * If the database supports duplicate keys (#MDB_DUPSORT) then the + * first data item for the key will be returned. Retrieval of other + * items requires the use of #mdbx_cursor_get(). + * + * @note The memory pointed to by the returned values is owned by the + * database. The caller need not dispose of the memory, and may not + * modify it in any way. For values returned in a read-only transaction + * any modification attempts will cause a SIGSEGV. + * @note Values returned from the database are valid only until a + * subsequent update operation, or the end of the transaction. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] key The key to search for in the database + * @param[out] data The data corresponding to the key + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - the key was not in the database. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + +/** @brief Store items into a database. + * + * This function stores key/data pairs in the database. The default behavior + * is to enter the new key/data pair, replacing any previously existing key + * if duplicates are disallowed, or adding a duplicate data item if + * duplicates are allowed (#MDB_DUPSORT). + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] key The key to store in the database + * @param[in,out] data The data to store + * @param[in] flags Special options for this operation. This parameter + * must be set to 0 or by bitwise OR'ing together one or more of the + * values described here. + *
    + *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be + *specified + * if the database was opened with #MDB_DUPSORT. The function + *will + * return #MDB_KEYEXIST if the key/data pair already appears in + *the + * database. + *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will + *return + * #MDB_KEYEXIST if the key already appears in the database, even + *if + * the database supports duplicates (#MDB_DUPSORT). The \b data + * parameter will be set to point to the existing item. + *
  • #MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. + * LMDB does nothing else with this memory, the caller is + *expected + * to modify all of the space requested. This flag must not be + * specified if the database was opened with #MDB_DUPSORT. + *
  • #MDB_APPEND - append the given key/data pair to the end of the + * database. This option allows fast bulk loading when keys are + * already known to be in the correct order. Loading unsorted + *keys + * with this flag will cause a #MDB_KEYEXIST error. + *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_MAP_FULL - the database is full, see #mdbx_env_set_mapsize(). + *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned flags); + +/** @brief Delete items from a database. + * + * This function removes key/data pairs from the database. + * + * MDBX-mode: + * The data parameter is NOT ignored regardless the database does + * support sorted duplicate data items or not. If the data parameter + * is non-NULL only the matching data item will be deleted. + * + * LMDB-compatible mode: + * If the database does not support sorted duplicate data items + * (#MDB_DUPSORT) the data parameter is ignored. + * If the database supports sorted duplicates and the data parameter + * is NULL, all of the duplicate data items for the key will be + * deleted. Otherwise, if the data parameter is non-NULL + * only the matching data item will be deleted. + * + * This function will return #MDB_NOTFOUND if the specified key/data + * pair is not in the database. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] key The key to delete from the database + * @param[in] data The data to delete + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); + +/** @brief Create a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * A cursor cannot be used when its database handle is closed. Nor + * when its transaction has ended, except with #mdbx_cursor_renew(). + * It can be discarded with #mdbx_cursor_close(). + * + * MDBX-mode: + * A cursor must be closed explicitly always, before + * or after its transaction ends. It can be reused with + * #mdbx_cursor_renew() before finally closing it. + * + * LMDB-compatible mode: + * A cursor in a write-transaction can be closed before its transaction + * ends, and will otherwise be closed when its transaction ends. + * A cursor in a read-only transaction must be closed explicitly, before + * or after its transaction ends. It can be reused with + * #mdbx_cursor_renew() before finally closing it. + * @note Earlier documentation said that cursors in every transaction + * were closed when the transaction committed or aborted. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[out] cursor Address where the new #MDB_cursor handle will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); + +/** @brief Close a cursor handle. + * + * The cursor handle will be freed and must not be used again after this call. + * Its transaction must still be live if it is a write-transaction. + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + */ +void mdbx_cursor_close(MDB_cursor *cursor); + +/** @brief Renew a cursor handle. + * + * A cursor is associated with a specific transaction and database. + * Cursors that are only used in read-only + * transactions may be re-used, to avoid unnecessary malloc/free overhead. + * The cursor may be associated with a new read-only transaction, and + * referencing the same database handle as it was created with. + * This may be done whether the previous transaction is live or dead. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); + +/** @brief Return the cursor's transaction handle. + * + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + */ +MDB_txn *mdbx_cursor_txn(MDB_cursor *cursor); + +/** @brief Return the cursor's database handle. + * + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + */ +MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); + +/** @brief Retrieve by cursor. + * + * This function retrieves key/data pairs from the database. The address and + *length + * of the key are returned in the object to which \b key refers (except for + *the + * case of the #MDB_SET option, in which the \b key object is unchanged), and + * the address and length of the data are returned in the object to which \b + *data + * refers. + * See #mdbx_get() for restrictions on using the output values. + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * @param[in,out] key The key for a retrieved item + * @param[in,out] data The data of a retrieved item + * @param[in] op A cursor operation #MDB_cursor_op + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_NOTFOUND - no matching key found. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + MDB_cursor_op op); + +/** @brief Store by cursor. + * + * This function stores key/data pairs into the database. + * The cursor is positioned at the new item, or on failure usually near it. + * @note Earlier documentation incorrectly said errors would leave the + * state of the cursor unchanged. + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * @param[in] key The key operated on. + * @param[in] data The data operated on. + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + *
    + *
  • #MDB_CURRENT - replace the item at the current cursor position. + * The \b key parameter must still be provided, and must match + *it. + * If using sorted duplicates (#MDB_DUPSORT) the data item must + *still + * sort into the same place. This is intended to be used when the + * new data is the same size as the old. Otherwise it will simply + * perform a delete of the old record followed by an insert. + *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be + *specified + * if the database was opened with #MDB_DUPSORT. The function + *will + * return #MDB_KEYEXIST if the key/data pair already appears in + *the + * database. + *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will + *return + * #MDB_KEYEXIST if the key already appears in the database, even + *if + * the database supports duplicates (#MDB_DUPSORT). + *
  • #MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. This + *flag + * must not be specified if the database was opened with + *#MDB_DUPSORT. + *
  • #MDB_APPEND - append the given key/data pair to the end of the + * database. No key comparisons are performed. This option allows + * fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * a #MDB_KEYEXIST error. + *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
  • #MDB_MULTIPLE - store multiple contiguous data elements in a + * single request. This flag may only be specified if the + *database + * was opened with #MDB_DUPFIXED. The \b data argument must be an + * array of two MDB_vals. The mv_size of the first MDB_val must + *be + * the size of a single data element. The mv_data of the first + *MDB_val + * must point to the beginning of the array of contiguous data + *elements. + * The mv_size of the second MDB_val must be the count of the + *number + * of data elements to store. On return this field will be set to + * the count of the number of elements actually written. The + *mv_data + * of the second MDB_val is unused. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • #MDB_MAP_FULL - the database is full, see #mdbx_env_set_mapsize(). + *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + unsigned flags); + +/** @brief Delete current key/data pair + * + * This function deletes the key/data pair to which the cursor refers. + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * @param[in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + *
    + *
  • #MDB_NODUPDATA - delete all of the data items for the current key. + * This flag may only be specified if the database was opened with + *#MDB_DUPSORT. + *
+ * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EACCES - an attempt was made to write in a read-only transaction. + *
  • EINVAL - an invalid parameter was specified. + *
+ */ +int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); + +/** @brief Return count of duplicates for current key. + * + * This call is only valid on databases that support sorted duplicate + * data items #MDB_DUPSORT. + * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * @param[out] countp Address where the count will be stored + * @return A non-zero error value on failure and 0 on success. Some possible + * errors are: + *
    + *
  • EINVAL - cursor is not initialized, or an invalid parameter was + *specified. + *
+ */ +int mdbx_cursor_count(MDB_cursor *cursor, size_t *countp); + +/** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two data items were keys in the + * specified database. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + +/** @brief Compare two data items according to a particular database. + * + * This returns a comparison as if the two items were data items of + * the specified database. The database must have the #MDB_DUPSORT flag. + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * @param[in] a The first item to compare + * @param[in] b The second item to compare + * @return < 0 if a < b, 0 if a == b, > 0 if a > b + */ +int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); + +/** @brief A callback function used to print a message from the library. + * + * @param[in] msg The string to be printed. + * @param[in] ctx An arbitrary context pointer for the callback. + * @return < 0 on failure, >= 0 on success. + */ +typedef int(MDB_msg_func)(const char *msg, void *ctx); + +/** @brief Dump the entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] func A #MDB_msg_func function + * @param[in] ctx Anything the message function needs + * @return < 0 on failure, >= 0 on success. + */ +int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); + +/** @brief Check for stale entries in the reader lock table. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[out] dead Number of stale slots that were cleared + * @return 0 on success, non-zero on failure. + */ +int mdbx_reader_check(MDB_env *env, int *dead); + +char *mdbx_dkey(MDB_val *key, char *buf); + int mdbx_env_close_ex(MDB_env *env, int dont_sync); - /** @brief Set threshold to force flush the data buffers to disk, - * even of #MDB_NOSYNC, #MDB_NOMETASYNC and #MDB_MAPASYNC flags - * in the environment. - * - * Data is always written to disk when #mdb_txn_commit() is called, - * but the operating system may keep it buffered. LMDB always flushes - * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. - * - * The default is 0, than mean no any threshold checked, - * and no additional flush will be made. - * - * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] bytes The size in bytes of summary changes - * when a synchronous flush would be made. - * @return A non-zero error value on failure and 0 on success. - */ +/** @brief Set threshold to force flush the data buffers to disk, + * even of #MDB_NOSYNC, #MDB_NOMETASYNC and #MDB_MAPASYNC flags + * in the environment. + * + * Data is always written to disk when #mdbx_txn_commit() is called, + * but the operating system may keep it buffered. LMDB always flushes + * the OS buffers upon commit as well, unless the environment was + * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. + * + * The default is 0, than mean no any threshold checked, + * and no additional flush will be made. + * + * @param[in] env An environment handle returned by #mdbx_env_create() + * @param[in] bytes The size in bytes of summary changes + * when a synchronous flush would be made. + * @return A non-zero error value on failure and 0 on success. + */ int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); - /** @brief Returns a lag of the reading. - * - * Returns an information for estimate how much given read-only - * transaction is lagging relative the to actual head. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[out] percent Percentage of page allocation in the database. - * @return Number of transactions committed after the given was started for read, or -1 on failure. - */ +/** @brief Returns a lag of the reading. + * + * Returns an information for estimate how much given read-only + * transaction is lagging relative the to actual head. + * + * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * @param[out] percent Percentage of page allocation in the database. + * @return Number of transactions committed after the given was started for + * read, or -1 on failure. + */ int mdbx_txn_straggler(MDB_txn *txn, int *percent); - /** @brief A callback function for killing a laggard readers, - * but also could waiting ones. Called in case of MDB_MAP_FULL error. - * - * @param[in] env An environment handle returned by #mdb_env_create(). - * @param[in] pid pid of the reader process. - * @param[in] thread_id thread_id of the reader thread. - * @param[in] txn Transaction number on which stalled. - * @param[in] gap a lag from the last commited txn. - * @param[in] retry a retry number, less that zero for notify end of OOM-loop. - * @return -1 on failure (reader is not killed), - * 0 on a race condition (no such reader), - * 1 on success (reader was killed), - * >1 on success (reader was SURE killed). - */ -typedef int (MDBX_oom_func)(MDB_env *env, int pid, void* thread_id, size_t txn, unsigned gap, int retry); +/** @brief A callback function for killing a laggard readers, + * but also could waiting ones. Called in case of MDB_MAP_FULL error. + * + * @param[in] env An environment handle returned by #mdbx_env_create(). + * @param[in] pid pid of the reader process. + * @param[in] thread_id thread_id of the reader thread. + * @param[in] txn Transaction number on which stalled. + * @param[in] gap a lag from the last commited txn. + * @param[in] retry a retry number, less that zero for notify end of OOM-loop. + * @return -1 on failure (reader is not killed), + * 0 on a race condition (no such reader), + * 1 on success (reader was killed), + * >1 on success (reader was SURE killed). + */ +typedef int(MDBX_oom_func)(MDB_env *env, int pid, void *thread_id, size_t txn, + unsigned gap, int retry); - /** @brief Set the OOM callback. - * - * Callback will be called only on out-of-pages case for killing - * a laggard readers to allowing reclaiming of freeDB. - * - * @param[in] env An environment handle returned by #mdb_env_create(). - * @param[in] oomfunc A #MDBX_oom_func function or NULL to disable. - */ +/** @brief Set the OOM callback. + * + * Callback will be called only on out-of-pages case for killing + * a laggard readers to allowing reclaiming of freeDB. + * + * @param[in] env An environment handle returned by #mdbx_env_create(). + * @param[in] oomfunc A #MDBX_oom_func function or NULL to disable. + */ void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); - /** @brief Get the current oom_func callback. - * - * Callback will be called only on out-of-pages case for killing - * a laggard readers to allowing reclaiming of freeDB. - * - * @param[in] env An environment handle returned by #mdb_env_create(). - * @return A #MDBX_oom_func function or NULL if disabled. - */ -MDBX_oom_func* mdbx_env_get_oomfunc(MDB_env *env); +/** @brief Get the current oom_func callback. + * + * Callback will be called only on out-of-pages case for killing + * a laggard readers to allowing reclaiming of freeDB. + * + * @param[in] env An environment handle returned by #mdbx_env_create(). + * @return A #MDBX_oom_func function or NULL if disabled. + */ +MDBX_oom_func *mdbx_env_get_oomfunc(MDB_env *env); -#define MDBX_DBG_ASSERT 1 -#define MDBX_DBG_PRINT 2 -#define MDBX_DBG_TRACE 4 -#define MDBX_DBG_EXTRA 8 -#define MDBX_DBG_AUDIT 16 -#define MDBX_DBG_EDGE 32 +#define MDBX_DBG_ASSERT 1 +#define MDBX_DBG_PRINT 2 +#define MDBX_DBG_TRACE 4 +#define MDBX_DBG_EXTRA 8 +#define MDBX_DBG_AUDIT 16 +#define MDBX_DBG_EDGE 32 /* LY: a "don't touch" value */ -#define MDBX_DBG_DNT (-1L) +#define MDBX_DBG_DNT (-1L) typedef void MDBX_debug_func(int type, const char *function, int line, - const char *msg, va_list args); + const char *msg, va_list args); -int mdbx_setup_debug(int flags, MDBX_debug_func* logger, long edge_txn); +int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); -typedef int MDBX_pgvisitor_func(size_t pgno, unsigned pgnumber, void* ctx, - const char* dbi, const char *type, int nentries, - int payload_bytes, int header_bytes, int unused_bytes); -int mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func* visitor, void* ctx); +typedef int MDBX_pgvisitor_func(size_t pgno, unsigned pgnumber, void *ctx, + const char *dbi, const char *type, int nentries, + int payload_bytes, int header_bytes, + int unused_bytes); +int mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, void *ctx); -typedef struct mdbx_canary { - size_t x, y, z, v; -} mdbx_canary; +typedef struct mdbx_canary { size_t x, y, z, v; } mdbx_canary; -int mdbx_canary_put(MDB_txn *txn, const mdbx_canary* canary); -size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary* canary); +int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary); +size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary); /* Returns: * - MDBX_RESULT_TRUE when no more data available @@ -236,20 +1768,19 @@ int mdbx_cursor_on_last(MDB_cursor *mc); #define MDBX_RESULT_FALSE MDB_SUCCESS #define MDBX_RESULT_TRUE (-1) -int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, - MDB_val *key, MDB_val *new_data, MDB_val *old_data, unsigned flags); +int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, + MDB_val *old_data, unsigned flags); /* Same as mdbx_get(), but: * 1) if values_count is not NULL, then returns the count * of multi-values/duplicates for a given key. * 2) updates the key for pointing to the actual key's data inside DB. */ -int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, int* values_count); +int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + int *values_count); -int mdbx_is_dirty(const MDB_txn *txn, const void* ptr); +int mdbx_is_dirty(const MDB_txn *txn, const void *ptr); int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, - MDB_dbi *dbi, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp); - -/** @} */ + MDB_dbi *dbi, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp); #ifdef __cplusplus } diff --git a/mdbx_chk.c b/mdbx_chk.c new file mode 100644 index 00000000..6c1f6454 --- /dev/null +++ b/mdbx_chk.c @@ -0,0 +1,979 @@ +/* mdbx_chk.c - memory-mapped database check tool */ + +/* + * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2015,2016 Peter-Service R&D LLC. + * + * This file is part of libmdbx. + * + * libmdbx is free software; you can redistribute it and/or modify it under + * the terms of the GNU Affero General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * libmdbx is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mdbx.h" +#include "midl.h" + +typedef struct flagbit { + int bit; + char *name; +} flagbit; + +flagbit dbflags[] = {{MDB_DUPSORT, "dupsort"}, + {MDB_INTEGERKEY, "integerkey"}, + {MDB_REVERSEKEY, "reversekey"}, + {MDB_DUPFIXED, "dupfixed"}, + {MDB_REVERSEDUP, "reversedup"}, + {MDB_INTEGERDUP, "integerdup"}, + {0, NULL}}; + +static volatile sig_atomic_t gotsignal; + +static void signal_handler(int sig) { + (void)sig; + gotsignal = 1; +} + +#define MAX_DBI 32768 + +#define EXIT_INTERRUPTED (EXIT_FAILURE + 4) +#define EXIT_FAILURE_SYS (EXIT_FAILURE + 3) +#define EXIT_FAILURE_MDB (EXIT_FAILURE + 2) +#define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE + 1) +#define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE + +struct { + const char *dbi_names[MAX_DBI]; + size_t dbi_pages[MAX_DBI]; + size_t dbi_empty_pages[MAX_DBI]; + size_t dbi_payload_bytes[MAX_DBI]; + size_t dbi_lost_bytes[MAX_DBI]; + short *pagemap; + size_t total_payload_bytes; + size_t pgcount; +} walk; + +static __attribute__((constructor)) void init_walk(void) { + walk.dbi_names[0] = "@gc"; +} + +size_t total_unused_bytes; +int exclusive = 2; + +MDB_env *env; +MDB_txn *txn, *locktxn; +MDBX_envinfo info; +MDBX_stat stat; +size_t maxkeysize, reclaimable_pages, freedb_pages, lastpgno; +size_t userdb_count, skipped_subdb; +unsigned verbose, quiet; +const char *only_subdb; + +struct problem { + struct problem *pr_next; + size_t count; + const char *caption; +}; + +struct problem *problems_list; +size_t total_problems; + +static void __attribute__((format(printf, 1, 2))) print(const char *msg, ...) { + if (!quiet) { + va_list args; + + fflush(stderr); + va_start(args, msg); + vfprintf(stdout, msg, args); + va_end(args); + } +} + +static void __attribute__((format(printf, 1, 2))) error(const char *msg, ...) { + total_problems++; + + if (!quiet) { + va_list args; + + fflush(stdout); + va_start(args, msg); + vfprintf(stderr, msg, args); + va_end(args); + fflush(NULL); + } +} + +static void pagemap_cleanup(void) { + int i; + + for (i = 1; i < MAX_DBI; ++i) { + if (walk.dbi_names[i]) { + free((void *)walk.dbi_names[i]); + walk.dbi_names[i] = NULL; + } + } + + free(walk.pagemap); + walk.pagemap = NULL; +} + +static int pagemap_lookup_dbi(const char *dbi) { + static int last; + int i; + + if (last > 0 && strcmp(walk.dbi_names[last], dbi) == 0) + return last; + + for (i = 1; walk.dbi_names[i] && last < MAX_DBI; ++i) + if (strcmp(walk.dbi_names[i], dbi) == 0) + return last = i; + + if (i == MAX_DBI) + return -1; + + walk.dbi_names[i] = strdup(dbi); + + if (verbose > 1) { + print(" - found '%s' area\n", dbi); + fflush(NULL); + } + + return last = i; +} + +static void problem_add(const char *object, size_t entry_number, + const char *msg, const char *extra, ...) { + total_problems++; + + if (!quiet) { + int need_fflush = 0; + struct problem *p; + + for (p = problems_list; p; p = p->pr_next) + if (p->caption == msg) + break; + + if (!p) { + p = calloc(1, sizeof(*p)); + p->caption = msg; + p->pr_next = problems_list; + problems_list = p; + need_fflush = 1; + } + + p->count++; + if (verbose > 1) { + print(" %s #%zu: %s", object, entry_number, msg); + if (extra) { + va_list args; + printf(" ("); + va_start(args, extra); + vfprintf(stdout, extra, args); + va_end(args); + printf(")"); + } + printf("\n"); + if (need_fflush) + fflush(NULL); + } + } +} + +static struct problem *problems_push() { + struct problem *p = problems_list; + problems_list = NULL; + return p; +} + +static size_t problems_pop(struct problem *list) { + size_t count = 0; + + if (problems_list) { + int i; + + print(" - problems: "); + for (i = 0; problems_list; ++i) { + struct problem *p = problems_list->pr_next; + count += problems_list->count; + print("%s%s (%zu)", i ? ", " : "", problems_list->caption, + problems_list->count); + free(problems_list); + problems_list = p; + } + print("\n"); + fflush(NULL); + } + + problems_list = list; + return count; +} + +static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, + const char *type, int nentries, int payload_bytes, + int header_bytes, int unused_bytes) { + (void)ctx; + + if (type) { + size_t page_bytes = payload_bytes + header_bytes + unused_bytes; + size_t page_size = pgnumber * stat.ms_psize; + int index = pagemap_lookup_dbi(dbi); + if (index < 0) + return ENOMEM; + + if (verbose > 2 && (!only_subdb || strcmp(only_subdb, dbi) == 0)) { + if (pgnumber == 1) + print(" %s-page %zu", type, pgno); + else + print(" %s-span %zu[%u]", type, pgno, pgnumber); + print(" of %s: header %i, payload %i, unused %i\n", dbi, header_bytes, + payload_bytes, unused_bytes); + } + + walk.pgcount += pgnumber; + + if (unused_bytes < 0 || (size_t)unused_bytes > page_size) + problem_add("page", pgno, "illegal unused-bytes", "%zu < %i < %zu", 0, + unused_bytes, stat.ms_psize); + + if (header_bytes < (int)sizeof(long) || + (size_t)header_bytes >= stat.ms_psize - sizeof(long)) + problem_add("page", pgno, "illegal header-length", "%zu < %i < %zu", + sizeof(long), header_bytes, stat.ms_psize - sizeof(long)); + if (payload_bytes < 1) { + if (nentries > 1) { + problem_add("page", pgno, "zero size-of-entry", + "payload %i bytes, %i entries", payload_bytes, nentries); + if ((size_t)header_bytes + unused_bytes < page_size) { + /* LY: hush a misuse error */ + page_bytes = page_size; + } + } else { + problem_add("page", pgno, "empty", "payload %i bytes, %i entries", + payload_bytes, nentries); + walk.dbi_empty_pages[index] += 1; + } + } + + if (page_bytes != page_size) { + problem_add("page", pgno, "misused", "%zu != %zu (%ih + %ip + %iu)", + page_size, page_bytes, header_bytes, payload_bytes, + unused_bytes); + if (page_size > page_bytes) + walk.dbi_lost_bytes[index] += page_size - page_bytes; + } else { + walk.dbi_payload_bytes[index] += payload_bytes + header_bytes; + walk.total_payload_bytes += payload_bytes + header_bytes; + } + + if (pgnumber) { + do { + if (pgno >= lastpgno) + problem_add("page", pgno, "wrong page-no", "%zu > %zi", pgno, + lastpgno); + else if (walk.pagemap[pgno]) + problem_add("page", pgno, "already used", "in %s", + walk.dbi_names[walk.pagemap[pgno]]); + else { + walk.pagemap[pgno] = index; + walk.dbi_pages[index] += 1; + } + ++pgno; + } while (--pgnumber); + } + } + + return gotsignal ? EINTR : MDB_SUCCESS; +} + +typedef int(visitor)(size_t record_number, MDB_val *key, MDB_val *data); +static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent); + +static int handle_userdb(size_t record_number, MDB_val *key, MDB_val *data) { + (void)record_number; + (void)key; + (void)data; + return MDB_SUCCESS; +} + +static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { + char *bad = ""; + size_t pg, prev; + ssize_t i, number, span = 0; + size_t *iptr = data->mv_data, txnid = *(size_t *)key->mv_data; + + if (key->mv_size != sizeof(txnid)) + problem_add("entry", record_number, "wrong txn-id size", "key-size %zi", + key->mv_size); + else if (txnid < 1 || txnid > info.me_last_txnid) + problem_add("entry", record_number, "wrong txn-id", "%zu", txnid); + + if (data->mv_size < sizeof(size_t) || data->mv_size % sizeof(size_t)) + problem_add("entry", record_number, "wrong idl size", "%zu", data->mv_size); + else { + number = *iptr++; + if (number >= MDB_IDL_UM_MAX) + problem_add("entry", record_number, "wrong idl length", "%zi", number); + else if ((number + 1) * sizeof(size_t) != data->mv_size) + problem_add("entry", record_number, "mismatch idl length", "%zi != %zu", + number * sizeof(size_t), data->mv_size); + else { + freedb_pages += number; + if (info.me_tail_txnid > txnid) + reclaimable_pages += number; + for (i = number, prev = 1; --i >= 0;) { + pg = iptr[i]; + if (pg < 2 /* META_PAGE */ || pg > info.me_last_pgno) + problem_add("entry", record_number, "wrong idl entry", + "2 < %zi < %zi", pg, info.me_last_pgno); + else if (pg <= prev) { + bad = " [bad sequence]"; + problem_add("entry", record_number, "bad sequence", "%zi <= %zi", pg, + prev); + } + prev = pg; + pg += span; + for (; i >= span && iptr[i - span] == pg; span++, pg++) + ; + } + if (verbose > 2 && !only_subdb) { + print(" transaction %zu, %zd pages, maxspan %zd%s\n", + *(size_t *)key->mv_data, number, span, bad); + if (verbose > 3) { + int j = number - 1; + while (j >= 0) { + pg = iptr[j]; + for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) + ; + if (span > 1) + print(" %9zu[%zd]\n", pg, span); + else + print(" %9zu\n", pg); + } + } + } + } + } + + return MDB_SUCCESS; +} + +static int handle_maindb(size_t record_number, MDB_val *key, MDB_val *data) { + char *name; + int rc; + size_t i; + + name = key->mv_data; + for (i = 0; i < key->mv_size; ++i) { + if (name[i] < ' ') + return handle_userdb(record_number, key, data); + } + + name = malloc(key->mv_size + 1); + memcpy(name, key->mv_data, key->mv_size); + name[key->mv_size] = '\0'; + userdb_count++; + + rc = process_db(-1, name, handle_userdb, 0); + free(name); + if (rc != MDB_INCOMPATIBLE) + return rc; + + return handle_userdb(record_number, key, data); +} + +static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { + MDB_cursor *mc; + MDBX_stat ms; + MDB_val key, data; + MDB_val prev_key, prev_data; + unsigned flags; + int rc, i; + struct problem *saved_list; + size_t problems_count; + + unsigned record_count = 0, dups = 0; + size_t key_bytes = 0, data_bytes = 0; + + if (0 > (int)dbi) { + rc = mdbx_dbi_open(txn, name, 0, &dbi); + if (rc) { + if (!name || + rc != + MDB_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { + error(" - mdbx_open '%s' failed, error %d %s\n", name ? name : "main", + rc, mdbx_strerror(rc)); + } + return rc; + } + } + + if (dbi >= 2 /* CORE_DBS */ && name && only_subdb && + strcmp(only_subdb, name)) { + if (verbose) { + print("Skip processing '%s'...\n", name); + fflush(NULL); + } + skipped_subdb++; + return MDB_SUCCESS; + } + + if (!silent && verbose) { + print("Processing '%s'...\n", name ? name : "main"); + fflush(NULL); + } + + rc = mdbx_dbi_flags(txn, dbi, &flags); + if (rc) { + error(" - mdbx_dbi_flags failed, error %d %s\n", rc, mdbx_strerror(rc)); + return rc; + } + + rc = mdbx_stat(txn, dbi, &ms, sizeof(ms)); + if (rc) { + error(" - mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + return rc; + } + + if (!silent && verbose) { + print(" - dbi-id %d, flags:", dbi); + if (!flags) + print(" none"); + else { + for (i = 0; dbflags[i].bit; i++) + if (flags & dbflags[i].bit) + print(" %s", dbflags[i].name); + } + print(" (0x%02X)\n", flags); + if (verbose > 1) { + print(" - page size %u, entries %zu\n", ms.ms_psize, ms.ms_entries); + print(" - b-tree depth %u, pages: branch %zu, leaf %zu, overflow %zu\n", + ms.ms_depth, ms.ms_branch_pages, ms.ms_leaf_pages, + ms.ms_overflow_pages); + } + } + + rc = mdbx_cursor_open(txn, dbi, &mc); + if (rc) { + error(" - mdbx_cursor_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + return rc; + } + + saved_list = problems_push(); + prev_key.mv_data = NULL; + prev_data.mv_size = 0; + rc = mdbx_cursor_get(mc, &key, &data, MDB_FIRST); + while (rc == MDB_SUCCESS) { + if (gotsignal) { + print(" - interrupted by signal\n"); + fflush(NULL); + rc = EINTR; + goto bailout; + } + + if (key.mv_size > maxkeysize) { + problem_add("entry", record_count, "key length exceeds max-key-size", + "%zu > %zu", key.mv_size, maxkeysize); + } else if ((flags & MDB_INTEGERKEY) && key.mv_size != sizeof(size_t) && + key.mv_size != sizeof(int)) { + problem_add("entry", record_count, "wrong key length", "%zu != %zu", + key.mv_size, sizeof(size_t)); + } + + if ((flags & MDB_INTEGERDUP) && data.mv_size != sizeof(size_t) && + data.mv_size != sizeof(int)) { + problem_add("entry", record_count, "wrong data length", "%zu != %zu", + data.mv_size, sizeof(size_t)); + } + + if (prev_key.mv_data) { + if ((flags & MDB_DUPFIXED) && prev_data.mv_size != data.mv_size) { + problem_add("entry", record_count, "different data length", + "%zu != %zu", prev_data.mv_size, data.mv_size); + } + + int cmp = mdbx_cmp(txn, dbi, &prev_key, &key); + if (cmp > 0) { + problem_add("entry", record_count, "broken ordering of entries", NULL); + } else if (cmp == 0) { + ++dups; + if (!(flags & MDB_DUPSORT)) + problem_add("entry", record_count, "duplicated entries", NULL); + else if (flags & MDB_INTEGERDUP) { + cmp = mdbx_dcmp(txn, dbi, &prev_data, &data); + if (cmp > 0) + problem_add("entry", record_count, + "broken ordering of multi-values", NULL); + } + } + } else if (verbose) { + if (flags & MDB_INTEGERKEY) + print(" - fixed key-size %zu\n", key.mv_size); + if (flags & (MDB_INTEGERDUP | MDB_DUPFIXED)) + print(" - fixed data-size %zu\n", data.mv_size); + } + + if (handler) { + rc = handler(record_count, &key, &data); + if (rc) + goto bailout; + } + + record_count++; + key_bytes += key.mv_size; + data_bytes += data.mv_size; + + prev_key = key; + prev_data = data; + rc = mdbx_cursor_get(mc, &key, &data, MDB_NEXT); + } + if (rc != MDB_NOTFOUND) + error(" - mdbx_cursor_get failed, error %d %s\n", rc, mdbx_strerror(rc)); + else + rc = 0; + + if (record_count != ms.ms_entries) + problem_add("entry", record_count, "differentent number of entries", + "%zu != %zu", record_count, ms.ms_entries); +bailout: + problems_count = problems_pop(saved_list); + if (!silent && verbose) { + print(" - summary: %u records, %u dups, %zu key's bytes, %zu data's " + "bytes, %zu problems\n", + record_count, dups, key_bytes, data_bytes, problems_count); + fflush(NULL); + } + + mdbx_cursor_close(mc); + return rc || problems_count; +} + +static void usage(char *prog) { + fprintf(stderr, + "usage: %s dbpath [-V] [-v] [-n] [-q] [-w] [-c] [-d] [-s subdb]\n" + " -V\t\tshow version\n" + " -v\t\tmore verbose, could be used multiple times\n" + " -n\t\tNOSUBDIR mode for open\n" + " -q\t\tbe quiet\n" + " -w\t\tlock DB for writing while checking\n" + " -d\t\tdisable page-by-page traversal of b-tree\n" + " -s subdb\tprocess a specific subdatabase only\n" + " -c\t\tforce cooperative mode (don't try exclusive)\n", + prog); + exit(EXIT_INTERRUPTED); +} + +const char *meta_synctype(size_t sign) { + switch (sign) { + case 0: + return "no-sync/legacy"; + case 1: + return "weak"; + default: + return "steady"; + } +} + +int meta_lt(size_t txn1, size_t sign1, size_t txn2, size_t sign2) { + return ((sign1 > 1) == (sign2 > 1)) ? txn1 < txn2 : txn2 && sign2 > 1; +} + +int main(int argc, char *argv[]) { + int i, rc; + char *prog = argv[0]; + char *envname; + int envflags = MDB_RDONLY; + int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; + int dont_traversal = 0; + size_t n; + struct timespec timestamp_start, timestamp_finish; + double elapsed; + + atexit(pagemap_cleanup); + + if (clock_gettime(CLOCK_MONOTONIC, ×tamp_start)) { + rc = errno; + error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); + return EXIT_FAILURE_SYS; + } + + if (argc < 2) { + usage(prog); + } + + while ((i = getopt(argc, argv, "Vvqnwcds:")) != EOF) { + switch (i) { + case 'V': + printf("%s\n", MDB_VERSION_STRING); + exit(EXIT_SUCCESS); + break; + case 'v': + verbose++; + break; + case 'q': + quiet = 1; + break; + case 'n': + envflags |= MDB_NOSUBDIR; + break; + case 'w': + envflags &= ~MDB_RDONLY; + break; + case 'c': + exclusive = 0; + break; + case 'd': + dont_traversal = 1; + break; + case 's': + if (only_subdb && strcmp(only_subdb, optarg)) + usage(prog); + only_subdb = optarg; + break; + default: + usage(prog); + } + } + + if (optind != argc - 1) + usage(prog); + +#ifdef SIGPIPE + signal(SIGPIPE, signal_handler); +#endif +#ifdef SIGHUP + signal(SIGHUP, signal_handler); +#endif + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + + envname = argv[optind]; + print("Running mdbx_chk for '%s' in %s mode...\n", envname, + (envflags & MDB_RDONLY) ? "read-only" : "write-lock"); + fflush(NULL); + + rc = mdbx_env_create(&env); + if (rc) { + error("mdbx_env_create failed, error %d %s\n", rc, mdbx_strerror(rc)); + return rc < 0 ? EXIT_FAILURE_MDB : EXIT_FAILURE_SYS; + } + + rc = mdbx_env_get_maxkeysize(env); + if (rc < 0) { + error("mdbx_env_get_maxkeysize failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto bailout; + } + maxkeysize = rc; + + rc = mdbx_env_set_maxdbs(env, MAX_DBI); + if (rc < 0) { + error("mdbx_env_set_maxdbs failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + + rc = mdbx_env_open_ex(env, envname, envflags, 0664, &exclusive); + if (rc) { + error("mdbx_env_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + if (verbose) + print(" - %s mode\n", exclusive ? "monopolistic" : "cooperative"); + + if (!(envflags & MDB_RDONLY)) { + rc = mdbx_txn_begin(env, NULL, 0, &locktxn); + if (rc) { + error("mdbx_txn_begin(lock-write) failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto bailout; + } + } + + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) { + error("mdbx_txn_begin(read-only) failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto bailout; + } + + rc = mdbx_env_info(env, &info, sizeof(info)); + if (rc) { + error("mdbx_env_info failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + + rc = mdbx_env_stat(env, &stat, sizeof(stat)); + if (rc) { + error("mdbx_env_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + + lastpgno = info.me_last_pgno + 1; + errno = 0; + + if (verbose) { + double k = 1024.0; + const char sf[] = + "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ + for (i = 0; sf[i + 1] && info.me_mapsize / k > 1000.0; ++i) + k *= 1024; + print(" - map size %zu (%.2f %cb)\n", info.me_mapsize, info.me_mapsize / k, + sf[i]); + if (info.me_mapaddr) + print(" - mapaddr %p\n", info.me_mapaddr); + print(" - pagesize %u, max keysize %zu (%s), max readers %u\n", + stat.ms_psize, maxkeysize, + (maxkeysize == 511) ? "default" : (maxkeysize == 0) ? "devel" + : "custom", + info.me_maxreaders); + print(" - transactions: last %zu, bottom %zu, lag reading %zi\n", + info.me_last_txnid, info.me_tail_txnid, + info.me_last_txnid - info.me_tail_txnid); + + print(" - meta-1: %s %zu, %s", meta_synctype(info.me_meta1_sign), + info.me_meta1_txnid, meta_lt(info.me_meta1_txnid, info.me_meta1_sign, + info.me_meta2_txnid, info.me_meta2_sign) + ? "tail" + : "head"); + if (info.me_meta1_txnid > info.me_last_txnid) + print(", rolled-back %zu (%zu >>> %zu)", + info.me_meta1_txnid - info.me_last_txnid, info.me_meta1_txnid, + info.me_last_txnid); + print("\n"); + + print(" - meta-2: %s %zu, %s", meta_synctype(info.me_meta2_sign), + info.me_meta2_txnid, meta_lt(info.me_meta2_txnid, info.me_meta2_sign, + info.me_meta1_txnid, info.me_meta1_sign) + ? "tail" + : "head"); + if (info.me_meta2_txnid > info.me_last_txnid) + print(", rolled-back %zu (%zu >>> %zu)", + info.me_meta2_txnid - info.me_last_txnid, info.me_meta2_txnid, + info.me_last_txnid); + print("\n"); + } + + if (exclusive > 1) { + if (verbose) + print(" - perform full check last-txn-id with meta-pages\n"); + + if (!meta_lt(info.me_meta1_txnid, info.me_meta1_sign, info.me_meta2_txnid, + info.me_meta2_sign) && + info.me_meta1_txnid != info.me_last_txnid) { + print(" - meta-1 txn-id mismatch last-txn-id (%zi != %zi)\n", + info.me_meta1_txnid, info.me_last_txnid); + ++problems_meta; + } + + if (!meta_lt(info.me_meta2_txnid, info.me_meta2_sign, info.me_meta1_txnid, + info.me_meta1_sign) && + info.me_meta2_txnid != info.me_last_txnid) { + print(" - meta-2 txn-id mismatch last-txn-id (%zi != %zi)\n", + info.me_meta2_txnid, info.me_last_txnid); + ++problems_meta; + } + } else if (locktxn) { + if (verbose) + print(" - perform lite check last-txn-id with meta-pages (not a " + "monopolistic mode)\n"); + size_t last = (info.me_meta2_txnid > info.me_meta1_txnid) + ? info.me_meta2_txnid + : info.me_meta1_txnid; + if (last != info.me_last_txnid) { + print(" - last-meta mismatch last-txn-id (%zi != %zi)\n", last, + info.me_last_txnid); + ++problems_meta; + } + } else if (verbose) { + print(" - skip check last-txn-id with meta-pages (monopolistic or " + "write-lock mode only)\n"); + } + + if (!dont_traversal) { + struct problem *saved_list; + size_t traversal_problems; + size_t empty_pages, lost_bytes; + + print("Traversal b-tree...\n"); + fflush(NULL); + walk.pagemap = calloc(lastpgno, sizeof(*walk.pagemap)); + if (!walk.pagemap) { + rc = errno ? errno : ENOMEM; + error("calloc failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto bailout; + } + + saved_list = problems_push(); + rc = mdbx_env_pgwalk(txn, pgvisitor, NULL); + traversal_problems = problems_pop(saved_list); + + if (rc) { + if (rc == EINTR && gotsignal) { + print(" - interrupted by signal\n"); + fflush(NULL); + } else { + error("mdbx_env_pgwalk failed, error %d %s\n", rc, mdbx_strerror(rc)); + } + goto bailout; + } + + for (n = 0; n < lastpgno; ++n) + if (!walk.pagemap[n]) + walk.dbi_pages[0] += 1; + + empty_pages = lost_bytes = 0; + for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { + empty_pages += walk.dbi_empty_pages[i]; + lost_bytes += walk.dbi_lost_bytes[i]; + } + + if (verbose) { + size_t total_page_bytes = walk.pgcount * stat.ms_psize; + print(" - dbi pages: %zu total", walk.pgcount); + if (verbose > 1) + for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) + print(", %s %zu", walk.dbi_names[i], walk.dbi_pages[i]); + print(", %s %zu\n", walk.dbi_names[0], walk.dbi_pages[0]); + if (verbose > 1) { + print(" - space info: total %zu bytes, payload %zu (%.1f%%), unused " + "%zu (%.1f%%)\n", + total_page_bytes, walk.total_payload_bytes, + walk.total_payload_bytes * 100.0 / total_page_bytes, + total_page_bytes - walk.total_payload_bytes, + (total_page_bytes - walk.total_payload_bytes) * 100.0 / + total_page_bytes); + for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { + size_t dbi_bytes = walk.dbi_pages[i] * stat.ms_psize; + print(" %s: subtotal %zu bytes (%.1f%%), payload %zu (%.1f%%), " + "unused %zu (%.1f%%)", + walk.dbi_names[i], dbi_bytes, + dbi_bytes * 100.0 / total_page_bytes, walk.dbi_payload_bytes[i], + walk.dbi_payload_bytes[i] * 100.0 / dbi_bytes, + dbi_bytes - walk.dbi_payload_bytes[i], + (dbi_bytes - walk.dbi_payload_bytes[i]) * 100.0 / dbi_bytes); + if (walk.dbi_empty_pages[i]) + print(", %zu empty pages", walk.dbi_empty_pages[i]); + if (walk.dbi_lost_bytes[i]) + print(", %zu bytes lost", walk.dbi_lost_bytes[i]); + print("\n"); + } + } + print(" - summary: average fill %.1f%%", + walk.total_payload_bytes * 100.0 / total_page_bytes); + if (empty_pages) + print(", %zu empty pages", empty_pages); + if (lost_bytes) + print(", %zu bytes lost", lost_bytes); + print(", %zu problems\n", traversal_problems); + } + } else if (verbose) { + print("Skipping b-tree walk...\n"); + fflush(NULL); + } + + if (!verbose) + print("Iterating DBIs...\n"); + problems_maindb = process_db(-1, /* MAIN_DBI */ NULL, NULL, 0); + problems_freedb = process_db(0 /* FREE_DBI */, "free", handle_freedb, 0); + + if (verbose) { + size_t value = info.me_mapsize / stat.ms_psize; + double percent = value / 100.0; + print(" - pages info: %zu total", value); + print(", allocated %zu (%.1f%%)", lastpgno, lastpgno / percent); + + if (verbose > 1) { + value = info.me_mapsize / stat.ms_psize - lastpgno; + print(", remained %zu (%.1f%%)", value, value / percent); + + value = lastpgno - freedb_pages; + print(", used %zu (%.1f%%)", value, value / percent); + + print(", gc %zu (%.1f%%)", freedb_pages, freedb_pages / percent); + + value = freedb_pages - reclaimable_pages; + print(", detained %zu (%.1f%%)", value, value / percent); + + print(", reclaimable %zu (%.1f%%)", reclaimable_pages, + reclaimable_pages / percent); + } + + value = info.me_mapsize / stat.ms_psize - lastpgno + reclaimable_pages; + print(", available %zu (%.1f%%)\n", value, value / percent); + } + + if (problems_maindb == 0 && problems_freedb == 0) { + if (!dont_traversal && (exclusive || locktxn)) { + if (walk.pgcount != lastpgno - freedb_pages) { + error("used pages mismatch (%zu != %zu)\n", walk.pgcount, + lastpgno - freedb_pages); + } + if (walk.dbi_pages[0] != freedb_pages) { + error("gc pages mismatch (%zu != %zu)\n", walk.dbi_pages[0], + freedb_pages); + } + } else if (verbose) { + print(" - skip check used and gc pages (btree-traversal with " + "monopolistic or write-lock mode only)\n"); + } + + if (!process_db(-1, NULL, handle_maindb, 1)) { + if (!userdb_count && verbose) + print(" - does not contain multiple databases\n"); + } + } + +bailout: + if (txn) + mdbx_txn_abort(txn); + if (locktxn) + mdbx_txn_abort(locktxn); + if (env) + mdbx_env_close(env); + fflush(NULL); + if (rc) { + if (rc < 0) + return gotsignal ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; + return EXIT_FAILURE_MDB; + } + + if (clock_gettime(CLOCK_MONOTONIC, ×tamp_finish)) { + rc = errno; + error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); + return EXIT_FAILURE_SYS; + } + + elapsed = timestamp_finish.tv_sec - timestamp_start.tv_sec + + (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9; + + total_problems += problems_meta; + if (total_problems || problems_maindb || problems_freedb) { + print("Total %zu error(s) is detected, elapsed %.3f seconds.\n", + total_problems, elapsed); + if (problems_meta || problems_maindb || problems_freedb) + return EXIT_FAILURE_CHECK_MAJOR; + return EXIT_FAILURE_CHECK_MINOR; + } + print("No error is detected, elapsed %.3f seconds\n", elapsed); + return EXIT_SUCCESS; +} diff --git a/mdb_copy.1 b/mdbx_copy.1 similarity index 94% rename from mdb_copy.1 rename to mdbx_copy.1 index 157e741d..06a620fd 100644 --- a/mdb_copy.1 +++ b/mdbx_copy.1 @@ -4,9 +4,9 @@ .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_COPY 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdb_copy \- LMDB environment copy tool +mdbx_copy \- LMDB environment copy tool .SH SYNOPSIS -.B mdb_copy +.B mdbx_copy [\c .BR \-V ] [\c @@ -18,7 +18,7 @@ mdb_copy \- LMDB environment copy tool .BR dstpath ] .SH DESCRIPTION The -.B mdb_copy +.B mdbx_copy utility copies an LMDB environment. The environment can be copied regardless of whether it is currently in use. No lockfile is created, since it gets recreated at need. @@ -52,6 +52,6 @@ This utility can trigger significant file size growth if run in parallel with write transactions, because pages which they free during copying cannot be reused until the copy is done. .SH "SEE ALSO" -.BR mdb_stat (1) +.BR mdbx_stat (1) .SH AUTHOR Howard Chu of Symas Corporation diff --git a/mdbx_copy.c b/mdbx_copy.c new file mode 100644 index 00000000..b80b70a5 --- /dev/null +++ b/mdbx_copy.c @@ -0,0 +1,76 @@ +/* mdbx_copy.c - memory-mapped database backup tool */ + +/* + * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2012-2017 Howard Chu, Symas Corp. + * Copyright 2015,2016 Peter-Service R&D LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "mdbx.h" +#include +#include +#include + +static void sighandle(int sig) { (void)sig; } + +int main(int argc, char *argv[]) { + int rc; + MDB_env *env = NULL; + const char *progname = argv[0], *act; + unsigned flags = MDB_RDONLY; + unsigned cpflags = 0; + + for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { + if (argv[1][1] == 'n' && argv[1][2] == '\0') + flags |= MDB_NOSUBDIR; + else if (argv[1][1] == 'c' && argv[1][2] == '\0') + cpflags |= MDB_CP_COMPACT; + else if (argv[1][1] == 'V' && argv[1][2] == '\0') { + printf("%s\n", MDB_VERSION_STRING); + exit(0); + } else + argc = 0; + } + + if (argc < 2 || argc > 3) { + fprintf(stderr, "usage: %s [-V] [-c] [-n] srcpath [dstpath]\n", progname); + exit(EXIT_FAILURE); + } + +#ifdef SIGPIPE + signal(SIGPIPE, sighandle); +#endif +#ifdef SIGHUP + signal(SIGHUP, sighandle); +#endif + signal(SIGINT, sighandle); + signal(SIGTERM, sighandle); + + act = "opening environment"; + rc = mdbx_env_create(&env); + if (rc == MDB_SUCCESS) { + rc = mdbx_env_open(env, argv[1], flags, 0640); + } + if (rc == MDB_SUCCESS) { + act = "copying"; + if (argc == 2) + rc = mdbx_env_copyfd2(env, STDOUT_FILENO, cpflags); + else + rc = mdbx_env_copy2(env, argv[2], cpflags); + } + if (rc) + fprintf(stderr, "%s: %s failed, error %d (%s)\n", progname, act, rc, + mdbx_strerror(rc)); + mdbx_env_close(env); + + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/mdb_dump.1 b/mdbx_dump.1 similarity index 94% rename from mdb_dump.1 rename to mdbx_dump.1 index 4c4553ce..80718bb0 100644 --- a/mdb_dump.1 +++ b/mdbx_dump.1 @@ -4,9 +4,9 @@ .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_DUMP 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdb_dump \- LMDB environment export tool +mdbx_dump \- LMDB environment export tool .SH SYNOPSIS -.B mdb_dump +.B mdbx_dump [\c .BR \-V ] [\c @@ -23,11 +23,11 @@ mdb_dump \- LMDB environment export tool .BR \ envpath .SH DESCRIPTION The -.B mdb_dump +.B mdbx_dump utility reads a database and writes its contents to the standard output using a portable flat-text format understood by the -.BR mdb_load (1) +.BR mdbx_load (1) utility. .SH OPTIONS .TP @@ -69,9 +69,9 @@ will result in new databases that use the default comparison functions. damaged beyond repair permitting neither record storage nor retrieval.\fP The only available workaround is to modify the source for the -.BR mdb_load (1) +.BR mdbx_load (1) utility to load the database using the correct comparison functions. .SH "SEE ALSO" -.BR mdb_load (1) +.BR mdbx_load (1) .SH AUTHOR Howard Chu of Symas Corporation diff --git a/mdbx_dump.c b/mdbx_dump.c new file mode 100644 index 00000000..16543d09 --- /dev/null +++ b/mdbx_dump.c @@ -0,0 +1,316 @@ +/* mdbx_dump.c - memory-mapped database dump tool */ + +/* + * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2011-2017 Howard Chu, Symas Corp. + * Copyright 2015,2016 Peter-Service R&D LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "mdbx.h" +#include +#include +#include +#include +#include +#include +#include + +#define PRINT 1 +static int mode; + +typedef struct flagbit { + int bit; + char *name; +} flagbit; + +flagbit dbflags[] = {{MDB_REVERSEKEY, "reversekey"}, + {MDB_DUPSORT, "dupsort"}, + {MDB_INTEGERKEY, "integerkey"}, + {MDB_DUPFIXED, "dupfixed"}, + {MDB_INTEGERDUP, "integerdup"}, + {MDB_REVERSEDUP, "reversedup"}, + {0, NULL}}; + +static volatile sig_atomic_t gotsig; + +static void dumpsig(int sig) { + (void)sig; + gotsig = 1; +} + +static const char hexc[] = "0123456789abcdef"; + +static void hex(unsigned char c) { + putchar(hexc[c >> 4]); + putchar(hexc[c & 0xf]); +} + +static void text(MDB_val *v) { + unsigned char *c, *end; + + putchar(' '); + c = v->mv_data; + end = c + v->mv_size; + while (c < end) { + if (isprint(*c)) { + putchar(*c); + } else { + putchar('\\'); + hex(*c); + } + c++; + } + putchar('\n'); +} + +static void byte(MDB_val *v) { + unsigned char *c, *end; + + putchar(' '); + c = v->mv_data; + end = c + v->mv_size; + while (c < end) { + hex(*c++); + } + putchar('\n'); +} + +/* Dump in BDB-compatible format */ +static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) { + MDB_cursor *mc; + MDBX_stat ms; + MDB_val key, data; + MDBX_envinfo info; + unsigned int flags; + int rc, i; + + rc = mdbx_dbi_flags(txn, dbi, &flags); + if (rc) + return rc; + + rc = mdbx_stat(txn, dbi, &ms, sizeof(ms)); + if (rc) + return rc; + + rc = mdbx_env_info(mdbx_txn_env(txn), &info, sizeof(info)); + if (rc) + return rc; + + printf("VERSION=3\n"); + printf("format=%s\n", mode & PRINT ? "print" : "bytevalue"); + if (name) + printf("database=%s\n", name); + printf("type=btree\n"); + printf("mapsize=%zu\n", info.me_mapsize); + if (info.me_mapaddr) + printf("mapaddr=%p\n", info.me_mapaddr); + printf("maxreaders=%u\n", info.me_maxreaders); + + for (i = 0; dbflags[i].bit; i++) + if (flags & dbflags[i].bit) + printf("%s=1\n", dbflags[i].name); + + printf("db_pagesize=%d\n", ms.ms_psize); + printf("HEADER=END\n"); + + rc = mdbx_cursor_open(txn, dbi, &mc); + if (rc) + return rc; + + while ((rc = mdbx_cursor_get(mc, &key, &data, MDB_NEXT)) == MDB_SUCCESS) { + if (gotsig) { + rc = EINTR; + break; + } + if (mode & PRINT) { + text(&key); + text(&data); + } else { + byte(&key); + byte(&data); + } + } + printf("DATA=END\n"); + if (rc == MDB_NOTFOUND) + rc = MDB_SUCCESS; + + return rc; +} + +static void usage(char *prog) { + fprintf(stderr, + "usage: %s [-V] [-f output] [-l] [-n] [-p] [-a|-s subdb] dbpath\n", + prog); + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) { + int i, rc; + MDB_env *env; + MDB_txn *txn; + MDB_dbi dbi; + char *prog = argv[0]; + char *envname; + char *subname = NULL; + int alldbs = 0, envflags = 0, list = 0; + + if (argc < 2) { + usage(prog); + } + + /* -a: dump main DB and all subDBs + * -s: dump only the named subDB + * -n: use NOSUBDIR flag on env_open + * -p: use printable characters + * -f: write to file instead of stdout + * -V: print version and exit + * (default) dump only the main DB + */ + while ((i = getopt(argc, argv, "af:lnps:V")) != EOF) { + switch (i) { + case 'V': + printf("%s\n", MDB_VERSION_STRING); + exit(0); + break; + case 'l': + list = 1; + /*FALLTHROUGH*/; + case 'a': + if (subname) + usage(prog); + alldbs++; + break; + case 'f': + if (freopen(optarg, "w", stdout) == NULL) { + fprintf(stderr, "%s: %s: reopen: %s\n", prog, optarg, strerror(errno)); + exit(EXIT_FAILURE); + } + break; + case 'n': + envflags |= MDB_NOSUBDIR; + break; + case 'p': + mode |= PRINT; + break; + case 's': + if (alldbs) + usage(prog); + subname = optarg; + break; + default: + usage(prog); + } + } + + if (optind != argc - 1) + usage(prog); + +#ifdef SIGPIPE + signal(SIGPIPE, dumpsig); +#endif +#ifdef SIGHUP + signal(SIGHUP, dumpsig); +#endif + signal(SIGINT, dumpsig); + signal(SIGTERM, dumpsig); + + envname = argv[optind]; + rc = mdbx_env_create(&env); + if (rc) { + fprintf(stderr, "mdbx_env_create failed, error %d %s\n", rc, + mdbx_strerror(rc)); + return EXIT_FAILURE; + } + + if (alldbs || subname) { + mdbx_env_set_maxdbs(env, 2); + } + + rc = mdbx_env_open(env, envname, envflags | MDB_RDONLY, 0664); + if (rc) { + fprintf(stderr, "mdbx_env_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) { + fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + rc = mdbx_dbi_open(txn, subname, 0, &dbi); + if (rc) { + fprintf(stderr, "mdbx_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto txn_abort; + } + + if (alldbs) { + MDB_cursor *cursor; + MDB_val key; + int count = 0; + + rc = mdbx_cursor_open(txn, dbi, &cursor); + if (rc) { + fprintf(stderr, "mdbx_cursor_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { + char *str; + MDB_dbi db2; + if (memchr(key.mv_data, '\0', key.mv_size)) + continue; + count++; + str = malloc(key.mv_size + 1); + memcpy(str, key.mv_data, key.mv_size); + str[key.mv_size] = '\0'; + rc = mdbx_dbi_open(txn, str, 0, &db2); + if (rc == MDB_SUCCESS) { + if (list) { + printf("%s\n", str); + list++; + } else { + rc = dumpit(txn, db2, str); + if (rc) + break; + } + mdbx_dbi_close(env, db2); + } + free(str); + if (rc) + continue; + } + mdbx_cursor_close(cursor); + if (!count) { + fprintf(stderr, "%s: %s does not contain multiple databases\n", prog, + envname); + rc = MDB_NOTFOUND; + } else if (rc == MDB_INCOMPATIBLE) { + /* LY: the record it not a named sub-db. */ + rc = MDB_SUCCESS; + } + } else { + rc = dumpit(txn, dbi, subname); + } + if (rc && rc != MDB_NOTFOUND) + fprintf(stderr, "%s: %s: %s\n", prog, envname, mdbx_strerror(rc)); + + mdbx_dbi_close(env, dbi); +txn_abort: + mdbx_txn_abort(txn); +env_close: + mdbx_env_close(env); + + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/mdb_load.1 b/mdbx_load.1 similarity index 94% rename from mdb_load.1 rename to mdbx_load.1 index 5e082f67..63b88f10 100644 --- a/mdb_load.1 +++ b/mdbx_load.1 @@ -4,9 +4,9 @@ .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_LOAD 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdb_load \- LMDB environment import tool +mdbx_load \- LMDB environment import tool .SH SYNOPSIS -.B mdb_load +.B mdbx_load [\c .BR \-V ] [\c @@ -22,15 +22,15 @@ mdb_load \- LMDB environment import tool .BR \ envpath .SH DESCRIPTION The -.B mdb_load +.B mdbx_load utility reads from the standard input and loads it into the LMDB environment .BR envpath . The input to -.B mdb_load +.B mdbx_load must be in the output format specified by the -.BR mdb_dump (1) +.BR mdbx_dump (1) utility or as specified by the .B -T option below. @@ -66,7 +66,7 @@ character; for example, \\0a is a newline character in the ASCII character set. For this reason, any backslash or newline characters that naturally occur in the text input must be escaped to avoid misinterpretation by -.BR mdb_load . +.BR mdbx_load . .SH DIAGNOSTICS Exit status is zero if no errors occur. @@ -74,6 +74,6 @@ Errors result in a non-zero exit status and a diagnostic message being written to standard error. .SH "SEE ALSO" -.BR mdb_dump (1) +.BR mdbx_dump (1) .SH AUTHOR Howard Chu of Symas Corporation diff --git a/mdbx_load.c b/mdbx_load.c new file mode 100644 index 00000000..a211b24e --- /dev/null +++ b/mdbx_load.c @@ -0,0 +1,466 @@ +/* mdbx_load.c - memory-mapped database load tool */ + +/* + * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2011-2017 Howard Chu, Symas Corp. + * Copyright 2015,2016 Peter-Service R&D LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "mdbx.h" +#include +#include +#include +#include +#include +#include + +#define PRINT 1 +#define NOHDR 2 +static int mode; + +static char *subname = NULL; + +static size_t lineno; +static int version; + +static int dbi_flags; + +static char *prog; + +static int Eof; + +static MDBX_envinfo info; + +static MDB_val kbuf, dbuf; + +#define STRLENOF(s) (sizeof(s) - 1) + +typedef struct flagbit { + int bit; + char *name; + int len; +} flagbit; + +#define S(s) s, STRLENOF(s) + +flagbit dbflags[] = {{MDB_REVERSEKEY, S("reversekey")}, + {MDB_DUPSORT, S("dupsort")}, + {MDB_INTEGERKEY, S("integerkey")}, + {MDB_DUPFIXED, S("dupfixed")}, + {MDB_INTEGERDUP, S("integerdup")}, + {MDB_REVERSEDUP, S("reversedup")}, + {0, NULL, 0}}; + +static void readhdr(void) { + char *ptr; + + dbi_flags = 0; + while (fgets(dbuf.mv_data, dbuf.mv_size, stdin) != NULL) { + lineno++; + if (!strncmp(dbuf.mv_data, "db_pagesize=", STRLENOF("db_pagesize=")) || + !strncmp(dbuf.mv_data, "duplicates=", STRLENOF("duplicates="))) { + /* LY: silently ignore information fields. */ + continue; + } else if (!strncmp(dbuf.mv_data, "VERSION=", STRLENOF("VERSION="))) { + version = atoi((char *)dbuf.mv_data + STRLENOF("VERSION=")); + if (version > 3) { + fprintf(stderr, "%s: line %zd: unsupported VERSION %d\n", prog, lineno, + version); + exit(EXIT_FAILURE); + } + } else if (!strncmp(dbuf.mv_data, "HEADER=END", STRLENOF("HEADER=END"))) { + break; + } else if (!strncmp(dbuf.mv_data, "format=", STRLENOF("format="))) { + if (!strncmp((char *)dbuf.mv_data + STRLENOF("FORMAT="), "print", + STRLENOF("print"))) + mode |= PRINT; + else if (strncmp((char *)dbuf.mv_data + STRLENOF("FORMAT="), "bytevalue", + STRLENOF("bytevalue"))) { + fprintf(stderr, "%s: line %zd: unsupported FORMAT %s\n", prog, lineno, + (char *)dbuf.mv_data + STRLENOF("FORMAT=")); + exit(EXIT_FAILURE); + } + } else if (!strncmp(dbuf.mv_data, "database=", STRLENOF("database="))) { + ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + if (ptr) + *ptr = '\0'; + if (subname) + free(subname); + subname = strdup((char *)dbuf.mv_data + STRLENOF("database=")); + } else if (!strncmp(dbuf.mv_data, "type=", STRLENOF("type="))) { + if (strncmp((char *)dbuf.mv_data + STRLENOF("type="), "btree", + STRLENOF("btree"))) { + fprintf(stderr, "%s: line %zd: unsupported type %s\n", prog, lineno, + (char *)dbuf.mv_data + STRLENOF("type=")); + exit(EXIT_FAILURE); + } + } else if (!strncmp(dbuf.mv_data, "mapaddr=", STRLENOF("mapaddr="))) { + int i; + ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + if (ptr) + *ptr = '\0'; + i = sscanf((char *)dbuf.mv_data + STRLENOF("mapaddr="), "%p", + &info.me_mapaddr); + if (i != 1) { + fprintf(stderr, "%s: line %zd: invalid mapaddr %s\n", prog, lineno, + (char *)dbuf.mv_data + STRLENOF("mapaddr=")); + exit(EXIT_FAILURE); + } + } else if (!strncmp(dbuf.mv_data, "mapsize=", STRLENOF("mapsize="))) { + int i; + ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + if (ptr) + *ptr = '\0'; + i = sscanf((char *)dbuf.mv_data + STRLENOF("mapsize="), "%zu", + &info.me_mapsize); + if (i != 1) { + fprintf(stderr, "%s: line %zd: invalid mapsize %s\n", prog, lineno, + (char *)dbuf.mv_data + STRLENOF("mapsize=")); + exit(EXIT_FAILURE); + } + } else if (!strncmp(dbuf.mv_data, "maxreaders=", STRLENOF("maxreaders="))) { + int i; + ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + if (ptr) + *ptr = '\0'; + i = sscanf((char *)dbuf.mv_data + STRLENOF("maxreaders="), "%u", + &info.me_maxreaders); + if (i != 1) { + fprintf(stderr, "%s: line %zd: invalid maxreaders %s\n", prog, lineno, + (char *)dbuf.mv_data + STRLENOF("maxreaders=")); + exit(EXIT_FAILURE); + } + } else { + int i; + for (i = 0; dbflags[i].bit; i++) { + if (!strncmp(dbuf.mv_data, dbflags[i].name, dbflags[i].len) && + ((char *)dbuf.mv_data)[dbflags[i].len] == '=') { + if (((char *)dbuf.mv_data)[dbflags[i].len + 1] == '1') + dbi_flags |= dbflags[i].bit; + break; + } + } + if (!dbflags[i].bit) { + ptr = memchr(dbuf.mv_data, '=', dbuf.mv_size); + if (!ptr) { + fprintf(stderr, "%s: line %zd: unexpected format\n", prog, lineno); + exit(EXIT_FAILURE); + } else { + *ptr = '\0'; + fprintf(stderr, "%s: line %zd: unrecognized keyword ignored: %s\n", + prog, lineno, (char *)dbuf.mv_data); + } + } + } + } +} + +static void badend(void) { + fprintf(stderr, "%s: line %zd: unexpected end of input\n", prog, lineno); +} + +static int unhex(unsigned char *c2) { + int x, c; + x = *c2++ & 0x4f; + if (x & 0x40) + x -= 55; + c = x << 4; + x = *c2 & 0x4f; + if (x & 0x40) + x -= 55; + c |= x; + return c; +} + +static int readline(MDB_val *out, MDB_val *buf) { + unsigned char *c1, *c2, *end; + size_t len, l2; + int c; + + if (!(mode & NOHDR)) { + c = fgetc(stdin); + if (c == EOF) { + Eof = 1; + return EOF; + } + if (c != ' ') { + lineno++; + if (fgets(buf->mv_data, buf->mv_size, stdin) == NULL) { + badend: + Eof = 1; + badend(); + return EOF; + } + if (c == 'D' && !strncmp(buf->mv_data, "ATA=END", STRLENOF("ATA=END"))) + return EOF; + goto badend; + } + } + if (fgets(buf->mv_data, buf->mv_size, stdin) == NULL) { + Eof = 1; + return EOF; + } + lineno++; + + c1 = buf->mv_data; + len = strlen((char *)c1); + l2 = len; + + /* Is buffer too short? */ + while (c1[len - 1] != '\n') { + buf->mv_data = realloc(buf->mv_data, buf->mv_size * 2); + if (!buf->mv_data) { + Eof = 1; + fprintf(stderr, "%s: line %zd: out of memory, line too long\n", prog, + lineno); + return EOF; + } + c1 = buf->mv_data; + c1 += l2; + if (fgets((char *)c1, buf->mv_size + 1, stdin) == NULL) { + Eof = 1; + badend(); + return EOF; + } + buf->mv_size *= 2; + len = strlen((char *)c1); + l2 += len; + } + c1 = c2 = buf->mv_data; + len = l2; + c1[--len] = '\0'; + end = c1 + len; + + if (mode & PRINT) { + while (c2 < end) { + if (*c2 == '\\') { + if (c2[1] == '\\') { + c1++; + c2 += 2; + } else { + if (c2 + 3 > end || !isxdigit(c2[1]) || !isxdigit(c2[2])) { + Eof = 1; + badend(); + return EOF; + } + *c1++ = unhex(++c2); + c2 += 2; + } + } else { + /* copies are redundant when no escapes were used */ + *c1++ = *c2++; + } + } + } else { + /* odd length not allowed */ + if (len & 1) { + Eof = 1; + badend(); + return EOF; + } + while (c2 < end) { + if (!isxdigit(*c2) || !isxdigit(c2[1])) { + Eof = 1; + badend(); + return EOF; + } + *c1++ = unhex(c2); + c2 += 2; + } + } + c2 = out->mv_data = buf->mv_data; + out->mv_size = c1 - c2; + + return 0; +} + +static void usage(void) { + fprintf(stderr, "usage: %s [-V] [-f input] [-n] [-s name] [-N] [-T] dbpath\n", + prog); + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) { + int i, rc; + MDB_env *env; + MDB_txn *txn; + MDB_cursor *mc; + MDB_dbi dbi; + char *envname; + int envflags = 0, putflags = 0; + + prog = argv[0]; + + if (argc < 2) { + usage(); + } + + /* -f: load file instead of stdin + * -n: use NOSUBDIR flag on env_open + * -s: load into named subDB + * -N: use NOOVERWRITE on puts + * -T: read plaintext + * -V: print version and exit + */ + while ((i = getopt(argc, argv, "f:ns:NTV")) != EOF) { + switch (i) { + case 'V': + printf("%s\n", MDB_VERSION_STRING); + exit(0); + break; + case 'f': + if (freopen(optarg, "r", stdin) == NULL) { + fprintf(stderr, "%s: %s: reopen: %s\n", prog, optarg, strerror(errno)); + exit(EXIT_FAILURE); + } + break; + case 'n': + envflags |= MDB_NOSUBDIR; + break; + case 's': + subname = strdup(optarg); + break; + case 'N': + putflags = MDB_NOOVERWRITE | MDB_NODUPDATA; + break; + case 'T': + mode |= NOHDR | PRINT; + break; + default: + usage(); + } + } + + if (optind != argc - 1) + usage(); + + dbuf.mv_size = 4096; + dbuf.mv_data = malloc(dbuf.mv_size); + + if (!(mode & NOHDR)) + readhdr(); + + envname = argv[optind]; + rc = mdbx_env_create(&env); + if (rc) { + fprintf(stderr, "mdbx_env_create failed, error %d %s\n", rc, + mdbx_strerror(rc)); + return EXIT_FAILURE; + } + + mdbx_env_set_maxdbs(env, 2); + + if (info.me_maxreaders) + mdbx_env_set_maxreaders(env, info.me_maxreaders); + + if (info.me_mapsize) + mdbx_env_set_mapsize(env, info.me_mapsize); + + if (info.me_mapaddr) + envflags |= MDB_FIXEDMAP; + + rc = mdbx_env_open(env, envname, envflags, 0664); + if (rc) { + fprintf(stderr, "mdbx_env_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + kbuf.mv_size = mdbx_env_get_maxkeysize(env) * 2 + 2; + kbuf.mv_data = malloc(kbuf.mv_size); + + while (!Eof) { + MDB_val key, data; + int batch = 0; + + rc = mdbx_txn_begin(env, NULL, 0, &txn); + if (rc) { + fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + rc = mdbx_dbi_open(txn, subname, dbi_flags | MDB_CREATE, &dbi); + if (rc) { + fprintf(stderr, "mdbx_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto txn_abort; + } + + rc = mdbx_cursor_open(txn, dbi, &mc); + if (rc) { + fprintf(stderr, "mdbx_cursor_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + + while (1) { + rc = readline(&key, &kbuf); + if (rc) /* rc == EOF */ + break; + + rc = readline(&data, &dbuf); + if (rc) { + fprintf(stderr, "%s: line %zd: failed to read key value\n", prog, + lineno); + goto txn_abort; + } + + rc = mdbx_cursor_put(mc, &key, &data, putflags); + if (rc == MDB_KEYEXIST && putflags) + continue; + if (rc) { + fprintf(stderr, "mdbx_cursor_put failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + batch++; + if (batch == 100) { + rc = mdbx_txn_commit(txn); + if (rc) { + fprintf(stderr, "%s: line %zd: txn_commit: %s\n", prog, lineno, + mdbx_strerror(rc)); + goto env_close; + } + rc = mdbx_txn_begin(env, NULL, 0, &txn); + if (rc) { + fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + rc = mdbx_cursor_open(txn, dbi, &mc); + if (rc) { + fprintf(stderr, "mdbx_cursor_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + batch = 0; + } + } + rc = mdbx_txn_commit(txn); + txn = NULL; + if (rc) { + fprintf(stderr, "%s: line %zd: txn_commit: %s\n", prog, lineno, + mdbx_strerror(rc)); + goto env_close; + } + mdbx_dbi_close(env, dbi); + if (!(mode & NOHDR)) + readhdr(); + } + +txn_abort: + mdbx_txn_abort(txn); +env_close: + mdbx_env_close(env); + + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/mdb_stat.1 b/mdbx_stat.1 similarity index 95% rename from mdb_stat.1 rename to mdbx_stat.1 index bb659744..096fffc1 100644 --- a/mdb_stat.1 +++ b/mdbx_stat.1 @@ -4,9 +4,9 @@ .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_STAT 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdb_stat \- LMDB environment status tool +mdbx_stat \- LMDB environment status tool .SH SYNOPSIS -.B mdb_stat +.B mdbx_stat [\c .BR \-V ] [\c @@ -23,7 +23,7 @@ mdb_stat \- LMDB environment status tool .BR \ envpath .SH DESCRIPTION The -.B mdb_stat +.B mdbx_stat utility displays the status of an LMDB environment. .SH OPTIONS .TP @@ -61,6 +61,6 @@ Exit status is zero if no errors occur. Errors result in a non-zero exit status and a diagnostic message being written to standard error. .SH "SEE ALSO" -.BR mdb_copy (1) +.BR mdbx_copy (1) .SH AUTHOR Howard Chu of Symas Corporation diff --git a/mdbx_stat.c b/mdbx_stat.c new file mode 100644 index 00000000..ca72b290 --- /dev/null +++ b/mdbx_stat.c @@ -0,0 +1,306 @@ +/* mdbx_stat.c - memory-mapped database status tool */ + +/* + * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2011-2017 Howard Chu, Symas Corp. + * Copyright 2015,2016 Peter-Service R&D LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "mdbx.h" +#include +#include +#include +#include + +static void prstat(MDBX_stat *ms) { + printf(" Page size: %u\n", ms->ms_psize); + printf(" Tree depth: %u\n", ms->ms_depth); + printf(" Branch pages: %zu\n", ms->ms_branch_pages); + printf(" Leaf pages: %zu\n", ms->ms_leaf_pages); + printf(" Overflow pages: %zu\n", ms->ms_overflow_pages); + printf(" Entries: %zu\n", ms->ms_entries); +} + +static void usage(char *prog) { + fprintf(stderr, + "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-a|-s subdb] dbpath\n", + prog); + exit(EXIT_FAILURE); +} + +int main(int argc, char *argv[]) { + int i, rc; + MDB_env *env; + MDB_txn *txn; + MDB_dbi dbi; + MDBX_stat mst; + MDBX_envinfo mei; + char *prog = argv[0]; + char *envname; + char *subname = NULL; + int alldbs = 0, envinfo = 0, envflags = 0, freinfo = 0, rdrinfo = 0; + + if (argc < 2) { + usage(prog); + } + + /* -a: print stat of main DB and all subDBs + * -s: print stat of only the named subDB + * -e: print env info + * -f: print freelist info + * -r: print reader info + * -n: use NOSUBDIR flag on env_open + * -V: print version and exit + * (default) print stat of only the main DB + */ + while ((i = getopt(argc, argv, "Vaefnrs:")) != EOF) { + switch (i) { + case 'V': + printf("%s\n", MDB_VERSION_STRING); + exit(0); + break; + case 'a': + if (subname) + usage(prog); + alldbs++; + break; + case 'e': + envinfo++; + break; + case 'f': + freinfo++; + break; + case 'n': + envflags |= MDB_NOSUBDIR; + break; + case 'r': + rdrinfo++; + break; + case 's': + if (alldbs) + usage(prog); + subname = optarg; + break; + default: + usage(prog); + } + } + + if (optind != argc - 1) + usage(prog); + + envname = argv[optind]; + rc = mdbx_env_create(&env); + if (rc) { + fprintf(stderr, "mdbx_env_create failed, error %d %s\n", rc, + mdbx_strerror(rc)); + return EXIT_FAILURE; + } + + if (alldbs || subname) { + mdbx_env_set_maxdbs(env, 4); + } + + rc = mdbx_env_open(env, envname, envflags | MDB_RDONLY, 0664); + if (rc) { + fprintf(stderr, "mdbx_env_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + if (envinfo) { + (void)mdbx_env_stat(env, &mst, sizeof(mst)); + (void)mdbx_env_info(env, &mei, sizeof(mei)); + printf("Environment Info\n"); + printf(" Map address: %p\n", mei.me_mapaddr); + printf(" Map size: %zu\n", mei.me_mapsize); + printf(" Page size: %u\n", mst.ms_psize); + printf(" Max pages: %zu\n", mei.me_mapsize / mst.ms_psize); + printf(" Number of pages used: %zu\n", mei.me_last_pgno + 1); + printf(" Last transaction ID: %zu\n", mei.me_last_txnid); + printf(" Tail transaction ID: %zu (%zi)\n", mei.me_tail_txnid, + mei.me_tail_txnid - mei.me_last_txnid); + printf(" Max readers: %u\n", mei.me_maxreaders); + printf(" Number of readers used: %u\n", mei.me_numreaders); + } else { + /* LY: zap warnings from gcc */ + memset(&mst, 0, sizeof(mst)); + memset(&mei, 0, sizeof(mei)); + } + + if (rdrinfo) { + printf("Reader Table Status\n"); + rc = mdbx_reader_list(env, (MDB_msg_func *)fputs, stdout); + if (rdrinfo > 1) { + int dead; + mdbx_reader_check(env, &dead); + printf(" %d stale readers cleared.\n", dead); + rc = mdbx_reader_list(env, (MDB_msg_func *)fputs, stdout); + } + if (!(subname || alldbs || freinfo)) + goto env_close; + } + + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + if (rc) { + fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto env_close; + } + + if (freinfo) { + MDB_cursor *cursor; + MDB_val key, data; + size_t pages = 0, *iptr; + size_t reclaimable = 0; + + printf("Freelist Status\n"); + dbi = 0; + rc = mdbx_cursor_open(txn, dbi, &cursor); + if (rc) { + fprintf(stderr, "mdbx_cursor_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + rc = mdbx_stat(txn, dbi, &mst, sizeof(mst)); + if (rc) { + fprintf(stderr, "mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto txn_abort; + } + prstat(&mst); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + iptr = data.mv_data; + pages += *iptr; + if (envinfo && mei.me_tail_txnid > *(size_t *)key.mv_data) + reclaimable += *iptr; + if (freinfo > 1) { + char *bad = ""; + size_t pg, prev; + ssize_t i, j, span = 0; + j = *iptr++; + for (i = j, prev = 1; --i >= 0;) { + pg = iptr[i]; + if (pg <= prev) + bad = " [bad sequence]"; + prev = pg; + pg += span; + for (; i >= span && iptr[i - span] == pg; span++, pg++) + ; + } + printf(" Transaction %zu, %zd pages, maxspan %zd%s\n", + *(size_t *)key.mv_data, j, span, bad); + if (freinfo > 2) { + for (--j; j >= 0;) { + pg = iptr[j]; + for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) + ; + if (span > 1) + printf(" %9zu[%zd]\n", pg, span); + else + printf(" %9zu\n", pg); + } + } + } + } + mdbx_cursor_close(cursor); + if (envinfo) { + size_t value = mei.me_mapsize / mst.ms_psize; + double percent = value / 100.0; + printf("Page Allocation Info\n"); + printf(" Max pages: %9zu 100%%\n", value); + + value = mei.me_last_pgno + 1; + printf(" Number of pages used: %zu %.1f%%\n", value, value / percent); + + value = mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1); + printf(" Remained: %zu %.1f%%\n", value, value / percent); + + value = mei.me_last_pgno + 1 - pages; + printf(" Used now: %zu %.1f%%\n", value, value / percent); + + value = pages; + printf(" Unallocated: %zu %.1f%%\n", value, value / percent); + + value = pages - reclaimable; + printf(" Detained: %zu %.1f%%\n", value, value / percent); + + value = reclaimable; + printf(" Reclaimable: %zu %.1f%%\n", value, value / percent); + + value = + mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1) + reclaimable; + printf(" Available: %zu %.1f%%\n", value, value / percent); + } else + printf(" Free pages: %zu\n", pages); + } + + rc = mdbx_dbi_open(txn, subname, 0, &dbi); + if (rc) { + fprintf(stderr, "mdbx_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto txn_abort; + } + + rc = mdbx_stat(txn, dbi, &mst, sizeof(mst)); + if (rc) { + fprintf(stderr, "mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + goto txn_abort; + } + printf("Status of %s\n", subname ? subname : "Main DB"); + prstat(&mst); + + if (alldbs) { + MDB_cursor *cursor; + MDB_val key; + + rc = mdbx_cursor_open(txn, dbi, &cursor); + if (rc) { + fprintf(stderr, "mdbx_cursor_open failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { + char *str; + MDB_dbi db2; + if (memchr(key.mv_data, '\0', key.mv_size)) + continue; + str = malloc(key.mv_size + 1); + memcpy(str, key.mv_data, key.mv_size); + str[key.mv_size] = '\0'; + rc = mdbx_dbi_open(txn, str, 0, &db2); + if (rc == MDB_SUCCESS) + printf("Status of %s\n", str); + free(str); + if (rc) + continue; + rc = mdbx_stat(txn, db2, &mst, sizeof(mst)); + if (rc) { + fprintf(stderr, "mdbx_stat failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto txn_abort; + } + prstat(&mst); + mdbx_dbi_close(env, db2); + } + mdbx_cursor_close(cursor); + } + + if (rc == MDB_NOTFOUND) + rc = MDB_SUCCESS; + + mdbx_dbi_close(env, dbi); +txn_abort: + mdbx_txn_abort(txn); +env_close: + mdbx_env_close(env); + + return rc ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/midl.c b/midl.c deleted file mode 100644 index 6d2417ac..00000000 --- a/midl.c +++ /dev/null @@ -1,361 +0,0 @@ -/** @file midl.c - * @brief ldap bdb back-end ID List functions */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2000-2017 The OpenLDAP Foundation. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include -#include "midl.h" - -/** @defgroup internal LMDB Internals - * @{ - */ -/** @defgroup idls ID List Management - * @{ - */ - -static unsigned __hot -mdb_midl_search( MDB_IDL ids, MDB_ID id ) -{ - /* - * binary search of id in ids - * if found, returns position of id - * if not found, returns first position greater than id - */ - unsigned base = 0; - unsigned cursor = 1; - int val = 0; - unsigned n = ids[0]; - - while( 0 < n ) { - unsigned pivot = n >> 1; - cursor = base + pivot + 1; - val = mdbx_cmp2int( ids[cursor], id ); - - if( val < 0 ) { - n = pivot; - - } else if ( val > 0 ) { - base = cursor; - n -= pivot + 1; - - } else { - return cursor; - } - } - - if( val > 0 ) { - ++cursor; - } - return cursor; -} - -#if 0 /* superseded by append/sort */ -static int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) -{ - unsigned x, i; - - x = mdb_midl_search( ids, id ); - assert( x > 0 ); - - if( x < 1 ) { - /* internal error */ - return -2; - } - - if ( x <= ids[0] && ids[x] == id ) { - /* duplicate */ - assert(0); - return -1; - } - - if ( ++ids[0] >= MDB_IDL_DB_MAX ) { - /* no room */ - --ids[0]; - return -2; - - } else { - /* insert id */ - for (i=ids[0]; i>x; i--) - ids[i] = ids[i-1]; - ids[x] = id; - } - - return 0; -} -#endif - -static MDB_IDL mdb_midl_alloc(int num) -{ - MDB_IDL ids = malloc((num+2) * sizeof(MDB_ID)); - if (ids) { - *ids++ = num; - *ids = 0; - } - return ids; -} - -static void mdb_midl_free(MDB_IDL ids) -{ - if (ids) - free(ids-1); -} - -static void mdb_midl_shrink( MDB_IDL *idp ) -{ - MDB_IDL ids = *idp; - if (*(--ids) > MDB_IDL_UM_MAX && - (ids = realloc(ids, (MDB_IDL_UM_MAX+2) * sizeof(MDB_ID)))) - { - *ids++ = MDB_IDL_UM_MAX; - *idp = ids; - } -} - -static int mdb_midl_grow( MDB_IDL *idp, int num ) -{ - MDB_IDL idn = *idp-1; - /* grow it */ - idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID)); - if (!idn) - return ENOMEM; - *idn++ += num; - *idp = idn; - return 0; -} - -static int mdb_midl_need( MDB_IDL *idp, unsigned num ) -{ - MDB_IDL ids = *idp; - num += ids[0]; - if (num > ids[-1]) { - num = (num + num/4 + (256 + 2)) & -256; - if (!(ids = realloc(ids-1, num * sizeof(MDB_ID)))) - return ENOMEM; - *ids++ = num - 2; - *idp = ids; - } - return 0; -} - -static int mdb_midl_append( MDB_IDL *idp, MDB_ID id ) -{ - MDB_IDL ids = *idp; - /* Too big? */ - if (ids[0] >= ids[-1]) { - if (mdb_midl_grow(idp, MDB_IDL_UM_MAX)) - return ENOMEM; - ids = *idp; - } - ids[0]++; - ids[ids[0]] = id; - return 0; -} - -static int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ) -{ - MDB_IDL ids = *idp; - /* Too big? */ - if (ids[0] + app[0] >= ids[-1]) { - if (mdb_midl_grow(idp, app[0])) - return ENOMEM; - ids = *idp; - } - memcpy(&ids[ids[0]+1], &app[1], app[0] * sizeof(MDB_ID)); - ids[0] += app[0]; - return 0; -} - -static int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ) -{ - MDB_ID *ids = *idp, len = ids[0]; - /* Too big? */ - if (len + n > ids[-1]) { - if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX)) - return ENOMEM; - ids = *idp; - } - ids[0] = len + n; - ids += len; - while (n) - ids[n--] = id++; - return 0; -} - -static void __hot -mdb_midl_xmerge( MDB_IDL idl, MDB_IDL merge ) -{ - MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i+j, total = k; - idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */ - old_id = idl[j]; - while (i) { - merge_id = merge[i--]; - for (; old_id < merge_id; old_id = idl[--j]) - idl[k--] = old_id; - idl[k--] = merge_id; - } - idl[0] = total; -} - -/* Quicksort + Insertion sort for small arrays */ - -#define SMALL 8 -#define MIDL_SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; } - -static void __hot -mdb_midl_sort( MDB_IDL ids ) -{ - /* Max possible depth of int-indexed tree * 2 items/level */ - int istack[sizeof(int)*CHAR_BIT * 2]; - int i,j,k,l,ir,jstack; - MDB_ID a, itmp; - - ir = (int)ids[0]; - l = 1; - jstack = 0; - for(;;) { - if (ir - l < SMALL) { /* Insertion sort */ - for (j=l+1;j<=ir;j++) { - a = ids[j]; - for (i=j-1;i>=1;i--) { - if (ids[i] >= a) break; - ids[i+1] = ids[i]; - } - ids[i+1] = a; - } - if (jstack == 0) break; - ir = istack[jstack--]; - l = istack[jstack--]; - } else { - k = (l + ir) >> 1; /* Choose median of left, center, right */ - MIDL_SWAP(ids[k], ids[l+1]); - if (ids[l] < ids[ir]) { - MIDL_SWAP(ids[l], ids[ir]); - } - if (ids[l+1] < ids[ir]) { - MIDL_SWAP(ids[l+1], ids[ir]); - } - if (ids[l] < ids[l+1]) { - MIDL_SWAP(ids[l], ids[l+1]); - } - i = l+1; - j = ir; - a = ids[l+1]; - for(;;) { - do i++; while(ids[i] > a); - do j--; while(ids[j] < a); - if (j < i) break; - MIDL_SWAP(ids[i],ids[j]); - } - ids[l+1] = ids[j]; - ids[j] = a; - jstack += 2; - if (ir-i+1 >= j-l) { - istack[jstack] = ir; - istack[jstack-1] = i; - ir = j-1; - } else { - istack[jstack] = j-1; - istack[jstack-1] = l; - l = i; - } - } - } -} - -static unsigned __hot -mdb_mid2l_search( MDB_ID2L ids, MDB_ID id ) -{ - /* - * binary search of id in ids - * if found, returns position of id - * if not found, returns first position greater than id - */ - unsigned base = 0; - unsigned cursor = 1; - int val = 0; - unsigned n = (unsigned)ids[0].mid; - - while( 0 < n ) { - unsigned pivot = n >> 1; - cursor = base + pivot + 1; - val = mdbx_cmp2int( id, ids[cursor].mid ); - - if( val < 0 ) { - n = pivot; - - } else if ( val > 0 ) { - base = cursor; - n -= pivot + 1; - - } else { - return cursor; - } - } - - if( val > 0 ) { - ++cursor; - } - return cursor; -} - -static int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ) -{ - unsigned x, i; - - x = mdb_mid2l_search( ids, id->mid ); - - if( x < 1 ) { - /* internal error */ - return -2; - } - - if ( x <= ids[0].mid && ids[x].mid == id->mid ) { - /* duplicate */ - return -1; - } - - if ( ids[0].mid >= MDB_IDL_UM_MAX ) { - /* too big */ - return -2; - - } else { - /* insert id */ - ids[0].mid++; - for (i=(unsigned)ids[0].mid; i>x; i--) - ids[i] = ids[i-1]; - ids[x] = *id; - } - - return 0; -} - -static int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ) -{ - /* Too big? */ - if (ids[0].mid >= MDB_IDL_UM_MAX) { - return -2; - } - ids[0].mid++; - ids[ids[0].mid] = *id; - return 0; -} - -/** @} */ -/** @} */ diff --git a/midl.h b/midl.h index 1bdffce1..eccc6099 100644 --- a/midl.h +++ b/midl.h @@ -1,190 +1,53 @@ -/** @file midl.h - * @brief LMDB ID List header file. - * - * This file was originally part of back-bdb but has been - * modified for use in libmdb. Most of the macros defined - * in this file are unused, just left over from the original. - * - * This file is only used internally in libmdb and its definitions - * are not exposed publicly. +/** A generic unsigned ID number. These were entryIDs in back-bdb. + * Preferably it should have the same size as a pointer. */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2000-2017 The OpenLDAP Foundation. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#ifndef _MDB_MIDL_H_ -#define _MDB_MIDL_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/** @defgroup internal LMDB Internals - * @{ - */ - -/** @defgroup idls ID List Management - * @{ - */ - /** A generic unsigned ID number. These were entryIDs in back-bdb. - * Preferably it should have the same size as a pointer. - */ typedef size_t MDB_ID; - /** An IDL is an ID List, a sorted array of IDs. The first - * element of the array is a counter for how many actual - * IDs are in the list. In the original back-bdb code, IDLs are - * sorted in ascending order. For libmdb IDLs are sorted in - * descending order. - */ +/** An IDL is an ID List, a sorted array of IDs. The first + * element of the array is a counter for how many actual + * IDs are in the list. In the original back-bdb code, IDLs are + * sorted in ascending order. For libmdb IDLs are sorted in + * descending order. + */ typedef MDB_ID *MDB_IDL; /* IDL sizes - likely should be even bigger * limiting factors: sizeof(ID), thread stack size */ -#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ -#define MDB_IDL_DB_SIZE (1<. */ +#include "mdbx.h" +#include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #include #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -void* thread_entry(void *ctx) -{ - MDB_env *env = ctx; - MDB_txn *txn; - int rc; +void *thread_entry(void *ctx) { + MDB_env *env = ctx; + MDB_txn *txn; + int rc; - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - mdb_txn_abort(txn); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + mdbx_txn_abort(txn); - return NULL; + return NULL; } -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor, *cur2; - MDB_cursor_op op; - int count; - int *values; - char sval[32] = ""; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor, *cur2; + MDB_cursor_op op; + int count; + int *values; + char sval[32] = ""; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - count = (rand()%384) + 64; - values = (int *)malloc(count*sizeof(int)); + count = (rand() % 384) + 64; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i in each iteration, since MDB_NOOVERWRITE may modify it */ - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_KEYEXIST, mdb_put(txn, dbi, &key, &data, MDB_NOOVERWRITE))) { - j++; - data.mv_size = sizeof(sval); - data.mv_data = sval; - } - } - if (j) printf("%d duplicates skipped\n", j); - E(mdb_txn_commit(txn)); - E(mdb_env_stat(env, &mst)); + printf("Adding %d values\n", count); + for (i = 0; i < count; i++) { + sprintf(sval, "%03x %d foo bar", values[i], values[i]); + /* Set in each iteration, since MDB_NOOVERWRITE may modify it */ + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_KEYEXIST, mdbx_put(txn, dbi, &key, &data, MDB_NOOVERWRITE))) { + j++; + data.mv_size = sizeof(sval); + data.mv_data = sval; + } + } + if (j) + printf("%d duplicates skipped\n", j); + E(mdbx_txn_commit(txn)); + E(mdbx_env_stat(env, &mst, sizeof(mst))); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %p %.*s, data: %p %.*s\n", - key.mv_data, (int) key.mv_size, (char *) key.mv_data, - data.mv_data, (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, + (char *)key.mv_data, data.mv_data, (int)data.mv_size, + (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - j=0; - key.mv_data = sval; - for (i= count - 1; i > -1; i-= (rand()%5)) { - j++; - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%03x ", values[i]); - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, NULL))) { - j--; - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); + j = 0; + key.mv_data = sval; + for (i = count - 1; i > -1; i -= (rand() % 5)) { + j++; + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%03x ", values[i]); + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, NULL))) { + j--; + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor last\n"); - E(mdb_cursor_get(cursor, &key, &data, MDB_LAST)); - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor last/prev\n"); - E(mdb_cursor_get(cursor, &key, &data, MDB_LAST)); - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - E(mdb_cursor_get(cursor, &key, &data, MDB_PREV)); - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor last\n"); + E(mdbx_cursor_get(cursor, &key, &data, MDB_LAST)); + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + printf("Cursor prev\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor last/prev\n"); + E(mdbx_cursor_get(cursor, &key, &data, MDB_LAST)); + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + E(mdbx_cursor_get(cursor, &key, &data, MDB_PREV)); + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - printf("Deleting with cursor\n"); - E(mdb_txn_begin(env, NULL, 0, &txn)); - E(mdb_cursor_open(txn, dbi, &cur2)); - for (i=0; i<50; i++) { - if (RES(MDB_NOTFOUND, mdb_cursor_get(cur2, &key, &data, MDB_NEXT))) - break; - printf("key: %p %.*s, data: %p %.*s\n", - key.mv_data, (int) key.mv_size, (char *) key.mv_data, - data.mv_data, (int) data.mv_size, (char *) data.mv_data); - E(mdb_del(txn, dbi, &key, NULL)); - } + printf("Deleting with cursor\n"); + E(mdbx_txn_begin(env, NULL, 0, &txn)); + E(mdbx_cursor_open(txn, dbi, &cur2)); + for (i = 0; i < 50; i++) { + if (RES(MDB_NOTFOUND, mdbx_cursor_get(cur2, &key, &data, MDB_NEXT))) + break; + printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, + (char *)key.mv_data, data.mv_data, (int)data.mv_size, + (char *)data.mv_data); + E(mdbx_del(txn, dbi, &key, NULL)); + } - printf("Restarting cursor in txn\n"); - for (op=MDB_FIRST, i=0; i<=32; op=MDB_NEXT, i++) { - if (RES(MDB_NOTFOUND, mdb_cursor_get(cur2, &key, &data, op))) - break; - printf("key: %p %.*s, data: %p %.*s\n", - key.mv_data, (int) key.mv_size, (char *) key.mv_data, - data.mv_data, (int) data.mv_size, (char *) data.mv_data); - } - mdb_cursor_close(cur2); - E(mdb_txn_commit(txn)); + printf("Restarting cursor in txn\n"); + for (op = MDB_FIRST, i = 0; i <= 32; op = MDB_NEXT, i++) { + if (RES(MDB_NOTFOUND, mdbx_cursor_get(cur2, &key, &data, op))) + break; + printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, + (char *)key.mv_data, data.mv_data, (int)data.mv_size, + (char *)data.mv_data); + } + mdbx_cursor_close(cur2); + E(mdbx_txn_commit(txn)); - for(i = 0; i < 41; ++i) { - pthread_t thread; - pthread_create(&thread, NULL, thread_entry, env); - } + for (i = 0; i < 41; ++i) { + pthread_t thread; + pthread_create(&thread, NULL, thread_entry, env); + } - printf("Restarting cursor outside txn\n"); - E(mdb_txn_begin(env, NULL, 0, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - for (op=MDB_FIRST, i=0; i<=32; op=MDB_NEXT, i++) { - if (RES(MDB_NOTFOUND, mdb_cursor_get(cursor, &key, &data, op))) - break; - printf("key: %p %.*s, data: %p %.*s\n", - key.mv_data, (int) key.mv_size, (char *) key.mv_data, - data.mv_data, (int) data.mv_size, (char *) data.mv_data); - } - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + printf("Restarting cursor outside txn\n"); + E(mdbx_txn_begin(env, NULL, 0, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + for (op = MDB_FIRST, i = 0; i <= 32; op = MDB_NEXT, i++) { + if (RES(MDB_NOTFOUND, mdbx_cursor_get(cursor, &key, &data, op))) + break; + printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, + (char *)key.mv_data, data.mv_data, (int)data.mv_size, + (char *)data.mv_data); + } + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdb_env_close(env); + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); - return 0; + return 0; } diff --git a/mtest1.c b/mtest1.c index ffe79123..826462dc 100644 --- a/mtest1.c +++ b/mtest1.c @@ -14,187 +14,186 @@ /* Based on mtest2.c - memory-mapped database tester/toy */ +#include "mdbx.h" +#include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32] = ""; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + int count; + int *values; + char sval[32] = ""; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - count = (rand()%384) + 64; - values = (int *)malloc(count*sizeof(int)); + count = (rand() % 384) + 64; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i -1; i -= (rand()%5)) { - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%03x ", values[i]); - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, NULL))) { - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - deleted++; - } - } - free(values); - printf("Deleted %d values\n", deleted); + int deleted = 0; + key.mv_data = sval; + for (i = count - 1; i > -1; i -= (rand() % 5)) { + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%03x ", values[i]); + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, NULL))) { + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + deleted++; + } + } + free(values); + printf("Deleted %d values\n", deleted); - printf("check-preset-b.cursor-next\n"); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - int present_b = 0; - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - ++present_b; - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - CHECK(present_b == present_a - deleted, "mismatch"); + printf("check-preset-b.cursor-next\n"); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + int present_b = 0; + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + ++present_b; + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + CHECK(present_b == present_a - deleted, "mismatch"); - printf("check-preset-b.cursor-prev\n"); - j = 1; - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - ++j; - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - CHECK(present_b == j, "mismatch"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + printf("check-preset-b.cursor-prev\n"); + j = 1; + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + ++j; + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + CHECK(present_b == j, "mismatch"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - /********************* LY: kept DB dirty ****************/ - mdbx_env_close_ex(env, 1); - E(mdb_env_create(&env)); - E(mdb_env_set_maxdbs(env, 4)); - E(mdb_env_open(env, DBPATH, env_oflags, 0664)); + mdbx_dbi_close(env, dbi); + /********************* LY: kept DB dirty ****************/ + mdbx_env_close_ex(env, 1); + E(mdbx_env_create(&env)); + E(mdbx_env_set_maxdbs(env, 4)); + E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - printf("check-preset-c.cursor-next\n"); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_dbi_open(txn, "id1", 0, &dbi)); - E(mdb_cursor_open(txn, dbi, &cursor)); - int present_c = 0; - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - ++present_c; - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Rolled back %d deletion(s)\n", present_c - (present_a - deleted)); - CHECK(present_c > present_a - deleted, "mismatch"); + printf("check-preset-c.cursor-next\n"); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_dbi_open(txn, "id1", 0, &dbi)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + int present_c = 0; + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + ++present_c; + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Rolled back %d deletion(s)\n", present_c - (present_a - deleted)); + CHECK(present_c > present_a - deleted, "mismatch"); - printf("check-preset-d.cursor-prev\n"); - j = 1; - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - ++j; - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - CHECK(present_c == j, "mismatch"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + printf("check-preset-d.cursor-prev\n"); + j = 1; + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + ++j; + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + CHECK(present_c == j, "mismatch"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdbx_env_close_ex(env, 0); + mdbx_dbi_close(env, dbi); + mdbx_env_close_ex(env, 0); - return 0; + return 0; } diff --git a/mtest2.c b/mtest2.c index 12b1e126..93caa6e9 100644 --- a/mtest2.c +++ b/mtest2.c @@ -17,136 +17,137 @@ /* Just like mtest.c, but using a subDB instead of the main DB */ +#include "mdbx.h" +#include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32] = ""; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + int count; + int *values; + char sval[32] = ""; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - count = (rand()%384) + 64; - values = (int *)malloc(count*sizeof(int)); + count = (rand() % 384) + 64; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i -1; i-= (rand()%5)) { - j++; - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%03x ", values[i]); - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, NULL))) { - j--; - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); + j = 0; + key.mv_data = sval; + for (i = count - 1; i > -1; i -= (rand() % 5)) { + j++; + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%03x ", values[i]); + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, NULL))) { + j--; + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor prev\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdb_env_close(env); - return 0; + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); + return 0; } diff --git a/mtest3.c b/mtest3.c index a55ec604..be46fe06 100644 --- a/mtest3.c +++ b/mtest3.c @@ -16,146 +16,147 @@ */ /* Tests for sorted duplicate DBs */ +#include "mdbx.h" +#include #include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32]; - char kval[sizeof(int)]; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + int count; + int *values; + char sval[32]; + char kval[sizeof(int)]; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - memset(sval, 0, sizeof(sval)); + memset(sval, 0, sizeof(sval)); - count = (rand()%384) + 64; - values = (int *)malloc(count*sizeof(int)); + count = (rand() % 384) + 64; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i -1; i-= (rand()%5)) { - j++; - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(kval, "%03x", values[i & ~0x0f]); - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, &data))) { - j--; - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); + for (i = count - 1; i > -1; i -= (rand() % 5)) { + j++; + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(kval, "%03x", values[i & ~0x0f]); + sprintf(sval, "%03x %d foo bar", values[i], values[i]); + key.mv_size = sizeof(int); + key.mv_data = kval; + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { + j--; + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor prev\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdb_env_close(env); - return 0; + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); + return 0; } diff --git a/mtest4.c b/mtest4.c index 3d67a0f9..16aca90c 100644 --- a/mtest4.c +++ b/mtest4.c @@ -16,181 +16,181 @@ */ /* Tests for sorted duplicate DBs with fixed-size keys */ +#include "mdbx.h" +#include #include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[8]; - char kval[sizeof(int)]; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + int count; + int *values; + char sval[8]; + char kval[sizeof(int)]; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - memset(sval, 0, sizeof(sval)); + (void)argc; + (void)argv; + memset(sval, 0, sizeof(sval)); - count = 510; - values = (int *)malloc(count*sizeof(int)); + count = 510; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i -1; i-= (rand()%3)) { - j++; - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%07x", values[i]); - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, &data))) { - j--; - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); + for (i = count - 1; i > -1; i -= (rand() % 3)) { + j++; + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(sval, "%07x", values[i]); + key.mv_size = sizeof(int); + key.mv_data = kval; + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { + j--; + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor prev\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdb_env_close(env); - return 0; + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); + return 0; } diff --git a/mtest5.c b/mtest5.c index ed19f412..abca4e72 100644 --- a/mtest5.c +++ b/mtest5.c @@ -16,148 +16,149 @@ */ /* Tests for sorted duplicate DBs using cursor_put */ +#include "mdbx.h" +#include #include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif -int main(int argc,char * argv[]) -{ - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32]; - char kval[sizeof(int)]; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, j = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + int count; + int *values; + char sval[32]; + char kval[sizeof(int)]; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - memset(sval, 0, sizeof(sval)); + memset(sval, 0, sizeof(sval)); - count = (rand()%384) + 64; - values = (int *)malloc(count*sizeof(int)); + count = (rand() % 384) + 64; + values = (int *)malloc(count * sizeof(int)); - for(i = 0;i -1; i-= (rand()%5)) { - j++; - txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); - sprintf(kval, "%03x", values[i & ~0x0f]); - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, &data))) { - j--; - mdb_txn_abort(txn); - } else { - E(mdb_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); + for (i = count - 1; i > -1; i -= (rand() % 5)) { + j++; + txn = NULL; + E(mdbx_txn_begin(env, NULL, 0, &txn)); + sprintf(kval, "%03x", values[i & ~0x0f]); + sprintf(sval, "%03x %d foo bar", values[i], values[i]); + key.mv_size = sizeof(int); + key.mv_data = kval; + data.mv_size = sizeof(sval); + data.mv_data = sval; + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { + j--; + mdbx_txn_abort(txn); + } else { + E(mdbx_txn_commit(txn)); + } + } + free(values); + printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + printf("Cursor next\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + printf("Cursor prev\n"); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, + (int)data.mv_size, (char *)data.mv_data); + } + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); - mdb_env_close(env); - return 0; + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); + return 0; } diff --git a/mtest6.c b/mtest6.c index d988c93c..e7de6ab5 100644 --- a/mtest6.c +++ b/mtest6.c @@ -16,105 +16,109 @@ */ /* Tests for DB splits and merges */ +#include "mdbx.h" +#include #include #include #include +#include #include #include -#include -#include -#include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif char dkbuf[1024]; -int main(int argc,char * argv[]) -{ - int i = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data, sdata; - MDB_txn *txn; - MDB_stat mst; - MDB_cursor *cursor; - long kval; - char *sval; - int env_oflags; - struct stat db_stat, exe_stat; +int main(int argc, char *argv[]) { + int i = 0, rc; + MDB_env *env; + MDB_dbi dbi; + MDB_val key, data, sdata; + MDB_txn *txn; + MDBX_stat mst; + MDB_cursor *cursor; + long kval; + char *sval; + int env_oflags; + struct stat db_stat, exe_stat; - (void) argc; - (void) argv; - srand(time(NULL)); + (void)argc; + (void)argv; + srand(time(NULL)); - E(mdb_env_create(&env)); - E(mdb_env_set_mapsize(env, 10485760)); - E(mdb_env_set_maxdbs(env, 4)); + E(mdbx_env_create(&env)); + E(mdbx_env_set_mapsize(env, 10485760)); + E(mdbx_env_set_maxdbs(env, 4)); - E(stat("/proc/self/exe", &exe_stat)?errno:0); - E(stat(DBPATH "/.", &db_stat)?errno:0); - env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; - if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { - /* LY: Assume running inside a CI-environment: - * 1) don't use FIXEDMAP to avoid EBUSY in case collision, - * which could be inspired by address space randomisation feature. - * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some dedicated storage. - */ - env_oflags = 0; - } - E(mdb_env_open(env, DBPATH, env_oflags, 0664)); + E(stat("/proc/self/exe", &exe_stat) ? errno : 0); + E(stat(DBPATH "/.", &db_stat) ? errno : 0); + env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; + if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { + /* LY: Assume running inside a CI-environment: + * 1) don't use FIXEDMAP to avoid EBUSY in case collision, + * which could be inspired by address space randomisation feature. + * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some + * dedicated storage. + */ + env_oflags = 0; + } + E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - E(mdb_txn_begin(env, NULL, 0, &txn)); - if (mdb_dbi_open(txn, "id6", MDB_CREATE, &dbi) == MDB_SUCCESS) - E(mdb_drop(txn, dbi, 1)); - E(mdb_dbi_open(txn, "id6", MDB_CREATE|MDB_INTEGERKEY, &dbi)); - E(mdb_cursor_open(txn, dbi, &cursor)); - E(mdb_stat(txn, dbi, &mst)); + E(mdbx_txn_begin(env, NULL, 0, &txn)); + if (mdbx_dbi_open(txn, "id6", MDB_CREATE, &dbi) == MDB_SUCCESS) + E(mdbx_drop(txn, dbi, 1)); + E(mdbx_dbi_open(txn, "id6", MDB_CREATE | MDB_INTEGERKEY, &dbi)); + E(mdbx_cursor_open(txn, dbi, &cursor)); + E(mdbx_stat(txn, dbi, &mst, sizeof(mst))); - sval = calloc(1, mst.ms_psize / 4); - key.mv_size = sizeof(long); - key.mv_data = &kval; - sdata.mv_size = mst.ms_psize / 4 - 30; - sdata.mv_data = sval; + sval = calloc(1, mst.ms_psize / 4); + key.mv_size = sizeof(long); + key.mv_data = &kval; + sdata.mv_size = mst.ms_psize / 4 - 30; + sdata.mv_data = sval; - printf("Adding 12 values, should yield 3 splits\n"); - for (i=0;i<12;i++) { - kval = i*5; - sprintf(sval, "%08lx", kval); - data = sdata; - (void)RES(MDB_KEYEXIST, mdb_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); - } - printf("Adding 12 more values, should yield 3 splits\n"); - for (i=0;i<12;i++) { - kval = i*5+4; - sprintf(sval, "%08lx", kval); - data = sdata; - (void)RES(MDB_KEYEXIST, mdb_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); - } - printf("Adding 12 more values, should yield 3 splits\n"); - for (i=0;i<12;i++) { - kval = i*5+1; - sprintf(sval, "%08lx", kval); - data = sdata; - (void)RES(MDB_KEYEXIST, mdb_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); - } - E(mdb_cursor_get(cursor, &key, &data, MDB_FIRST)); + printf("Adding 12 values, should yield 3 splits\n"); + for (i = 0; i < 12; i++) { + kval = i * 5; + sprintf(sval, "%08lx", kval); + data = sdata; + (void)RES(MDB_KEYEXIST, + mdbx_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); + } + printf("Adding 12 more values, should yield 3 splits\n"); + for (i = 0; i < 12; i++) { + kval = i * 5 + 4; + sprintf(sval, "%08lx", kval); + data = sdata; + (void)RES(MDB_KEYEXIST, + mdbx_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); + } + printf("Adding 12 more values, should yield 3 splits\n"); + for (i = 0; i < 12; i++) { + kval = i * 5 + 1; + sprintf(sval, "%08lx", kval); + data = sdata; + (void)RES(MDB_KEYEXIST, + mdbx_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); + } + E(mdbx_cursor_get(cursor, &key, &data, MDB_FIRST)); - do { - printf("key: %p %s, data: %p %.*s\n", - key.mv_data, mdb_dkey(&key, dkbuf), - data.mv_data, (int) data.mv_size, (char *) data.mv_data); - } while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0); - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_commit(txn); + do { + printf("key: %p %s, data: %p %.*s\n", key.mv_data, mdbx_dkey(&key, dkbuf), + data.mv_data, (int)data.mv_size, (char *)data.mv_data); + } while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0); + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_commit(txn); #if 0 int j=0; @@ -124,47 +128,47 @@ int main(int argc,char * argv[]) for (i= count - 1; i > -1; i-= (rand()%5)) { j++; txn=NULL; - E(mdb_txn_begin(env, NULL, 0, &txn)); + E(mdbx_txn_begin(env, NULL, 0, &txn)); sprintf(kval, "%03x", values[i & ~0x0f]); sprintf(sval, "%03x %d foo bar", values[i], values[i]); key.mv_size = sizeof(int); key.mv_data = kval; data.mv_size = sizeof(sval); data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdb_del(txn, dbi, &key, &data))) { + if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { j--; - mdb_txn_abort(txn); + mdbx_txn_abort(txn); } else { - E(mdb_txn_commit(txn)); + E(mdbx_txn_commit(txn)); } } free(values); printf("Deleted %d values\n", j); - E(mdb_env_stat(env, &mst)); - E(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdb_cursor_open(txn, dbi, &cursor)); + E(mdbx_env_stat(env, &mst, sizeof(mst))); + E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + E(mdbx_cursor_open(txn, dbi, &cursor)); printf("Cursor next\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { printf("key: %.*s, data: %.*s\n", (int) key.mv_size, (char *) key.mv_data, (int) data.mv_size, (char *) data.mv_data); } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); printf("Cursor prev\n"); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { printf("key: %.*s, data: %.*s\n", (int) key.mv_size, (char *) key.mv_data, (int) data.mv_size, (char *) data.mv_data); } - CHECK(rc == MDB_NOTFOUND, "mdb_cursor_get"); - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); - mdb_dbi_close(env, dbi); + mdbx_dbi_close(env, dbi); #endif - mdb_env_close(env); - free(sval); + mdbx_env_close(env); + free(sval); - return 0; + return 0; } diff --git a/sample-mdb.txt b/sample-mdb.txt index 24fccdb9..194afdcc 100644 --- a/sample-mdb.txt +++ b/sample-mdb.txt @@ -33,10 +33,10 @@ int main(int argc,char * argv[]) /* Note: Most error checking omitted for simplicity */ - rc = mdb_env_create(&env); - rc = mdb_env_open(env, "./testdb", 0, 0664); - rc = mdb_txn_begin(env, NULL, 0, &txn); - rc = mdb_dbi_open(txn, NULL, 0, &dbi); + rc = mdbx_env_create(&env); + rc = mdbx_env_open(env, "./testdb", 0, 0664); + rc = mdbx_txn_begin(env, NULL, 0, &txn); + rc = mdbx_dbi_open(txn, NULL, 0, &dbi); key.mv_size = sizeof(int); key.mv_data = sval; @@ -44,23 +44,23 @@ int main(int argc,char * argv[]) data.mv_data = sval; sprintf(sval, "%03x %d foo bar", 32, 3141592); - rc = mdb_put(txn, dbi, &key, &data, 0); - rc = mdb_txn_commit(txn); + rc = mdbx_put(txn, dbi, &key, &data, 0); + rc = mdbx_txn_commit(txn); if (rc) { - fprintf(stderr, "mdb_txn_commit: (%d) %s\n", rc, mdb_strerror(rc)); + fprintf(stderr, "mdbx_txn_commit: (%d) %s\n", rc, mdbx_strerror(rc)); goto leave; } - rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); - rc = mdb_cursor_open(txn, dbi, &cursor); - while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + rc = mdbx_cursor_open(txn, dbi, &cursor); + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int) key.mv_size, (char *) key.mv_data, data.mv_data, (int) data.mv_size, (char *) data.mv_data); } - mdb_cursor_close(cursor); - mdb_txn_abort(txn); + mdbx_cursor_close(cursor); + mdbx_txn_abort(txn); leave: - mdb_dbi_close(env, dbi); - mdb_env_close(env); + mdbx_dbi_close(env, dbi); + mdbx_env_close(env); return 0; } diff --git a/wbench.c b/wbench.c index e5fdc64a..debb8be9 100644 --- a/wbench.c +++ b/wbench.c @@ -12,248 +12,249 @@ * . */ +#include #include #include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include #include "mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) ((test) ? (void)0 : ((void)fprintf(stderr, \ - "%s:%d: %s: %s\n", __FILE__, __LINE__, msg, mdb_strerror(rc)), abort())) +#define CHECK(test, msg) \ + ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ + __LINE__, msg, mdbx_strerror(rc)), \ + abort())) #ifndef DBPATH -# define DBPATH "./testdb" +#define DBPATH "./testdb" #endif struct t0 { - struct rusage ru; - struct timespec ts; + struct rusage ru; + struct timespec ts; }; -void t0(struct t0 *t0) -{ - int rc; - E(getrusage(RUSAGE_SELF, &t0->ru)); - E(clock_gettime(CLOCK_MONOTONIC_RAW, &t0->ts)); +void t0(struct t0 *t0) { + int rc; + E(getrusage(RUSAGE_SELF, &t0->ru)); + E(clock_gettime(CLOCK_MONOTONIC_RAW, &t0->ts)); } struct info { - double wall_s, cpu_sys_s, cpu_user_s; - long iops_r, iops_w, iops_pf; + double wall_s, cpu_sys_s, cpu_user_s; + long iops_r, iops_w, iops_pf; }; -double delta_s(const struct timeval *begin, const struct timeval *end) -{ - return end->tv_sec - begin->tv_sec - + (end->tv_usec - begin->tv_usec) / 1000000.0; +double delta_s(const struct timeval *begin, const struct timeval *end) { + return end->tv_sec - begin->tv_sec + + (end->tv_usec - begin->tv_usec) / 1000000.0; } -double delta2_s(const struct timespec *begin, const struct timespec *end) -{ - return end->tv_sec - begin->tv_sec - + (end->tv_nsec - begin->tv_nsec) / 1000000000.0; +double delta2_s(const struct timespec *begin, const struct timespec *end) { + return end->tv_sec - begin->tv_sec + + (end->tv_nsec - begin->tv_nsec) / 1000000000.0; } -void measure(const struct t0 *t0, struct info *i) -{ - struct t0 t1; - int rc; +void measure(const struct t0 *t0, struct info *i) { + struct t0 t1; + int rc; - E(clock_gettime(CLOCK_MONOTONIC_RAW, &t1.ts)); - E(getrusage(RUSAGE_SELF, &t1.ru)); + E(clock_gettime(CLOCK_MONOTONIC_RAW, &t1.ts)); + E(getrusage(RUSAGE_SELF, &t1.ru)); - i->wall_s = delta2_s(&t0->ts, &t1.ts); - i->cpu_user_s = delta_s(&t0->ru.ru_utime, &t1.ru.ru_utime); - i->cpu_sys_s = delta_s(&t0->ru.ru_stime, &t1.ru.ru_stime); - i->iops_r = t1.ru.ru_inblock - t0->ru.ru_inblock; - i->iops_w = t1.ru.ru_oublock - t0->ru.ru_oublock; - i->iops_pf = t1.ru.ru_majflt - t0->ru.ru_majflt - + t1.ru.ru_minflt - t0->ru.ru_minflt; + i->wall_s = delta2_s(&t0->ts, &t1.ts); + i->cpu_user_s = delta_s(&t0->ru.ru_utime, &t1.ru.ru_utime); + i->cpu_sys_s = delta_s(&t0->ru.ru_stime, &t1.ru.ru_stime); + i->iops_r = t1.ru.ru_inblock - t0->ru.ru_inblock; + i->iops_w = t1.ru.ru_oublock - t0->ru.ru_oublock; + i->iops_pf = + t1.ru.ru_majflt - t0->ru.ru_majflt + t1.ru.ru_minflt - t0->ru.ru_minflt; } -void print(struct info *i) -{ - printf("wall-clock %.3f, iops: %lu reads, %lu writes, %lu page-faults, " - "cpu: %.3f user, %.3f sys\n", - i->wall_s, i->iops_r, i->iops_w, i->iops_pf, i->cpu_user_s, i->cpu_sys_s); - +void print(struct info *i) { + printf("wall-clock %.3f, iops: %lu reads, %lu writes, %lu page-faults, " + "cpu: %.3f user, %.3f sys\n", + i->wall_s, i->iops_r, i->iops_w, i->iops_pf, i->cpu_user_s, + i->cpu_sys_s); } -static void wbench(int flags, int mb, int count, int salt) -{ - MDB_env *env; - MDB_dbi dbi; - MDB_txn *txn; - MDB_val key, data; - unsigned key_value = salt; - char data_value[777]; - int i, rc; - struct t0 start; - struct info ra, rd, rs, rt; +static void wbench(int flags, int mb, int count, int salt) { + MDB_env *env; + MDB_dbi dbi; + MDB_txn *txn; + MDB_val key, data; + unsigned key_value = salt; + char data_value[777]; + int i, rc; + struct t0 start; + struct info ra, rd, rs, rt; - mkdir(DBPATH, 0755); - unlink(DBPATH "/data.mdb"); - unlink(DBPATH "/lock.mdb"); + mkdir(DBPATH, 0755); + unlink(DBPATH "/data.mdb"); + unlink(DBPATH "/lock.mdb"); - printf("\nProbing %d Mb, %d items, flags:", mb, count); - if (flags & MDB_NOSYNC) - printf(" NOSYNC"); - if (flags & MDB_NOMETASYNC) - printf(" NOMETASYNC"); - if (flags & MDB_WRITEMAP) - printf(" WRITEMAP"); - if (flags & MDB_MAPASYNC) - printf(" MAPASYNC"); + printf("\nProbing %d Mb, %d items, flags:", mb, count); + if (flags & MDB_NOSYNC) + printf(" NOSYNC"); + if (flags & MDB_NOMETASYNC) + printf(" NOMETASYNC"); + if (flags & MDB_WRITEMAP) + printf(" WRITEMAP"); + if (flags & MDB_MAPASYNC) + printf(" MAPASYNC"); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - if (flags & MDBX_COALESCE) - printf(" COALESCE"); - if (flags & MDBX_LIFORECLAIM) - printf(" LIFO"); + if (flags & MDBX_COALESCE) + printf(" COALESCE"); + if (flags & MDBX_LIFORECLAIM) + printf(" LIFO"); #endif - printf(" 0x%X\n", flags); + printf(" 0x%X\n", flags); - E(mdb_env_create(&env)); - E(mdb_env_set_mapsize(env, (1ull << 20) * mb)); - E(mdb_env_open(env, DBPATH, flags, 0664)); + E(mdbx_env_create(&env)); + E(mdbx_env_set_mapsize(env, (1ull << 20) * mb)); + E(mdbx_env_open(env, DBPATH, flags, 0664)); - key.mv_size = sizeof(key_value); - key.mv_data = &key_value; - data.mv_size = sizeof(data_value); - data.mv_data = &data_value; + key.mv_size = sizeof(key_value); + key.mv_data = &key_value; + data.mv_size = sizeof(data_value); + data.mv_data = &data_value; - printf("\tAdding %d values...", count); - fflush(stdout); - key_value = salt; - t0(&start); - for(i = 0; i < count; ++i) { - E(mdb_txn_begin(env, NULL, 0, &txn)); - E(mdb_dbi_open(txn, NULL, 0, &dbi)); + printf("\tAdding %d values...", count); + fflush(stdout); + key_value = salt; + t0(&start); + for (i = 0; i < count; ++i) { + E(mdbx_txn_begin(env, NULL, 0, &txn)); + E(mdbx_dbi_open(txn, NULL, 0, &dbi)); - snprintf(data_value, sizeof(data_value), "value=%u", key_value); - E(mdb_put(txn, dbi, &key, &data, MDB_NOOVERWRITE)); - E(mdb_txn_commit(txn)); + snprintf(data_value, sizeof(data_value), "value=%u", key_value); + E(mdbx_put(txn, dbi, &key, &data, MDB_NOOVERWRITE)); + E(mdbx_txn_commit(txn)); - key_value = key_value * 1664525 + 1013904223; - } - measure(&start, &ra); - print(&ra); + key_value = key_value * 1664525 + 1013904223; + } + measure(&start, &ra); + print(&ra); - printf("\tDeleting %d values...", count); - fflush(stdout); - key_value = salt; - t0(&start); - for(i = 0; i < count; ++i) { - E(mdb_txn_begin(env, NULL, 0, &txn)); - E(mdb_dbi_open(txn, NULL, 0, &dbi)); + printf("\tDeleting %d values...", count); + fflush(stdout); + key_value = salt; + t0(&start); + for (i = 0; i < count; ++i) { + E(mdbx_txn_begin(env, NULL, 0, &txn)); + E(mdbx_dbi_open(txn, NULL, 0, &dbi)); - E(mdb_del(txn, dbi, &key, NULL)); - E(mdb_txn_commit(txn)); + E(mdbx_del(txn, dbi, &key, NULL)); + E(mdbx_txn_commit(txn)); - key_value = key_value * 1664525 + 1013904223; - } - measure(&start, &rd); - print(&rd); + key_value = key_value * 1664525 + 1013904223; + } + measure(&start, &rd); + print(&rd); - printf("\tCheckpoint..."); - fflush(stdout); - t0(&start); - mdb_env_sync(env, 1); - measure(&start, &rs); - print(&rs); + printf("\tCheckpoint..."); + fflush(stdout); + t0(&start); + mdbx_env_sync(env, 1); + measure(&start, &rs); + print(&rs); - mdb_env_close(env); - rt.wall_s = ra.wall_s + rd.wall_s + rs.wall_s; - rt.cpu_sys_s = ra.cpu_sys_s + rd.cpu_sys_s + rs.cpu_sys_s; - rt.cpu_user_s = ra.cpu_user_s + rd.cpu_user_s + rs.cpu_user_s; - rt.iops_r = ra.iops_r + rd.iops_r + rs.iops_r; - rt.iops_w = ra.iops_w + rd.iops_w + rs.iops_w; - rt.iops_pf = ra.iops_pf + rd.iops_pf + rs.iops_pf; - printf("Total "); - print(&rt); + mdbx_env_close(env); + rt.wall_s = ra.wall_s + rd.wall_s + rs.wall_s; + rt.cpu_sys_s = ra.cpu_sys_s + rd.cpu_sys_s + rs.cpu_sys_s; + rt.cpu_user_s = ra.cpu_user_s + rd.cpu_user_s + rs.cpu_user_s; + rt.iops_r = ra.iops_r + rd.iops_r + rs.iops_r; + rt.iops_w = ra.iops_w + rd.iops_w + rs.iops_w; + rt.iops_pf = ra.iops_pf + rd.iops_pf + rs.iops_pf; + printf("Total "); + print(&rt); - fprintf(stderr, "flags: "); - if (flags & MDB_NOSYNC) - fprintf(stderr, " NOSYNC"); - if (flags & MDB_NOMETASYNC) - fprintf(stderr, " NOMETASYNC"); - if (flags & MDB_WRITEMAP) - fprintf(stderr, " WRITEMAP"); - if (flags & MDB_MAPASYNC) - fprintf(stderr, " MAPASYNC"); + fprintf(stderr, "flags: "); + if (flags & MDB_NOSYNC) + fprintf(stderr, " NOSYNC"); + if (flags & MDB_NOMETASYNC) + fprintf(stderr, " NOMETASYNC"); + if (flags & MDB_WRITEMAP) + fprintf(stderr, " WRITEMAP"); + if (flags & MDB_MAPASYNC) + fprintf(stderr, " MAPASYNC"); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - if (flags & MDBX_COALESCE) - fprintf(stderr, " COALESCE"); - if (flags & MDBX_LIFORECLAIM) - fprintf(stderr, " LIFO"); + if (flags & MDBX_COALESCE) + fprintf(stderr, " COALESCE"); + if (flags & MDBX_LIFORECLAIM) + fprintf(stderr, " LIFO"); #endif - fprintf(stderr, "\t%.3f\t%.3f\t%.3f\t%.3f\n", rt.iops_w / 1000.0, rt.cpu_user_s, rt.cpu_sys_s, rt.wall_s); - + fprintf(stderr, "\t%.3f\t%.3f\t%.3f\t%.3f\n", rt.iops_w / 1000.0, + rt.cpu_user_s, rt.cpu_sys_s, rt.wall_s); } -int main(int argc,char * argv[]) -{ - (void) argc; - (void) argv; +int main(int argc, char *argv[]) { + (void)argc; + (void)argv; -#define SALT 1 -#define COUNT 10000 -#define SIZE 12 +#define SALT 1 +#define COUNT 10000 +#define SIZE 12 - printf("\nDefault 'sync' mode..."); - wbench(0, SIZE, COUNT, SALT); + printf("\nDefault 'sync' mode..."); + wbench(0, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); + // wbench(MDBX_COALESCE, SIZE, COUNT, SALT); + wbench(MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); // wbench(MDBX_LIFORECLAIM, SIZE, COUNT, SALT); #endif - printf("\nno-meta-sync hack..."); - wbench(MDB_NOMETASYNC, SIZE, COUNT, SALT); + printf("\nno-meta-sync hack..."); + wbench(MDB_NOMETASYNC, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDB_NOMETASYNC | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); + // wbench(MDB_NOMETASYNC | MDBX_COALESCE, SIZE, COUNT, SALT); + wbench(MDB_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); // wbench(MDB_NOMETASYNC | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); #endif - printf("\nno-sync..."); - wbench(MDB_NOSYNC, SIZE, COUNT, SALT); + printf("\nno-sync..."); + wbench(MDB_NOSYNC, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) // wbench(MDB_NOSYNC | MDBX_COALESCE, SIZE, COUNT, SALT); -// wbench(MDB_NOSYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); +// wbench(MDB_NOSYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, +// SALT); // wbench(MDB_NOSYNC | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); #endif - printf("\nr/w-map..."); - wbench(MDB_WRITEMAP, SIZE, COUNT, SALT); + printf("\nr/w-map..."); + wbench(MDB_WRITEMAP, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDB_WRITEMAP | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_WRITEMAP | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); + // wbench(MDB_WRITEMAP | MDBX_COALESCE, SIZE, COUNT, SALT); + wbench(MDB_WRITEMAP | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); // wbench(MDB_WRITEMAP | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); #endif - printf("\nasync..."); - wbench(MDB_WRITEMAP | MDB_MAPASYNC, SIZE, COUNT, SALT); + printf("\nasync..."); + wbench(MDB_WRITEMAP | MDB_MAPASYNC, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -// wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); + // wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_COALESCE, SIZE, COUNT, + // SALT); + wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, + COUNT, SALT); +// wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_LIFORECLAIM, SIZE, COUNT, +// SALT); #endif - printf("\nr/w-map + no-sync..."); - wbench(MDB_NOSYNC | MDB_WRITEMAP, SIZE, COUNT, SALT); + printf("\nr/w-map + no-sync..."); + wbench(MDB_NOSYNC | MDB_WRITEMAP, SIZE, COUNT, SALT); #if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -// wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); + // wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_COALESCE, SIZE, COUNT, SALT); + wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, + COUNT, SALT); +// wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, SIZE, COUNT, +// SALT); #endif - return 0; + return 0; } diff --git a/yota_test1.c b/yota_test1.c index 0cad5468..701d748c 100644 --- a/yota_test1.c +++ b/yota_test1.c @@ -1,6 +1,7 @@ /* * Copyright 2016-2017 Leonid Yuriev . - * Copyright 2015 Vladimir Romanov , Yota Lab. + * Copyright 2015 Vladimir Romanov + * , Yota Lab. * * This file is part of libmdbx. * @@ -18,243 +19,259 @@ * along with this program. If not, see . */ -#include #include +#include +#include "mdbx.h" +#include #include #include -#include -#include #include +#include #include -#include -#include "mdbx.h" +#include -#define IP_PRINTF_ARG_HOST(addr) (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), (int)((addr) & 0xff) +#define IP_PRINTF_ARG_HOST(addr) \ + (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), \ + (int)((addr)&0xff) -char opt_db_path[PATH_MAX] = "/dev/shm/lmdb_bench1"; +char opt_db_path[PATH_MAX] = "/dev/shm/x_bench1"; static MDB_env *env; #define REC_COUNT 1000000 int64_t ids[REC_COUNT + REC_COUNT / 10]; int32_t ids_count = 0; -int64_t lmdb_add = 0; -int64_t lmdb_del = 0; +int64_t x_add = 0; +int64_t x_del = 0; int64_t obj_id = 0; static void add_id_to_pool(int64_t id) { - ids[ids_count] = id; - ids_count++; + ids[ids_count] = id; + ids_count++; } static inline int64_t getTimeMicroseconds(void) { - struct timeval val; - gettimeofday(&val, NULL); - return val.tv_sec * ((int64_t) 1000000) + val.tv_usec; + struct timeval val; + gettimeofday(&val, NULL); + return val.tv_sec * ((int64_t)1000000) + val.tv_usec; } static int64_t get_id_from_pool() { - if (ids_count == 0) { - return -1; - } - int32_t index = rand() % ids_count; - int64_t id = ids[index]; - ids[index] = ids[ids_count - 1]; - ids_count--; - return id; + if (ids_count == 0) { + return -1; + } + int32_t index = rand() % ids_count; + int64_t id = ids[index]; + ids[index] = ids[ids_count - 1]; + ids_count--; + return id; } -#define LMDB_CHECK(x) \ - do {\ - const int rc = (x);\ - if ( rc != MDB_SUCCESS ) {\ - printf("Error [%d] %s in %s at %s:%d\n", rc, mdb_strerror(rc), #x, __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - }\ - } while(0) +#define LMDB_CHECK(x) \ + do { \ + const int rc = (x); \ + if (rc != MDB_SUCCESS) { \ + printf("Error [%d] %s in %s at %s:%d\n", rc, mdbx_strerror(rc), #x, \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) static void db_connect() { - LMDB_CHECK(mdb_env_create(&env)); - LMDB_CHECK(mdb_env_set_mapsize(env, 3L * 1024L * 1024L * 1024L)); - LMDB_CHECK(mdb_env_set_maxdbs(env, 30)); + LMDB_CHECK(mdbx_env_create(&env)); + LMDB_CHECK(mdbx_env_set_mapsize(env, 3L * 1024L * 1024L * 1024L)); + LMDB_CHECK(mdbx_env_set_maxdbs(env, 30)); #if defined(MDBX_LIFORECLAIM) - LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664)); + LMDB_CHECK(mdbx_env_open( + env, opt_db_path, + MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664)); #else - LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); + LMDB_CHECK(mdbx_env_open(env, opt_db_path, + MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); #endif - printf("Connection open\n"); + printf("Connection open\n"); } typedef struct { - char session_id1[100]; - char session_id2[100]; - char ip[20]; - uint8_t fill[100]; + char session_id1[100]; + char session_id2[100]; + char ip[20]; + uint8_t fill[100]; } session_data_t; typedef struct { - int64_t obj_id; - int8_t event_type; + int64_t obj_id; + int8_t event_type; } __attribute__((__packed__)) event_data_t; static void create_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; - session_data_t data; - // transaction init - snprintf(data.session_id1, sizeof (data.session_id1), "mskugw%02ld_%02ld.gx.yota.ru;3800464060;4152;%ld", record_id % 3 + 1, record_id % 9 + 1, record_id); - snprintf(data.session_id2, sizeof (data.session_id2), "gx_service;%ld;%ld;node@spb-jsm1", record_id, record_id % 1000000000 + 99999); - snprintf(data.ip, sizeof (data.ip), "%d.%d.%d.%d", IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); - event.obj_id = record_id; - event.event_type = 1; + MDB_dbi dbi_session; + MDB_dbi dbi_session_id; + MDB_dbi dbi_event; + MDB_dbi dbi_ip; + event_data_t event; + MDB_txn *txn; + session_data_t data; + // transaction init + snprintf(data.session_id1, sizeof(data.session_id1), + "mskugw%02ld_%02ld.gx.yota.ru;3800464060;4152;%ld", + record_id % 3 + 1, record_id % 9 + 1, record_id); + snprintf(data.session_id2, sizeof(data.session_id2), + "gx_service;%ld;%ld;node@spb-jsm1", record_id, + record_id % 1000000000 + 99999); + snprintf(data.ip, sizeof(data.ip), "%d.%d.%d.%d", + IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); + event.obj_id = record_id; + event.event_type = 1; - MDB_val _session_id1_rec = {data.session_id1, strlen(data.session_id1)}; - MDB_val _session_id2_rec = {data.session_id2, strlen(data.session_id2)}; - MDB_val _ip_rec = {data.ip, strlen(data.ip)}; - MDB_val _obj_id_rec = {&record_id, sizeof (record_id)}; - MDB_val _data_rec = {&data, offsetof(session_data_t, fill) + (rand() % sizeof (data.fill))}; - MDB_val _event_rec = {&event, sizeof (event)}; + MDB_val _session_id1_rec = {data.session_id1, strlen(data.session_id1)}; + MDB_val _session_id2_rec = {data.session_id2, strlen(data.session_id2)}; + MDB_val _ip_rec = {data.ip, strlen(data.ip)}; + MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; + MDB_val _data_rec = {&data, offsetof(session_data_t, fill) + + (rand() % sizeof(data.fill))}; + MDB_val _event_rec = {&event, sizeof(event)}; - LMDB_CHECK(mdb_txn_begin(env, NULL, 0, &txn)); - LMDB_CHECK(mdb_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdb_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdb_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdb_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - LMDB_CHECK(mdb_put(txn, dbi_session, &_obj_id_rec, &_data_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_session_id, &_session_id1_rec, &_obj_id_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_session_id, &_session_id2_rec, &_obj_id_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_ip, &_ip_rec, &_obj_id_rec, 0)); - LMDB_CHECK(mdb_put(txn, dbi_event, &_event_rec, &_obj_id_rec, 0)); + LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); + LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); + LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); + LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); + LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); + LMDB_CHECK(mdbx_put(txn, dbi_session, &_obj_id_rec, &_data_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id1_rec, &_obj_id_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id2_rec, &_obj_id_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_ip, &_ip_rec, &_obj_id_rec, 0)); + LMDB_CHECK(mdbx_put(txn, dbi_event, &_event_rec, &_obj_id_rec, 0)); - // transaction commit - LMDB_CHECK(mdb_txn_commit(txn)); - lmdb_add++; + // transaction commit + LMDB_CHECK(mdbx_txn_commit(txn)); + x_add++; } static void delete_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; + MDB_dbi dbi_session; + MDB_dbi dbi_session_id; + MDB_dbi dbi_event; + MDB_dbi dbi_ip; + event_data_t event; + MDB_txn *txn; - // transaction init - LMDB_CHECK(mdb_txn_begin(env, NULL, 0, &txn)); - // open database in read-write mode - LMDB_CHECK(mdb_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdb_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdb_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdb_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - // put data - MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; - MDB_val v_rec; - // get data - LMDB_CHECK(mdb_get(txn, dbi_session, &_obj_id_rec, &v_rec)); - session_data_t* data = (session_data_t*) v_rec.mv_data; + // transaction init + LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); + // open database in read-write mode + LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); + LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); + LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); + LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); + // put data + MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; + MDB_val v_rec; + // get data + LMDB_CHECK(mdbx_get(txn, dbi_session, &_obj_id_rec, &v_rec)); + session_data_t *data = (session_data_t *)v_rec.mv_data; - MDB_val _session_id1_rec = {data->session_id1, strlen(data->session_id1)}; - MDB_val _session_id2_rec = {data->session_id2, strlen(data->session_id2)}; - MDB_val _ip_rec = {data->ip, strlen(data->ip)}; - LMDB_CHECK(mdb_del(txn, dbi_session_id, &_session_id1_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_session_id, &_session_id2_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_ip, &_ip_rec, NULL)); - event.obj_id = record_id; - event.event_type = 1; - MDB_val _event_rec = {&event, sizeof(event)}; - LMDB_CHECK(mdb_del(txn, dbi_event, &_event_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_session, &_obj_id_rec, NULL)); + MDB_val _session_id1_rec = {data->session_id1, strlen(data->session_id1)}; + MDB_val _session_id2_rec = {data->session_id2, strlen(data->session_id2)}; + MDB_val _ip_rec = {data->ip, strlen(data->ip)}; + LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id1_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id2_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_ip, &_ip_rec, NULL)); + event.obj_id = record_id; + event.event_type = 1; + MDB_val _event_rec = {&event, sizeof(event)}; + LMDB_CHECK(mdbx_del(txn, dbi_event, &_event_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_session, &_obj_id_rec, NULL)); - // transaction commit - LMDB_CHECK(mdb_txn_commit(txn)); - lmdb_del++; + // transaction commit + LMDB_CHECK(mdbx_txn_commit(txn)); + x_del++; } static void db_disconnect() { - mdb_env_close(env); - printf("Connection closed\n"); + mdbx_env_close(env); + printf("Connection closed\n"); } -static void get_db_stat(const char* db, int64_t* ms_branch_pages, int64_t* ms_leaf_pages) { - MDB_txn *txn; - MDB_stat stat; - MDB_dbi dbi; +static void get_db_stat(const char *db, int64_t *ms_branch_pages, + int64_t *ms_leaf_pages) { + MDB_txn *txn; + MDBX_stat stat; + MDB_dbi dbi; - LMDB_CHECK(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - LMDB_CHECK(mdb_dbi_open(txn, db, MDB_CREATE, &dbi)); - LMDB_CHECK(mdb_stat(txn, dbi, &stat)); - mdb_txn_abort(txn); - printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", - db, - stat.ms_branch_pages, - stat.ms_depth, - stat.ms_entries, - stat.ms_leaf_pages, - stat.ms_overflow_pages); - (*ms_branch_pages) += stat.ms_branch_pages; - (*ms_leaf_pages) += stat.ms_leaf_pages; + LMDB_CHECK(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + LMDB_CHECK(mdbx_dbi_open(txn, db, MDB_CREATE, &dbi)); + LMDB_CHECK(mdbx_stat(txn, dbi, &stat, sizeof(stat))); + mdbx_txn_abort(txn); + printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", db, + stat.ms_branch_pages, stat.ms_depth, stat.ms_entries, + stat.ms_leaf_pages, stat.ms_overflow_pages); + (*ms_branch_pages) += stat.ms_branch_pages; + (*ms_leaf_pages) += stat.ms_leaf_pages; } static void periodic_stat(void) { - int64_t ms_branch_pages = 0; - int64_t ms_leaf_pages = 0; - printf(" Name | ms_branch_pages | depth | entries | leaf_pages | overf_pages |\n"); - get_db_stat("session", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); - printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, "", "", ms_leaf_pages, ""); - static int64_t prev_add; - static int64_t prev_del; - static int64_t t = -1; - if (t > 0) { - int64_t delta = getTimeMicroseconds() - t; - printf("CPS: add %ld, delete %ld, items processed - %ld\n", (lmdb_add - prev_add)*1000000 / delta, (lmdb_del - prev_del)*1000000 / delta, obj_id); - } - t = getTimeMicroseconds(); - prev_add = lmdb_add; - prev_del = lmdb_del; + int64_t ms_branch_pages = 0; + int64_t ms_leaf_pages = 0; + printf(" Name | ms_branch_pages | depth | entries | " + "leaf_pages | overf_pages |\n"); + get_db_stat("session", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); + printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, + "", "", ms_leaf_pages, ""); + static int64_t prev_add; + static int64_t prev_del; + static int64_t t = -1; + if (t > 0) { + int64_t delta = getTimeMicroseconds() - t; + printf("CPS: add %ld, delete %ld, items processed - %ld\n", + (x_add - prev_add) * 1000000 / delta, + (x_del - prev_del) * 1000000 / delta, obj_id); + } + t = getTimeMicroseconds(); + prev_add = x_add; + prev_del = x_del; } static void periodic_add_rec() { - int i; - for (i = 0; i < 10000; i++) { - if (ids_count <= REC_COUNT) { - int64_t id = obj_id++; - create_record(id); - add_id_to_pool(id); - } - if (ids_count > REC_COUNT) { - int64_t id = get_id_from_pool(); - delete_record(id); - } - } - periodic_stat(); + int i; + for (i = 0; i < 10000; i++) { + if (ids_count <= REC_COUNT) { + int64_t id = obj_id++; + create_record(id); + add_id_to_pool(id); + } + if (ids_count > REC_COUNT) { + int64_t id = get_id_from_pool(); + delete_record(id); + } + } + periodic_stat(); } -int main(int argc, char** argv) { - (void) argc; - (void) argv; +int main(int argc, char **argv) { + (void)argc; + (void)argv; - char filename[PATH_MAX]; - mkdir(opt_db_path, 0775); + char filename[PATH_MAX]; + mkdir(opt_db_path, 0775); - strcpy(filename, opt_db_path); - strcat(filename, "/data.mdb"); - remove(filename); + strcpy(filename, opt_db_path); + strcat(filename, "/data.mdb"); + remove(filename); - strcpy(filename, opt_db_path); - strcat(filename, "/lock.mdb"); - remove(filename); + strcpy(filename, opt_db_path); + strcat(filename, "/lock.mdb"); + remove(filename); - db_connect(); - while (1) { - periodic_add_rec(); - } - db_disconnect(); - return 0; + db_connect(); + while (1) { + periodic_add_rec(); + } + db_disconnect(); + return 0; } diff --git a/yota_test2.c b/yota_test2.c index 80dc4f2f..69d41c7c 100644 --- a/yota_test2.c +++ b/yota_test2.c @@ -1,6 +1,7 @@ /* * Copyright 2016-2017 Leonid Yuriev . - * Copyright 2015 Vladimir Romanov , Yota Lab. + * Copyright 2015 Vladimir Romanov + * , Yota Lab. * * This file is part of libmdbx. * @@ -18,233 +19,257 @@ * along with this program. If not, see . */ -#include #include +#include +#include "mdbx.h" +#include #include #include -#include -#include #include +#include #include -#include -#include "mdbx.h" +#include -#define IP_PRINTF_ARG_HOST(addr) (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), (int)((addr) & 0xff) +#define IP_PRINTF_ARG_HOST(addr) \ + (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), \ + (int)((addr)&0xff) -char opt_db_path[PATH_MAX] = "/dev/shm/lmdb_bench2"; +char opt_db_path[PATH_MAX] = "/dev/shm/x_bench2"; static MDB_env *env; #define REC_COUNT 1024000 int64_t ids[REC_COUNT * 10]; int32_t ids_count = 0; -int64_t lmdb_add = 0; -int64_t lmdb_del = 0; +int64_t x_add = 0; +int64_t x_del = 0; int64_t obj_id = 0; -int64_t lmdb_data_size = 0; -int64_t lmdb_key_size = 0; +int64_t x_data_size = 0; +int64_t x_key_size = 0; static void add_id_to_pool(int64_t id) { - ids[ids_count] = id; - ids_count++; + ids[ids_count] = id; + ids_count++; } static inline int64_t getTimeMicroseconds(void) { - struct timeval val; - gettimeofday(&val, NULL); - return val.tv_sec * ((int64_t) 1000000) + val.tv_usec; + struct timeval val; + gettimeofday(&val, NULL); + return val.tv_sec * ((int64_t)1000000) + val.tv_usec; } static int64_t get_id_from_pool() { - if (ids_count == 0) { - return -1; - } - int32_t index = rand() % ids_count; - int64_t id = ids[index]; - ids[index] = ids[ids_count - 1]; - ids_count--; - return id; + if (ids_count == 0) { + return -1; + } + int32_t index = rand() % ids_count; + int64_t id = ids[index]; + ids[index] = ids[ids_count - 1]; + ids_count--; + return id; } -#define LMDB_CHECK(x) \ - do {\ - const int rc = (x);\ - if ( rc != MDB_SUCCESS ) {\ - printf("Error [%d] %s in %s at %s:%d\n", rc, mdb_strerror(rc), #x, __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - }\ - } while(0) +#define LMDB_CHECK(x) \ + do { \ + const int rc = (x); \ + if (rc != MDB_SUCCESS) { \ + printf("Error [%d] %s in %s at %s:%d\n", rc, mdbx_strerror(rc), #x, \ + __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) static void db_connect() { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; + MDB_dbi dbi_session; + MDB_dbi dbi_session_id; + MDB_dbi dbi_event; + MDB_dbi dbi_ip; - LMDB_CHECK(mdb_env_create(&env)); - LMDB_CHECK(mdb_env_set_mapsize(env, 300000L * 4096L)); - LMDB_CHECK(mdb_env_set_maxdbs(env, 30)); + LMDB_CHECK(mdbx_env_create(&env)); + LMDB_CHECK(mdbx_env_set_mapsize(env, 300000L * 4096L)); + LMDB_CHECK(mdbx_env_set_maxdbs(env, 30)); #if defined(MDBX_LIFORECLAIM) - LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664)); + LMDB_CHECK(mdbx_env_open( + env, opt_db_path, + MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664)); #else - LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); + LMDB_CHECK(mdbx_env_open(env, opt_db_path, + MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); #endif - MDB_txn *txn; + MDB_txn *txn; - // transaction init - LMDB_CHECK(mdb_txn_begin(env, NULL, 0, &txn)); - // open database in read-write mode - LMDB_CHECK(mdb_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdb_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdb_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdb_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - // transaction commit - LMDB_CHECK(mdb_txn_commit(txn)); - printf("Connection open\n"); + // transaction init + LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); + // open database in read-write mode + LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); + LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); + LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); + LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); + // transaction commit + LMDB_CHECK(mdbx_txn_commit(txn)); + printf("Connection open\n"); } typedef struct { - char session_id1[100]; - char session_id2[100]; - char ip[20]; - uint8_t fill[100]; + char session_id1[100]; + char session_id2[100]; + char ip[20]; + uint8_t fill[100]; } session_data_t; typedef struct { - int64_t obj_id; - int8_t event_type; + int64_t obj_id; + int8_t event_type; } __attribute__((__packed__)) event_data_t; static void create_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; - session_data_t data; - // transaction init - snprintf(data.session_id1, sizeof (data.session_id1), "mskugw%02ld_%02ld.gx.yota.ru;3800464060;4152;%ld", record_id % 3 + 1, record_id % 9 + 1, record_id); - snprintf(data.session_id2, sizeof (data.session_id2), "gx_service;%ld;%ld;node@spb-jsm1", record_id, record_id % 1000000000 + 99999); - snprintf(data.ip, sizeof (data.ip), "%d.%d.%d.%d", IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); - event.obj_id = record_id; - event.event_type = 1; + MDB_dbi dbi_session; + MDB_dbi dbi_session_id; + MDB_dbi dbi_event; + MDB_dbi dbi_ip; + event_data_t event; + MDB_txn *txn; + session_data_t data; + // transaction init + snprintf(data.session_id1, sizeof(data.session_id1), + "mskugw%02ld_%02ld.gx.yota.ru;3800464060;4152;%ld", + record_id % 3 + 1, record_id % 9 + 1, record_id); + snprintf(data.session_id2, sizeof(data.session_id2), + "gx_service;%ld;%ld;node@spb-jsm1", record_id, + record_id % 1000000000 + 99999); + snprintf(data.ip, sizeof(data.ip), "%d.%d.%d.%d", + IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); + event.obj_id = record_id; + event.event_type = 1; - MDB_val _session_id1_rec = {data.session_id1, strlen(data.session_id1)}; - MDB_val _session_id2_rec = {data.session_id2, strlen(data.session_id2)}; - MDB_val _ip_rec = {data.ip, strlen(data.ip)}; - MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; - MDB_val _data_rec = {&data, offsetof(session_data_t, fill) + (rand() % sizeof (data.fill))}; - MDB_val _event_rec = {&event, sizeof(event)}; + MDB_val _session_id1_rec = {data.session_id1, strlen(data.session_id1)}; + MDB_val _session_id2_rec = {data.session_id2, strlen(data.session_id2)}; + MDB_val _ip_rec = {data.ip, strlen(data.ip)}; + MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; + MDB_val _data_rec = {&data, offsetof(session_data_t, fill) + + (rand() % sizeof(data.fill))}; + MDB_val _event_rec = {&event, sizeof(event)}; - LMDB_CHECK(mdb_txn_begin(env, NULL, 0, &txn)); - LMDB_CHECK(mdb_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdb_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdb_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdb_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - LMDB_CHECK(mdb_put(txn, dbi_session, &_obj_id_rec, &_data_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_session_id, &_session_id1_rec, &_obj_id_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_session_id, &_session_id2_rec, &_obj_id_rec, MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdb_put(txn, dbi_ip, &_ip_rec, &_obj_id_rec, 0)); - LMDB_CHECK(mdb_put(txn, dbi_event, &_event_rec, &_obj_id_rec, 0)); - lmdb_data_size += (_data_rec.mv_size + _obj_id_rec.mv_size * 4); - lmdb_key_size += (_obj_id_rec.mv_size + _session_id1_rec.mv_size + _session_id2_rec.mv_size + _ip_rec.mv_size + _event_rec.mv_size); + LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); + LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); + LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); + LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); + LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); + LMDB_CHECK(mdbx_put(txn, dbi_session, &_obj_id_rec, &_data_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id1_rec, &_obj_id_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id2_rec, &_obj_id_rec, + MDB_NOOVERWRITE | MDB_NODUPDATA)); + LMDB_CHECK(mdbx_put(txn, dbi_ip, &_ip_rec, &_obj_id_rec, 0)); + LMDB_CHECK(mdbx_put(txn, dbi_event, &_event_rec, &_obj_id_rec, 0)); + x_data_size += (_data_rec.mv_size + _obj_id_rec.mv_size * 4); + x_key_size += + (_obj_id_rec.mv_size + _session_id1_rec.mv_size + + _session_id2_rec.mv_size + _ip_rec.mv_size + _event_rec.mv_size); - // transaction commit - LMDB_CHECK(mdb_txn_commit(txn)); - lmdb_add++; + // transaction commit + LMDB_CHECK(mdbx_txn_commit(txn)); + x_add++; } static void delete_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; + MDB_dbi dbi_session; + MDB_dbi dbi_session_id; + MDB_dbi dbi_event; + MDB_dbi dbi_ip; + event_data_t event; + MDB_txn *txn; - // transaction init - LMDB_CHECK(mdb_txn_begin(env, NULL, 0, &txn)); - // open database in read-write mode - LMDB_CHECK(mdb_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdb_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdb_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdb_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - // put data - MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; - MDB_val _data_rec; - // get data - LMDB_CHECK(mdb_get(txn, dbi_session, &_obj_id_rec, &_data_rec)); - session_data_t* data = (session_data_t*) _data_rec.mv_data; + // transaction init + LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); + // open database in read-write mode + LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); + LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); + LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); + LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); + // put data + MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; + MDB_val _data_rec; + // get data + LMDB_CHECK(mdbx_get(txn, dbi_session, &_obj_id_rec, &_data_rec)); + session_data_t *data = (session_data_t *)_data_rec.mv_data; - MDB_val _session_id1_rec = {data->session_id1, strlen(data->session_id1)}; - MDB_val _session_id2_rec = {data->session_id2, strlen(data->session_id2)}; - MDB_val _ip_rec = {data->ip, strlen(data->ip)}; - LMDB_CHECK(mdb_del(txn, dbi_session_id, &_session_id1_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_session_id, &_session_id2_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_ip, &_ip_rec, NULL)); - event.obj_id = record_id; - event.event_type = 1; - MDB_val _event_rec = {&event, sizeof(event)}; - LMDB_CHECK(mdb_del(txn, dbi_event, &_event_rec, NULL)); - LMDB_CHECK(mdb_del(txn, dbi_session, &_obj_id_rec, NULL)); + MDB_val _session_id1_rec = {data->session_id1, strlen(data->session_id1)}; + MDB_val _session_id2_rec = {data->session_id2, strlen(data->session_id2)}; + MDB_val _ip_rec = {data->ip, strlen(data->ip)}; + LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id1_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id2_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_ip, &_ip_rec, NULL)); + event.obj_id = record_id; + event.event_type = 1; + MDB_val _event_rec = {&event, sizeof(event)}; + LMDB_CHECK(mdbx_del(txn, dbi_event, &_event_rec, NULL)); + LMDB_CHECK(mdbx_del(txn, dbi_session, &_obj_id_rec, NULL)); - lmdb_data_size -= (_data_rec.mv_size + _obj_id_rec.mv_size * 4); - lmdb_key_size -= (_obj_id_rec.mv_size + _session_id1_rec.mv_size + _session_id2_rec.mv_size + _ip_rec.mv_size + _event_rec.mv_size); + x_data_size -= (_data_rec.mv_size + _obj_id_rec.mv_size * 4); + x_key_size -= + (_obj_id_rec.mv_size + _session_id1_rec.mv_size + + _session_id2_rec.mv_size + _ip_rec.mv_size + _event_rec.mv_size); - // transaction commit - LMDB_CHECK(mdb_txn_commit(txn)); - lmdb_del++; + // transaction commit + LMDB_CHECK(mdbx_txn_commit(txn)); + x_del++; } static void db_disconnect() { - mdb_env_close(env); - printf("Connection closed\n"); + mdbx_env_close(env); + printf("Connection closed\n"); } -static void get_db_stat(const char* db, int64_t* ms_branch_pages, int64_t* ms_leaf_pages) { - MDB_txn *txn; - MDB_stat stat; - MDB_dbi dbi; +static void get_db_stat(const char *db, int64_t *ms_branch_pages, + int64_t *ms_leaf_pages) { + MDB_txn *txn; + MDBX_stat stat; + MDB_dbi dbi; - LMDB_CHECK(mdb_txn_begin(env, NULL, MDB_RDONLY, &txn)); - LMDB_CHECK(mdb_dbi_open(txn, db, MDB_CREATE, &dbi)); - LMDB_CHECK(mdb_stat(txn, dbi, &stat)); - mdb_txn_abort(txn); - printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", - db, - stat.ms_branch_pages, - stat.ms_depth, - stat.ms_entries, - stat.ms_leaf_pages, - stat.ms_overflow_pages); - (*ms_branch_pages) += stat.ms_branch_pages; - (*ms_leaf_pages) += stat.ms_leaf_pages; + LMDB_CHECK(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); + LMDB_CHECK(mdbx_dbi_open(txn, db, MDB_CREATE, &dbi)); + LMDB_CHECK(mdbx_stat(txn, dbi, &stat, sizeof(stat))); + mdbx_txn_abort(txn); + printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", db, + stat.ms_branch_pages, stat.ms_depth, stat.ms_entries, + stat.ms_leaf_pages, stat.ms_overflow_pages); + (*ms_branch_pages) += stat.ms_branch_pages; + (*ms_leaf_pages) += stat.ms_leaf_pages; } static void periodic_stat(void) { - int64_t ms_branch_pages = 0; - int64_t ms_leaf_pages = 0; - printf(" Name | ms_branch_pages | depth | entries | leaf_pages | overf_pages |\n"); - get_db_stat("session", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); - printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, "", "", ms_leaf_pages, ""); - static int64_t prev_add; - static int64_t prev_del; - static int64_t t = -1; - if (t > 0) { - int64_t delta = getTimeMicroseconds() - t; - printf("CPS: add %ld, delete %ld, items processed - %ldK data=%ldK key=%ldK\n", (lmdb_add - prev_add)*1000000 / delta, (lmdb_del - prev_del)*1000000 / delta, obj_id / 1024, lmdb_data_size / 1024, lmdb_key_size / 1024); - printf("usage data=%ld%%\n", ((lmdb_data_size + lmdb_key_size) * 100) / ((ms_leaf_pages + ms_branch_pages)*4096)); - } - t = getTimeMicroseconds(); - prev_add = lmdb_add; - prev_del = lmdb_del; + int64_t ms_branch_pages = 0; + int64_t ms_leaf_pages = 0; + printf(" Name | ms_branch_pages | depth | entries | " + "leaf_pages | overf_pages |\n"); + get_db_stat("session", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); + get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); + printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, + "", "", ms_leaf_pages, ""); + static int64_t prev_add; + static int64_t prev_del; + static int64_t t = -1; + if (t > 0) { + int64_t delta = getTimeMicroseconds() - t; + printf("CPS: add %ld, delete %ld, items processed - %ldK data=%ldK " + "key=%ldK\n", + (x_add - prev_add) * 1000000 / delta, + (x_del - prev_del) * 1000000 / delta, obj_id / 1024, + x_data_size / 1024, x_key_size / 1024); + printf("usage data=%ld%%\n", + ((x_data_size + x_key_size) * 100) / + ((ms_leaf_pages + ms_branch_pages) * 4096)); + } + t = getTimeMicroseconds(); + prev_add = x_add; + prev_del = x_del; } -//static void periodic_add_rec() { +// static void periodic_add_rec() { // for (int i = 0; i < 10240; i++) { // if (ids_count <= REC_COUNT) { // int64_t id = obj_id++; @@ -259,52 +284,52 @@ static void periodic_stat(void) { // periodic_stat(); //} -int main(int argc, char** argv) { - (void) argc; - (void) argv; +int main(int argc, char **argv) { + (void)argc; + (void)argv; - char filename[PATH_MAX]; - int i; - int64_t t; + char filename[PATH_MAX]; + int i; + int64_t t; - mkdir(opt_db_path, 0775); + mkdir(opt_db_path, 0775); - strcpy(filename, opt_db_path); - strcat(filename, "/data.mdb"); - remove(filename); + strcpy(filename, opt_db_path); + strcat(filename, "/data.mdb"); + remove(filename); - strcpy(filename, opt_db_path); - strcat(filename, "/lock.mdb"); - remove(filename); + strcpy(filename, opt_db_path); + strcat(filename, "/lock.mdb"); + remove(filename); - db_connect(); - periodic_stat(); - for (i = 0; i < 1024000; i++) { - int64_t id = obj_id++; - create_record(id); - add_id_to_pool(id); - } - periodic_stat(); - t = getTimeMicroseconds(); - while (1) { - int i; - int64_t now; - for (i = 0; i < 100; i++) { - int64_t id = obj_id++; - create_record(id); - add_id_to_pool(id); - id = get_id_from_pool(); - delete_record(id); - } - //int64_t id = obj_id++; - //create_record(id); - //add_id_to_pool(id); - now = getTimeMicroseconds(); - if ((now - t) > 100000) { - periodic_stat(); - t = now; - } - } - db_disconnect(); - return 0; + db_connect(); + periodic_stat(); + for (i = 0; i < 1024000; i++) { + int64_t id = obj_id++; + create_record(id); + add_id_to_pool(id); + } + periodic_stat(); + t = getTimeMicroseconds(); + while (1) { + int i; + int64_t now; + for (i = 0; i < 100; i++) { + int64_t id = obj_id++; + create_record(id); + add_id_to_pool(id); + id = get_id_from_pool(); + delete_record(id); + } + // int64_t id = obj_id++; + // create_record(id); + // add_id_to_pool(id); + now = getTimeMicroseconds(); + if ((now - t) > 100000) { + periodic_stat(); + t = now; + } + } + db_disconnect(); + return 0; } From febe2e2748b3840d7bc6923446fce7a29900d885 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 17 Feb 2017 19:55:10 +0300 Subject: [PATCH 004/303] mdbx: minor refine clearing C_DEL. --- mdbx.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mdbx.c b/mdbx.c index b96857c1..16bbab74 100644 --- a/mdbx.c +++ b/mdbx.c @@ -6682,9 +6682,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, break; } - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; - + mc->mc_flags &= ~C_DEL; return rc; } @@ -6851,8 +6849,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return rc; } - if (mc->mc_flags & C_DEL) - mc->mc_flags ^= C_DEL; + mc->mc_flags &= ~C_DEL; /* Cursor is positioned, check for room in the dirty list */ if (!nospill) { From daa08e610259d00de9c96953313f761dbf9a9055 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 21 Feb 2017 17:15:04 +0300 Subject: [PATCH 005/303] mdbx: assert for NODEPTR. --- mdbx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mdbx.c b/mdbx.c index 16bbab74..961cf7bb 100644 --- a/mdbx.c +++ b/mdbx.c @@ -763,7 +763,11 @@ typedef struct MDB_node { #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) /** Address of node \b i in page \b p */ -#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE)) +#define NODEPTR(p, i) \ + ({ \ + assert(NUMKEYS(p) > (unsigned)(i)); \ + (MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); \ + }) /** Address of the key for the node */ #define NODEKEY(node) (void *)((node)->mn_data) From 3d08b9e76f81f65bb7f4e8dd0baddd7ee1ec2bbe Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 21 Feb 2017 17:16:01 +0300 Subject: [PATCH 006/303] mdbx: fix mdbx_node_search(). --- mdbx.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mdbx.c b/mdbx.c index 961cf7bb..fbeaff94 100644 --- a/mdbx.c +++ b/mdbx.c @@ -5558,11 +5558,9 @@ static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, } } - if (rc > 0) { /* Found entry is less than the key. */ - i++; /* Skip to get the smallest entry larger than key. */ - if (!IS_LEAF2(mp)) - node = NODEPTR(mp, i); - } + if (rc > 0) /* Found entry is less than the key. */ + i++; /* Skip to get the smallest entry larger than key. */ + if (exactp) *exactp = (rc == 0 && nkeys > 0); /* store the key index */ @@ -5572,7 +5570,7 @@ static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, return NULL; /* nodeptr is fake for LEAF2 */ - return node; + return IS_LEAF2(mp) ? node : NODEPTR(mp, i); } #if 0 From abc7dca2476aefbdc6881f297e56123c1003a9cc Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 21 Feb 2017 23:03:16 +0300 Subject: [PATCH 007/303] mdbx: MDB_END_EOTDONE and refine txn_commit(). --- mdbx.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/mdbx.c b/mdbx.c index fbeaff94..0f8c6150 100644 --- a/mdbx.c +++ b/mdbx.c @@ -1261,6 +1261,7 @@ enum { #define MDB_END_OPMASK 0x0F /**< mask for #mdbx_txn_end() operation number */ #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ +#define MDB_END_EOTDONE 0x40 /**< txn's cursors already closed */ #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ static int mdbx_txn_end(MDB_txn *txn, unsigned mode); @@ -3266,7 +3267,7 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { pgno_t *pghead = env->me_pghead; - if (!(mode & MDB_END_UPDATE)) /* !(already closed cursors) */ + if (!(mode & MDB_END_EOTDONE)) /* !(already closed cursors) */ mdbx_cursors_eot(txn, 0); if (!(env->me_flags & MDB_WRITEMAP)) { mdbx_dlist_free(txn); @@ -3806,8 +3807,7 @@ done: int mdbx_txn_commit(MDB_txn *txn) { int rc; - unsigned i, end_mode; - MDB_env *env; + unsigned i; if (unlikely(txn == NULL)) return EINVAL; @@ -3815,15 +3815,12 @@ int mdbx_txn_commit(MDB_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDB_VERSION_MISMATCH; - if (unlikely(txn->mt_env->me_pid != getpid())) { - txn->mt_env->me_flags |= MDB_FATAL_ERROR; + MDB_env *env = txn->mt_env; + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; return MDB_PANIC; } - /* mdbx_txn_end() mode for a commit which writes nothing */ - end_mode = - MDB_END_EMPTY_COMMIT | MDB_END_UPDATE | MDB_END_SLOT | MDB_END_FREE; - if (txn->mt_child) { rc = mdbx_txn_commit(txn->mt_child); txn->mt_child = NULL; @@ -3831,11 +3828,11 @@ int mdbx_txn_commit(MDB_txn *txn) { goto fail; } - env = txn->mt_env; - - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) { + /* mdbx_txn_end() mode for a commit which writes nothing */ + unsigned end_mode = + MDB_END_EMPTY_COMMIT | MDB_END_UPDATE | MDB_END_SLOT | MDB_END_FREE; + if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) goto done; - } if (unlikely(txn->mt_flags & (MDB_TXN_FINISHED | MDB_TXN_ERROR))) { mdbx_debug("error flag is set, can't commit"); @@ -3989,7 +3986,6 @@ int mdbx_txn_commit(MDB_txn *txn) { return rc; } - env = txn->mt_env; if (unlikely(txn != env->me_txn)) { mdbx_debug("attempt to commit unknown transaction"); rc = EINVAL; @@ -3997,6 +3993,7 @@ int mdbx_txn_commit(MDB_txn *txn) { } mdbx_cursors_eot(txn, 0); + end_mode |= MDB_END_EOTDONE; if (!txn->mt_u.dirty_list[0].mid && !(txn->mt_flags & (MDB_TXN_DIRTY | MDB_TXN_SPILLS))) @@ -4054,7 +4051,7 @@ int mdbx_txn_commit(MDB_txn *txn) { } if (unlikely(rc != MDB_SUCCESS)) goto fail; - end_mode = MDB_END_COMMITTED | MDB_END_UPDATE; + end_mode = MDB_END_COMMITTED | MDB_END_UPDATE | MDB_END_EOTDONE; done: return mdbx_txn_end(txn, end_mode); From 44b378b8ea4f7577e05131149c2a268f122f249a Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 17 Feb 2017 19:58:39 +0300 Subject: [PATCH 008/303] mdbx: refine mdbx_cursor_get(). --- mdbx.c | 142 +++++++++++++++++++++++---------------------------------- 1 file changed, 58 insertions(+), 84 deletions(-) diff --git a/mdbx.c b/mdbx.c index 0f8c6150..382c4add 100644 --- a/mdbx.c +++ b/mdbx.c @@ -6513,86 +6513,72 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDB_BAD_TXN; switch (op) { - case MDB_GET_CURRENT: - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { - rc = EINVAL; + case MDB_GET_CURRENT: { + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + return EINVAL; + MDB_page *mp = mc->mc_pg[mc->mc_top]; + int nkeys = NUMKEYS(mp); + if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + mc->mc_ki[mc->mc_top] = nkeys; + return MDB_NOTFOUND; + } + + rc = MDB_SUCCESS; + if (IS_LEAF2(mp)) { + key->mv_size = mc->mc_db->md_xsize; + key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); } else { - MDB_page *mp = mc->mc_pg[mc->mc_top]; - int nkeys = NUMKEYS(mp); - if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { - mc->mc_ki[mc->mc_top] = nkeys; - rc = MDB_NOTFOUND; - break; - } - rc = MDB_SUCCESS; - if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); - } else { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY(leaf, key); - if (data) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (unlikely( - !(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - mdbx_xcursor_init1(mc, leaf); - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - break; - } - rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, - MDB_GET_CURRENT); - } else { - rc = mdbx_node_read(mc, leaf, data); + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDB_GET_KEY(leaf, key); + if (data) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { + mdbx_xcursor_init1(mc, leaf); + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; } + rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, + MDB_GET_CURRENT); + } else { + rc = mdbx_node_read(mc, leaf, data); } + if (unlikely(rc)) + return rc; } } break; + } case MDB_GET_BOTH: case MDB_GET_BOTH_RANGE: - if (unlikely(data == NULL)) { - rc = EINVAL; - break; - } - if (unlikely(mc->mc_xcursor == NULL)) { - rc = MDB_INCOMPATIBLE; - break; - } + if (unlikely(data == NULL)) + return EINVAL; + if (unlikely(mc->mc_xcursor == NULL)) + return MDB_INCOMPATIBLE; /* FALLTHRU */ case MDB_SET: case MDB_SET_KEY: case MDB_SET_RANGE: - if (unlikely(key == NULL)) { - rc = EINVAL; - } else { - rc = mdbx_cursor_set(mc, key, data, op, - op == MDB_SET_RANGE ? NULL : &exact); - } + if (unlikely(key == NULL)) + return EINVAL; + rc = + mdbx_cursor_set(mc, key, data, op, op == MDB_SET_RANGE ? NULL : &exact); break; case MDB_GET_MULTIPLE: - if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) { - rc = EINVAL; - break; - } - if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) { - rc = MDB_INCOMPATIBLE; - break; - } + if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) + return EINVAL; + if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) + return MDB_INCOMPATIBLE; rc = MDB_SUCCESS; if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) break; goto fetchm; case MDB_NEXT_MULTIPLE: - if (unlikely(data == NULL)) { - rc = EINVAL; - break; - } - if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) { - rc = MDB_INCOMPATIBLE; - break; - } + if (unlikely(data == NULL)) + return EINVAL; + if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) + return MDB_INCOMPATIBLE; rc = mdbx_cursor_next(mc, key, data, MDB_NEXT_DUP); if (rc == MDB_SUCCESS) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { @@ -6608,18 +6594,13 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, } break; case MDB_PREV_MULTIPLE: - if (data == NULL) { - rc = EINVAL; - break; - } - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - rc = MDB_INCOMPATIBLE; - break; - } + if (data == NULL) + return EINVAL; + if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) + return MDB_INCOMPATIBLE; + rc = MDB_SUCCESS; if (!(mc->mc_flags & C_INITIALIZED)) rc = mdbx_cursor_last(mc, key, data); - else - rc = MDB_SUCCESS; if (rc == MDB_SUCCESS) { MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; if (mx->mc_flags & C_INITIALIZED) { @@ -6647,14 +6628,10 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, case MDB_FIRST_DUP: mfunc = mdbx_cursor_first; mmove: - if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) { - rc = EINVAL; - break; - } - if (unlikely(mc->mc_xcursor == NULL)) { - rc = MDB_INCOMPATIBLE; - break; - } + if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) + return EINVAL; + if (unlikely(mc->mc_xcursor == NULL)) + return MDB_INCOMPATIBLE; { MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -6663,10 +6640,8 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, break; } } - if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - rc = EINVAL; - break; - } + if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) + return EINVAL; rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); break; case MDB_LAST: @@ -6677,8 +6652,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, goto mmove; default: mdbx_debug("unhandled/unimplemented cursor operation %u", op); - rc = EINVAL; - break; + return EINVAL; } mc->mc_flags &= ~C_DEL; From a034502657c262dfb61e93f24098a7c3f1eaf67c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 21 Feb 2017 19:53:05 +0300 Subject: [PATCH 009/303] mdbx: fix wrong cursor's state after a deletion. --- mdbx.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/mdbx.c b/mdbx.c index 382c4add..3628072c 100644 --- a/mdbx.c +++ b/mdbx.c @@ -6060,9 +6060,8 @@ static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, return mdbx_cursor_first(mc, key, data); mp = mc->mc_pg[mc->mc_top]; - if (mc->mc_flags & C_EOF) { - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp) - 1) + if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) return MDB_NOTFOUND; mc->mc_flags ^= C_EOF; } @@ -6142,6 +6141,9 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_node *leaf; int rc; + if ((mc->mc_flags & C_DEL) && op == MDB_PREV_DUP) + return MDB_NOTFOUND; + if (!(mc->mc_flags & C_INITIALIZED)) { rc = mdbx_cursor_last(mc, key, data); if (unlikely(rc)) @@ -6150,8 +6152,8 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, } mp = mc->mc_pg[mc->mc_top]; - - if (mc->mc_db->md_flags & MDB_DUPSORT) { + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { if (op == MDB_PREV || op == MDB_PREV_DUP) { @@ -6461,7 +6463,7 @@ static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) { if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (likely(!(mc->mc_flags & C_EOF))) { + if (likely((mc->mc_flags & (C_EOF | C_DEL)) != C_EOF)) { if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { rc = mdbx_page_search(mc, NULL, MDB_PS_LAST); if (unlikely(rc != MDB_SUCCESS)) @@ -6517,11 +6519,12 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return EINVAL; MDB_page *mp = mc->mc_pg[mc->mc_top]; - int nkeys = NUMKEYS(mp); - if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) { + unsigned nkeys = NUMKEYS(mp); + if (mc->mc_ki[mc->mc_top] >= nkeys) { mc->mc_ki[mc->mc_top] = nkeys; return MDB_NOTFOUND; } + assert(nkeys > 0); rc = MDB_SUCCESS; if (IS_LEAF2(mp)) { @@ -8502,11 +8505,12 @@ static int mdbx_rebalance(MDB_cursor *mc) { if (mc->mc_snum < 2) { MDB_page *mp = mc->mc_pg[0]; + unsigned nkeys = NUMKEYS(mp); if (IS_SUBP(mp)) { mdbx_debug("Can't rebalance a subpage, ignoring"); return MDB_SUCCESS; } - if (NUMKEYS(mp) == 0) { + if (nkeys == 0) { mdbx_debug("tree is completely empty"); mc->mc_db->md_root = P_INVALID; mc->mc_db->md_depth = 0; @@ -8700,8 +8704,10 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { * Other cursors adjustments were already done * by mdbx_rebalance and aren't needed here. */ - if (!mc->mc_snum) + if (!mc->mc_snum) { + mc->mc_flags |= C_DEL | C_EOF; return rc; + } mp = mc->mc_pg[mc->mc_top]; nkeys = NUMKEYS(mp); From d2d8403f454e73656cb4964ecb6cb4b260ec249d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 27 Feb 2017 20:12:55 +0300 Subject: [PATCH 010/303] mdbx: drop MDB_rel_func and related. --- mdbx.c | 32 -------------------------------- mdbx.h | 57 --------------------------------------------------------- 2 files changed, 89 deletions(-) diff --git a/mdbx.c b/mdbx.c index 3628072c..81e82a6d 100644 --- a/mdbx.c +++ b/mdbx.c @@ -930,8 +930,6 @@ typedef struct MDB_dbx { MDB_val md_name; /**< name of the database */ MDB_cmp_func *md_cmp; /**< function for comparing keys */ MDB_cmp_func *md_dcmp; /**< function for comparing data items */ - MDB_rel_func *md_rel; /**< user relocate function */ - void *md_relctx; /**< user-provided context for md_rel */ } MDB_dbx; #if MDBX_MODE_ENABLED @@ -7723,7 +7721,6 @@ static void mdbx_xcursor_init0(MDB_cursor *mc) { mx->mx_dbx.md_name.mv_data = NULL; mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; mx->mx_dbx.md_dcmp = NULL; - mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; } /** Final setup of a sorted-dups cursor. @@ -10102,7 +10099,6 @@ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, unsigned slot = unused ? unused : txn->mt_numdbs; txn->mt_dbxs[slot].md_name.mv_data = namedup; txn->mt_dbxs[slot].md_name.mv_size = len; - txn->mt_dbxs[slot].md_rel = NULL; txn->mt_dbflags[slot] = dbflag; /* txn-> and env-> are the same in read txns, use * tmp variable to avoid undefined assignment @@ -10354,34 +10350,6 @@ int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { return MDB_SUCCESS; } -int mdbx_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel) { - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - txn->mt_dbxs[dbi].md_rel = rel; - return MDB_SUCCESS; -} - -int mdbx_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx) { - if (unlikely(!txn)) - return EINVAL; - - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; - - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; - - txn->mt_dbxs[dbi].md_relctx = ctx; - return MDB_SUCCESS; -} - int __cold mdbx_env_get_maxkeysize(MDB_env *env) { if (!env || env->me_signature != MDBX_ME_SIGNATURE) return EINVAL; diff --git a/mdbx.h b/mdbx.h index b86136a4..2b3a8a8a 100644 --- a/mdbx.h +++ b/mdbx.h @@ -117,25 +117,6 @@ typedef struct iovec MDB_val; /** @brief A callback function used to compare two keys in a database */ typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); -/** @brief A callback function used to relocate a position-dependent data item - * in a fixed-address database. - * - * The \b newptr gives the item's desired address in - * the memory map, and \b oldptr gives its previous address. The item's actual - * data resides at the address in \b item. This callback is expected to walk - * through the fields of the record in \b item and modify any - * values based at the \b oldptr address to be relative to the \b newptr - * address. - * @param[in,out] item The item that is to be relocated. - * @param[in] oldptr The previous address. - * @param[in] newptr The new address to relocate to. - * @param[in] relctx An application-provided context, set by - * #mdbx_set_relctx(). - * @todo This feature is currently unimplemented. - */ -typedef void(MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, - void *relctx); - /** @defgroup mdbx_env Environment Flags * @{ */ @@ -1243,44 +1224,6 @@ int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); */ int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); -/** @brief Set a relocation function for a #MDB_FIXEDMAP database. - * - * @todo The relocation function is called whenever it is necessary to move - *the data - * of an item to a different position in the database (e.g. through tree - * balancing operations, shifts as a result of adds or deletes, etc.). It is - * intended to allow address/position-dependent data items to be stored in - * a database in an environment opened with the #MDB_FIXEDMAP option. - * Currently the relocation feature is unimplemented and setting - * this function has no effect. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[in] rel A #MDB_rel_func function - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdbx_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel); - -/** @brief Set a context pointer for a #MDB_FIXEDMAP database's relocation - *function. - * - * See #mdbx_set_relfunc and #MDB_rel_func for more details. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[in] ctx An arbitrary pointer for whatever the application needs. - * It will be passed to the callback function set by #mdbx_set_relfunc - * as its \b relctx parameter whenever the callback is invoked. - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
- */ -int mdbx_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx); - /** @brief Get items from a database. * * This function retrieves key/data pairs from the database. The address From 00ec50cfb39048b74dda033a96029c71073d2626 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 27 Feb 2017 20:55:05 +0300 Subject: [PATCH 011/303] mdbx: partially remove Doxygen tags. --- mdbx.h | 1272 ++++++++++++++++++++++++++------------------------------ 1 file changed, 593 insertions(+), 679 deletions(-) diff --git a/mdbx.h b/mdbx.h index 2b3a8a8a..05c4205f 100644 --- a/mdbx.h +++ b/mdbx.h @@ -63,334 +63,313 @@ extern "C" { #define MDB_VERSION_FULL \ MDB_VERINT(MDB_VERSION_MAJOR, MDB_VERSION_MINOR, MDB_VERSION_PATCH) -/** The release date of this library version */ +/* The release date of this library version */ #define MDB_VERSION_DATE "DEVEL" -/** A stringifier for the version info */ +/* A stringifier for the version info */ #define MDB_VERSTR(a, b, c, d) \ "MDBX " #a "." #b "." #c ": (" d ", https://github.com/ReOpen/libmdbx)" -/** A helper for the stringifier macro */ +/* A helper for the stringifier macro */ #define MDB_VERFOO(a, b, c, d) MDB_VERSTR(a, b, c, d) -/** The full library version as a C string */ +/* The full library version as a C string */ #define MDB_VERSION_STRING \ MDB_VERFOO(MDB_VERSION_MAJOR, MDB_VERSION_MINOR, MDB_VERSION_PATCH, \ MDB_VERSION_DATE) -/** @} */ -/** @brief Opaque structure for a database environment. +/* Opaque structure for a database environment. * * A DB environment supports multiple databases, all residing in the same * shared-memory map. */ typedef struct MDB_env MDB_env; -/** @brief Opaque structure for a transaction handle. +/* Opaque structure for a transaction handle. * * All database operations require a transaction handle. Transactions may be * read-only or read-write. */ typedef struct MDB_txn MDB_txn; -/** @brief A handle for an individual database in the DB environment. */ +/* A handle for an individual database in the DB environment. */ typedef unsigned MDB_dbi; -/** @brief Opaque structure for navigating through a database */ +/* Opaque structure for navigating through a database */ typedef struct MDB_cursor MDB_cursor; -/** @brief Generic structure used for passing keys and data in and out +/* Generic structure used for passing keys and data in and out * of the database. * * Values returned from the database are valid only until a subsequent * update operation, or the end of the transaction. Do not modify or * free them, they commonly point into the database itself. * - * Key sizes must be between 1 and #mdbx_env_get_maxkeysize() inclusive. - * The same applies to data sizes in databases with the #MDB_DUPSORT flag. + * Key sizes must be between 1 and mdbx_env_get_maxkeysize() inclusive. + * The same applies to data sizes in databases with the MDB_DUPSORT flag. * Other data items can in theory be from 0 to 0xffffffff bytes long. */ typedef struct iovec MDB_val; #define mv_size iov_len #define mv_data iov_base -/** @brief A callback function used to compare two keys in a database */ +/* A callback function used to compare two keys in a database */ typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); -/** @defgroup mdbx_env Environment Flags - * @{ - */ -/** mmap at a fixed address (experimental) */ +/* Environment Flags */ +/* mmap at a fixed address (experimental) */ #define MDB_FIXEDMAP 0x01 -/** no environment directory */ +/* no environment directory */ #define MDB_NOSUBDIR 0x4000 -/** don't fsync after commit */ +/* don't fsync after commit */ #define MDB_NOSYNC 0x10000 -/** read only */ +/* read only */ #define MDB_RDONLY 0x20000 -/** don't fsync metapage after commit */ +/* don't fsync metapage after commit */ #define MDB_NOMETASYNC 0x40000 -/** use writable mmap */ +/* use writable mmap */ #define MDB_WRITEMAP 0x80000 -/** use asynchronous msync when #MDB_WRITEMAP is used */ +/* use asynchronous msync when MDB_WRITEMAP is used */ #define MDB_MAPASYNC 0x100000 -/** tie reader locktable slots to #MDB_txn objects instead of to threads */ +/* tie reader locktable slots to MDB_txn objects instead of to threads */ #define MDB_NOTLS 0x200000 -/** don't do any locking, caller must manage their own locks +/* don't do any locking, caller must manage their own locks * WARNING: libmdbx don't support this mode. */ #define MDB_NOLOCK__UNSUPPORTED 0x400000 -/** don't do readahead */ +/* don't do readahead */ #define MDB_NORDAHEAD 0x800000 -/** don't initialize malloc'd memory before writing to datafile */ +/* don't initialize malloc'd memory before writing to datafile */ #define MDB_NOMEMINIT 0x1000000 #if MDBX_MODE_ENABLED -/** aim to coalesce FreeDB records */ +/* aim to coalesce FreeDB records */ #define MDBX_COALESCE 0x2000000 -/** LIFO policy for reclaiming FreeDB records */ +/* LIFO policy for reclaiming FreeDB records */ #define MDBX_LIFORECLAIM 0x4000000 #endif /* MDBX_MODE_ENABLED */ -/** make a steady-sync only on close and explicit env-sync */ +/* make a steady-sync only on close and explicit env-sync */ #define MDBX_UTTERLY_NOSYNC (MDB_NOSYNC | MDB_MAPASYNC) -/** debuging option, fill/perturb released pages */ +/* debuging option, fill/perturb released pages */ #define MDBX_PAGEPERTURB 0x8000000 -/** @} */ -/** @defgroup mdbx_dbi_open Database Flags - * @{ - */ -/** use reverse string keys */ +/* Database Flags */ +/* use reverse string keys */ #define MDB_REVERSEKEY 0x02 -/** use sorted duplicates */ +/* use sorted duplicates */ #define MDB_DUPSORT 0x04 -/** numeric keys in native byte order, either unsigned int or #mdbx_size_t. +/* numeric keys in native byte order, either unsigned int or mdbx_size_t. * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) * The keys must all be of the same size. */ #define MDB_INTEGERKEY 0x08 -/** with #MDB_DUPSORT, sorted dup items have fixed size */ +/* with MDB_DUPSORT, sorted dup items have fixed size */ #define MDB_DUPFIXED 0x10 -/** with #MDB_DUPSORT, dups are #MDB_INTEGERKEY-style integers */ +/* with MDB_DUPSORT, dups are MDB_INTEGERKEY-style integers */ #define MDB_INTEGERDUP 0x20 -/** with #MDB_DUPSORT, use reverse string dups */ +/* with MDB_DUPSORT, use reverse string dups */ #define MDB_REVERSEDUP 0x40 -/** create DB if not already existing */ +/* create DB if not already existing */ #define MDB_CREATE 0x40000 -/** @} */ -/** @defgroup mdbx_put Write Flags - * @{ - */ -/** For put: Don't write if the key already exists. */ +/* Write Flags */ +/* For put: Don't write if the key already exists. */ #define MDB_NOOVERWRITE 0x10 -/** Only for #MDB_DUPSORT
- * For put: don't write if the key and data pair already exist.
+/* Only for MDB_DUPSORT + * For put: don't write if the key and data pair already exist. * For mdbx_cursor_del: remove all duplicate data items. */ #define MDB_NODUPDATA 0x20 -/** For mdbx_cursor_put: overwrite the current key/data pair +/* For mdbx_cursor_put: overwrite the current key/data pair * MDBX allows this flag for mdbx_put() for explicit overwrite/update without * insertion. */ #define MDB_CURRENT 0x40 -/** For put: Just reserve space for data, don't copy it. Return a +/* For put: Just reserve space for data, don't copy it. Return a * pointer to the reserved space. */ #define MDB_RESERVE 0x10000 -/** Data is being appended, don't split full pages. */ +/* Data is being appended, don't split full pages. */ #define MDB_APPEND 0x20000 -/** Duplicate data is being appended, don't split full pages. */ +/* Duplicate data is being appended, don't split full pages. */ #define MDB_APPENDDUP 0x40000 -/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ +/* Store multiple data items in one call. Only for MDB_DUPFIXED. */ #define MDB_MULTIPLE 0x80000 -/* @} */ -/** @defgroup mdbx_copy Copy Flags - * @{ - */ -/** Compacting copy: Omit free space from copy, and renumber all - * pages sequentially. - */ -#define MDB_CP_COMPACT 0x01 -/* @} */ +/* Copy Flags */ +/* Compacting copy: Omit free space from copy, and renumber all + * pages sequentially. */ +#define MDB_CP_COMPACT 1 -/** @brief Cursor Get operations. +/* Cursor Get operations. * * This is the set of all operations for retrieving data * using a cursor. */ typedef enum MDB_cursor_op { - MDB_FIRST, /**< Position at first key/data item */ - MDB_FIRST_DUP, /**< Position at first data item of current key. - Only for #MDB_DUPSORT */ - MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ - MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for - #MDB_DUPSORT */ - MDB_GET_CURRENT, /**< Return key/data at current cursor position */ - MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items + MDB_FIRST, /* Position at first key/data item */ + MDB_FIRST_DUP, /* Position at first data item of current key. + Only for MDB_DUPSORT */ + MDB_GET_BOTH, /* Position at key/data pair. Only for MDB_DUPSORT */ + MDB_GET_BOTH_RANGE, /* position at key, nearest data. Only for + MDB_DUPSORT */ + MDB_GET_CURRENT, /* Return key/data at current cursor position */ + MDB_GET_MULTIPLE, /* Return key and up to a page of duplicate data items from current cursor position. Move cursor to prepare - for #MDB_NEXT_MULTIPLE. Only for - #MDB_DUPFIXED */ - MDB_LAST, /**< Position at last key/data item */ - MDB_LAST_DUP, /**< Position at last data item of current key. - Only for #MDB_DUPSORT */ - MDB_NEXT, /**< Position at next data item */ - MDB_NEXT_DUP, /**< Position at next data item of current key. - Only for #MDB_DUPSORT */ - MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items + for MDB_NEXT_MULTIPLE. Only for + MDB_DUPFIXED */ + MDB_LAST, /* Position at last key/data item */ + MDB_LAST_DUP, /* Position at last data item of current key. + Only for MDB_DUPSORT */ + MDB_NEXT, /* Position at next data item */ + MDB_NEXT_DUP, /* Position at next data item of current key. + Only for MDB_DUPSORT */ + MDB_NEXT_MULTIPLE, /* Return key and up to a page of duplicate data items from next cursor position. Move cursor to prepare - for #MDB_NEXT_MULTIPLE. Only for - #MDB_DUPFIXED */ - MDB_NEXT_NODUP, /**< Position at first data item of next key */ - MDB_PREV, /**< Position at previous data item */ - MDB_PREV_DUP, /**< Position at previous data item of current key. - Only for #MDB_DUPSORT */ - MDB_PREV_NODUP, /**< Position at last data item of previous key */ - MDB_SET, /**< Position at specified key */ - MDB_SET_KEY, /**< Position at specified key, return key + data */ - MDB_SET_RANGE, /**< Position at first key greater than or equal to specified + for MDB_NEXT_MULTIPLE. Only for + MDB_DUPFIXED */ + MDB_NEXT_NODUP, /* Position at first data item of next key */ + MDB_PREV, /* Position at previous data item */ + MDB_PREV_DUP, /* Position at previous data item of current key. + Only for MDB_DUPSORT */ + MDB_PREV_NODUP, /* Position at last data item of previous key */ + MDB_SET, /* Position at specified key */ + MDB_SET_KEY, /* Position at specified key, return key + data */ + MDB_SET_RANGE, /* Position at first key greater than or equal to specified key. */ - MDB_PREV_MULTIPLE /**< Position at previous page and return key and up to + MDB_PREV_MULTIPLE /* Position at previous page and return key and up to a page of duplicate data items. - Only for #MDB_DUPFIXED */ + Only for MDB_DUPFIXED */ } MDB_cursor_op; -/** @defgroup errors Return Codes - * - * BerkeleyDB uses -30800 to -30999, we'll go under them - * @{ - */ -/** Successful result */ +/* Return Codes + * BerkeleyDB uses -30800 to -30999, we'll go under them */ + +/* Successful result */ #define MDB_SUCCESS 0 -/** key/data pair already exists */ +/* key/data pair already exists */ #define MDB_KEYEXIST (-30799) -/** key/data pair not found (EOF) */ +/* key/data pair not found (EOF) */ #define MDB_NOTFOUND (-30798) -/** Requested page not found - this usually indicates corruption */ +/* Requested page not found - this usually indicates corruption */ #define MDB_PAGE_NOTFOUND (-30797) -/** Located page was wrong type */ +/* Located page was wrong type */ #define MDB_CORRUPTED (-30796) -/** Update of meta page failed or environment had fatal error */ +/* Update of meta page failed or environment had fatal error */ #define MDB_PANIC (-30795) -/** Environment version mismatch */ +/* Environment version mismatch */ #define MDB_VERSION_MISMATCH (-30794) -/** File is not a valid LMDB file */ +/* File is not a valid LMDB file */ #define MDB_INVALID (-30793) -/** Environment mapsize reached */ +/* Environment mapsize reached */ #define MDB_MAP_FULL (-30792) -/** Environment maxdbs reached */ +/* Environment maxdbs reached */ #define MDB_DBS_FULL (-30791) -/** Environment maxreaders reached */ +/* Environment maxreaders reached */ #define MDB_READERS_FULL (-30790) -/** Txn has too many dirty pages */ +/* Txn has too many dirty pages */ #define MDB_TXN_FULL (-30788) -/** Cursor stack too deep - internal error */ +/* Cursor stack too deep - internal error */ #define MDB_CURSOR_FULL (-30787) -/** Page has not enough space - internal error */ +/* Page has not enough space - internal error */ #define MDB_PAGE_FULL (-30786) -/** Database contents grew beyond environment mapsize */ +/* Database contents grew beyond environment mapsize */ #define MDB_MAP_RESIZED (-30785) -/** Operation and DB incompatible, or DB type changed. This can mean: - *
    - *
  • The operation expects an #MDB_DUPSORT / #MDB_DUPFIXED database. - *
  • Opening a named DB when the unnamed DB has #MDB_DUPSORT / - *#MDB_INTEGERKEY. - *
  • Accessing a data record as a database, or vice versa. - *
  • The database was dropped and recreated with different flags. - *
+/* Operation and DB incompatible, or DB type changed. This can mean: + * - The operation expects an MDB_DUPSORT / MDB_DUPFIXED database. + * - Opening a named DB when the unnamed DB has MDB_DUPSORT/MDB_INTEGERKEY. + * - Accessing a data record as a database, or vice versa. + * - The database was dropped and recreated with different flags. */ #define MDB_INCOMPATIBLE (-30784) -/** Invalid reuse of reader locktable slot */ +/* Invalid reuse of reader locktable slot */ #define MDB_BAD_RSLOT (-30783) -/** Transaction must abort, has a child, or is invalid */ +/* Transaction must abort, has a child, or is invalid */ #define MDB_BAD_TXN (-30782) -/** Unsupported size of key/DB name/data, or wrong DUPFIXED size */ +/* Unsupported size of key/DB name/data, or wrong DUPFIXED size */ #define MDB_BAD_VALSIZE (-30781) -/** The specified DBI was changed unexpectedly */ +/* The specified DBI was changed unexpectedly */ #define MDB_BAD_DBI (-30780) -/** Unexpected problem - txn should abort */ +/* Unexpected problem - txn should abort */ #define MDB_PROBLEM (-30779) -/** The last defined error code */ +/* The last defined error code */ #define MDB_LAST_ERRCODE MDB_PROBLEM -/** @} */ -/** @brief Statistics for a database in the environment */ +/* Statistics for a database in the environment */ typedef struct MDBX_stat { - unsigned ms_psize; /**< Size of a database page. + unsigned ms_psize; /* Size of a database page. This is currently the same for all databases. */ - unsigned ms_depth; /**< Depth (height) of the B-tree */ - size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ - size_t ms_leaf_pages; /**< Number of leaf pages */ - size_t ms_overflow_pages; /**< Number of overflow pages */ - size_t ms_entries; /**< Number of data items */ + unsigned ms_depth; /* Depth (height) of the B-tree */ + size_t ms_branch_pages; /* Number of internal (non-leaf) pages */ + size_t ms_leaf_pages; /* Number of leaf pages */ + size_t ms_overflow_pages; /* Number of overflow pages */ + size_t ms_entries; /* Number of data items */ } MDBX_stat; -/** @brief Information about the environment */ +/* Information about the environment */ typedef struct MDBX_envinfo { - void *me_mapaddr; /**< Address of map, if fixed */ - size_t me_mapsize; /**< Size of the data memory map */ - size_t me_last_pgno; /**< ID of the last used page */ - size_t me_last_txnid; /**< ID of the last committed transaction */ - unsigned me_maxreaders; /**< max reader slots in the environment */ - unsigned me_numreaders; /**< max reader slots used in the environment */ - size_t me_tail_txnid; /**< ID of the last reader transaction */ + void *me_mapaddr; /* Address of map, if fixed */ + size_t me_mapsize; /* Size of the data memory map */ + size_t me_last_pgno; /* ID of the last used page */ + size_t me_last_txnid; /* ID of the last committed transaction */ + unsigned me_maxreaders; /* max reader slots in the environment */ + unsigned me_numreaders; /* max reader slots used in the environment */ + size_t me_tail_txnid; /* ID of the last reader transaction */ size_t me_meta1_txnid, me_meta1_sign; size_t me_meta2_txnid, me_meta2_sign; } MDBX_envinfo; -/** @brief Return the LMDB library version information. +/* Return the LMDB library version information. * - * @param[out] major if non-NULL, the library major version number is copied + * [out] major if non-NULL, the library major version number is copied * here - * @param[out] minor if non-NULL, the library minor version number is copied + * [out] minor if non-NULL, the library minor version number is copied * here - * @param[out] patch if non-NULL, the library patch version number is copied + * [out] patch if non-NULL, the library patch version number is copied * here - * @retval "version string" The library version as a string + * Returns "version string" The library version as a string */ char *mdbx_version(int *major, int *minor, int *patch); -/** @brief Return a string describing a given error code. +/* Return a string describing a given error code. * * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) * function. If the error code is greater than or equal to 0, then the string * returned by the system function strerror(3) is returned. If the error code * is less than 0, an error string corresponding to the LMDB library error is - * returned. See @ref errors for a list of LMDB-specific error codes. - * @param[in] err The error code - * @retval "error message" The description of the error + * returned. See errors for a list of LMDB-specific error codes. + * [in] err The error code + * Returns "error message" The description of the error */ char *mdbx_strerror(int err); -/** @brief Create an LMDB environment handle. +/* Create an LMDB environment handle. * - * This function allocates memory for a #MDB_env structure. To release - * the allocated memory and discard the handle, call #mdbx_env_close(). - * Before the handle may be used, it must be opened using #mdbx_env_open(). + * This function allocates memory for a MDB_env structure. To release + * the allocated memory and discard the handle, call mdbx_env_close(). + * Before the handle may be used, it must be opened using mdbx_env_open(). * Various other options may also need to be set before opening the handle, - * e.g. #mdbx_env_set_mapsize(), #mdbx_env_set_maxreaders(), - * #mdbx_env_set_maxdbs(), + * e.g. mdbx_env_set_mapsize(), mdbx_env_set_maxreaders(), + * mdbx_env_set_maxdbs(), * depending on usage requirements. - * @param[out] env The address where the new handle will be stored - * @return A non-zero error value on failure and 0 on success. + * [out] env The address where the new handle will be stored + * Returns A non-zero error value on failure and 0 on success. */ int mdbx_env_create(MDB_env **env); -/** @brief Open an environment handle. +/* Open an environment handle. * - * If this function fails, #mdbx_env_close() must be called to discard the - *#MDB_env handle. - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] path The directory in which the database files reside. This + * If this function fails, mdbx_env_close() must be called to discard the + *MDB_env handle. + * [in] env An environment handle returned by mdbx_env_create() + * [in] path The directory in which the database files reside. This * directory must already exist and be writable. - * @param[in] flags Special options for this environment. This parameter + * [in] flags Special options for this environment. This parameter * must be set to 0 or by bitwise OR'ing together one or more of the * values described here. * Flags set by mdbx_env_set_flags() are also used. - *
    - *
  • #MDB_FIXEDMAP + * - MDB_FIXEDMAP * use a fixed address for the mmap region. This flag must be specified * when creating the environment, and is stored persistently in the *environment. @@ -403,7 +382,7 @@ int mdbx_env_create(MDB_env **env); * how the operating system has allocated memory to shared *libraries and other uses. * The feature is highly experimental. - *
  • #MDB_NOSUBDIR + * - MDB_NOSUBDIR * By default, LMDB creates its environment in a directory whose * pathname is given in \b path, and creates its data and lock *files @@ -412,13 +391,13 @@ int mdbx_env_create(MDB_env **env); * the database main data file. The database lock file is the \b *path * with "-lock" appended. - *
  • #MDB_RDONLY + * - MDB_RDONLY * Open the environment in read-only mode. No write operations will *be * allowed. LMDB will still modify the lock file - except on *read-only * filesystems, where LMDB does not use locks. - *
  • #MDB_WRITEMAP + * - MDB_WRITEMAP * Use a writeable memory map unless MDB_RDONLY is set. This uses * fewer mallocs but loses protection from application bugs * like wild pointer writes and other bad updates into the @@ -428,13 +407,13 @@ int mdbx_env_create(MDB_env **env); * is slower for DBs larger than RAM. * Incompatible with nested transactions. * Do not mix processes with and without MDB_WRITEMAP on the same - * environment. This can defeat durability (#mdbx_env_sync etc). - *
  • #MDB_NOMETASYNC + * environment. This can defeat durability (mdbx_env_sync etc). + * - MDB_NOMETASYNC * Flush system buffers to disk only once per transaction, omit *the * metadata flush. Defer that until the system flushes files to *disk, - * or next non-MDB_RDONLY commit or #mdbx_env_sync(). This + * or next non-MDB_RDONLY commit or mdbx_env_sync(). This *optimization * maintains database integrity, but a system crash may undo the *last @@ -442,8 +421,8 @@ int mdbx_env_create(MDB_env **env); * consistency, isolation) but not D (durability) database *property. * This flag may be changed at any time using - *#mdbx_env_set_flags(). - *
  • #MDB_NOSYNC + *mdbx_env_set_flags(). + * - MDB_NOSYNC * Don't flush system buffers to disk when committing a *transaction. * This optimization means a system crash can corrupt the database @@ -452,9 +431,9 @@ int mdbx_env_create(MDB_env **env); *disk. * The risk is governed by how often the system flushes dirty *buffers - * to disk and how often #mdbx_env_sync() is called. However, if + * to disk and how often mdbx_env_sync() is called. However, if *the - * filesystem preserves write order and the #MDB_WRITEMAP flag is + * filesystem preserves write order and the MDB_WRITEMAP flag is *not * used, transactions exhibit ACI (atomicity, consistency, *isolation) @@ -462,26 +441,26 @@ int mdbx_env_create(MDB_env **env); *integrity * is maintained, but a system crash may undo the final *transactions. - * Note that (#MDB_NOSYNC | #MDB_WRITEMAP) leaves the system with + * Note that (MDB_NOSYNC | MDB_WRITEMAP) leaves the system with *no * hint for when to write transactions to disk, unless - *#mdbx_env_sync() - * is called. (#MDB_MAPASYNC | #MDB_WRITEMAP) may be preferable. + *mdbx_env_sync() + * is called. (MDB_MAPASYNC | MDB_WRITEMAP) may be preferable. * This flag may be changed at any time using - *#mdbx_env_set_flags(). - *
  • #MDB_MAPASYNC - * When using #MDB_WRITEMAP, use asynchronous flushes to disk. - * As with #MDB_NOSYNC, a system crash can then corrupt the + *mdbx_env_set_flags(). + * - MDB_MAPASYNC + * When using MDB_WRITEMAP, use asynchronous flushes to disk. + * As with MDB_NOSYNC, a system crash can then corrupt the * database or lose the last transactions. Calling - *#mdbx_env_sync() + *mdbx_env_sync() * ensures on-disk database integrity until next commit. * This flag may be changed at any time using - *#mdbx_env_set_flags(). - *
  • #MDB_NOTLS + *mdbx_env_set_flags(). + * - MDB_NOTLS * Don't use Thread-Local Storage. Tie reader locktable slots to - * #MDB_txn objects instead of to threads. I.e. #mdbx_txn_reset() + * MDB_txn objects instead of to threads. I.e. mdbx_txn_reset() *keeps - * the slot reseved for the #MDB_txn object. A thread may use + * the slot reseved for the MDB_txn object. A thread may use *parallel * read-only transactions. A read-only transaction may span threads *if @@ -493,7 +472,7 @@ int mdbx_env_create(MDB_env **env); *OS * thread, since LMDB's write locking is unaware of the user *threads. - *
  • #MDB_NOLOCK + * - MDB_NOLOCK * Don't do any locking. If concurrent access is anticipated, the * caller must manage all concurrency itself. For proper *operation @@ -503,13 +482,13 @@ int mdbx_env_create(MDB_env **env); * active. The simplest approach is to use an exclusive lock so *that * no readers may be active at all when a writer begins. - *
  • #MDB_NORDAHEAD + * - MDB_NORDAHEAD * Turn off readahead. Most operating systems perform readahead *on * read requests by default. This option turns it off if the OS * supports it. Turning it off may help random read performance * when the DB is larger than RAM and system RAM is full. - *
  • #MDB_NOMEMINIT + * - MDB_NOMEMINIT * Don't initialize malloc'd memory before writing to unused *spaces * in the data file. By default, memory for pages written to the @@ -534,159 +513,152 @@ int mdbx_env_create(MDB_env **env); * which handle sensitive data like passwords, and it makes *memory * checkers like Valgrind noisy. This flag is not needed with - *#MDB_WRITEMAP, + *MDB_WRITEMAP, * which writes directly to the mmap instead of using malloc for *pages. The - * initialization is also skipped if #MDB_RESERVE is used; the + * initialization is also skipped if MDB_RESERVE is used; the * caller is expected to overwrite all of the memory that was * reserved in that case. * This flag may be changed at any time using - *#mdbx_env_set_flags(). - *
  • #MDBX_COALESCE + *mdbx_env_set_flags(). + * - #MDBX_COALESCE * Aim to coalesce records while reclaiming FreeDB. * This flag may be changed at any time using - *#mdbx_env_set_flags(). - *
  • #MDBX_LIFORECLAIM + *mdbx_env_set_flags(). + * - #MDBX_LIFORECLAIM * LIFO policy for reclaiming FreeDB records. This significantly *reduce * write IPOS in case MDB_NOSYNC with periodically checkpoints. - *
- * @param[in] mode The UNIX permissions to set on created files and + * [in] mode The UNIX permissions to set on created files and *semaphores. * This parameter is ignored on Windows. - * @return A non-zero error value on failure and 0 on success. Some possible + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • #MDB_VERSION_MISMATCH - the version of the LMDB library doesn't + * - MDB_VERSION_MISMATCH - the version of the LMDB library doesn't *match the * version that created the database environment. - *
  • #MDB_INVALID - the environment file headers are corrupted. - *
  • ENOENT - the directory specified by the path parameter doesn't + * - MDB_INVALID - the environment file headers are corrupted. + * - ENOENT - the directory specified by the path parameter doesn't *exist. - *
  • EACCES - the user didn't have permission to access the environment + * - EACCES - the user didn't have permission to access the environment *files. - *
  • EAGAIN - the environment was locked by another process. - *
+ * - EAGAIN - the environment was locked by another process. */ int mdbx_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode); int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mode_t mode, int *exclusive); -/** @brief Copy an LMDB environment to the specified path. +/* Copy an LMDB environment to the specified path. * * This function may be used to make a backup of an existing environment. * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in + * Note: This call can trigger significant file size growth if run in * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdbx_env_create(). It + * transaction. See long-lived transactions under caveats_sec. + * [in] env An environment handle returned by mdbx_env_create(). It * must have already been opened successfully. - * @param[in] path The directory in which the copy will reside. This + * [in] path The directory in which the copy will reside. This * directory must already exist and be writable but must otherwise be * empty. - * @return A non-zero error value on failure and 0 on success. + * Returns A non-zero error value on failure and 0 on success. */ int mdbx_env_copy(MDB_env *env, const char *path); -/** @brief Copy an LMDB environment to the specified file descriptor. +/* Copy an LMDB environment to the specified file descriptor. * * This function may be used to make a backup of an existing environment. * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in + * Note: This call can trigger significant file size growth if run in * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdbx_env_create(). It + * transaction. See long-lived transactions under caveats_sec. + * [in] env An environment handle returned by mdbx_env_create(). It * must have already been opened successfully. - * @param[in] fd The filedescriptor to write the copy to. It must + * [in] fd The filedescriptor to write the copy to. It must * have already been opened for Write access. - * @return A non-zero error value on failure and 0 on success. + * Returns A non-zero error value on failure and 0 on success. */ int mdbx_env_copyfd(MDB_env *env, int fd); -/** @brief Copy an LMDB environment to the specified path, with options. +/* Copy an LMDB environment to the specified path, with options. * * This function may be used to make a backup of an existing environment. * No lockfile is created, since it gets recreated at need. - * @note This call can trigger significant file size growth if run in + * Note: This call can trigger significant file size growth if run in * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdbx_env_create(). It + * transaction. See long-lived transactions under caveats_sec. + * [in] env An environment handle returned by mdbx_env_create(). It * must have already been opened successfully. - * @param[in] path The directory in which the copy will reside. This + * [in] path The directory in which the copy will reside. This * directory must already exist and be writable but must otherwise be * empty. - * @param[in] flags Special options for this operation. This parameter + * [in] flags Special options for this operation. This parameter * must be set to 0 or by bitwise OR'ing together one or more of the * values described here. - *
    - *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free + * - MDB_CP_COMPACT - Perform compaction while copying: omit free * pages and sequentially renumber all pages in output. This *option * consumes more CPU and runs more slowly than the default. * Currently it fails if the environment has suffered a page *leak. - *
- * @return A non-zero error value on failure and 0 on success. + * Returns A non-zero error value on failure and 0 on success. */ int mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags); -/** @brief Copy an LMDB environment to the specified file descriptor, +/* Copy an LMDB environment to the specified file descriptor, * with options. * * This function may be used to make a backup of an existing environment. * No lockfile is created, since it gets recreated at need. See - * #mdbx_env_copy2() for further details. - * @note This call can trigger significant file size growth if run in + * mdbx_env_copy2() for further details. + * Note: This call can trigger significant file size growth if run in * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under @ref caveats_sec. - * @param[in] env An environment handle returned by #mdbx_env_create(). It + * transaction. See long-lived transactions under caveats_sec. + * [in] env An environment handle returned by mdbx_env_create(). It * must have already been opened successfully. - * @param[in] fd The filedescriptor to write the copy to. It must + * [in] fd The filedescriptor to write the copy to. It must * have already been opened for Write access. - * @param[in] flags Special options for this operation. - * See #mdbx_env_copy2() for options. - * @return A non-zero error value on failure and 0 on success. + * [in] flags Special options for this operation. + * See mdbx_env_copy2() for options. + * Returns A non-zero error value on failure and 0 on success. */ int mdbx_env_copyfd2(MDB_env *env, int fd, unsigned flags); -/** @brief Return statistics about the LMDB environment. +/* Return statistics about the LMDB environment. * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[out] stat The address of an #MDB_stat structure + * [in] env An environment handle returned by mdbx_env_create() + * [out] stat The address of an MDB_stat structure * where the statistics will be copied */ int mdbx_env_stat(MDB_env *env, MDBX_stat *stat, size_t bytes); -/** @brief Return information about the LMDB environment. +/* Return information about the LMDB environment. * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[out] stat The address of an #MDB_envinfo structure + * [in] env An environment handle returned by mdbx_env_create() + * [out] stat The address of an MDB_envinfo structure * where the information will be copied */ int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); -/** @brief Flush the data buffers to disk. +/* Flush the data buffers to disk. * - * Data is always written to disk when #mdbx_txn_commit() is called, + * Data is always written to disk when mdbx_txn_commit() is called, * but the operating system may keep it buffered. LMDB always flushes * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. This call is - * not valid if the environment was opened with #MDB_RDONLY. - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] force If non-zero, force a synchronous flush. Otherwise - * if the environment has the #MDB_NOSYNC flag set the flushes - * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. - * @return A non-zero error value on failure and 0 on success. Some possible + * opened with MDB_NOSYNC or in part MDB_NOMETASYNC. This call is + * not valid if the environment was opened with MDB_RDONLY. + * [in] env An environment handle returned by mdbx_env_create() + * [in] force If non-zero, force a synchronous flush. Otherwise + * if the environment has the MDB_NOSYNC flag set the flushes + * will be omitted, and with MDB_MAPASYNC they will be asynchronous. + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EACCES - the environment is read-only. - *
  • EINVAL - an invalid parameter was specified. - *
  • EIO - an error occurred during synchronization. - *
+ * - EACCES - the environment is read-only. + * - EINVAL - an invalid parameter was specified. + * - EIO - an error occurred during synchronization. */ int mdbx_env_sync(MDB_env *env, int force); -/** @brief Close the environment and release the memory map. +/* Close the environment and release the memory map. * * Only a single thread may call this function. All transactions, databases, * and cursors must already be closed before calling this function. Attempts @@ -694,8 +666,8 @@ int mdbx_env_sync(MDB_env *env, int force); * use any such handles after calling this function will cause a SIGSEGV. * The environment handle will be freed and must not be used again after this * call. - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] dont_sync A dont'sync flag, if non-zero the last checkpoint + * [in] env An environment handle returned by mdbx_env_create() + * [in] dont_sync A dont'sync flag, if non-zero the last checkpoint * (meta-page update) will be kept "as is" and may be still "weak" * in NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be ignored * on opening next time, and transactions since the last non-weak @@ -703,72 +675,64 @@ int mdbx_env_sync(MDB_env *env, int force); */ void mdbx_env_close(MDB_env *env); -/** @brief Set environment flags. +/* Set environment flags. * * This may be used to set some flags in addition to those from - * #mdbx_env_open(), or to unset these flags. If several threads + * mdbx_env_open(), or to unset these flags. If several threads * change the flags at the same time, the result is undefined. - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] flags The flags to change, bitwise OR'ed together - * @param[in] onoff A non-zero value sets the flags, zero clears them. - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] env An environment handle returned by mdbx_env_create() + * [in] flags The flags to change, bitwise OR'ed together + * [in] onoff A non-zero value sets the flags, zero clears them. + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff); -/** @brief Get environment flags. +/* Get environment flags. * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[out] flags The address of an integer to store the flags - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] env An environment handle returned by mdbx_env_create() + * [out] flags The address of an integer to store the flags + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_env_get_flags(MDB_env *env, unsigned *flags); -/** @brief Return the path that was used in #mdbx_env_open(). +/* Return the path that was used in mdbx_env_open(). * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[out] path Address of a string pointer to contain the path. This + * [in] env An environment handle returned by mdbx_env_create() + * [out] path Address of a string pointer to contain the path. This * is the actual string in the environment, not a copy. It should not be * altered in any way. - * @return A non-zero error value on failure and 0 on success. Some possible + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_env_get_path(MDB_env *env, const char **path); -/** @brief Return the filedescriptor for the given environment. +/* Return the filedescriptor for the given environment. * * This function may be called after fork(), so the descriptor can be * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. * (Until LMDB 0.9.18, only the lockfile had that.) * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[out] fd Address of a int to contain the descriptor. - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] env An environment handle returned by mdbx_env_create() + * [out] fd Address of a int to contain the descriptor. + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_env_get_fd(MDB_env *env, int *fd); -/** @brief Set the size of the memory map to use for this environment. +/* Set the size of the memory map to use for this environment. * * The size should be a multiple of the OS page size. The default is * 10485760 bytes. The size of the memory map is also the maximum size * of the database. The value should be chosen as large as possible, * to accommodate future growth of the database. - * This function should be called after #mdbx_env_create() and before - *#mdbx_env_open(). + * This function should be called after mdbx_env_create() and before + *mdbx_env_open(). * It may be called at later times if no transactions are active in * this process. Note that the library does not check for this condition, * the caller must ensure it explicitly. @@ -779,263 +743,247 @@ int mdbx_env_get_fd(MDB_env *env, int *fd); * persisted into the environment. * * If the mapsize is increased by another process, and data has grown - * beyond the range of the current mapsize, #mdbx_txn_begin() will - * return #MDB_MAP_RESIZED. This function may be called with a size + * beyond the range of the current mapsize, mdbx_txn_begin() will + * return MDB_MAP_RESIZED. This function may be called with a size * of zero to adopt the new size. * * Any attempt to set a size smaller than the space already consumed * by the environment will be silently changed to the current size of the used *space. - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] size The size in bytes - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] env An environment handle returned by mdbx_env_create() + * [in] size The size in bytes + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment + * - EINVAL - an invalid parameter was specified, or the environment *has * an active write transaction. - *
*/ int mdbx_env_set_mapsize(MDB_env *env, size_t size); -/** @brief Set the maximum number of threads/reader slots for the environment. +/* Set the maximum number of threads/reader slots for the environment. * * This defines the number of slots in the lock table that is used to track *readers in the * the environment. The default is 126. * Starting a read-only transaction normally ties a lock table slot to the * current thread until the environment closes or the thread exits. If - * MDB_NOTLS is in use, #mdbx_txn_begin() instead ties the slot to the - * MDB_txn object until it or the #MDB_env object is destroyed. - * This function may only be called after #mdbx_env_create() and before - *#mdbx_env_open(). - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] readers The maximum number of reader lock table slots - * @return A non-zero error value on failure and 0 on success. Some possible + * MDB_NOTLS is in use, mdbx_txn_begin() instead ties the slot to the + * MDB_txn object until it or the MDB_env object is destroyed. + * This function may only be called after mdbx_env_create() and before + *mdbx_env_open(). + * [in] env An environment handle returned by mdbx_env_create() + * [in] readers The maximum number of reader lock table slots + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment is + * - EINVAL - an invalid parameter was specified, or the environment is *already open. - *
*/ int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); -/** @brief Get the maximum number of threads/reader slots for the environment. +/* Get the maximum number of threads/reader slots for the environment. * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[out] readers Address of an integer to store the number of readers - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] env An environment handle returned by mdbx_env_create() + * [out] readers Address of an integer to store the number of readers + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); -/** @brief Set the maximum number of named databases for the environment. +/* Set the maximum number of named databases for the environment. * * This function is only needed if multiple databases will be used in the * environment. Simpler applications that use the environment as a single * unnamed database can ignore this option. - * This function may only be called after #mdbx_env_create() and before - *#mdbx_env_open(). + * This function may only be called after mdbx_env_create() and before + *mdbx_env_open(). * * Currently a moderate number of slots are cheap but a huge number gets - * expensive: 7-120 words per transaction, and every #mdbx_dbi_open() + * expensive: 7-120 words per transaction, and every mdbx_dbi_open() * does a linear search of the opened slots. - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] dbs The maximum number of databases - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] env An environment handle returned by mdbx_env_create() + * [in] dbs The maximum number of databases + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified, or the environment is + * - EINVAL - an invalid parameter was specified, or the environment is *already open. - *
*/ int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); -/** @brief Get the maximum size of keys and #MDB_DUPSORT data we can write. +/* Get the maximum size of keys and MDB_DUPSORT data we can write. * - * Depends on the compile-time constant #MDB_MAXKEYSIZE. Default 511. - * See @ref MDB_val. - * @param[in] env An environment handle returned by #mdbx_env_create() - * @return The maximum size of a key we can write + * Depends on the compile-time constant MDB_MAXKEYSIZE. Default 511. + * See MDB_val. + * [in] env An environment handle returned by mdbx_env_create() + * Returns The maximum size of a key we can write */ int mdbx_env_get_maxkeysize(MDB_env *env); -/** @brief Set application information associated with the #MDB_env. +/* Set application information associated with the MDB_env. * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] ctx An arbitrary pointer for whatever the application needs. - * @return A non-zero error value on failure and 0 on success. + * [in] env An environment handle returned by mdbx_env_create() + * [in] ctx An arbitrary pointer for whatever the application needs. + * Returns A non-zero error value on failure and 0 on success. */ int mdbx_env_set_userctx(MDB_env *env, void *ctx); -/** @brief Get the application information associated with the #MDB_env. +/* Get the application information associated with the MDB_env. * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @return The pointer set by #mdbx_env_set_userctx(). + * [in] env An environment handle returned by mdbx_env_create() + * Returns The pointer set by mdbx_env_set_userctx(). */ void *mdbx_env_get_userctx(MDB_env *env); -/** @brief A callback function for most LMDB assert() failures, +/* A callback function for most LMDB assert() failures, * called before printing the message and aborting. * - * @param[in] env An environment handle returned by #mdbx_env_create(). - * @param[in] msg The assertion message, not including newline. + * [in] env An environment handle returned by mdbx_env_create(). + * [in] msg The assertion message, not including newline. */ typedef void MDB_assert_func(MDB_env *env, const char *msg, const char *function, unsigned line); -/** Set or reset the assert() callback of the environment. +/* Set or reset the assert() callback of the environment. * Disabled if liblmdb is buillt with MDB_DEBUG=0. - * @note This hack should become obsolete as lmdb's error handling matures. - * @param[in] env An environment handle returned by #mdbx_env_create(). - * @param[in] func An #MDB_assert_func function, or 0. - * @return A non-zero error value on failure and 0 on success. + * Note: This hack should become obsolete as lmdb's error handling matures. + * [in] env An environment handle returned by mdbx_env_create(). + * [in] func An MDB_assert_func function, or 0. + * Returns A non-zero error value on failure and 0 on success. */ int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); -/** @brief Create a transaction for use with the environment. +/* Create a transaction for use with the environment. * - * The transaction handle may be discarded using #mdbx_txn_abort() or - *#mdbx_txn_commit(). - * @note A transaction and its cursors must only be used by a single + * The transaction handle may be discarded using mdbx_txn_abort() or + *mdbx_txn_commit(). + * Note: A transaction and its cursors must only be used by a single * thread, and a thread may only have a single transaction at a time. - * If #MDB_NOTLS is in use, this does not apply to read-only transactions. - * @note Cursors may not span transactions. - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] parent If this parameter is non-NULL, the new transaction + * If MDB_NOTLS is in use, this does not apply to read-only transactions. + * Note: Cursors may not span transactions. + * [in] env An environment handle returned by mdbx_env_create() + * [in] parent If this parameter is non-NULL, the new transaction * will be a nested transaction, with the transaction indicated by \b parent * as its parent. Transactions may be nested to any level. A parent * transaction and its cursors may not issue any other operations than * mdbx_txn_commit and mdbx_txn_abort while it has active child transactions. - * @param[in] flags Special options for this transaction. This parameter + * [in] flags Special options for this transaction. This parameter * must be set to 0 or by bitwise OR'ing together one or more of the * values described here. - *
    - *
  • #MDB_RDONLY + * - MDB_RDONLY * This transaction will not perform any write operations. - *
- * @param[out] txn Address where the new #MDB_txn handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible + * [out] txn Address where the new MDB_txn handle will be stored + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * - MDB_PANIC - a fatal error occurred earlier and the environment * must be shut down. - *
  • #MDB_MAP_RESIZED - another process wrote data beyond this + * - MDB_MAP_RESIZED - another process wrote data beyond this *MDB_env's * mapsize and this environment's map must be resized as well. - * See #mdbx_env_set_mapsize(). - *
  • #MDB_READERS_FULL - a read-only transaction was requested and - * the reader lock table is full. See #mdbx_env_set_maxreaders(). - *
  • ENOMEM - out of memory. - *
+ * See mdbx_env_set_mapsize(). + * - MDB_READERS_FULL - a read-only transaction was requested and + * the reader lock table is full. See mdbx_env_set_maxreaders(). + * - ENOMEM - out of memory. */ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **txn); -/** @brief Returns the transaction's #MDB_env +/* Returns the transaction's MDB_env * - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * [in] txn A transaction handle returned by mdbx_txn_begin() */ MDB_env *mdbx_txn_env(MDB_txn *txn); -/** @brief Return the transaction's ID. +/* Return the transaction's ID. * * This returns the identifier associated with this transaction. For a * read-only transaction, this corresponds to the snapshot being read; * concurrent readers will frequently have the same transaction ID. * - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @return A transaction ID, valid if input is an active transaction. + * [in] txn A transaction handle returned by mdbx_txn_begin() + * Returns A transaction ID, valid if input is an active transaction. */ size_t mdbx_txn_id(MDB_txn *txn); -/** @brief Commit all the operations of a transaction into the database. +/* Commit all the operations of a transaction into the database. * * The transaction handle is freed. It and its cursors must not be used - * again after this call, except with #mdbx_cursor_renew(). + * again after this call, except with mdbx_cursor_renew(). * - * @note MDBX-mode: + * Note: MDBX-mode: * A cursor must be closed explicitly always, before * or after its transaction ends. It can be reused with - * #mdbx_cursor_renew() before finally closing it. + * mdbx_cursor_renew() before finally closing it. * - * @note LMDB-compatible mode: + * Note: LMDB-compatible mode: * Earlier documentation incorrectly said all cursors would be freed. * Only write-transactions free cursors. * - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] txn A transaction handle returned by mdbx_txn_begin() + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
  • ENOSPC - no more disk space. - *
  • EIO - a low-level I/O error occurred while writing. - *
  • ENOMEM - out of memory. - *
+ * - EINVAL - an invalid parameter was specified. + * - ENOSPC - no more disk space. + * - EIO - a low-level I/O error occurred while writing. + * - ENOMEM - out of memory. */ int mdbx_txn_commit(MDB_txn *txn); -/** @brief Abandon all the operations of the transaction instead of saving +/* Abandon all the operations of the transaction instead of saving * them. * * The transaction handle is freed. It and its cursors must not be used - * again after this call, except with #mdbx_cursor_renew(). + * again after this call, except with mdbx_cursor_renew(). * - * @note MDBX-mode: + * Note: MDBX-mode: * A cursor must be closed explicitly always, before * or after its transaction ends. It can be reused with - * #mdbx_cursor_renew() before finally closing it. + * mdbx_cursor_renew() before finally closing it. * - * @note LMDB-compatible mode: + * Note: LMDB-compatible mode: * Earlier documentation incorrectly said all cursors would be freed. * Only write-transactions free cursors. * - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * [in] txn A transaction handle returned by mdbx_txn_begin() */ int mdbx_txn_abort(MDB_txn *txn); -/** @brief Reset a read-only transaction. +/* Reset a read-only transaction. * - * Abort the transaction like #mdbx_txn_abort(), but keep the transaction - * handle. #mdbx_txn_renew() may reuse the handle. This saves allocation + * Abort the transaction like mdbx_txn_abort(), but keep the transaction + * handle. mdbx_txn_renew() may reuse the handle. This saves allocation * overhead if the process will start a new read-only transaction soon, - * and also locking overhead if #MDB_NOTLS is in use. The reader table + * and also locking overhead if MDB_NOTLS is in use. The reader table * lock is released, but the table slot stays tied to its thread or - * #MDB_txn. Use mdbx_txn_abort() to discard a reset handle, and to free + * MDB_txn. Use mdbx_txn_abort() to discard a reset handle, and to free * its lock table slot if MDB_NOTLS is in use. * Cursors opened within the transaction must not be used - * again after this call, except with #mdbx_cursor_renew(). + * again after this call, except with mdbx_cursor_renew(). * Reader locks generally don't interfere with writers, but they keep old * versions of database pages allocated. Thus they prevent the old pages * from being reused when writers commit new data, and so under heavy load * the database size may grow much more rapidly than otherwise. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() + * [in] txn A transaction handle returned by mdbx_txn_begin() */ int mdbx_txn_reset(MDB_txn *txn); -/** @brief Renew a read-only transaction. +/* Renew a read-only transaction. * * This acquires a new reader lock for a transaction handle that had been - * released by #mdbx_txn_reset(). It must be called before a reset transaction + * released by mdbx_txn_reset(). It must be called before a reset transaction * may be used again. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] txn A transaction handle returned by mdbx_txn_begin() + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • #MDB_PANIC - a fatal error occurred earlier and the environment + * - MDB_PANIC - a fatal error occurred earlier and the environment * must be shut down. - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_txn_renew(MDB_txn *txn); -/** @brief Open a database in the environment. +/* Open a database in the environment. * A database handle denotes the name and parameters of a database, * independently of whether such a database exists. - * The database handle may be discarded by calling #mdbx_dbi_close(). + * The database handle may be discarded by calling mdbx_dbi_close(). * The old database handle is returned if the database was already open. * The handle may only be closed once. * @@ -1050,96 +998,90 @@ int mdbx_txn_renew(MDB_txn *txn); * this function must finish (either commit or abort) before * any other transaction in the process may use this function. * - * To use named databases (with name != NULL), #mdbx_env_set_maxdbs() + * To use named databases (with name != NULL), mdbx_env_set_maxdbs() * must be called before opening the environment. Database names are * keys in the unnamed database, and may be read but not written. * - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] name The name of the database to open. If only a single + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] name The name of the database to open. If only a single * database is needed in the environment, this value may be NULL. - * @param[in] flags Special options for this database. This parameter + * [in] flags Special options for this database. This parameter * must be set to 0 or by bitwise OR'ing together one or more of the * values described here. - *
    - *
  • #MDB_REVERSEKEY + * - MDB_REVERSEKEY * Keys are strings to be compared in reverse order, from the end * of the strings to the beginning. By default, Keys are treated as *strings and * compared from beginning to end. - *
  • #MDB_DUPSORT + * - MDB_DUPSORT * Duplicate keys may be used in the database. (Or, from another *perspective, * keys may have multiple data items, stored in sorted order.) By *default * keys must be unique and may have only a single data item. - *
  • #MDB_INTEGERKEY + * - MDB_INTEGERKEY * Keys are binary integers in native byte order, either unsigned *int - * or #mdbx_size_t, and will be sorted as such. + * or mdbx_size_t, and will be sorted as such. * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) * The keys must all be of the same size. - *
  • #MDB_DUPFIXED - * This flag may only be used in combination with #MDB_DUPSORT. + * - MDB_DUPFIXED + * This flag may only be used in combination with MDB_DUPSORT. *This option * tells the library that the data items for this database are all *the same * size, which allows further optimizations in storage and *retrieval. When - * all data items are the same size, the #MDB_GET_MULTIPLE, - *#MDB_NEXT_MULTIPLE - * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve + * all data items are the same size, the MDB_GET_MULTIPLE, + *MDB_NEXT_MULTIPLE + * and MDB_PREV_MULTIPLE cursor operations may be used to retrieve *multiple * items at once. - *
  • #MDB_INTEGERDUP + * - MDB_INTEGERDUP * This option specifies that duplicate data items are binary *integers, - * similar to #MDB_INTEGERKEY keys. - *
  • #MDB_REVERSEDUP + * similar to MDB_INTEGERKEY keys. + * - MDB_REVERSEDUP * This option specifies that duplicate data items should be *compared as * strings in reverse order. - *
  • #MDB_CREATE + * - MDB_CREATE * Create the named database if it doesn't exist. This option is *not * allowed in a read-only transaction or a read-only environment. - *
- * @param[out] dbi Address where the new #MDB_dbi handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible + * [out] dbi Address where the new MDB_dbi handle will be stored + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • #MDB_NOTFOUND - the specified database doesn't exist in the + * - MDB_NOTFOUND - the specified database doesn't exist in the *environment - * and #MDB_CREATE was not specified. - *
  • #MDB_DBS_FULL - too many databases have been opened. See - *#mdbx_env_set_maxdbs(). - *
+ * and MDB_CREATE was not specified. + * - MDB_DBS_FULL - too many databases have been opened. See + *mdbx_env_set_maxdbs(). */ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi); -/** @brief Retrieve statistics for a database. +/* Retrieve statistics for a database. * - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[out] stat The address of an #MDB_stat structure + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [out] stat The address of an MDB_stat structure * where the statistics will be copied - * @return A non-zero error value on failure and 0 on success. Some possible + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, size_t bytes); -/** @brief Retrieve the DB flags for a database handle. +/* Retrieve the DB flags for a database handle. * - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[out] flags Address where the flags will be returned. - * @return A non-zero error value on failure and 0 on success. + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [out] flags Address where the flags will be returned. + * Returns A non-zero error value on failure and 0 on success. */ int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); -/** @brief Close a database handle. Normally unnecessary. Use with care: +/* Close a database handle. Normally unnecessary. Use with care: * * This call is not mutex protected. Handles should only be closed by * a single thread, and only if no other threads are going to reference @@ -1148,140 +1090,133 @@ int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); * Doing so can cause misbehavior from database corruption to errors * like MDB_BAD_VALSIZE (since the DB name is gone). * - * Closing a database handle is not necessary, but lets #mdbx_dbi_open() + * Closing a database handle is not necessary, but lets mdbx_dbi_open() * reuse the handle value. Usually it's better to set a bigger - * #mdbx_env_set_maxdbs(), unless that value would be large. + * mdbx_env_set_maxdbs(), unless that value would be large. * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() + * [in] env An environment handle returned by mdbx_env_create() + * [in] dbi A database handle returned by mdbx_dbi_open() */ void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); -/** @brief Empty or delete+close a database. +/* Empty or delete+close a database. * - * See #mdbx_dbi_close() for restrictions about closing the DB handle. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[in] del 0 to empty the DB, 1 to delete it from the + * See mdbx_dbi_close() for restrictions about closing the DB handle. + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] del 0 to empty the DB, 1 to delete it from the * environment and close the DB handle. - * @return A non-zero error value on failure and 0 on success. + * Returns A non-zero error value on failure and 0 on success. */ int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); -/** @brief Set a custom key comparison function for a database. +/* Set a custom key comparison function for a database. * * The comparison function is called whenever it is necessary to compare a * key specified by the application with a key currently stored in the *database. * If no comparison function is specified, and no special key flags were *specified - * with #mdbx_dbi_open(), the keys are compared lexically, with shorter keys + * with mdbx_dbi_open(), the keys are compared lexically, with shorter keys *collating * before longer keys. - * @warning This function must be called before any data access functions are + * Warning: This function must be called before any data access functions are *used, * otherwise data corruption may occur. The same comparison function must be *used by every * program accessing the database, every time the database is used. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[in] cmp A #MDB_cmp_func function - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] cmp A MDB_cmp_func function + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); -/** @brief Set a custom data comparison function for a #MDB_DUPSORT database. +/* Set a custom data comparison function for a MDB_DUPSORT database. * * This comparison function is called whenever it is necessary to compare a *data * item specified by the application with a data item currently stored in the *database. * This function only takes effect if the database was opened with the - *#MDB_DUPSORT + *MDB_DUPSORT * flag. * If no comparison function is specified, and no special key flags were *specified - * with #mdbx_dbi_open(), the data items are compared lexically, with shorter + * with mdbx_dbi_open(), the data items are compared lexically, with shorter *items collating * before longer items. - * @warning This function must be called before any data access functions are + * Warning: This function must be called before any data access functions are *used, * otherwise data corruption may occur. The same comparison function must be *used by every * program accessing the database, every time the database is used. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[in] cmp A #MDB_cmp_func function - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] cmp A MDB_cmp_func function + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); -/** @brief Get items from a database. +/* Get items from a database. * * This function retrieves key/data pairs from the database. The address * and length of the data associated with the specified \b key are returned * in the structure to which \b data refers. - * If the database supports duplicate keys (#MDB_DUPSORT) then the + * If the database supports duplicate keys (MDB_DUPSORT) then the * first data item for the key will be returned. Retrieval of other - * items requires the use of #mdbx_cursor_get(). + * items requires the use of mdbx_cursor_get(). * - * @note The memory pointed to by the returned values is owned by the + * Note: The memory pointed to by the returned values is owned by the * database. The caller need not dispose of the memory, and may not * modify it in any way. For values returned in a read-only transaction * any modification attempts will cause a SIGSEGV. - * @note Values returned from the database are valid only until a + * Note: Values returned from the database are valid only until a * subsequent update operation, or the end of the transaction. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[in] key The key to search for in the database - * @param[out] data The data corresponding to the key - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] key The key to search for in the database + * [out] data The data corresponding to the key + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • #MDB_NOTFOUND - the key was not in the database. - *
  • EINVAL - an invalid parameter was specified. - *
+ * - MDB_NOTFOUND - the key was not in the database. + * - EINVAL - an invalid parameter was specified. */ int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); -/** @brief Store items into a database. +/* Store items into a database. * * This function stores key/data pairs in the database. The default behavior * is to enter the new key/data pair, replacing any previously existing key * if duplicates are disallowed, or adding a duplicate data item if - * duplicates are allowed (#MDB_DUPSORT). - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[in] key The key to store in the database - * @param[in,out] data The data to store - * @param[in] flags Special options for this operation. This parameter + * duplicates are allowed (MDB_DUPSORT). + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] key The key to store in the database + * [in,out] data The data to store + * [in] flags Special options for this operation. This parameter * must be set to 0 or by bitwise OR'ing together one or more of the * values described here. - *
    - *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * - MDB_NODUPDATA - enter the new key/data pair only if it does not * already appear in the database. This flag may only be *specified - * if the database was opened with #MDB_DUPSORT. The function + * if the database was opened with MDB_DUPSORT. The function *will - * return #MDB_KEYEXIST if the key/data pair already appears in + * return MDB_KEYEXIST if the key/data pair already appears in *the * database. - *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * - MDB_NOOVERWRITE - enter the new key/data pair only if the key * does not already appear in the database. The function will *return - * #MDB_KEYEXIST if the key already appears in the database, even + * MDB_KEYEXIST if the key already appears in the database, even *if - * the database supports duplicates (#MDB_DUPSORT). The \b data + * the database supports duplicates (MDB_DUPSORT). The \b data * parameter will be set to point to the existing item. - *
  • #MDB_RESERVE - reserve space for data of the given size, but + * - MDB_RESERVE - reserve space for data of the given size, but * don't copy the given data. Instead, return a pointer to the * reserved space, which the caller can fill in later - before * the next update operation or the transaction ends. This saves @@ -1289,27 +1224,24 @@ int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); * LMDB does nothing else with this memory, the caller is *expected * to modify all of the space requested. This flag must not be - * specified if the database was opened with #MDB_DUPSORT. - *
  • #MDB_APPEND - append the given key/data pair to the end of the + * specified if the database was opened with MDB_DUPSORT. + * - MDB_APPEND - append the given key/data pair to the end of the * database. This option allows fast bulk loading when keys are * already known to be in the correct order. Loading unsorted *keys - * with this flag will cause a #MDB_KEYEXIST error. - *
  • #MDB_APPENDDUP - as above, but for sorted dup data. - *
- * @return A non-zero error value on failure and 0 on success. Some possible + * with this flag will cause a MDB_KEYEXIST error. + * - MDB_APPENDDUP - as above, but for sorted dup data. + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • #MDB_MAP_FULL - the database is full, see #mdbx_env_set_mapsize(). - *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
+ * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). + * - MDB_TXN_FULL - the transaction has too many dirty pages. + * - EACCES - an attempt was made to write in a read-only transaction. + * - EINVAL - an invalid parameter was specified. */ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); -/** @brief Delete items from a database. +/* Delete items from a database. * * This function removes key/data pairs from the database. * @@ -1320,68 +1252,64 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, * * LMDB-compatible mode: * If the database does not support sorted duplicate data items - * (#MDB_DUPSORT) the data parameter is ignored. + * (MDB_DUPSORT) the data parameter is ignored. * If the database supports sorted duplicates and the data parameter * is NULL, all of the duplicate data items for the key will be * deleted. Otherwise, if the data parameter is non-NULL * only the matching data item will be deleted. * - * This function will return #MDB_NOTFOUND if the specified key/data + * This function will return MDB_NOTFOUND if the specified key/data * pair is not in the database. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[in] key The key to delete from the database - * @param[in] data The data to delete - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] key The key to delete from the database + * [in] data The data to delete + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EACCES - an attempt was made to write in a read-only transaction. + * - EINVAL - an invalid parameter was specified. */ int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); -/** @brief Create a cursor handle. +/* Create a cursor handle. * * A cursor is associated with a specific transaction and database. * A cursor cannot be used when its database handle is closed. Nor - * when its transaction has ended, except with #mdbx_cursor_renew(). - * It can be discarded with #mdbx_cursor_close(). + * when its transaction has ended, except with mdbx_cursor_renew(). + * It can be discarded with mdbx_cursor_close(). * * MDBX-mode: * A cursor must be closed explicitly always, before * or after its transaction ends. It can be reused with - * #mdbx_cursor_renew() before finally closing it. + * mdbx_cursor_renew() before finally closing it. * * LMDB-compatible mode: * A cursor in a write-transaction can be closed before its transaction * ends, and will otherwise be closed when its transaction ends. * A cursor in a read-only transaction must be closed explicitly, before * or after its transaction ends. It can be reused with - * #mdbx_cursor_renew() before finally closing it. - * @note Earlier documentation said that cursors in every transaction + * mdbx_cursor_renew() before finally closing it. + * Note: Earlier documentation said that cursors in every transaction * were closed when the transaction committed or aborted. * - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[out] cursor Address where the new #MDB_cursor handle will be stored - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [out] cursor Address where the new MDB_cursor handle will be stored + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); -/** @brief Close a cursor handle. +/* Close a cursor handle. * * The cursor handle will be freed and must not be used again after this call. * Its transaction must still be live if it is a write-transaction. - * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * [in] cursor A cursor handle returned by mdbx_cursor_open() */ void mdbx_cursor_close(MDB_cursor *cursor); -/** @brief Renew a cursor handle. +/* Renew a cursor handle. * * A cursor is associated with a specific transaction and database. * Cursors that are only used in read-only @@ -1389,105 +1317,100 @@ void mdbx_cursor_close(MDB_cursor *cursor); * The cursor may be associated with a new read-only transaction, and * referencing the same database handle as it was created with. * This may be done whether the previous transaction is live or dead. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() - * @return A non-zero error value on failure and 0 on success. Some possible + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EINVAL - an invalid parameter was specified. */ int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); -/** @brief Return the cursor's transaction handle. +/* Return the cursor's transaction handle. * - * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * [in] cursor A cursor handle returned by mdbx_cursor_open() */ MDB_txn *mdbx_cursor_txn(MDB_cursor *cursor); -/** @brief Return the cursor's database handle. +/* Return the cursor's database handle. * - * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() + * [in] cursor A cursor handle returned by mdbx_cursor_open() */ MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); -/** @brief Retrieve by cursor. +/* Retrieve by cursor. * * This function retrieves key/data pairs from the database. The address and *length * of the key are returned in the object to which \b key refers (except for *the - * case of the #MDB_SET option, in which the \b key object is unchanged), and + * case of the MDB_SET option, in which the \b key object is unchanged), and * the address and length of the data are returned in the object to which \b *data * refers. - * See #mdbx_get() for restrictions on using the output values. - * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() - * @param[in,out] key The key for a retrieved item - * @param[in,out] data The data of a retrieved item - * @param[in] op A cursor operation #MDB_cursor_op - * @return A non-zero error value on failure and 0 on success. Some possible + * See mdbx_get() for restrictions on using the output values. + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in,out] key The key for a retrieved item + * [in,out] data The data of a retrieved item + * [in] op A cursor operation MDB_cursor_op + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • #MDB_NOTFOUND - no matching key found. - *
  • EINVAL - an invalid parameter was specified. - *
+ * - MDB_NOTFOUND - no matching key found. + * - EINVAL - an invalid parameter was specified. */ int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, MDB_cursor_op op); -/** @brief Store by cursor. +/* Store by cursor. * * This function stores key/data pairs into the database. * The cursor is positioned at the new item, or on failure usually near it. - * @note Earlier documentation incorrectly said errors would leave the + * Note: Earlier documentation incorrectly said errors would leave the * state of the cursor unchanged. - * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() - * @param[in] key The key operated on. - * @param[in] data The data operated on. - * @param[in] flags Options for this operation. This parameter + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in] key The key operated on. + * [in] data The data operated on. + * [in] flags Options for this operation. This parameter * must be set to 0 or one of the values described here. - *
    - *
  • #MDB_CURRENT - replace the item at the current cursor position. + * - MDB_CURRENT - replace the item at the current cursor position. * The \b key parameter must still be provided, and must match *it. - * If using sorted duplicates (#MDB_DUPSORT) the data item must + * If using sorted duplicates (MDB_DUPSORT) the data item must *still * sort into the same place. This is intended to be used when the * new data is the same size as the old. Otherwise it will simply * perform a delete of the old record followed by an insert. - *
  • #MDB_NODUPDATA - enter the new key/data pair only if it does not + * - MDB_NODUPDATA - enter the new key/data pair only if it does not * already appear in the database. This flag may only be *specified - * if the database was opened with #MDB_DUPSORT. The function + * if the database was opened with MDB_DUPSORT. The function *will - * return #MDB_KEYEXIST if the key/data pair already appears in + * return MDB_KEYEXIST if the key/data pair already appears in *the * database. - *
  • #MDB_NOOVERWRITE - enter the new key/data pair only if the key + * - MDB_NOOVERWRITE - enter the new key/data pair only if the key * does not already appear in the database. The function will *return - * #MDB_KEYEXIST if the key already appears in the database, even + * MDB_KEYEXIST if the key already appears in the database, even *if - * the database supports duplicates (#MDB_DUPSORT). - *
  • #MDB_RESERVE - reserve space for data of the given size, but + * the database supports duplicates (MDB_DUPSORT). + * - MDB_RESERVE - reserve space for data of the given size, but * don't copy the given data. Instead, return a pointer to the * reserved space, which the caller can fill in later - before * the next update operation or the transaction ends. This saves * an extra memcpy if the data is being generated later. This *flag * must not be specified if the database was opened with - *#MDB_DUPSORT. - *
  • #MDB_APPEND - append the given key/data pair to the end of the + *MDB_DUPSORT. + * - MDB_APPEND - append the given key/data pair to the end of the * database. No key comparisons are performed. This option allows * fast bulk loading when keys are already known to be in the * correct order. Loading unsorted keys with this flag will cause - * a #MDB_KEYEXIST error. - *
  • #MDB_APPENDDUP - as above, but for sorted dup data. - *
  • #MDB_MULTIPLE - store multiple contiguous data elements in a + * a MDB_KEYEXIST error. + * - MDB_APPENDDUP - as above, but for sorted dup data. + * - MDB_MULTIPLE - store multiple contiguous data elements in a * single request. This flag may only be specified if the *database - * was opened with #MDB_DUPFIXED. The \b data argument must be an + * was opened with MDB_DUPFIXED. The \b data argument must be an * array of two MDB_vals. The mv_size of the first MDB_val must *be * the size of a single data element. The mv_data of the first @@ -1500,100 +1423,91 @@ int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * the count of the number of elements actually written. The *mv_data * of the second MDB_val is unused. - *
- * @return A non-zero error value on failure and 0 on success. Some possible + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • #MDB_MAP_FULL - the database is full, see #mdbx_env_set_mapsize(). - *
  • #MDB_TXN_FULL - the transaction has too many dirty pages. - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
+ * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). + * - MDB_TXN_FULL - the transaction has too many dirty pages. + * - EACCES - an attempt was made to write in a read-only transaction. + * - EINVAL - an invalid parameter was specified. */ int mdbx_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, unsigned flags); -/** @brief Delete current key/data pair +/* Delete current key/data pair * * This function deletes the key/data pair to which the cursor refers. - * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() - * @param[in] flags Options for this operation. This parameter + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in] flags Options for this operation. This parameter * must be set to 0 or one of the values described here. - *
    - *
  • #MDB_NODUPDATA - delete all of the data items for the current key. + * - MDB_NODUPDATA - delete all of the data items for the current key. * This flag may only be specified if the database was opened with - *#MDB_DUPSORT. - *
- * @return A non-zero error value on failure and 0 on success. Some possible + *MDB_DUPSORT. + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EACCES - an attempt was made to write in a read-only transaction. - *
  • EINVAL - an invalid parameter was specified. - *
+ * - EACCES - an attempt was made to write in a read-only transaction. + * - EINVAL - an invalid parameter was specified. */ int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); -/** @brief Return count of duplicates for current key. +/* Return count of duplicates for current key. * * This call is only valid on databases that support sorted duplicate - * data items #MDB_DUPSORT. - * @param[in] cursor A cursor handle returned by #mdbx_cursor_open() - * @param[out] countp Address where the count will be stored - * @return A non-zero error value on failure and 0 on success. Some possible + * data items MDB_DUPSORT. + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [out] countp Address where the count will be stored + * Returns A non-zero error value on failure and 0 on success. Some possible * errors are: - *
    - *
  • EINVAL - cursor is not initialized, or an invalid parameter was + * - EINVAL - cursor is not initialized, or an invalid parameter was *specified. - *
*/ int mdbx_cursor_count(MDB_cursor *cursor, size_t *countp); -/** @brief Compare two data items according to a particular database. +/* Compare two data items according to a particular database. * * This returns a comparison as if the two data items were keys in the * specified database. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[in] a The first item to compare - * @param[in] b The second item to compare - * @return < 0 if a < b, 0 if a == b, > 0 if a > b + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] a The first item to compare + * [in] b The second item to compare + * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); -/** @brief Compare two data items according to a particular database. +/* Compare two data items according to a particular database. * * This returns a comparison as if the two items were data items of - * the specified database. The database must have the #MDB_DUPSORT flag. - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - * @param[in] a The first item to compare - * @param[in] b The second item to compare - * @return < 0 if a < b, 0 if a == b, > 0 if a > b + * the specified database. The database must have the MDB_DUPSORT flag. + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] a The first item to compare + * [in] b The second item to compare + * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); -/** @brief A callback function used to print a message from the library. +/* A callback function used to print a message from the library. * - * @param[in] msg The string to be printed. - * @param[in] ctx An arbitrary context pointer for the callback. - * @return < 0 on failure, >= 0 on success. + * [in] msg The string to be printed. + * [in] ctx An arbitrary context pointer for the callback. + * Returns < 0 on failure, >= 0 on success. */ typedef int(MDB_msg_func)(const char *msg, void *ctx); -/** @brief Dump the entries in the reader lock table. +/* Dump the entries in the reader lock table. * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] func A #MDB_msg_func function - * @param[in] ctx Anything the message function needs - * @return < 0 on failure, >= 0 on success. + * [in] env An environment handle returned by mdbx_env_create() + * [in] func A MDB_msg_func function + * [in] ctx Anything the message function needs + * Returns < 0 on failure, >= 0 on success. */ int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); -/** @brief Check for stale entries in the reader lock table. +/* Check for stale entries in the reader lock table. * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[out] dead Number of stale slots that were cleared - * @return 0 on success, non-zero on failure. + * [in] env An environment handle returned by mdbx_env_create() + * [out] dead Number of stale slots that were cleared + * Returns 0 on success, non-zero on failure. */ int mdbx_reader_check(MDB_env *env, int *dead); @@ -1601,46 +1515,46 @@ char *mdbx_dkey(MDB_val *key, char *buf); int mdbx_env_close_ex(MDB_env *env, int dont_sync); -/** @brief Set threshold to force flush the data buffers to disk, - * even of #MDB_NOSYNC, #MDB_NOMETASYNC and #MDB_MAPASYNC flags +/* Set threshold to force flush the data buffers to disk, + * even of MDB_NOSYNC, MDB_NOMETASYNC and MDB_MAPASYNC flags * in the environment. * - * Data is always written to disk when #mdbx_txn_commit() is called, + * Data is always written to disk when mdbx_txn_commit() is called, * but the operating system may keep it buffered. LMDB always flushes * the OS buffers upon commit as well, unless the environment was - * opened with #MDB_NOSYNC or in part #MDB_NOMETASYNC. + * opened with MDB_NOSYNC or in part MDB_NOMETASYNC. * * The default is 0, than mean no any threshold checked, * and no additional flush will be made. * - * @param[in] env An environment handle returned by #mdbx_env_create() - * @param[in] bytes The size in bytes of summary changes + * [in] env An environment handle returned by mdbx_env_create() + * [in] bytes The size in bytes of summary changes * when a synchronous flush would be made. - * @return A non-zero error value on failure and 0 on success. + * Returns A non-zero error value on failure and 0 on success. */ int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); -/** @brief Returns a lag of the reading. +/* Returns a lag of the reading. * * Returns an information for estimate how much given read-only * transaction is lagging relative the to actual head. * - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[out] percent Percentage of page allocation in the database. - * @return Number of transactions committed after the given was started for + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [out] percent Percentage of page allocation in the database. + * Returns Number of transactions committed after the given was started for * read, or -1 on failure. */ int mdbx_txn_straggler(MDB_txn *txn, int *percent); -/** @brief A callback function for killing a laggard readers, +/* A callback function for killing a laggard readers, * but also could waiting ones. Called in case of MDB_MAP_FULL error. * - * @param[in] env An environment handle returned by #mdbx_env_create(). - * @param[in] pid pid of the reader process. - * @param[in] thread_id thread_id of the reader thread. - * @param[in] txn Transaction number on which stalled. - * @param[in] gap a lag from the last commited txn. - * @param[in] retry a retry number, less that zero for notify end of OOM-loop. - * @return -1 on failure (reader is not killed), + * [in] env An environment handle returned by mdbx_env_create(). + * [in] pid pid of the reader process. + * [in] thread_id thread_id of the reader thread. + * [in] txn Transaction number on which stalled. + * [in] gap a lag from the last commited txn. + * [in] retry a retry number, less that zero for notify end of OOM-loop. + * Returns -1 on failure (reader is not killed), * 0 on a race condition (no such reader), * 1 on success (reader was killed), * >1 on success (reader was SURE killed). @@ -1648,23 +1562,23 @@ int mdbx_txn_straggler(MDB_txn *txn, int *percent); typedef int(MDBX_oom_func)(MDB_env *env, int pid, void *thread_id, size_t txn, unsigned gap, int retry); -/** @brief Set the OOM callback. +/* Set the OOM callback. * * Callback will be called only on out-of-pages case for killing * a laggard readers to allowing reclaiming of freeDB. * - * @param[in] env An environment handle returned by #mdbx_env_create(). - * @param[in] oomfunc A #MDBX_oom_func function or NULL to disable. + * [in] env An environment handle returned by mdbx_env_create(). + * [in] oomfunc A #MDBX_oom_func function or NULL to disable. */ void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); -/** @brief Get the current oom_func callback. +/* Get the current oom_func callback. * * Callback will be called only on out-of-pages case for killing * a laggard readers to allowing reclaiming of freeDB. * - * @param[in] env An environment handle returned by #mdbx_env_create(). - * @return A #MDBX_oom_func function or NULL if disabled. + * [in] env An environment handle returned by mdbx_env_create(). + * Returns A #MDBX_oom_func function or NULL if disabled. */ MDBX_oom_func *mdbx_env_get_oomfunc(MDB_env *env); From 8fb252327614e6a035b8f711b9fc977f7ba77453 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 27 Feb 2017 21:17:22 +0300 Subject: [PATCH 012/303] mdbx: add MDBX_EBADSIGN. --- mdbx.c | 76 +++++++++++++++++++++++++++++----------------------------- mdbx.h | 19 +++++++++++---- 2 files changed, 52 insertions(+), 43 deletions(-) diff --git a/mdbx.c b/mdbx.c index 81e82a6d..175406d2 100644 --- a/mdbx.c +++ b/mdbx.c @@ -2697,7 +2697,7 @@ int mdbx_env_sync(MDB_env *env, int force) { return EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!env->me_txns)) return MDB_PANIC; @@ -3054,7 +3054,7 @@ int mdbx_txn_renew(MDB_txn *txn) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY | MDB_TXN_FINISHED))) return EINVAL; @@ -3078,7 +3078,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, return EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(env->me_pid != getpid())) { env->me_flags |= MDB_FATAL_ERROR; @@ -3318,7 +3318,7 @@ int mdbx_txn_reset(MDB_txn *txn) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; /* This call is only valid for read-only txns */ if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) @@ -3337,7 +3337,7 @@ int mdbx_txn_abort(MDB_txn *txn) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; #if MDBX_MODE_ENABLED if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) @@ -3811,7 +3811,7 @@ int mdbx_txn_commit(MDB_txn *txn) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; MDB_env *env = txn->mt_env; if (unlikely(env->me_pid != getpid())) { @@ -4452,7 +4452,7 @@ int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { return EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(size < env->me_psize * 8)) return EINVAL; @@ -4495,7 +4495,7 @@ int __cold mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { return EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(env->me_map)) return EINVAL; @@ -4509,7 +4509,7 @@ int __cold mdbx_env_set_maxreaders(MDB_env *env, unsigned readers) { return EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(env->me_map)) return EINVAL; @@ -4523,7 +4523,7 @@ int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) { return EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; *readers = env->me_maxreaders; return MDB_SUCCESS; @@ -5108,7 +5108,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, return EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (env->me_fd != INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE | CHANGELESS))) @@ -5322,7 +5322,7 @@ int __cold mdbx_env_close_ex(MDB_env *env, int dont_sync) { if (unlikely(!env)) return EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (!dont_sync && env->me_txns) rc = mdbx_env_sync(env, 1); @@ -5976,7 +5976,7 @@ int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return EINVAL; @@ -6507,7 +6507,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, return EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) return MDB_BAD_TXN; @@ -6712,7 +6712,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; env = mc->mc_txn->mt_env; @@ -7270,7 +7270,7 @@ int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { return EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; @@ -7835,7 +7835,7 @@ int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) return EINVAL; @@ -7870,7 +7870,7 @@ int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE && mc->mc_signature != MDBX_MC_READY4CLOSE)) @@ -7908,7 +7908,7 @@ int mdbx_cursor_count(MDB_cursor *mc, size_t *countp) { return EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) return MDB_BAD_TXN; @@ -8757,7 +8757,7 @@ int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return EINVAL; @@ -9250,7 +9250,7 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return EINVAL; @@ -10009,7 +10009,7 @@ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(flags & ~VALID_FLAGS)) return EINVAL; @@ -10122,7 +10122,7 @@ int __cold mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, size_t bytes) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) return EINVAL; @@ -10162,7 +10162,7 @@ int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) return EINVAL; @@ -10274,7 +10274,7 @@ int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return EINVAL; @@ -10327,7 +10327,7 @@ int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return EINVAL; @@ -10341,7 +10341,7 @@ int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return EINVAL; @@ -10366,7 +10366,7 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { return -EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; rdrs = env->me_txns->mti_numreaders; mr = env->me_txns->mti_readers; @@ -10920,7 +10920,7 @@ int __cold mdbx_env_set_syncbytes(MDB_env *env, size_t bytes) { return EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; env->me_sync_threshold = bytes; return env->me_map ? mdbx_env_sync(env, 0) : MDB_SUCCESS; @@ -10949,7 +10949,7 @@ ATTRIBUTE_NO_SANITIZE_THREAD /* LY: avoid tsan-trap by me_txn, mm_last_pg and return -EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!txn->mt_u.reader)) return -1; @@ -11109,7 +11109,7 @@ int __cold mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, if (unlikely(!txn)) return MDB_BAD_TXN; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; ctx.mw_txn = txn; ctx.mw_user = user; @@ -11132,7 +11132,7 @@ int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) return EACCES; @@ -11162,7 +11162,7 @@ int mdbx_cursor_on_first(MDB_cursor *mc) { return EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (!(mc->mc_flags & C_INITIALIZED)) return MDBX_RESULT_FALSE; @@ -11181,7 +11181,7 @@ int mdbx_cursor_on_last(MDB_cursor *mc) { return EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (!(mc->mc_flags & C_INITIALIZED)) return MDBX_RESULT_FALSE; @@ -11201,7 +11201,7 @@ int mdbx_cursor_eof(MDB_cursor *mc) { return EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if ((mc->mc_flags & C_INITIALIZED) == 0) return MDBX_RESULT_TRUE; @@ -11253,7 +11253,7 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) return EINVAL; @@ -11406,7 +11406,7 @@ int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return EINVAL; @@ -11467,7 +11467,7 @@ int mdbx_is_dirty(const MDB_txn *txn, const void *ptr) { return EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDB_VERSION_MISMATCH; + return MDBX_EBADSIGN; if (unlikely(txn->mt_flags & MDB_TXN_RDONLY)) return MDB_BAD_TXN; diff --git a/mdbx.h b/mdbx.h index 05c4205f..77f05c21 100644 --- a/mdbx.h +++ b/mdbx.h @@ -247,6 +247,9 @@ typedef enum MDB_cursor_op { /* Successful result */ #define MDB_SUCCESS 0 +#define MDBX_RESULT_FALSE MDB_SUCCESS +#define MDBX_RESULT_TRUE (-1) + /* key/data pair already exists */ #define MDB_KEYEXIST (-30799) /* key/data pair not found (EOF) */ @@ -257,7 +260,7 @@ typedef enum MDB_cursor_op { #define MDB_CORRUPTED (-30796) /* Update of meta page failed or environment had fatal error */ #define MDB_PANIC (-30795) -/* Environment version mismatch */ +/* DB file version mismatch with libmdbx */ #define MDB_VERSION_MISMATCH (-30794) /* File is not a valid LMDB file */ #define MDB_INVALID (-30793) @@ -295,6 +298,16 @@ typedef enum MDB_cursor_op { /* The last defined error code */ #define MDB_LAST_ERRCODE MDB_PROBLEM +/* The mdbx_put() or mdbx_replace() was called for key, + that has more that one associated value. */ +#define MDBX_EMULTIVAL (-30421) + +/* Bad signature of a runtime object(s), this can mean: + * - memory corruption or double-free; + * - ABI version mismatch (rare case); */ +#define MDBX_EBADSIGN (-30420) + + /* Statistics for a database in the environment */ typedef struct MDBX_stat { unsigned ms_psize; /* Size of a database page. @@ -1621,10 +1634,6 @@ int mdbx_cursor_on_first(MDB_cursor *mc); /* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */ int mdbx_cursor_on_last(MDB_cursor *mc); -#define MDBX_EMULTIVAL (MDB_LAST_ERRCODE - 42) -#define MDBX_RESULT_FALSE MDB_SUCCESS -#define MDBX_RESULT_TRUE (-1) - int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, MDB_val *old_data, unsigned flags); /* Same as mdbx_get(), but: From 57dc59ecfbf2ec06f84819d8002e1f7a923488b4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 28 Feb 2017 16:54:10 +0300 Subject: [PATCH 013/303] mdbx: add mdbx_strerror_r(). --- mdbx.c | 87 ++++++++++++++++++++++++++++++++++------------------------ mdbx.h | 5 ++-- 2 files changed, 54 insertions(+), 38 deletions(-) diff --git a/mdbx.c b/mdbx.c index 175406d2..3c5beff4 100644 --- a/mdbx.c +++ b/mdbx.c @@ -1328,7 +1328,7 @@ static pthread_mutex_t tsan_mutex = PTHREAD_MUTEX_INITIALIZER; #endif /** Return the library version info. */ -char *__cold mdbx_version(int *major, int *minor, int *patch) { +const char *mdbx_version(int *major, int *minor, int *patch) { if (major) *major = MDB_VERSION_MAJOR; if (minor) @@ -1338,45 +1338,60 @@ char *__cold mdbx_version(int *major, int *minor, int *patch) { return MDB_VERSION_STRING; } -/** Table of descriptions for LMDB @ref errors */ -static char *const mdbx_errstr[] = { - "MDB_KEYEXIST: Key/data pair already exists", - "MDB_NOTFOUND: No matching key/data pair found", - "MDB_PAGE_NOTFOUND: Requested page not found", - "MDB_CORRUPTED: Located page was wrong type", - "MDB_PANIC: Update of meta page failed or environment had fatal error", - "MDB_VERSION_MISMATCH: Database environment version mismatch", - "MDB_INVALID: File is not an LMDB file", - "MDB_MAP_FULL: Environment mapsize limit reached", - "MDB_DBS_FULL: Environment maxdbs limit reached", - "MDB_READERS_FULL: Environment maxreaders limit reached", - "MDB_TLS_FULL: Thread-local storage keys full - too many environments " - "open", - "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too " - "big", - "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", - "MDB_PAGE_FULL: Internal error - page has no more space", - "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", - "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", - "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", - "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", - "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong " - "DUPFIXED size", - "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", - "MDB_PROBLEM: Unexpected problem - txn should abort", -}; - -char *__cold mdbx_strerror(int err) { - int i; - if (!err) - return ("Successful return: 0"); +static const char *__mdbx_strerr(int err) { + /* Table of descriptions for LMDB errors */ + static const char *const tbl[] = { + "MDB_KEYEXIST: Key/data pair already exists", + "MDB_NOTFOUND: No matching key/data pair found", + "MDB_PAGE_NOTFOUND: Requested page not found", + "MDB_CORRUPTED: Located page was wrong type", + "MDB_PANIC: Update of meta page failed or environment had fatal error", + "MDB_VERSION_MISMATCH: Database environment version mismatch", + "MDB_INVALID: File is not an LMDB file", + "MDB_MAP_FULL: Environment mapsize limit reached", + "MDB_DBS_FULL: Environment maxdbs limit reached", + "MDB_READERS_FULL: Environment maxreaders limit reached", + "MDB_TLS_FULL: Thread-local storage keys full - too many environments " + "open", + "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too " + "big", + "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", + "MDB_PAGE_FULL: Internal error - page has no more space", + "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", + "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", + "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", + "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", + "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong " + "DUPFIXED size", + "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", + "MDB_PROBLEM: Unexpected problem - txn should abort", + }; if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { - i = err - MDB_KEYEXIST; - return mdbx_errstr[i]; + int i = err - MDB_KEYEXIST; + return tbl[i]; } - return strerror(err); + switch (err) { + case MDB_SUCCESS: + return "Successful return: 0"; + case MDBX_EMULTIVAL: + return ""; + case MDBX_EBADSIGN: + return ""; + default: + return NULL; + } +} + +const char *mdbx_strerror_r(int err, char *buf, size_t buflen) { + const char *msg = __mdbx_strerr(err); + return msg ? msg : strerror_r(err, buf, buflen); +} + +const char *__cold mdbx_strerror(int err) { + const char *msg = __mdbx_strerr(err); + return msg ? msg : strerror(err); } #if MDBX_MODE_ENABLED diff --git a/mdbx.h b/mdbx.h index 77f05c21..5aadfbb6 100644 --- a/mdbx.h +++ b/mdbx.h @@ -343,7 +343,7 @@ typedef struct MDBX_envinfo { * here * Returns "version string" The library version as a string */ -char *mdbx_version(int *major, int *minor, int *patch); +const char *mdbx_version(int *major, int *minor, int *patch); /* Return a string describing a given error code. * @@ -355,7 +355,8 @@ char *mdbx_version(int *major, int *minor, int *patch); * [in] err The error code * Returns "error message" The description of the error */ -char *mdbx_strerror(int err); +const char *mdbx_strerror(int err); +const char *mdbx_strerror_r(int err, char *buf, size_t buflen); /* Create an LMDB environment handle. * From f4bf45429835606d732c0584ca660ad9fccc0d0e Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 28 Feb 2017 17:24:29 +0300 Subject: [PATCH 014/303] mdbx: refine error-msg. --- mdbx.c | 37 ++++++++++++++++++------------------- mdbx.h | 4 ++-- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/mdbx.c b/mdbx.c index 3c5beff4..5f34b376 100644 --- a/mdbx.c +++ b/mdbx.c @@ -1338,21 +1338,20 @@ const char *mdbx_version(int *major, int *minor, int *patch) { return MDB_VERSION_STRING; } -static const char *__mdbx_strerr(int err) { +static const char *__mdbx_strerr(int errnum) { /* Table of descriptions for LMDB errors */ static const char *const tbl[] = { "MDB_KEYEXIST: Key/data pair already exists", "MDB_NOTFOUND: No matching key/data pair found", "MDB_PAGE_NOTFOUND: Requested page not found", - "MDB_CORRUPTED: Located page was wrong type", + "MDB_CORRUPTED: Located page was wrong data", "MDB_PANIC: Update of meta page failed or environment had fatal error", - "MDB_VERSION_MISMATCH: Database environment version mismatch", + "MDB_VERSION_MISMATCH: DB version mismatch libmdbx", "MDB_INVALID: File is not an LMDB file", "MDB_MAP_FULL: Environment mapsize limit reached", - "MDB_DBS_FULL: Environment maxdbs limit reached", - "MDB_READERS_FULL: Environment maxreaders limit reached", - "MDB_TLS_FULL: Thread-local storage keys full - too many environments " - "open", + "MDB_DBS_FULL: Too may DBI (maxdbs reached)", + "MDB_READERS_FULL: Too many readers (maxreaders reached)", + NULL /* -30789 unused in MDBX */, "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too " "big", "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", @@ -1367,31 +1366,31 @@ static const char *__mdbx_strerr(int err) { "MDB_PROBLEM: Unexpected problem - txn should abort", }; - if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) { - int i = err - MDB_KEYEXIST; + if (errnum >= MDB_KEYEXIST && errnum <= MDB_LAST_ERRCODE) { + int i = errnum - MDB_KEYEXIST; return tbl[i]; } - switch (err) { + switch (errnum) { case MDB_SUCCESS: - return "Successful return: 0"; + return "MDB_SUCCESS: Successful"; case MDBX_EMULTIVAL: - return ""; + return "MDBX_EMULTIVAL: Unable to update multi-value for the given key"; case MDBX_EBADSIGN: - return ""; + return "MDBX_EBADSIGN: Wrong signature of a runtime object(s)"; default: return NULL; } } -const char *mdbx_strerror_r(int err, char *buf, size_t buflen) { - const char *msg = __mdbx_strerr(err); - return msg ? msg : strerror_r(err, buf, buflen); +const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) { + const char *msg = __mdbx_strerr(errnum); + return msg ? msg : strerror_r(errnum, buf, buflen); } -const char *__cold mdbx_strerror(int err) { - const char *msg = __mdbx_strerr(err); - return msg ? msg : strerror(err); +const char *__cold mdbx_strerror(int errnum) { + const char *msg = __mdbx_strerr(errnum); + return msg ? msg : strerror(errnum); } #if MDBX_MODE_ENABLED diff --git a/mdbx.h b/mdbx.h index 5aadfbb6..91aaf081 100644 --- a/mdbx.h +++ b/mdbx.h @@ -355,8 +355,8 @@ const char *mdbx_version(int *major, int *minor, int *patch); * [in] err The error code * Returns "error message" The description of the error */ -const char *mdbx_strerror(int err); -const char *mdbx_strerror_r(int err, char *buf, size_t buflen); +const char *mdbx_strerror(int errnum); +const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); /* Create an LMDB environment handle. * From c2fda6be5aca460b74e2d04512f40890063f3a9d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 6 Mar 2017 20:18:42 +0300 Subject: [PATCH 015/303] mdbx: refine mdbx_is_dirty(). --- mdbx.c | 56 ++++++++++++++++++++++---------------------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/mdbx.c b/mdbx.c index 5f34b376..d201c495 100644 --- a/mdbx.c +++ b/mdbx.c @@ -11484,7 +11484,7 @@ int mdbx_is_dirty(const MDB_txn *txn, const void *ptr) { return MDBX_EBADSIGN; if (unlikely(txn->mt_flags & MDB_TXN_RDONLY)) - return MDB_BAD_TXN; + return MDBX_RESULT_FALSE; const MDB_env *env = txn->mt_env; const uintptr_t mask = ~(uintptr_t)(env->me_psize - 1); @@ -11495,49 +11495,37 @@ int mdbx_is_dirty(const MDB_txn *txn, const void *ptr) { * что было исходно задумано, детали см в логике кода mdbx_page_touch(). * * Более того, в режиме БЕЗ WRITEMAP грязные страницы выделяются через - * malloc(), т.е. находятся вне mmap-диаппазона. + * malloc(), т.е. находятся вне mmap-диапазона. * - * Тем не менее, однозначно страница "не грязная" если: - * - адрес находится внутри mmap-диаппазона и в заголовке страницы - * нет флажка P_DIRTY, то однозначно страница "не грязная". - * - адрес вне mmap-диаппазона и его нет среди списка "грязных" страниц. - */ + * Тем не менее, однозначно страница "не грязная" если адрес находится + * внутри mmap-диапазона и в заголовке страницы нет флажка P_DIRTY. */ if (env->me_map < (char *)page) { const size_t used_size = env->me_psize * txn->mt_next_pgno; - if (env->me_map + used_size > (char *)page) { - /* страница внутри диапазона */ - if (page->mp_flags & P_DIRTY) - return MDBX_RESULT_TRUE; - return MDBX_RESULT_FALSE; + if ((char *)page < env->me_map + used_size) { + /* страница внутри диапазона, смотрим на флажки */ + if ((page->mp_flags & (P_DIRTY | P_LOOSE | P_KEEP)) == 0) + return MDBX_RESULT_FALSE; } /* Гипотетически здесь возможна ситуация, когда указатель адресует что-то * в пределах mmap, но за границей распределенных страниц. Это тяжелая - * ошибка, которой не возможно добиться без каких-то мега-нарушений. + * ошибка, к которой не возможно прийти без каких-то больших нарушений. * Поэтому не проверяем этот случай кроме как assert-ом, ибо бестолку. */ mdbx_tassert(txn, env->me_map + env->me_mapsize > (char *)page); } - /* Страница вне mmap-диаппазона */ - if (env->me_flags & MDB_WRITEMAP) - /* Если MDB_WRITEMAP, то результат уже ясен. */ - return MDBX_RESULT_FALSE; - - /* Смотрим список грязных страниц у заданной транзакции. */ - MDB_ID2 *list = txn->mt_u.dirty_list; - if (list) { - unsigned i, n = list[0].mid; - for (i = 1; i <= n; i++) { - const MDB_page *dirty = list[i].mptr; - if (dirty == page) - return MDBX_RESULT_TRUE; - } - } - - /* При вложенных транзакциях, страница может быть в dirty-списке - * родительской транзакции, но в этом случае она будет скопирована перед - * изменением в текущей транзакции, т.е. относительно заданной транзакции - * проверяемый адрес "не грязный". */ - return MDBX_RESULT_FALSE; + /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был + * передан некорректный адрес, либо адрес в теневой странице, которая была + * выделена посредством malloc(). + * + * Поэтому всегда считаем что страница вне mmap-диапазона "грязная", + * не просматривая при этом списки грязных и spilled страниц у каких-либо + * транзакций. Такая логика имеет ряд преимуществ: + * - не тратим время на просмотр списков; + * - результат всегда безопасен (может быть ложно-положительным, но + * не ложно-отрицательным); + * - результат не зависит от вложенности транзакций и от относительного + * положения переданной транзакции в этой рекурсии. */ + return MDBX_RESULT_TRUE; } int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, From 95e606606a2c51a4c0ea12b804a7d6957bcf34f2 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 16 Mar 2017 17:27:05 +0300 Subject: [PATCH 016/303] mdbx: subdirs. Change-Id: Iea70b29ed39f55ee363729300f6ce54127b4e880 --- Makefile | 107 ++++++---------------- barriers.h => src/barriers.h | 0 mdbx.c => src/mdbx.c | 0 mdbx.h => src/mdbx.h | 0 mdbx_chk.c => src/mdbx_chk.c | 0 mdbx_copy.1 => src/mdbx_copy.1 | 0 mdbx_copy.c => src/mdbx_copy.c | 0 mdbx_dump.1 => src/mdbx_dump.1 | 0 mdbx_dump.c => src/mdbx_dump.c | 0 mdbx_load.1 => src/mdbx_load.1 | 0 mdbx_load.c => src/mdbx_load.c | 0 mdbx_stat.1 => src/mdbx_stat.1 | 0 mdbx_stat.c => src/mdbx_stat.c | 0 midl.h => src/midl.h | 0 reopen.h => src/reopen.h | 0 mtest0.c => test/test0.c | 2 +- mtest1.c => test/test1.c | 2 +- mtest2.c => test/test2.c | 2 +- mtest3.c => test/test3.c | 2 +- mtest4.c => test/test4.c | 2 +- mtest5.c => test/test5.c | 2 +- mtest6.c => test/test6.c | 2 +- wbench.c => test/test_bench.c | 2 +- yota_test1.c => test/test_yota1.c | 0 yota_test2.c => test/test_yota2.c | 0 tutorial/README.md | 1 + sample-bdb.txt => tutorial/sample-bdb.txt | 0 sample-mdb.txt => tutorial/sample-mdb.txt | 0 28 files changed, 39 insertions(+), 85 deletions(-) rename barriers.h => src/barriers.h (100%) rename mdbx.c => src/mdbx.c (100%) rename mdbx.h => src/mdbx.h (100%) rename mdbx_chk.c => src/mdbx_chk.c (100%) rename mdbx_copy.1 => src/mdbx_copy.1 (100%) rename mdbx_copy.c => src/mdbx_copy.c (100%) rename mdbx_dump.1 => src/mdbx_dump.1 (100%) rename mdbx_dump.c => src/mdbx_dump.c (100%) rename mdbx_load.1 => src/mdbx_load.1 (100%) rename mdbx_load.c => src/mdbx_load.c (100%) rename mdbx_stat.1 => src/mdbx_stat.1 (100%) rename mdbx_stat.c => src/mdbx_stat.c (100%) rename midl.h => src/midl.h (100%) rename reopen.h => src/reopen.h (100%) rename mtest0.c => test/test0.c (99%) rename mtest1.c => test/test1.c (99%) rename mtest2.c => test/test2.c (99%) rename mtest3.c => test/test3.c (99%) rename mtest4.c => test/test4.c (99%) rename mtest5.c => test/test5.c (99%) rename mtest6.c => test/test6.c (99%) rename wbench.c => test/test_bench.c (99%) rename yota_test1.c => test/test_yota1.c (100%) rename yota_test2.c => test/test_yota2.c (100%) create mode 100644 tutorial/README.md rename sample-bdb.txt => tutorial/sample-bdb.txt (100%) rename sample-mdb.txt => tutorial/sample-mdb.txt (100%) diff --git a/Makefile b/Makefile index 4b6c3614..fb3db7ba 100644 --- a/Makefile +++ b/Makefile @@ -24,8 +24,9 @@ suffix ?= CC ?= gcc XCFLAGS ?= -DNDEBUG=1 -DMDB_DEBUG=0 -CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections +CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC CFLAGS += -std=gnu99 -pthread $(XCFLAGS) +COVER ?= -coverage -fprofile-arcs -ftest-coverage -O0 # LY: for ability to built with modern glibc, # but then run with the old @@ -40,10 +41,10 @@ HEADERS := mdbx.h LIBRARIES := libmdbx.a libmdbx.so TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 -TESTS := mtest0 mtest1 mtest2 mtest3 mtest4 mtest5 mtest6 wbench \ - yota_test1 yota_test2 +TESTS := test0 test1 test2 test3 test4 test5 test6 test_bench \ + test_yota1 test_yota2 -SRC_MDBX := mdbx.c mdbx.h reopen.h barriers.h +SRC_MDBX := $(add_prefix src/, mdbx.c mdbx.h reopen.h barriers.h midl.h) .PHONY: mdbx all install clean check tests coverage @@ -64,94 +65,46 @@ install: $(LIBRARIES) $(TOOLS) $(HEADERS) && cp -t $(SANDBOX)$(mandir)/man1 $(MANPAGES) clean: - rm -rf $(TOOLS) $(TESTS) @* *.[ao] *.[ls]o *~ testdb/* *.gcov + rm -rf $(TOOLS) $(TESTS) @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err tests: $(TESTS) check: tests - [ -d testdb ] || mkdir testdb && rm -f testdb/* \ - && echo "*** LMDB-TEST-0" && ./mtest0 && ./mdbx_chk -v testdb \ - && echo "*** LMDB-TEST-1" && ./mtest1 && ./mdbx_chk -v testdb \ - && echo "*** LMDB-TEST-2" && ./mtest2 && ./mdbx_chk -v testdb \ - && echo "*** LMDB-TEST-3" && ./mtest3 && ./mdbx_chk -v testdb \ - && echo "*** LMDB-TEST-4" && ./mtest4 && ./mdbx_chk -v testdb \ - && echo "*** LMDB-TEST-5" && ./mtest5 && ./mdbx_chk -v testdb \ - && echo "*** LMDB-TEST-6" && ./mtest6 && ./mdbx_chk -v testdb \ + [ -d tmp.db ] || mkdir tmp.db && rm -f tmp.db/* \ + && echo "*** LMDB-TEST-0" && ./test0 && ./mdbx_chk -v tmp.db \ + && echo "*** LMDB-TEST-1" && ./test1 && ./mdbx_chk -v tmp.db \ + && echo "*** LMDB-TEST-2" && ./test2 && ./mdbx_chk -v tmp.db \ + && echo "*** LMDB-TEST-3" && ./test3 && ./mdbx_chk -v tmp.db \ + && echo "*** LMDB-TEST-4" && ./test4 && ./mdbx_chk -v tmp.db \ + && echo "*** LMDB-TEST-5" && ./test5 && ./mdbx_chk -v tmp.db \ + && echo "*** LMDB-TEST-6" && ./test6 && ./mdbx_chk -v tmp.db \ && echo "*** LMDB-TESTs - all done" +mdbx.o: $(SRC_MDBX) Makefile + $(CC) $(CFLAGS) -c src/mdbx.c -o $@ + libmdbx.a: mdbx.o $(AR) rs $@ $^ -libmdbx.so: mdbx.lo +libmdbx.so: mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) -save-temps -pthread -shared $(LDOPS) -o $@ $^ -mdbx_stat: mdbx_stat.o mdbx.o +mdbx_%: src/mdbx_%.c mdbx.o $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -mdbx_copy: mdbx_copy.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ +test%: test/test%.c mdbx.o + $(CC) $(CFLAGS) $(LDFLAGS) -Isrc -o $@ $^ -mdbx_dump: mdbx_dump.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ +gcov-mdbx.o: $(SRC_MDBX) Makefile + $(CC) $(CFLAGS) $(COVER) -c src/mdbx.c -o $@ -mdbx_load: mdbx_load.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ - -mdbx_chk: mdbx_chk.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ - -mtest0: mtest0.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ - -mtest1: mtest1.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ - -mtest2: mtest2.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ - -mtest3: mtest3.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ - -mtest4: mtest4.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ - -mtest5: mtest5.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ - -mtest6: mtest6.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ - -yota_test1: yota_test1.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ - -yota_test2: yota_test2.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ - -wbench: wbench.o mdbx.o - $(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ - -mdbx.o: $(SRC_MDBX) - $(CC) $(CFLAGS) -c mdbx.c -o $@ - -mdbx.lo: $(SRC_MDBX) - $(CC) $(CFLAGS) -fPIC -c mdbx.c -o $@ - -%: %.o - $(CC) $(CFLAGS) $(LDFLAGS) $^ -o $@ - -%.o: %.c mdbx.h - $(CC) $(CFLAGS) -c $< - -COFLAGS = -fprofile-arcs -ftest-coverage - -@gcov-mdb.o: $(SRC_MDBX) - $(CC) $(CFLAGS) $(COFLAGS) -O0 -c mdbx.c -o $@ - -coverage: @gcov-mdb.o - for t in mtest*.c; do x=`basename \$$t .c`; $(MAKE) $$x.o; \ - gcc -o @gcov-$$x $$x.o $^ -pthread $(COFLAGS); \ - rm -rf testdb; mkdir testdb; ./@gcov-$$x; done - gcov @gcov-mdb +# Seem this useless :( +coverage: gcov-mdbx.o + for t in test/test[0-9]*.c; do x=`basename \$$t .c`; \ + $(CC) $(CFLAGS) $(COVER) -Isrc $$t -o gcov-$$x $^; \ + rm -rf tmp.db; mkdir tmp.db; ./gcov-$$x; \ + done + gcov *.gcno ifneq ($(wildcard $(IOARENA)),) diff --git a/barriers.h b/src/barriers.h similarity index 100% rename from barriers.h rename to src/barriers.h diff --git a/mdbx.c b/src/mdbx.c similarity index 100% rename from mdbx.c rename to src/mdbx.c diff --git a/mdbx.h b/src/mdbx.h similarity index 100% rename from mdbx.h rename to src/mdbx.h diff --git a/mdbx_chk.c b/src/mdbx_chk.c similarity index 100% rename from mdbx_chk.c rename to src/mdbx_chk.c diff --git a/mdbx_copy.1 b/src/mdbx_copy.1 similarity index 100% rename from mdbx_copy.1 rename to src/mdbx_copy.1 diff --git a/mdbx_copy.c b/src/mdbx_copy.c similarity index 100% rename from mdbx_copy.c rename to src/mdbx_copy.c diff --git a/mdbx_dump.1 b/src/mdbx_dump.1 similarity index 100% rename from mdbx_dump.1 rename to src/mdbx_dump.1 diff --git a/mdbx_dump.c b/src/mdbx_dump.c similarity index 100% rename from mdbx_dump.c rename to src/mdbx_dump.c diff --git a/mdbx_load.1 b/src/mdbx_load.1 similarity index 100% rename from mdbx_load.1 rename to src/mdbx_load.1 diff --git a/mdbx_load.c b/src/mdbx_load.c similarity index 100% rename from mdbx_load.c rename to src/mdbx_load.c diff --git a/mdbx_stat.1 b/src/mdbx_stat.1 similarity index 100% rename from mdbx_stat.1 rename to src/mdbx_stat.1 diff --git a/mdbx_stat.c b/src/mdbx_stat.c similarity index 100% rename from mdbx_stat.c rename to src/mdbx_stat.c diff --git a/midl.h b/src/midl.h similarity index 100% rename from midl.h rename to src/midl.h diff --git a/reopen.h b/src/reopen.h similarity index 100% rename from reopen.h rename to src/reopen.h diff --git a/mtest0.c b/test/test0.c similarity index 99% rename from mtest0.c rename to test/test0.c index 162cfec9..29037618 100644 --- a/mtest0.c +++ b/test/test0.c @@ -33,7 +33,7 @@ abort())) #ifndef DBPATH -#define DBPATH "./testdb" +#define DBPATH "./tmp.db" #endif void *thread_entry(void *ctx) { diff --git a/mtest1.c b/test/test1.c similarity index 99% rename from mtest1.c rename to test/test1.c index 826462dc..58be6928 100644 --- a/mtest1.c +++ b/test/test1.c @@ -30,7 +30,7 @@ abort())) #ifndef DBPATH -#define DBPATH "./testdb" +#define DBPATH "./tmp.db" #endif int main(int argc, char *argv[]) { diff --git a/mtest2.c b/test/test2.c similarity index 99% rename from mtest2.c rename to test/test2.c index 93caa6e9..2244c7b3 100644 --- a/mtest2.c +++ b/test/test2.c @@ -33,7 +33,7 @@ abort())) #ifndef DBPATH -#define DBPATH "./testdb" +#define DBPATH "./tmp.db" #endif int main(int argc, char *argv[]) { diff --git a/mtest3.c b/test/test3.c similarity index 99% rename from mtest3.c rename to test/test3.c index be46fe06..1f1cacac 100644 --- a/mtest3.c +++ b/test/test3.c @@ -33,7 +33,7 @@ abort())) #ifndef DBPATH -#define DBPATH "./testdb" +#define DBPATH "./tmp.db" #endif int main(int argc, char *argv[]) { diff --git a/mtest4.c b/test/test4.c similarity index 99% rename from mtest4.c rename to test/test4.c index 16aca90c..423535c7 100644 --- a/mtest4.c +++ b/test/test4.c @@ -33,7 +33,7 @@ abort())) #ifndef DBPATH -#define DBPATH "./testdb" +#define DBPATH "./tmp.db" #endif int main(int argc, char *argv[]) { diff --git a/mtest5.c b/test/test5.c similarity index 99% rename from mtest5.c rename to test/test5.c index abca4e72..a482274a 100644 --- a/mtest5.c +++ b/test/test5.c @@ -33,7 +33,7 @@ abort())) #ifndef DBPATH -#define DBPATH "./testdb" +#define DBPATH "./tmp.db" #endif int main(int argc, char *argv[]) { diff --git a/mtest6.c b/test/test6.c similarity index 99% rename from mtest6.c rename to test/test6.c index e7de6ab5..ccd6c93a 100644 --- a/mtest6.c +++ b/test/test6.c @@ -33,7 +33,7 @@ abort())) #ifndef DBPATH -#define DBPATH "./testdb" +#define DBPATH "./tmp.db" #endif char dkbuf[1024]; diff --git a/wbench.c b/test/test_bench.c similarity index 99% rename from wbench.c rename to test/test_bench.c index debb8be9..95dc60d6 100644 --- a/wbench.c +++ b/test/test_bench.c @@ -32,7 +32,7 @@ abort())) #ifndef DBPATH -#define DBPATH "./testdb" +#define DBPATH "./tmp.db" #endif struct t0 { diff --git a/yota_test1.c b/test/test_yota1.c similarity index 100% rename from yota_test1.c rename to test/test_yota1.c diff --git a/yota_test2.c b/test/test_yota2.c similarity index 100% rename from yota_test2.c rename to test/test_yota2.c diff --git a/tutorial/README.md b/tutorial/README.md new file mode 100644 index 00000000..b5218da3 --- /dev/null +++ b/tutorial/README.md @@ -0,0 +1 @@ +This directory is just a placeholder for now. Tutorial and examples will be added later. diff --git a/sample-bdb.txt b/tutorial/sample-bdb.txt similarity index 100% rename from sample-bdb.txt rename to tutorial/sample-bdb.txt diff --git a/sample-mdb.txt b/tutorial/sample-mdb.txt similarity index 100% rename from sample-mdb.txt rename to tutorial/sample-mdb.txt From 0f49ed6e53d395d348311362d3281f8736a277c4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 16 Mar 2017 18:09:27 +0300 Subject: [PATCH 017/303] mdbx: big-bang (initial). - OS Abstraction Layer; - Windows Support. - preparation for more changes. Change-Id: I53772eda9091ba361cbc9a28656190ea0d4c5cee --- .clang-format | 3 + AUTHORS | 27 + Makefile | 28 +- src/mdbx.h => mdbx.h | 413 +++--- src/barriers.h | 156 --- src/bits.h | 796 ++++++++++++ src/defs.h | 302 +++++ src/lck-posix.c | 251 ++++ src/lck-windows.c | 314 +++++ src/mdbx.c | 2453 +++++++++-------------------------- src/midl.h | 45 +- src/osal.c | 625 +++++++++ src/osal.h | 423 ++++++ src/reopen.h | 236 ---- src/{ => tools}/mdbx_chk.c | 31 +- src/{ => tools}/mdbx_copy.1 | 0 src/{ => tools}/mdbx_copy.c | 10 +- src/{ => tools}/mdbx_dump.1 | 0 src/{ => tools}/mdbx_dump.c | 9 +- src/{ => tools}/mdbx_load.1 | 0 src/{ => tools}/mdbx_load.c | 11 +- src/{ => tools}/mdbx_stat.1 | 0 src/{ => tools}/mdbx_stat.c | 9 +- test/test0.c | 4 +- test/test1.c | 4 +- test/test2.c | 4 +- test/test3.c | 4 +- test/test4.c | 4 +- test/test5.c | 4 +- test/test6.c | 4 +- test/test_bench.c | 2 +- test/test_yota1.c | 2 +- test/test_yota2.c | 2 +- 33 files changed, 3678 insertions(+), 2498 deletions(-) create mode 100644 .clang-format create mode 100644 AUTHORS rename src/mdbx.h => mdbx.h (84%) delete mode 100644 src/barriers.h create mode 100644 src/bits.h create mode 100644 src/defs.h create mode 100644 src/lck-posix.c create mode 100644 src/lck-windows.c create mode 100644 src/osal.c create mode 100644 src/osal.h delete mode 100644 src/reopen.h rename src/{ => tools}/mdbx_chk.c (97%) rename src/{ => tools}/mdbx_copy.1 (100%) rename src/{ => tools}/mdbx_copy.c (89%) rename src/{ => tools}/mdbx_dump.1 (100%) rename src/{ => tools}/mdbx_dump.c (97%) rename src/{ => tools}/mdbx_load.1 (100%) rename src/{ => tools}/mdbx_load.c (98%) rename src/{ => tools}/mdbx_stat.1 (100%) rename src/{ => tools}/mdbx_stat.c (97%) diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..6c59ef3a --- /dev/null +++ b/.clang-format @@ -0,0 +1,3 @@ +BasedOnStyle: LLVM +Standard: Cpp11 +ReflowComments: true diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 00000000..b65b8080 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,27 @@ +Contributors +============ + +Alexey Naumov +Chris Mikkelson +Claude Brisson +David Barbour +David Wilson +Hallvard Furuseth , +Heiko Becker +Howard Chu , +Ignacio Casal Quinteiro +Jean-Christophe DUBOIS +John Hewson +Kurt Zeilenga +Leonid Yuriev , +Lorenz Bauer +Luke Yeager +Martin Hedenfalk +Ondrej Kuznik +Orivej Desh +Oskari Timperi +Pavel Medvedev +Philipp Storz +Quanah Gibson-Mount +Salvador Ortiz +Sebastien Launay diff --git a/Makefile b/Makefile index fb3db7ba..2dbfbeea 100644 --- a/Makefile +++ b/Makefile @@ -23,9 +23,9 @@ mandir ?= $(prefix)/man suffix ?= CC ?= gcc -XCFLAGS ?= -DNDEBUG=1 -DMDB_DEBUG=0 -CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -CFLAGS += -std=gnu99 -pthread $(XCFLAGS) +XCFLAGS ?= -DNDEBUG=1 -DMDB_DEBUG=0 -DMDBX_EXPORTS=1 +CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden +CFLAGS += -D_GNU_SOURCE=1 -std=gnu99 -pthread $(XCFLAGS) COVER ?= -coverage -fprofile-arcs -ftest-coverage -O0 # LY: for ability to built with modern glibc, @@ -44,7 +44,7 @@ MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 TESTS := test0 test1 test2 test3 test4 test5 test6 test_bench \ test_yota1 test_yota2 -SRC_MDBX := $(add_prefix src/, mdbx.c mdbx.h reopen.h barriers.h midl.h) +MDBX_SRC := mdbx.h $(addprefix src/, mdbx.c osal.c lck-posix.c defs.h bits.h osal.h midl.h) .PHONY: mdbx all install clean check tests coverage @@ -80,22 +80,28 @@ check: tests && echo "*** LMDB-TEST-6" && ./test6 && ./mdbx_chk -v tmp.db \ && echo "*** LMDB-TESTs - all done" -mdbx.o: $(SRC_MDBX) Makefile +mdbx.o: $(MDBX_SRC) Makefile $(CC) $(CFLAGS) -c src/mdbx.c -o $@ -libmdbx.a: mdbx.o - $(AR) rs $@ $^ +osal.o: $(MDBX_SRC) Makefile + $(CC) $(CFLAGS) -c src/osal.c -o $@ -libmdbx.so: mdbx.o +lck-posix.o: $(MDBX_SRC) Makefile + $(CC) $(CFLAGS) -c src/lck-posix.c -o $@ + +libmdbx.a: mdbx.o osal.o lck-posix.o + $(AR) rs $@ $? + +libmdbx.so: mdbx.o osal.o lck-posix.o $(CC) $(CFLAGS) $(LDFLAGS) -save-temps -pthread -shared $(LDOPS) -o $@ $^ -mdbx_%: src/mdbx_%.c mdbx.o +mdbx_%: src/tools/mdbx_%.c libmdbx.a $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -test%: test/test%.c mdbx.o +test%: test/test%.c libmdbx.a $(CC) $(CFLAGS) $(LDFLAGS) -Isrc -o $@ $^ -gcov-mdbx.o: $(SRC_MDBX) Makefile +gcov-mdbx.o: $(MDBX_SRC) Makefile $(CC) $(CFLAGS) $(COVER) -c src/mdbx.c -o $@ # Seem this useless :( diff --git a/src/mdbx.h b/mdbx.h similarity index 84% rename from src/mdbx.h rename to mdbx.h index 91aaf081..90c1564c 100644 --- a/src/mdbx.h +++ b/mdbx.h @@ -1,5 +1,17 @@ /* - * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + * + * --- * * This code is derived from "LMDB engine" written by * Howard Chu (Symas Corporation), which itself derived from btree.c @@ -35,48 +47,140 @@ */ #pragma once +/* *INDENT-OFF* */ +/* clang-format off */ + #ifndef _MDBX_H_ #define _MDBX_H_ #define MDBX_MODE_ENABLED 1 +#ifndef __has_attribute +# define __has_attribute(x) (0) +#endif + +#ifndef __dll_export +# if defined(_WIN32) || defined(__CYGWIN__) +# if defined(__GNUC__) || __has_attribute(dllexport) +# define __dll_export __attribute__((dllexport)) +# elif defined(_MSC_VER) +# define __dll_export __declspec(dllexport) +# else +# define __dll_export +# endif +# elif defined(__GNUC__) || __has_attribute(visibility) +# define __dll_export __attribute__((visibility("default"))) +# else +# define __dll_export +# endif +#endif /* __dll_export */ + +#ifndef __dll_import +# if defined(_WIN32) || defined(__CYGWIN__) +# if defined(__GNUC__) || __has_attribute(dllimport) +# define __dll_import __attribute__((dllimport)) +# elif defined(_MSC_VER) +# define __dll_import __declspec(dllimport) +# else +# define __dll_import +# endif +# else +# define __dll_import +# endif +#endif /* __dll_import */ + +#if defined(LIBMDBX_EXPORTS) +# define LIBMDBX_API __dll_export +#elif defined(MDBX_IMPORTS) +# define LIBMDBX_API __dll_import +#else +# define LIBMDBX_API +#endif /* LIBMDBX_API */ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4514) /* 'xyz': unreferenced inline function \ + has been removed */ +#pragma warning(disable : 4710) /* 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* function 'xyz' selected for \ + automatic inline expansion */ +#pragma warning(disable : 4061) /* enumerator 'abc' in switch of enum \ + 'xyz' is not explicitly handled by a case \ + label */ +#pragma warning(disable : 4201) /* nonstandard extension used : \ + nameless struct / union */ +#pragma warning(disable : 4127) /* conditional expression is constant \ + */ + +#pragma warning(push, 1) +#pragma warning(disable : 4530) /* C++ exception handler used, but \ + unwind semantics are not enabled. Specify \ + /EHsc */ +#pragma warning(disable : 4577) /* 'noexcept' used with no exception \ + handling mode specified; termination on \ + exception is not guaranteed. Specify /EHsc \ + */ +#endif /* _MSC_VER (warnings) */ + #include +#include #include -#include -#include -#include + +#if defined(_WIN32) || defined(_WIN64) +# include +# include + typedef unsigned mode_t; + typedef HANDLE mdbx_filehandle_t; + typedef DWORD mdbx_pid_t; + typedef DWORD mdbx_tid_t; +#else +# include /* for pthread_t */ +# include /* for truct iovec */ +# include /* for pid_t */ +# define HAVE_STRUCT_IOVEC 1 + typedef int mdbx_filehandle_t; + typedef pid_t mdbx_pid_t; + typedef pthread_t mdbx_tid_t; +#endif + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/* *INDENT-ON* */ +/* clang-format on */ #ifdef __cplusplus extern "C" { #endif /** Library major version */ -#define MDB_VERSION_MAJOR 0 +#define MDBX_VERSION_MAJOR 0 /** Library minor version */ -#define MDB_VERSION_MINOR 9 +#define MDBX_VERSION_MINOR 2 /** Library patch version */ -#define MDB_VERSION_PATCH 19 +#define MDBX_VERSION_PATCH 0 /** Combine args a,b,c into a single integer for easy version comparisons */ #define MDB_VERINT(a, b, c) (((a) << 24) | ((b) << 16) | (c)) /** The full library version as a single integer */ -#define MDB_VERSION_FULL \ - MDB_VERINT(MDB_VERSION_MAJOR, MDB_VERSION_MINOR, MDB_VERSION_PATCH) +#define MDBX_VERSION_FULL \ + MDB_VERINT(MDBX_VERSION_MAJOR, MDBX_VERSION_MINOR, MDBX_VERSION_PATCH) /* The release date of this library version */ -#define MDB_VERSION_DATE "DEVEL" +#define MDBX_VERSION_DATE "DEVEL" /* A stringifier for the version info */ -#define MDB_VERSTR(a, b, c, d) \ +#define MDBX_VERSTR(a, b, c, d) \ "MDBX " #a "." #b "." #c ": (" d ", https://github.com/ReOpen/libmdbx)" /* A helper for the stringifier macro */ -#define MDB_VERFOO(a, b, c, d) MDB_VERSTR(a, b, c, d) +#define MDBX_VERFOO(a, b, c, d) MDBX_VERSTR(a, b, c, d) /* The full library version as a C string */ -#define MDB_VERSION_STRING \ - MDB_VERFOO(MDB_VERSION_MAJOR, MDB_VERSION_MINOR, MDB_VERSION_PATCH, \ - MDB_VERSION_DATE) +#define MDBX_VERSION_STRING \ + MDBX_VERFOO(MDBX_VERSION_MAJOR, MDBX_VERSION_MINOR, MDBX_VERSION_PATCH, \ + MDBX_VERSION_DATE) /* Opaque structure for a database environment. * @@ -109,6 +213,14 @@ typedef struct MDB_cursor MDB_cursor; * The same applies to data sizes in databases with the MDB_DUPSORT flag. * Other data items can in theory be from 0 to 0xffffffff bytes long. */ +#ifndef HAVE_STRUCT_IOVEC +struct iovec { + void *iov_base; + size_t iov_len; +}; +#define HAVE_STRUCT_IOVEC +#endif /* HAVE_STRUCT_IOVEC */ + typedef struct iovec MDB_val; #define mv_size iov_len #define mv_data iov_base @@ -117,87 +229,85 @@ typedef struct iovec MDB_val; typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); /* Environment Flags */ -/* mmap at a fixed address (experimental) */ -#define MDB_FIXEDMAP 0x01 /* no environment directory */ -#define MDB_NOSUBDIR 0x4000 +#define MDB_NOSUBDIR 0x4000u /* don't fsync after commit */ -#define MDB_NOSYNC 0x10000 +#define MDB_NOSYNC 0x10000u /* read only */ -#define MDB_RDONLY 0x20000 +#define MDB_RDONLY 0x20000u /* don't fsync metapage after commit */ -#define MDB_NOMETASYNC 0x40000 +#define MDB_NOMETASYNC 0x40000u /* use writable mmap */ -#define MDB_WRITEMAP 0x80000 +#define MDB_WRITEMAP 0x80000u /* use asynchronous msync when MDB_WRITEMAP is used */ -#define MDB_MAPASYNC 0x100000 +#define MDB_MAPASYNC 0x100000u /* tie reader locktable slots to MDB_txn objects instead of to threads */ -#define MDB_NOTLS 0x200000 +#define MDB_NOTLS 0x200000u /* don't do any locking, caller must manage their own locks * WARNING: libmdbx don't support this mode. */ -#define MDB_NOLOCK__UNSUPPORTED 0x400000 +#define MDB_NOLOCK__UNSUPPORTED 0x400000u /* don't do readahead */ -#define MDB_NORDAHEAD 0x800000 +#define MDB_NORDAHEAD 0x800000u /* don't initialize malloc'd memory before writing to datafile */ -#define MDB_NOMEMINIT 0x1000000 +#define MDB_NOMEMINIT 0x1000000u #if MDBX_MODE_ENABLED /* aim to coalesce FreeDB records */ -#define MDBX_COALESCE 0x2000000 +#define MDBX_COALESCE 0x2000000u /* LIFO policy for reclaiming FreeDB records */ -#define MDBX_LIFORECLAIM 0x4000000 +#define MDBX_LIFORECLAIM 0x4000000u #endif /* MDBX_MODE_ENABLED */ /* make a steady-sync only on close and explicit env-sync */ #define MDBX_UTTERLY_NOSYNC (MDB_NOSYNC | MDB_MAPASYNC) /* debuging option, fill/perturb released pages */ -#define MDBX_PAGEPERTURB 0x8000000 +#define MDBX_PAGEPERTURB 0x8000000u /* Database Flags */ /* use reverse string keys */ -#define MDB_REVERSEKEY 0x02 +#define MDB_REVERSEKEY 0x02u /* use sorted duplicates */ -#define MDB_DUPSORT 0x04 +#define MDB_DUPSORT 0x04u /* numeric keys in native byte order, either unsigned int or mdbx_size_t. * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) * The keys must all be of the same size. */ -#define MDB_INTEGERKEY 0x08 +#define MDB_INTEGERKEY 0x08u /* with MDB_DUPSORT, sorted dup items have fixed size */ -#define MDB_DUPFIXED 0x10 +#define MDB_DUPFIXED 0x10u /* with MDB_DUPSORT, dups are MDB_INTEGERKEY-style integers */ -#define MDB_INTEGERDUP 0x20 +#define MDB_INTEGERDUP 0x20u /* with MDB_DUPSORT, use reverse string dups */ -#define MDB_REVERSEDUP 0x40 +#define MDB_REVERSEDUP 0x40u /* create DB if not already existing */ -#define MDB_CREATE 0x40000 +#define MDB_CREATE 0x40000u /* Write Flags */ /* For put: Don't write if the key already exists. */ -#define MDB_NOOVERWRITE 0x10 +#define MDB_NOOVERWRITE 0x10u /* Only for MDB_DUPSORT * For put: don't write if the key and data pair already exist. * For mdbx_cursor_del: remove all duplicate data items. */ -#define MDB_NODUPDATA 0x20 +#define MDB_NODUPDATA 0x20u /* For mdbx_cursor_put: overwrite the current key/data pair * MDBX allows this flag for mdbx_put() for explicit overwrite/update without * insertion. */ -#define MDB_CURRENT 0x40 +#define MDB_CURRENT 0x40u /* For put: Just reserve space for data, don't copy it. Return a * pointer to the reserved space. */ -#define MDB_RESERVE 0x10000 +#define MDB_RESERVE 0x10000u /* Data is being appended, don't split full pages. */ -#define MDB_APPEND 0x20000 +#define MDB_APPEND 0x20000u /* Duplicate data is being appended, don't split full pages. */ -#define MDB_APPENDDUP 0x40000 +#define MDB_APPENDDUP 0x40000u /* Store multiple data items in one call. Only for MDB_DUPFIXED. */ -#define MDB_MULTIPLE 0x80000 +#define MDB_MULTIPLE 0x80000u /* Copy Flags */ /* Compacting copy: Omit free space from copy, and renumber all * pages sequentially. */ -#define MDB_CP_COMPACT 1 +#define MDB_CP_COMPACT 1u /* Cursor Get operations. * @@ -235,8 +345,8 @@ typedef enum MDB_cursor_op { MDB_PREV_NODUP, /* Position at last data item of previous key */ MDB_SET, /* Position at specified key */ MDB_SET_KEY, /* Position at specified key, return key + data */ - MDB_SET_RANGE, /* Position at first key greater than or equal to specified - key. */ + MDB_SET_RANGE, /* Position at first key greater than or equal to specified + key. */ MDB_PREV_MULTIPLE /* Position at previous page and return key and up to a page of duplicate data items. Only for MDB_DUPFIXED */ @@ -280,7 +390,8 @@ typedef enum MDB_cursor_op { #define MDB_MAP_RESIZED (-30785) /* Operation and DB incompatible, or DB type changed. This can mean: * - The operation expects an MDB_DUPSORT / MDB_DUPFIXED database. - * - Opening a named DB when the unnamed DB has MDB_DUPSORT/MDB_INTEGERKEY. + * - Opening a named DB when the unnamed DB has + *MDB_DUPSORT/MDB_INTEGERKEY. * - Accessing a data record as a database, or vice versa. * - The database was dropped and recreated with different flags. */ @@ -307,7 +418,6 @@ typedef enum MDB_cursor_op { * - ABI version mismatch (rare case); */ #define MDBX_EBADSIGN (-30420) - /* Statistics for a database in the environment */ typedef struct MDBX_stat { unsigned ms_psize; /* Size of a database page. @@ -325,12 +435,12 @@ typedef struct MDBX_envinfo { void *me_mapaddr; /* Address of map, if fixed */ size_t me_mapsize; /* Size of the data memory map */ size_t me_last_pgno; /* ID of the last used page */ - size_t me_last_txnid; /* ID of the last committed transaction */ + uint64_t me_last_txnid; /* ID of the last committed transaction */ unsigned me_maxreaders; /* max reader slots in the environment */ unsigned me_numreaders; /* max reader slots used in the environment */ - size_t me_tail_txnid; /* ID of the last reader transaction */ - size_t me_meta1_txnid, me_meta1_sign; - size_t me_meta2_txnid, me_meta2_sign; + uint64_t me_tail_txnid; /* ID of the last reader transaction */ + uint64_t me_meta1_txnid, me_meta1_sign; + uint64_t me_meta2_txnid, me_meta2_sign; } MDBX_envinfo; /* Return the LMDB library version information. @@ -343,7 +453,7 @@ typedef struct MDBX_envinfo { * here * Returns "version string" The library version as a string */ -const char *mdbx_version(int *major, int *minor, int *patch); +LIBMDBX_API const char *mdbx_version(int *major, int *minor, int *patch); /* Return a string describing a given error code. * @@ -355,8 +465,8 @@ const char *mdbx_version(int *major, int *minor, int *patch); * [in] err The error code * Returns "error message" The description of the error */ -const char *mdbx_strerror(int errnum); -const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); +LIBMDBX_API const char *mdbx_strerror(int errnum); +LIBMDBX_API const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); /* Create an LMDB environment handle. * @@ -370,7 +480,7 @@ const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); * [out] env The address where the new handle will be stored * Returns A non-zero error value on failure and 0 on success. */ -int mdbx_env_create(MDB_env **env); +LIBMDBX_API int mdbx_env_create(MDB_env **env); /* Open an environment handle. * @@ -383,19 +493,6 @@ int mdbx_env_create(MDB_env **env); * must be set to 0 or by bitwise OR'ing together one or more of the * values described here. * Flags set by mdbx_env_set_flags() are also used. - * - MDB_FIXEDMAP - * use a fixed address for the mmap region. This flag must be specified - * when creating the environment, and is stored persistently in the - *environment. - * If successful, the memory map will always reside at the same - *virtual address - * and pointers used to reference data items in the database will - *be constant - * across multiple invocations. This option may not always work, - *depending on - * how the operating system has allocated memory to shared - *libraries and other uses. - * The feature is highly experimental. * - MDB_NOSUBDIR * By default, LMDB creates its environment in a directory whose * pathname is given in \b path, and creates its data and lock @@ -558,9 +655,10 @@ int mdbx_env_create(MDB_env **env); *files. * - EAGAIN - the environment was locked by another process. */ -int mdbx_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode); -int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, - mode_t mode, int *exclusive); +LIBMDBX_API int mdbx_env_open(MDB_env *env, const char *path, unsigned flags, + mode_t mode); +LIBMDBX_API int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, + mode_t mode, int *exclusive); /* Copy an LMDB environment to the specified path. * @@ -576,7 +674,7 @@ int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, * empty. * Returns A non-zero error value on failure and 0 on success. */ -int mdbx_env_copy(MDB_env *env, const char *path); +LIBMDBX_API int mdbx_env_copy(MDB_env *env, const char *path); /* Copy an LMDB environment to the specified file descriptor. * @@ -591,7 +689,7 @@ int mdbx_env_copy(MDB_env *env, const char *path); * have already been opened for Write access. * Returns A non-zero error value on failure and 0 on success. */ -int mdbx_env_copyfd(MDB_env *env, int fd); +LIBMDBX_API int mdbx_env_copyfd(MDB_env *env, mdbx_filehandle_t fd); /* Copy an LMDB environment to the specified path, with options. * @@ -616,7 +714,7 @@ int mdbx_env_copyfd(MDB_env *env, int fd); *leak. * Returns A non-zero error value on failure and 0 on success. */ -int mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags); +LIBMDBX_API int mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags); /* Copy an LMDB environment to the specified file descriptor, * with options. @@ -635,7 +733,8 @@ int mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags); * See mdbx_env_copy2() for options. * Returns A non-zero error value on failure and 0 on success. */ -int mdbx_env_copyfd2(MDB_env *env, int fd, unsigned flags); +LIBMDBX_API int mdbx_env_copyfd2(MDB_env *env, mdbx_filehandle_t fd, + unsigned flags); /* Return statistics about the LMDB environment. * @@ -643,7 +742,7 @@ int mdbx_env_copyfd2(MDB_env *env, int fd, unsigned flags); * [out] stat The address of an MDB_stat structure * where the statistics will be copied */ -int mdbx_env_stat(MDB_env *env, MDBX_stat *stat, size_t bytes); +LIBMDBX_API int mdbx_env_stat(MDB_env *env, MDBX_stat *stat, size_t bytes); /* Return information about the LMDB environment. * @@ -651,7 +750,7 @@ int mdbx_env_stat(MDB_env *env, MDBX_stat *stat, size_t bytes); * [out] stat The address of an MDB_envinfo structure * where the information will be copied */ -int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); +LIBMDBX_API int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); /* Flush the data buffers to disk. * @@ -670,7 +769,7 @@ int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); * - EINVAL - an invalid parameter was specified. * - EIO - an error occurred during synchronization. */ -int mdbx_env_sync(MDB_env *env, int force); +LIBMDBX_API int mdbx_env_sync(MDB_env *env, int force); /* Close the environment and release the memory map. * @@ -687,7 +786,7 @@ int mdbx_env_sync(MDB_env *env, int force); * on opening next time, and transactions since the last non-weak * checkpoint (meta-page update) will rolledback for consistency guarantee. */ -void mdbx_env_close(MDB_env *env); +LIBMDBX_API void mdbx_env_close(MDB_env *env); /* Set environment flags. * @@ -701,7 +800,7 @@ void mdbx_env_close(MDB_env *env); * errors are: * - EINVAL - an invalid parameter was specified. */ -int mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff); +LIBMDBX_API int mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff); /* Get environment flags. * @@ -711,7 +810,7 @@ int mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff); * errors are: * - EINVAL - an invalid parameter was specified. */ -int mdbx_env_get_flags(MDB_env *env, unsigned *flags); +LIBMDBX_API int mdbx_env_get_flags(MDB_env *env, unsigned *flags); /* Return the path that was used in mdbx_env_open(). * @@ -723,7 +822,7 @@ int mdbx_env_get_flags(MDB_env *env, unsigned *flags); * errors are: * - EINVAL - an invalid parameter was specified. */ -int mdbx_env_get_path(MDB_env *env, const char **path); +LIBMDBX_API int mdbx_env_get_path(MDB_env *env, const char **path); /* Return the filedescriptor for the given environment. * @@ -737,7 +836,7 @@ int mdbx_env_get_path(MDB_env *env, const char **path); * errors are: * - EINVAL - an invalid parameter was specified. */ -int mdbx_env_get_fd(MDB_env *env, int *fd); +LIBMDBX_API int mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *fd); /* Set the size of the memory map to use for this environment. * @@ -772,7 +871,7 @@ int mdbx_env_get_fd(MDB_env *env, int *fd); *has * an active write transaction. */ -int mdbx_env_set_mapsize(MDB_env *env, size_t size); +LIBMDBX_API int mdbx_env_set_mapsize(MDB_env *env, size_t size); /* Set the maximum number of threads/reader slots for the environment. * @@ -792,7 +891,7 @@ int mdbx_env_set_mapsize(MDB_env *env, size_t size); * - EINVAL - an invalid parameter was specified, or the environment is *already open. */ -int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); +LIBMDBX_API int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); /* Get the maximum number of threads/reader slots for the environment. * @@ -802,7 +901,7 @@ int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); * errors are: * - EINVAL - an invalid parameter was specified. */ -int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); +LIBMDBX_API int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); /* Set the maximum number of named databases for the environment. * @@ -822,7 +921,7 @@ int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); * - EINVAL - an invalid parameter was specified, or the environment is *already open. */ -int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); +LIBMDBX_API int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); /* Get the maximum size of keys and MDB_DUPSORT data we can write. * @@ -831,7 +930,7 @@ int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); * [in] env An environment handle returned by mdbx_env_create() * Returns The maximum size of a key we can write */ -int mdbx_env_get_maxkeysize(MDB_env *env); +LIBMDBX_API int mdbx_env_get_maxkeysize(MDB_env *env); /* Set application information associated with the MDB_env. * @@ -839,14 +938,14 @@ int mdbx_env_get_maxkeysize(MDB_env *env); * [in] ctx An arbitrary pointer for whatever the application needs. * Returns A non-zero error value on failure and 0 on success. */ -int mdbx_env_set_userctx(MDB_env *env, void *ctx); +LIBMDBX_API int mdbx_env_set_userctx(MDB_env *env, void *ctx); /* Get the application information associated with the MDB_env. * * [in] env An environment handle returned by mdbx_env_create() * Returns The pointer set by mdbx_env_set_userctx(). */ -void *mdbx_env_get_userctx(MDB_env *env); +LIBMDBX_API void *mdbx_env_get_userctx(MDB_env *env); /* A callback function for most LMDB assert() failures, * called before printing the message and aborting. @@ -864,7 +963,7 @@ typedef void MDB_assert_func(MDB_env *env, const char *msg, * [in] func An MDB_assert_func function, or 0. * Returns A non-zero error value on failure and 0 on success. */ -int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); +LIBMDBX_API int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); /* Create a transaction for use with the environment. * @@ -898,14 +997,14 @@ int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); * the reader lock table is full. See mdbx_env_set_maxreaders(). * - ENOMEM - out of memory. */ -int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, - MDB_txn **txn); +LIBMDBX_API int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, + MDB_txn **txn); /* Returns the transaction's MDB_env * * [in] txn A transaction handle returned by mdbx_txn_begin() */ -MDB_env *mdbx_txn_env(MDB_txn *txn); +LIBMDBX_API MDB_env *mdbx_txn_env(MDB_txn *txn); /* Return the transaction's ID. * @@ -916,7 +1015,7 @@ MDB_env *mdbx_txn_env(MDB_txn *txn); * [in] txn A transaction handle returned by mdbx_txn_begin() * Returns A transaction ID, valid if input is an active transaction. */ -size_t mdbx_txn_id(MDB_txn *txn); +LIBMDBX_API size_t mdbx_txn_id(MDB_txn *txn); /* Commit all the operations of a transaction into the database. * @@ -940,7 +1039,7 @@ size_t mdbx_txn_id(MDB_txn *txn); * - EIO - a low-level I/O error occurred while writing. * - ENOMEM - out of memory. */ -int mdbx_txn_commit(MDB_txn *txn); +LIBMDBX_API int mdbx_txn_commit(MDB_txn *txn); /* Abandon all the operations of the transaction instead of saving * them. @@ -959,7 +1058,7 @@ int mdbx_txn_commit(MDB_txn *txn); * * [in] txn A transaction handle returned by mdbx_txn_begin() */ -int mdbx_txn_abort(MDB_txn *txn); +LIBMDBX_API int mdbx_txn_abort(MDB_txn *txn); /* Reset a read-only transaction. * @@ -978,7 +1077,7 @@ int mdbx_txn_abort(MDB_txn *txn); * the database size may grow much more rapidly than otherwise. * [in] txn A transaction handle returned by mdbx_txn_begin() */ -int mdbx_txn_reset(MDB_txn *txn); +LIBMDBX_API int mdbx_txn_reset(MDB_txn *txn); /* Renew a read-only transaction. * @@ -992,7 +1091,7 @@ int mdbx_txn_reset(MDB_txn *txn); * must be shut down. * - EINVAL - an invalid parameter was specified. */ -int mdbx_txn_renew(MDB_txn *txn); +LIBMDBX_API int mdbx_txn_renew(MDB_txn *txn); /* Open a database in the environment. * A database handle denotes the name and parameters of a database, @@ -1072,7 +1171,8 @@ int mdbx_txn_renew(MDB_txn *txn); * - MDB_DBS_FULL - too many databases have been opened. See *mdbx_env_set_maxdbs(). */ -int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi); +LIBMDBX_API int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, + MDB_dbi *dbi); /* Retrieve statistics for a database. * @@ -1084,7 +1184,8 @@ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi); * errors are: * - EINVAL - an invalid parameter was specified. */ -int mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, size_t bytes); +LIBMDBX_API int mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, + size_t bytes); /* Retrieve the DB flags for a database handle. * @@ -1093,7 +1194,7 @@ int mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, size_t bytes); * [out] flags Address where the flags will be returned. * Returns A non-zero error value on failure and 0 on success. */ -int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); +LIBMDBX_API int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); /* Close a database handle. Normally unnecessary. Use with care: * @@ -1111,7 +1212,7 @@ int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); * [in] env An environment handle returned by mdbx_env_create() * [in] dbi A database handle returned by mdbx_dbi_open() */ -void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); +LIBMDBX_API void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); /* Empty or delete+close a database. * @@ -1122,7 +1223,7 @@ void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); * environment and close the DB handle. * Returns A non-zero error value on failure and 0 on success. */ -int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); +LIBMDBX_API int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); /* Set a custom key comparison function for a database. * @@ -1146,7 +1247,7 @@ int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); * errors are: * - EINVAL - an invalid parameter was specified. */ -int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); +LIBMDBX_API int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); /* Set a custom data comparison function for a MDB_DUPSORT database. * @@ -1174,7 +1275,7 @@ int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); * errors are: * - EINVAL - an invalid parameter was specified. */ -int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); +LIBMDBX_API int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); /* Get items from a database. * @@ -1200,7 +1301,8 @@ int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); * - MDB_NOTFOUND - the key was not in the database. * - EINVAL - an invalid parameter was specified. */ -int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); +LIBMDBX_API int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, + MDB_val *data); /* Store items into a database. * @@ -1252,8 +1354,8 @@ int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); * - EACCES - an attempt was made to write in a read-only transaction. * - EINVAL - an invalid parameter was specified. */ -int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, - unsigned flags); +LIBMDBX_API int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, + unsigned flags); /* Delete items from a database. * @@ -1283,7 +1385,8 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, * - EACCES - an attempt was made to write in a read-only transaction. * - EINVAL - an invalid parameter was specified. */ -int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); +LIBMDBX_API int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, + MDB_val *data); /* Create a cursor handle. * @@ -1313,7 +1416,8 @@ int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); * errors are: * - EINVAL - an invalid parameter was specified. */ -int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); +LIBMDBX_API int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, + MDB_cursor **cursor); /* Close a cursor handle. * @@ -1321,7 +1425,7 @@ int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); * Its transaction must still be live if it is a write-transaction. * [in] cursor A cursor handle returned by mdbx_cursor_open() */ -void mdbx_cursor_close(MDB_cursor *cursor); +LIBMDBX_API void mdbx_cursor_close(MDB_cursor *cursor); /* Renew a cursor handle. * @@ -1337,19 +1441,19 @@ void mdbx_cursor_close(MDB_cursor *cursor); * errors are: * - EINVAL - an invalid parameter was specified. */ -int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); +LIBMDBX_API int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); /* Return the cursor's transaction handle. * * [in] cursor A cursor handle returned by mdbx_cursor_open() */ -MDB_txn *mdbx_cursor_txn(MDB_cursor *cursor); +LIBMDBX_API MDB_txn *mdbx_cursor_txn(MDB_cursor *cursor); /* Return the cursor's database handle. * * [in] cursor A cursor handle returned by mdbx_cursor_open() */ -MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); +LIBMDBX_API MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); /* Retrieve by cursor. * @@ -1371,8 +1475,8 @@ MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); * - MDB_NOTFOUND - no matching key found. * - EINVAL - an invalid parameter was specified. */ -int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, - MDB_cursor_op op); +LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + MDB_cursor_op op); /* Store by cursor. * @@ -1444,8 +1548,8 @@ int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * - EACCES - an attempt was made to write in a read-only transaction. * - EINVAL - an invalid parameter was specified. */ -int mdbx_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, - unsigned flags); +LIBMDBX_API int mdbx_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, + unsigned flags); /* Delete current key/data pair * @@ -1461,7 +1565,7 @@ int mdbx_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * - EACCES - an attempt was made to write in a read-only transaction. * - EINVAL - an invalid parameter was specified. */ -int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); +LIBMDBX_API int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); /* Return count of duplicates for current key. * @@ -1474,7 +1578,7 @@ int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); * - EINVAL - cursor is not initialized, or an invalid parameter was *specified. */ -int mdbx_cursor_count(MDB_cursor *cursor, size_t *countp); +LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, size_t *countp); /* Compare two data items according to a particular database. * @@ -1486,7 +1590,8 @@ int mdbx_cursor_count(MDB_cursor *cursor, size_t *countp); * [in] b The second item to compare * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ -int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); +LIBMDBX_API int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, + const MDB_val *b); /* Compare two data items according to a particular database. * @@ -1498,7 +1603,8 @@ int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); * [in] b The second item to compare * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ -int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); +LIBMDBX_API int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, + const MDB_val *b); /* A callback function used to print a message from the library. * @@ -1515,7 +1621,7 @@ typedef int(MDB_msg_func)(const char *msg, void *ctx); * [in] ctx Anything the message function needs * Returns < 0 on failure, >= 0 on success. */ -int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); +LIBMDBX_API int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); /* Check for stale entries in the reader lock table. * @@ -1523,11 +1629,11 @@ int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); * [out] dead Number of stale slots that were cleared * Returns 0 on success, non-zero on failure. */ -int mdbx_reader_check(MDB_env *env, int *dead); +LIBMDBX_API int mdbx_reader_check(MDB_env *env, int *dead); -char *mdbx_dkey(MDB_val *key, char *buf); +LIBMDBX_API char *mdbx_dkey(MDB_val *key, char *buf); -int mdbx_env_close_ex(MDB_env *env, int dont_sync); +LIBMDBX_API int mdbx_env_close_ex(MDB_env *env, int dont_sync); /* Set threshold to force flush the data buffers to disk, * even of MDB_NOSYNC, MDB_NOMETASYNC and MDB_MAPASYNC flags @@ -1546,7 +1652,8 @@ int mdbx_env_close_ex(MDB_env *env, int dont_sync); * when a synchronous flush would be made. * Returns A non-zero error value on failure and 0 on success. */ -int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); +LIBMDBX_API int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); + /* Returns a lag of the reading. * * Returns an information for estimate how much given read-only @@ -1557,7 +1664,7 @@ int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); * Returns Number of transactions committed after the given was started for * read, or -1 on failure. */ -int mdbx_txn_straggler(MDB_txn *txn, int *percent); +LIBMDBX_API int mdbx_txn_straggler(MDB_txn *txn, int *percent); /* A callback function for killing a laggard readers, * but also could waiting ones. Called in case of MDB_MAP_FULL error. @@ -1573,8 +1680,8 @@ int mdbx_txn_straggler(MDB_txn *txn, int *percent); * 1 on success (reader was killed), * >1 on success (reader was SURE killed). */ -typedef int(MDBX_oom_func)(MDB_env *env, int pid, void *thread_id, size_t txn, - unsigned gap, int retry); +typedef int(MDBX_oom_func)(MDB_env *env, int pid, mdbx_tid_t thread_id, + size_t txn, unsigned gap, int retry); /* Set the OOM callback. * @@ -1584,7 +1691,7 @@ typedef int(MDBX_oom_func)(MDB_env *env, int pid, void *thread_id, size_t txn, * [in] env An environment handle returned by mdbx_env_create(). * [in] oomfunc A #MDBX_oom_func function or NULL to disable. */ -void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); +LIBMDBX_API void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); /* Get the current oom_func callback. * @@ -1594,7 +1701,7 @@ void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); * [in] env An environment handle returned by mdbx_env_create(). * Returns A #MDBX_oom_func function or NULL if disabled. */ -MDBX_oom_func *mdbx_env_get_oomfunc(MDB_env *env); +LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDB_env *env); #define MDBX_DBG_ASSERT 1 #define MDBX_DBG_PRINT 2 @@ -1609,48 +1716,56 @@ MDBX_oom_func *mdbx_env_get_oomfunc(MDB_env *env); typedef void MDBX_debug_func(int type, const char *function, int line, const char *msg, va_list args); -int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); +LIBMDBX_API int mdbx_setup_debug(int flags, MDBX_debug_func *logger, + long edge_txn); typedef int MDBX_pgvisitor_func(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, const char *type, int nentries, int payload_bytes, int header_bytes, int unused_bytes); -int mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, void *ctx); +LIBMDBX_API int mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, + void *ctx); typedef struct mdbx_canary { size_t x, y, z, v; } mdbx_canary; -int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary); -size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary); +LIBMDBX_API int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary); +LIBMDBX_API size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary); /* Returns: * - MDBX_RESULT_TRUE when no more data available * or cursor not positioned; * - MDBX_RESULT_FALSE when data available; * - Otherwise the error code. */ -int mdbx_cursor_eof(MDB_cursor *mc); +LIBMDBX_API int mdbx_cursor_eof(MDB_cursor *mc); /* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */ -int mdbx_cursor_on_first(MDB_cursor *mc); +LIBMDBX_API int mdbx_cursor_on_first(MDB_cursor *mc); /* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */ -int mdbx_cursor_on_last(MDB_cursor *mc); +LIBMDBX_API int mdbx_cursor_on_last(MDB_cursor *mc); -int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, - MDB_val *old_data, unsigned flags); +LIBMDBX_API int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, + MDB_val *new_data, MDB_val *old_data, + unsigned flags); /* Same as mdbx_get(), but: * 1) if values_count is not NULL, then returns the count * of multi-values/duplicates for a given key. * 2) updates the key for pointing to the actual key's data inside DB. */ -int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, - int *values_count); +LIBMDBX_API int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, + MDB_val *data, int *values_count); -int mdbx_is_dirty(const MDB_txn *txn, const void *ptr); +LIBMDBX_API int mdbx_is_dirty(const MDB_txn *txn, const void *ptr); -int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, - MDB_dbi *dbi, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp); +LIBMDBX_API int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, + MDB_dbi *dbi, MDB_cmp_func *keycmp, + MDB_cmp_func *datacmp); #ifdef __cplusplus } #endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + #endif /* _MDBX_H_ */ diff --git a/src/barriers.h b/src/barriers.h deleted file mode 100644 index 317e60bc..00000000 --- a/src/barriers.h +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -/***************************************************************************** - * Properly compiler/memory/coherence barriers - * in the most portable way for libmdbx project. - * - * Feedback and comments are welcome. - * https://gist.github.com/leo-yuriev/ba186a6bf5cf3a27bae7 */ - -#pragma once -/* *INDENT-OFF* */ -/* clang-format off */ - -#if defined(__mips) && defined(__linux) - /* Only MIPS has explicit cache control */ -# include -#endif - -#if defined(__GNUC__) || defined(__clang__) -# define MDBX_INLINE __inline -#elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ -# include -# if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) -# pragma intrinsic(__mf) -# elif defined(__i386__) || defined(__x86_64__) -# pragma intrinsic(_mm_mfence) -# endif -# define MDBX_INLINE __inline -#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) -# include -# define MDBX_INLINE inline -#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) \ - && (defined(HP_IA64) || defined(__ia64)) -# include -# define MDBX_INLINE -#elif defined(__IBMC__) && defined(__powerpc) -# include -# define MDBX_INLINE -#elif defined(_AIX) -# include -# include -# define MDBX_INLINE -#elif (defined(__osf__) && defined(__DECC)) || defined(__alpha) -# include -# include -# define MDBX_INLINE -#elif defined(__MWERKS__) - /* CodeWarrior - troubles ? */ -# pragma gcc_extensions -# define MDBX_INLINE -#elif defined(__SNC__) - /* Sony PS3 - troubles ? */ -# define MDBX_INLINE -#else -# define MDBX_INLINE -#endif - -#if defined(__i386__) || defined(__x86_64__) \ - || defined(_M_AMD64) || defined(_M_IX86) \ - || defined(__i386) || defined(__amd64) \ - || defined(i386) || defined(__x86_64) \ - || defined(_AMD64_) || defined(_M_X64) -# define MDB_CACHE_IS_COHERENT 1 -#elif defined(__hppa) || defined(__hppa__) -# define MDB_CACHE_IS_COHERENT 1 -#endif - -#ifndef MDB_CACHE_IS_COHERENT -# define MDB_CACHE_IS_COHERENT 0 -#endif - -#define MDBX_BARRIER_COMPILER 0 -#define MDBX_BARRIER_MEMORY 1 - -static MDBX_INLINE void mdbx_barrier(int type) { -#if defined(__clang__) - __asm__ __volatile__ ("" ::: "memory"); - if (type > MDBX_BARRIER_COMPILER) -# if __has_extension(c_atomic) || __has_extension(cxx_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); -# else - __sync_synchronize(); -# endif -#elif defined(__GNUC__) - __asm__ __volatile__ ("" ::: "memory"); - if (type > MDBX_BARRIER_COMPILER) -# if defined(__ATOMIC_SEQ_CST) - __atomic_thread_fence(__ATOMIC_SEQ_CST); -# else - __sync_synchronize(); -# endif -#elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ - __memory_barrier(); - if (type > MDBX_BARRIER_COMPILER) -# if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) - __mf(); -# elif defined(__i386__) || defined(__x86_64__) - _mm_mfence(); -# else -# error "Unknown target for Intel Compiler, please report to us." -# endif -#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) - __compiler_barrier(); - if (type > MDBX_BARRIER_COMPILER) - __machine_rw_barrier(); -#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) \ - && (defined(HP_IA64) || defined(__ia64)) - _Asm_sched_fence(/* LY: no-arg meaning 'all expect ALU', e.g. 0x3D3D */); - if (type > MDBX_BARRIER_COMPILER) - _Asm_mf(); -#elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) \ - || defined(__ppc64__) || defined(__powerpc64__) - __fence(); - if (type > MDBX_BARRIER_COMPILER) - __lwsync(); -#elif (defined(__osf__) && defined(__DECC)) || defined(__alpha) - __PAL_DRAINA(); /* LY: excessive ? */ - __MB(); -#else -# error "Could not guess the kind of compiler, please report to us." -#endif -} - -#define mdbx_compiler_barrier() \ - mdbx_barrier(MDBX_BARRIER_COMPILER) -#define mdbx_memory_barrier() \ - mdbx_barrier(MDBX_BARRIER_MEMORY) -#define mdbx_coherent_barrier() \ - mdbx_barrier(MDB_CACHE_IS_COHERENT ? MDBX_BARRIER_COMPILER : MDBX_BARRIER_MEMORY) - -static MDBX_INLINE void mdbx_invalidate_cache(void *addr, int nbytes) { - mdbx_coherent_barrier(); -#if defined(__mips) && defined(__linux) - /* MIPS has cache coherency issues. - * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); -#elif defined(_M_MRX000) || defined(_MIPS_) -# error "Sorry, cacheflush() for MIPS not implemented" -#else - /* LY: assume no mmap/dcache issues. */ - (void) addr; - (void) nbytes; -#endif -} diff --git a/src/bits.h b/src/bits.h new file mode 100644 index 00000000..29c1663c --- /dev/null +++ b/src/bits.h @@ -0,0 +1,796 @@ +/* + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once +/* *INDENT-OFF* */ +/* clang-format off */ + +#ifndef _FILE_OFFSET_BITS +# define _FILE_OFFSET_BITS 64 +#endif + +#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS) +# define _CRT_SECURE_NO_WARNINGS +#endif + +#ifdef _MSC_VER +#pragma warning(disable : 4464) /* C4464: relative include path contains '..' */ +#pragma warning(disable : 4710) /* C4710: 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* C4711: function 'xyz' selected for automatic inline expansion */ +//#pragma warning(disable : 4061) /* C4061: enumerator 'abc' in switch of enum 'xyz' is not explicitly handled by a case label */ +#pragma warning(disable : 4201) /* C4201: nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4706) /* C4706: assignment within conditional expression */ +#pragma warning(disable : 4127) /* C4127: conditional expression is constant */ +#endif /* _MSC_VER (warnings) */ + +#include "../mdbx.h" +#include "./defs.h" + +#if defined(USE_VALGRIND) +# include +# ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE +/* LY: available since Valgrind 3.10 */ +# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# endif +#else +# define VALGRIND_CREATE_MEMPOOL(h,r,z) +# define VALGRIND_DESTROY_MEMPOOL(h) +# define VALGRIND_MEMPOOL_TRIM(h,a,s) +# define VALGRIND_MEMPOOL_ALLOC(h,a,s) +# define VALGRIND_MEMPOOL_FREE(h,a) +# define VALGRIND_MEMPOOL_CHANGE(h,a,b,s) +# define VALGRIND_MAKE_MEM_NOACCESS(a,s) +# define VALGRIND_MAKE_MEM_DEFINED(a,s) +# define VALGRIND_MAKE_MEM_UNDEFINED(a,s) +# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) +# define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) +#endif /* USE_VALGRIND */ + +#ifdef __SANITIZE_ADDRESS__ +# include +#else +# define ASAN_POISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +# define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +#endif /* __SANITIZE_ADDRESS__ */ + +#include "./osal.h" + +#ifndef MDB_DEBUG +# define MDB_DEBUG 0 +#endif + +#if MDB_DEBUG +# undef NDEBUG +#endif + +#if defined(__GNUC__) && !__GNUC_PREREQ(4,2) + /* Actualy libmdbx was not tested with compilers older than GCC from RHEL6. + * But you could remove this #error and try to continue at your own risk. + * In such case please don't rise up an issues related ONLY to old compilers. + */ +# warning "libmdbx required at least GCC 4.2 compatible C/C++ compiler." +#endif + +#if defined(__GLIBC__) && !__GLIBC_PREREQ(2,12) + /* Actualy libmdbx was not tested with something older than glibc 2.12 (from RHEL6). + * But you could remove this #error and try to continue at your own risk. + * In such case please don't rise up an issues related ONLY to old systems. + */ +# warning "libmdbx required at least GLIBC 2.12." +#endif + +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) +# define MISALIGNED_OK 1 /* TODO */ +#endif +#ifndef MISALIGNED_OK +# define MISALIGNED_OK 0 +#endif /* MISALIGNED_OK */ + +#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +# error "Sanity checking failed: Two's complement, reasonably sized integer types" +#endif + +/*----------------------------------------------------------------------------*/ + +#ifndef ARRAY_LENGTH +# ifdef __cplusplus + template + char (&__ArraySizeHelper(T (&array)[N]))[N]; +# define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array))) +# else +# define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) +# endif +#endif /* ARRAY_LENGTH */ + +#ifndef ARRAY_END +# define ARRAY_END(array) (&array[ARRAY_LENGTH(array)]) +#endif /* ARRAY_END */ + +#ifndef STRINGIFY +# define STRINGIFY_HELPER(x) #x +# define STRINGIFY(x) STRINGIFY_HELPER(x) +#endif /* STRINGIFY */ + +#ifndef offsetof +# define offsetof(type, member) __builtin_offsetof(type, member) +#endif /* offsetof */ + +#ifndef container_of +# define container_of(ptr, type, member) \ + ((type *)((char *)(ptr) - offsetof(type, member))) +#endif /* container_of */ + +/* *INDENT-ON* */ +/* clang-format on */ + +#define FIXME "FIXME: " __FILE__ ", " STRINGIFY(__LINE__) + +/*----------------------------------------------------------------------------*/ + +/** handle for the DB used to track free pages. */ +#define FREE_DBI 0 +/** handle for the default DB. */ +#define MAIN_DBI 1 +/** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +#define CORE_DBS 2 + +/** Number of meta pages - also hardcoded elsewhere */ +#define NUM_METAS 2 + +/* A generic unsigned ID number. These were entryIDs in back-bdb. +* Preferably it should have the same size as a pointer. +*/ +typedef size_t MDB_ID; + +/** A page number in the database. +* Note that 64 bit page numbers are overkill, since pages themselves +* already represent 12-13 bits of addressable memory, and the OS will +* always limit applications to a maximum of 63 bits of address space. +* +* @note In the #MDB_node structure, we only store 48 bits of this value, +* which thus limits us to only 60 bits of addressable data. +*/ +typedef MDB_ID pgno_t; + +/** A transaction ID. +* See struct MDB_txn.mt_txnid for details. +*/ +typedef MDB_ID txnid_t; + +/* An IDL is an ID List, a sorted array of IDs. The first +* element of the array is a counter for how many actual +* IDs are in the list. In the original back-bdb code, IDLs are +* sorted in ascending order. For libmdb IDLs are sorted in +* descending order. +*/ +typedef MDB_ID *MDB_IDL; + +/* An ID2 is an ID/pointer pair. +*/ +typedef struct MDB_ID2 { + MDB_ID mid; /* The ID */ + void *mptr; /* The pointer */ +} MDB_ID2; + +/* An ID2L is an ID2 List, a sorted array of ID2s. +* The first element's \b mid member is a count of how many actual +* elements are in the array. The \b mptr member of the first element is +* unused. +* The array is sorted in ascending order by \b mid. +*/ +typedef MDB_ID2 *MDB_ID2L; + +/** Used for offsets within a single page. +* Since memory pages are typically 4 or 8KB in size, 12-13 bits, +* this is plenty. +*/ +typedef uint16_t indx_t; + +#pragma pack(push, 1) + +/** The information we store in a single slot of the reader table. +* In addition to a transaction ID, we also record the process and +* thread ID that owns a slot, so that we can detect stale information, +* e.g. threads or processes that went away without cleaning up. +* @note We currently don't check for stale records. We simply re-init +* the table when we know that we're the only process opening the +* lock file. +*/ +typedef struct MDB_rxbody { + /** Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. + */ + volatile txnid_t mrb_txnid; + /** The process ID of the process owning this reader txn. */ + volatile mdbx_pid_t mrb_pid; + /** The thread ID of the thread owning this txn. */ + volatile mdbx_tid_t mrb_tid; +} MDB_rxbody; + +/** The actual reader record, with cacheline padding. */ +typedef struct MDB_reader { + union { + MDB_rxbody mrx; +/** shorthand for mrb_txnid */ +#define mr_txnid mru.mrx.mrb_txnid +#define mr_pid mru.mrx.mrb_pid +#define mr_tid mru.mrx.mrb_tid + /** cache line alignment */ + char pad[(sizeof(MDB_rxbody) + MDBX_CACHELINE_SIZE - 1) & + ~(MDBX_CACHELINE_SIZE - 1)]; + } mru; +} MDB_reader; + +/** Information about a single database in the environment. */ +typedef struct MDB_db { + uint32_t md_xsize; /**< also ksize for LEAF2 pages */ + uint16_t md_flags; /**< @ref mdbx_dbi_open */ + uint16_t md_depth; /**< depth of this tree */ + pgno_t md_branch_pages; /**< number of internal pages */ + pgno_t md_leaf_pages; /**< number of leaf pages */ + pgno_t md_overflow_pages; /**< number of overflow pages */ + size_t md_entries; /**< number of data items */ + pgno_t md_root; /**< the root page of this tree */ +} MDB_db; + +/** Meta page content. +* A meta page is the start point for accessing a database snapshot. +* Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). +*/ +typedef struct MDB_meta { + /** Stamp identifying this as an LMDB file. It must be set + * to #MDB_MAGIC. */ + uint32_t mm_magic; + /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ + uint32_t mm_version; + size_t mm_mapsize; /**< size of mmap region */ + MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ + /** The size of pages used in this DB */ +#define mm_psize mm_dbs[FREE_DBI].md_xsize +/** Any persistent environment flags. @ref mdbx_env */ +#define mm_flags mm_dbs[FREE_DBI].md_flags + /** Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. + */ + pgno_t mm_last_pg; + volatile txnid_t mm_txnid; /**< txnid that committed this page */ +#define MDB_DATASIGN_NONE 0u +#define MDB_DATASIGN_WEAK 1u + volatile uint64_t mm_datasync_sign; +#define META_IS_WEAK(meta) ((meta)->mm_datasync_sign == MDB_DATASIGN_WEAK) +#define META_IS_STEADY(meta) ((meta)->mm_datasync_sign > MDB_DATASIGN_WEAK) + +#if MDBX_MODE_ENABLED + volatile mdbx_canary mm_canary; +#endif +} MDB_meta; + +/** Common header for all page types. The page type depends on #mp_flags. +* +* #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with +* sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages +* omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. +* +* #P_OVERFLOW records occupy one or more contiguous pages where only the +* first has a page header. They hold the real data of #F_BIGDATA nodes. +* +* #P_SUBP sub-pages are small leaf "pages" with duplicate data. +* A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. +* (Duplicate data can also go in sub-databases, which use normal pages.) +* +* #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. +* +* Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once +* in the snapshot: Either used by a database or listed in a freeDB record. +*/ +typedef struct MDB_page { +#define mp_pgno mp_p.p_pgno +#define mp_next mp_p.p_next + union { + pgno_t p_pgno; /**< page number */ + struct MDB_page *p_next; /**< for in-memory list of freed pages */ + } mp_p; + uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */ + /** @defgroup mdbx_page Page Flags + * @ingroup internal + * Flags for the page headers. + * @{ + */ +#define P_BRANCH 0x01 /**< branch page */ +#define P_LEAF 0x02 /**< leaf page */ +#define P_OVERFLOW 0x04 /**< overflow page */ +#define P_META 0x08 /**< meta page */ +#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ +#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ +#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ +#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ +#define P_KEEP 0x8000 /**< leave this page alone during spill */ + /** @} */ + uint16_t mp_flags; /**< @ref mdbx_page */ +#define mp_lower mp_pb.pb.pb_lower +#define mp_upper mp_pb.pb.pb_upper +#define mp_pages mp_pb.pb_pages + union { + struct { + indx_t pb_lower; /**< lower bound of free space */ + indx_t pb_upper; /**< upper bound of free space */ + } pb; + uint32_t pb_pages; /**< number of overflow pages */ + } mp_pb; + indx_t mp_ptrs[1]; /**< dynamic size */ +} MDB_page; + +/** Size of the page header, excluding dynamic data at the end */ +#define PAGEHDRSZ ((unsigned)offsetof(MDB_page, mp_ptrs)) + +/** Buffer for a stack-allocated meta page. +* The members define size and alignment, and silence type +* aliasing warnings. They are not used directly; that could +* mean incorrectly using several union members in parallel. +*/ +typedef union MDB_metabuf { + MDB_page mb_page; + struct { + char mm_pad[PAGEHDRSZ]; + MDB_meta mm_meta; + } mb_metabuf; +} MDB_metabuf; + +/* The header for the reader table (a memory-mapped lock file). */ +typedef struct MDBX_lockinfo { + /* Stamp identifying this as an LMDB file. It must be set to MDB_MAGIC. */ + uint64_t mti_magic; + /* Format of this lock file. Must be set to MDB_LOCK_FORMAT. */ + uint64_t mti_format; + + /* The ID of the last transaction committed to the database. + * This is recorded here only for convenience; the value can always + * be determined by reading the main database meta pages. */ + volatile txnid_t mti_txnid; +#ifdef MDBX_OSAL_LOCK + MDBX_OSAL_LOCK mti_wmutex; +#endif + + /* The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. */ + __cache_aligned volatile unsigned mti_numreaders; +#ifdef MDBX_OSAL_LOCK + /* Mutex protecting access to this table. */ + MDBX_OSAL_LOCK mti_rmutex; +#endif + MDB_reader mti_readers[1]; +} MDBX_lockinfo; + +#pragma pack(pop) + +/** Auxiliary DB info. +* The information here is mostly static/read-only. There is +* only a single copy of this record in the environment. +*/ +typedef struct MDB_dbx { + MDB_val md_name; /**< name of the database */ + MDB_cmp_func *md_cmp; /**< function for comparing keys */ + MDB_cmp_func *md_dcmp; /**< function for comparing data items */ +} MDB_dbx; + +#if MDBX_MODE_ENABLED +#define MDBX_MODE_SALT 0 +#else +#error !? +#endif + +/** A database transaction. +* Every operation requires a transaction handle. +*/ +struct MDB_txn { +#define MDBX_MT_SIGNATURE (0x93D53A31 ^ MDBX_MODE_SALT) + unsigned mt_signature; + MDB_txn *mt_parent; /**< parent of a nested txn */ + /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ + MDB_txn *mt_child; + pgno_t mt_next_pgno; /**< next unallocated page */ + /** The ID of this transaction. IDs are integers incrementing from 1. + * Only committed write transactions increment the ID. If a transaction + * aborts, the ID may be re-used by the next writer. + */ + txnid_t mt_txnid; + MDB_env *mt_env; /**< the DB environment */ + /** The list of reclaimed txns from freeDB */ + MDB_IDL mt_lifo_reclaimed; + /** The list of pages that became unused during this transaction. + */ + MDB_IDL mt_free_pgs; + /** The list of loose pages that became unused and may be reused + * in this transaction, linked through #NEXT_LOOSE_PAGE(page). + */ + MDB_page *mt_loose_pgs; + /** Number of loose pages (#mt_loose_pgs) */ + int mt_loose_count; + /** The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. + */ + MDB_IDL mt_spill_pgs; + union { + /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ + MDB_ID2L dirty_list; + /** For read txns: This thread/txn's reader table slot, or NULL. */ + MDB_reader *reader; + } mt_u; + /** Array of records for each DB known in the environment. */ + MDB_dbx *mt_dbxs; + /** Array of MDB_db records for each known DB */ + MDB_db *mt_dbs; + /** Array of sequence numbers for each DB handle */ + unsigned *mt_dbiseqs; +/** @defgroup mt_dbflag Transaction DB Flags +* @ingroup internal +* @{ +*/ +#define DB_DIRTY 0x01 /**< DB was written in this txn */ +#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ +#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ +#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ +#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ + /** @} */ + /** In write txns, array of cursors for each DB */ + MDB_cursor **mt_cursors; + /** Array of flags for each DB */ + unsigned char *mt_dbflags; + /** Number of DB records in use, or 0 when the txn is finished. + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. + */ + MDB_dbi mt_numdbs; + +/** @defgroup mdbx_txn Transaction Flags +* @ingroup internal +* @{ +*/ +/** #mdbx_txn_begin() flags */ +#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC | MDB_NOSYNC | MDB_RDONLY) +#define MDB_TXN_NOMETASYNC \ + MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ +#define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ +#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ + /* internal txn flags */ +#define MDB_TXN_WRITEMAP \ + MDB_WRITEMAP /**< copy of #MDB_env flag in writers \ + */ +#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ +#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ +#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ +#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ +#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ +/** most operations on the txn are currently illegal */ +#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED | MDB_TXN_ERROR | MDB_TXN_HAS_CHILD) + /** @} */ + unsigned mt_flags; /**< @ref mdbx_txn */ + /** #dirty_list room: Array size - \#dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirty_list into mt_parent after freeing hidden mt_parent pages. + */ + unsigned mt_dirty_room; + +#if MDBX_MODE_ENABLED + mdbx_canary mt_canary; +#endif +}; + +/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. +* At 4 keys per node, enough for 2^64 nodes, so there's probably no need to +* raise this on a 64 bit machine. +*/ +#define CURSOR_STACK 32 + +struct MDB_xcursor; + +/** Cursors are used for all DB operations. +* A cursor holds a path of (page pointer, key index) from the DB +* root to a position in the DB, plus other state. #MDB_DUPSORT +* cursors include an xcursor to the current data item. Write txns +* track their cursors and keep them up to date when data moves. +* Exception: An xcursor's pointer to a #P_SUBP page can be stale. +* (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). +*/ +struct MDB_cursor { +#define MDBX_MC_SIGNATURE (0xFE05D5B1 ^ MDBX_MODE_SALT) +#define MDBX_MC_READY4CLOSE (0x2817A047 ^ MDBX_MODE_SALT) +#define MDBX_MC_WAIT4EOT (0x90E297A7 ^ MDBX_MODE_SALT) + unsigned mc_signature; + /** Next cursor on this DB in this txn */ + MDB_cursor *mc_next; + /** Backup of the original cursor if this cursor is a shadow */ + MDB_cursor *mc_backup; + /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ + struct MDB_xcursor *mc_xcursor; + /** The transaction that owns this cursor */ + MDB_txn *mc_txn; + /** The database handle this cursor operates on */ + MDB_dbi mc_dbi; + /** The database record for this cursor */ + MDB_db *mc_db; + /** The database auxiliary record for this cursor */ + MDB_dbx *mc_dbx; + /** The @ref mt_dbflag for this database */ + uint8_t *mc_dbflag; + uint16_t mc_snum; /**< number of pushed pages */ + uint16_t mc_top; /**< index of top page, normally mc_snum-1 */ + /** @defgroup mdbx_cursor Cursor Flags + * @ingroup internal + * Cursor state flags. + * @{ + */ +#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ +#define C_EOF 0x02 /**< No more data */ +#define C_SUB 0x04 /**< Cursor is a sub-cursor */ +#define C_DEL 0x08 /**< last op was a cursor_del */ +#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ +#define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */ + /** @} */ + unsigned mc_flags; /**< @ref mdbx_cursor */ + MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ +}; + +/** Context for sorted-dup records. +* We could have gone to a fully recursive design, with arbitrarily +* deep nesting of sub-databases. But for now we only handle these +* levels - main DB, optional sub-DB, sorted-duplicate DB. +*/ +typedef struct MDB_xcursor { + /** A sub-cursor for traversing the Dup DB */ + MDB_cursor mx_cursor; + /** The database record for this Dup DB */ + MDB_db mx_db; + /** The auxiliary DB record for this Dup DB */ + MDB_dbx mx_dbx; + /** The @ref mt_dbflag for this Dup DB */ + unsigned char mx_dbflag; +} MDB_xcursor; + +/** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + +/** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed +* when the node which contains the sub-page may have moved. Called +* with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. +*/ +#define XCURSOR_REFRESH(mc, mp, ki) \ + do { \ + MDB_page *xr_pg = (mp); \ + MDB_node *xr_node = NODEPTR(xr_pg, ki); \ + if ((xr_node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ + } while (0) + +/** State of FreeDB old pages, stored in the MDB_env */ +typedef struct MDB_pgstate { + pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ +} MDB_pgstate; + +#define MDBX_LOCKINFO_WHOLE_SIZE \ + ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ + ~((size_t)MDBX_CACHELINE_SIZE - 1)) + +/** Lockfile format signature: version, features and field layout */ +#define MDB_LOCK_FORMAT \ + (((uint64_t)(MDBX_OSAL_LOCK_SIGN) << 32) + \ + ((MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1) << 16) + \ + (MDB_LOCK_VERSION) /* Flags which describe functionality */) + +/** The database environment. */ +struct MDB_env { +#define MDBX_ME_SIGNATURE (0x9A899641 ^ MDBX_MODE_SALT) + unsigned me_signature; + mdbx_filehandle_t me_fd; /**< The main data file */ + mdbx_filehandle_t me_lfd; /**< The lock file */ +/** Failed to update the meta page. Probably an I/O error. */ +#define MDB_FATAL_ERROR 0x80000000U +/** Some fields are initialized. */ +#define MDB_ENV_ACTIVE 0x20000000U +/** me_txkey is set */ +#define MDB_ENV_TXKEY 0x10000000U + uint32_t me_flags; /**< @ref mdbx_env */ + unsigned me_psize; /**< DB page size, inited from me_os_psize */ + unsigned me_os_psize; /**< OS page size, from mdbx_syspagesize() */ + unsigned me_maxreaders; /**< size of the reader table */ + /** Max #MDBX_lockinfo.mti_numreaders of interest to #mdbx_env_close() */ + unsigned me_close_readers; + MDB_dbi me_numdbs; /**< number of DBs opened */ + MDB_dbi me_maxdbs; /**< size of the DB table */ + mdbx_pid_t me_pid; /**< process ID of this env */ + char *me_path; /**< path to the DB files */ + char *me_map; /**< the memory map of the data file */ + MDBX_lockinfo *me_txns; /**< the memory map of the lock file, never NULL */ + void *me_pbuf; /**< scratch area for DUPSORT put() */ + MDB_txn *me_txn; /**< current write transaction */ + MDB_txn *me_txn0; /**< prealloc'd write transaction */ + size_t me_mapsize; /**< size of the data memory map */ + pgno_t me_maxpg; /**< me_mapsize / me_psize */ + MDB_dbx *me_dbxs; /**< array of static DB info */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ + unsigned *me_dbiseqs; /**< array of dbi sequence numbers */ + mdbx_thread_key_t me_txkey; /**< thread-key for readers */ + txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ + MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ +#define me_pglast me_pgstate.mf_pglast +#define me_pghead me_pgstate.mf_pghead + MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ + /** IDL of pages that became unused in a write txn */ + MDB_IDL me_free_pgs; + /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + MDB_ID2L me_dirty_list; + /** Max number of freelist items that can fit in a single overflow page */ + unsigned me_maxfree_1pg; + /** Max size of a node on a page */ + unsigned me_nodemax; + unsigned me_maxkey_limit; /**< max size of a key */ + int me_live_reader; /**< have liveness lock in reader table */ + void *me_userctx; /**< User-settable context */ +#if MDB_DEBUG + MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ +#endif + uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last + mdbx_env_sync() */ + uint64_t + me_sync_threshold; /**< Treshold of above to force synchronous flush */ +#if MDBX_MODE_ENABLED + MDBX_oom_func *me_oom_func; /**< Callback for kicking laggard readers */ +#endif +#ifdef USE_VALGRIND + int me_valgrind_handle; +#endif +}; + +/** Nested transaction */ +typedef struct MDB_ntxn { + MDB_txn mnt_txn; /**< the transaction */ + MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ +} MDB_ntxn; + +/*----------------------------------------------------------------------------*/ + +extern int mdbx_runtime_flags; +extern MDBX_debug_func *mdbx_debug_logger; +extern txnid_t mdbx_debug_edge; + +void mdbx_debug_log(int type, const char *function, int line, const char *fmt, + ...) +#if defined(__GNUC__) || __has_attribute(format) + __attribute__((format(printf, 4, 5))) +#endif + ; + +void mdbx_panic(const char *fmt, ...) +#if defined(__GNUC__) || __has_attribute(format) + __attribute__((format(printf, 1, 2))) +#endif + ; + +#if MDB_DEBUG + +#define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT) + +#define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT) + +#define mdbx_debug_enabled(type) \ + unlikely(mdbx_runtime_flags &(type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) + +#else +#ifndef NDEBUG +#define mdbx_debug_enabled(type) (1) +#else +#define mdbx_debug_enabled(type) (0) +#endif +#define mdbx_audit_enabled() (0) +#define mdbx_assert_enabled() (0) +#endif /* MDB_DEBUG */ + +#define mdbx_print(fmt, ...) \ + mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) + +#define mdbx_debug(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ + mdbx_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", \ + ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_print(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ + mdbx_debug_log(MDBX_DBG_TRACE, NULL, 0, fmt, ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_extra(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ + mdbx_debug_log(MDBX_DBG_EXTRA, __FUNCTION__, __LINE__, fmt, \ + ##__VA_ARGS__); \ + } while (0) + +#define mdbx_debug_extra_print(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ + mdbx_debug_log(MDBX_DBG_EXTRA, NULL, 0, fmt, ##__VA_ARGS__); \ + } while (0) + +#define mdbx_ensure_msg(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + mdbx_assert_fail(env, msg, __FUNCTION__, __LINE__); \ + } while (0) + +#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define mdbx_assert(env, expr) \ + do { \ + if (mdbx_assert_enabled()) \ + mdbx_ensure(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) + +/*----------------------------------------------------------------------------*/ + +int mdbx_reader_check0(MDB_env *env, int rlocked, int *dead); + +#define METAPAGE_1(env) (&((MDB_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) + +#define METAPAGE_2(env) \ + (&((MDB_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) + +static __inline MDB_meta *mdbx_meta_head_w(MDB_env *env) { + MDB_meta *a = METAPAGE_1(env); + MDB_meta *b = METAPAGE_2(env); + txnid_t head_txnid = env->me_txns->mti_txnid; + + mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); + if (a->mm_txnid == head_txnid) + return a; + if (likely(b->mm_txnid == head_txnid)) + return b; + + mdbx_debug("me_txns->mti_txnid not match meta-pages"); + mdbx_assert(env, head_txnid == a->mm_txnid || head_txnid == b->mm_txnid); + env->me_flags |= MDB_FATAL_ERROR; + return a; +} + +void mdbx_rthc_dtor(void *rthc); +void mdbx_rthc_lock(void); +void mdbx_rthc_unlock(void); +int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDB_reader *begin, MDB_reader *end); +void mdbx_rthc_remove(mdbx_thread_key_t key); +void mdbx_rthc_cleanup(void); diff --git a/src/defs.h b/src/defs.h new file mode 100644 index 00000000..a00fce4a --- /dev/null +++ b/src/defs.h @@ -0,0 +1,302 @@ +/* + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once +/* *INDENT-OFF* */ +/* clang-format off */ + +#ifndef __GNUC_PREREQ +# if defined(__GNUC__) && defined(__GNUC_MINOR__) +# define __GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GNUC_PREREQ(maj, min) (0) +# endif +#endif /* __GNUC_PREREQ */ + +#ifndef __CLANG_PREREQ +# ifdef __clang__ +# define __CLANG_PREREQ(maj,min) \ + ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min)) +# else +# define __CLANG_PREREQ(maj,min) (0) +# endif +#endif /* __CLANG_PREREQ */ + +#ifndef __GLIBC_PREREQ +# if defined(__GLIBC__) && defined(__GLIBC_MINOR__) +# define __GLIBC_PREREQ(maj, min) \ + ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GLIBC_PREREQ(maj, min) (0) +# endif +#endif /* __GLIBC_PREREQ */ + +#ifndef __has_attribute +# define __has_attribute(x) (0) +#endif + +#ifndef __has_feature +# define __has_feature(x) (0) +#endif + +#ifndef __has_extension +# define __has_extension(x) (0) +#endif + +#ifndef __has_builtin +# define __has_builtin(x) (0) +#endif + +#if __has_feature(thread_sanitizer) +# define __SANITIZE_THREAD__ 1 +#endif + +#if __has_feature(address_sanitizer) +# define __SANITIZE_ADDRESS__ 1 +#endif + +/*----------------------------------------------------------------------------*/ + +#ifndef __extern_C +# ifdef __cplusplus +# define __extern_C extern "C" +# else +# define __extern_C +# endif +#endif /* __extern_C */ + +#ifndef __cplusplus +# ifndef bool +# define bool _Bool +# endif +# ifndef true +# define true (1) +# endif +# ifndef false +# define false (0) +# endif +#endif + +#if !defined(nullptr) && !defined(__cplusplus) || (__cplusplus < 201103L && !defined(_MSC_VER)) +# define nullptr NULL +#endif + +/*----------------------------------------------------------------------------*/ + +#if !defined(__thread) && (defined(_MSC_VER) || defined(__DMC__)) +# define __thread __declspec(thread) +#endif /* __thread */ + +#ifndef __alwaysinline +# if defined(__GNUC__) || __has_attribute(always_inline) +# define __alwaysinline __inline __attribute__((always_inline)) +# elif defined(_MSC_VER) +# define __alwaysinline __forceinline +# else +# define __alwaysinline +# endif +#endif /* __alwaysinline */ + +#ifndef __noinline +# if defined(__GNUC__) || __has_attribute(noinline) +# define __noinline __attribute__((noinline)) +# elif defined(_MSC_VER) +# define __noinline __declspec(noinline) +# elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) +# define __noinline inline +# elif !defined(__INTEL_COMPILER) +# define __noinline /* FIXME ? */ +# endif +#endif /* __noinline */ + +#ifndef __must_check_result +# if defined(__GNUC__) || __has_attribute(warn_unused_result) +# define __must_check_result __attribute__((warn_unused_result)) +# else +# define __must_check_result +# endif +#endif /* __must_check_result */ + +#ifndef __deprecated +# if defined(__GNUC__) || __has_attribute(deprecated) +# define __deprecated __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define __deprecated __declspec(deprecated) +# else +# define __deprecated +# endif +#endif /* __deprecated */ + +#ifndef __packed +# if defined(__GNUC__) || __has_attribute(packed) +# define __packed __attribute__((packed)) +# else +# define __packed +# endif +#endif /* __packed */ + +#ifndef __aligned +# if defined(__GNUC__) || __has_attribute(aligned) +# define __aligned(N) __attribute__((aligned(N))) +# elif defined(_MSC_VER) +# define __aligned(N) __declspec(align(N)) +# else +# define __aligned(N) +# endif +#endif /* __aligned */ + +#ifndef __noreturn +# if defined(__GNUC__) || __has_attribute(noreturn) +# define __noreturn __attribute__((noreturn)) +# elif defined(_MSC_VER) +# define __noreturn __declspec(noreturn) +# else +# define __noreturn +# endif +#endif /* __noreturn */ + +#ifndef __nothrow +# if defined(__GNUC__) || __has_attribute(nothrow) +# define __nothrow __attribute__((nothrow)) +# elif defined(_MSC_VER) && defined(__cplusplus) +# define __nothrow __declspec(nothrow) +# else +# define __nothrow +# endif +#endif /* __nothrow */ + +#ifndef __pure_function + /* Many functions have no effects except the return value and their + * return value depends only on the parameters and/or global variables. + * Such a function can be subject to common subexpression elimination + * and loop optimization just as an arithmetic operator would be. + * These functions should be declared with the attribute pure. */ +# if defined(__GNUC__) || __has_attribute(pure) +# define __pure_function __attribute__((pure)) +# else +# define __pure_function +# endif +#endif /* __pure_function */ + +#ifndef __const_function + /* Many functions do not examine any values except their arguments, + * and have no effects except the return value. Basically this is just + * slightly more strict class than the PURE attribute, since function + * is not allowed to read global memory. + * + * Note that a function that has pointer arguments and examines the + * data pointed to must not be declared const. Likewise, a function + * that calls a non-const function usually must not be const. + * It does not make sense for a const function to return void. */ +# if defined(__GNUC__) || __has_attribute(const) +# define __const_function __attribute__((const)) +# else +# define __const_function +# endif +#endif /* __const_function */ + +#ifndef __dll_hidden +# if defined(__GNUC__) || __has_attribute(visibility) +# define __hidden __attribute__((visibility("hidden"))) +# else +# define __hidden +# endif +#endif /* __dll_hidden */ + +#ifndef __optimize +# if defined(__OPTIMIZE__) +# if defined(__clang__) && !__has_attribute(optimize) +# define __optimize(ops) +# elif defined(__GNUC__) || __has_attribute(optimize) +# define __optimize(ops) __attribute__((optimize(ops))) +# else +# define __optimize(ops) +# endif +# else +# define __optimize(ops) +# endif +#endif /* __optimize */ + +#ifndef __hot +# if defined(__OPTIMIZE__) +# if defined(__clang__) && !__has_attribute(hot) + /* just put frequently used functions in separate section */ +# define __hot __attribute__((section("text.hot"))) __optimize("O3") +# elif defined(__GNUC__) || __has_attribute(hot) +# define __hot __attribute__((hot)) __optimize("O3") +# else +# define __hot __optimize("O3") +# endif +# else +# define __hot +# endif +#endif /* __hot */ + +#ifndef __cold +# if defined(__OPTIMIZE__) +# if defined(__clang__) && !__has_attribute(cold) + /* just put infrequently used functions in separate section */ +# define __cold __attribute__((section("text.unlikely"))) __optimize("Os") +# elif defined(__GNUC__) || __has_attribute(cold) +# define __cold __attribute__((cold)) __optimize("Os") +# else +# define __cold __optimize("Os") +# endif +# else +# define __cold +# endif +#endif /* __cold */ + +#ifndef __flatten +# if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(flatten)) +# define __flatten __attribute__((flatten)) +# else +# define __flatten +# endif +#endif /* __flatten */ + +#ifndef likely +# if defined(__GNUC__) || defined(__clang__) +# define likely(cond) __builtin_expect(!!(cond), 1) +# else +# define likely(x) (x) +# endif +#endif /* likely */ + +#ifndef unlikely +# if defined(__GNUC__) || defined(__clang__) +# define unlikely(cond) __builtin_expect(!!(cond), 0) +# else +# define unlikely(x) (x) +# endif +#endif /* unlikely */ + +/*----------------------------------------------------------------------------*/ + +/* Wrapper around __func__, which is a C99 feature */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +# define mdbx_func_ __func__ +#elif (defined(__GNUC__) && __GNUC__ >= 2) || defined(__clang__) || defined(_MSC_VER) +# define mdbx_func_ __FUNCTION__ +#else +# define mdbx_func_ "" +#endif + +/* *INDENT-ON* */ +/* clang-format on */ + +#define MDBX_TETRAD(a, b, c, d) \ + ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | \ + (uint32_t)(d)) diff --git a/src/lck-posix.c b/src/lck-posix.c new file mode 100644 index 00000000..7c78f707 --- /dev/null +++ b/src/lck-posix.c @@ -0,0 +1,251 @@ +/* + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "./bits.h" + +/* Some platforms define the EOWNERDEAD error code + * even though they don't support Robust Mutexes. + * Compile with -DMDB_USE_ROBUST=0. */ +#ifndef MDB_USE_ROBUST +/* Howard Chu: Android currently lacks Robust Mutex support */ +#if defined(EOWNERDEAD) && \ + !defined(ANDROID) /* LY: glibc before 2.10 has a troubles with Robust \ + Mutex too. */ \ + && __GLIBC_PREREQ(2, 10) +#define MDB_USE_ROBUST 1 +#else +#define MDB_USE_ROBUST 0 +#endif +#endif /* MDB_USE_ROBUST */ + +/*----------------------------------------------------------------------------*/ +/* rthc */ + +static mdbx_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER; + +void mdbx_rthc_lock(void) { + mdbx_ensure(NULL, pthread_mutex_lock(&mdbx_rthc_mutex) == 0); +} + +void mdbx_rthc_unlock(void) { + mdbx_ensure(NULL, pthread_mutex_unlock(&mdbx_rthc_mutex) == 0); +} + +/*----------------------------------------------------------------------------*/ +/* lck */ + +static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset); +static int mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc); + +int mdbx_lck_init(MDB_env *env) { + pthread_mutexattr_t ma; + int rc = pthread_mutexattr_init(&ma); + if (rc) + return rc; + + rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); + if (rc) + goto bailout; + +#if MDB_USE_ROBUST +#if __GLIBC_PREREQ(2, 12) + rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); +#else + rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); +#endif + if (rc) + goto bailout; +#endif /* MDB_USE_ROBUST */ + +#if _POSIX_C_SOURCE >= 199506L + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); + if (rc == ENOTSUP) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); + if (rc) + goto bailout; +#endif /* PTHREAD_PRIO_INHERIT */ + + rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &ma); + if (rc) + goto bailout; + rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &ma); + +bailout: + pthread_mutexattr_destroy(&ma); + return rc; +} + +void mdbx_lck_destroy(MDB_env *env) { + if (env->me_lfd != INVALID_HANDLE_VALUE) { + /* try get exclusive access */ + if (mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0) == 0) { + /* got exclusive, drown mutexes */ + int rc = pthread_mutex_destroy(&env->me_txns->mti_rmutex); + if (rc == 0) + rc = pthread_mutex_destroy(&env->me_txns->mti_wmutex); + assert(rc == 0); + (void)rc; + /* lock would be released (by kernel) while the me_lfd will be closed */ + } + } +} + +static int mdbx_robust_lock(MDB_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_lock(mutex); + if (unlikely(rc != 0)) + rc = mdbx_mutex_failed(env, mutex, rc); + return rc; +} + +static int mdbx_robust_unlock(MDB_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_unlock(mutex); + if (unlikely(rc != 0)) + rc = mdbx_mutex_failed(env, mutex, rc); + return rc; +} + +int mdbx_rdt_lock(MDB_env *env) { + return mdbx_robust_lock(env, &env->me_txns->mti_rmutex); +} + +void mdbx_rdt_unlock(MDB_env *env) { + int rc = mdbx_robust_unlock(env, &env->me_txns->mti_rmutex); + if (unlikely(rc != 0)) + mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); +} + +int mdbx_txn_lock(MDB_env *env) { + return mdbx_robust_lock(env, &env->me_txns->mti_wmutex); +} + +void mdbx_txn_unlock(MDB_env *env) { + int rc = mdbx_robust_unlock(env, &env->me_txns->mti_wmutex); + if (unlikely(rc != 0)) + mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); +} + +int mdbx_lck_seize(MDB_env *env) { + /* try exclusive access */ + int rc = mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0); + if (rc == 0) + /* got exclusive */ + return MDBX_RESULT_TRUE; + if (rc == EAGAIN || rc == EACCES || rc == EBUSY) { + /* get shared access */ + rc = mdbx_lck_op(env->me_lfd, F_SETLKW, F_RDLCK, 0); + if (rc == 0) { + /* got shared, try exclusive again */ + rc = mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0); + if (rc == 0) + /* now got exclusive */ + return MDBX_RESULT_TRUE; + if (rc == EAGAIN || rc == EACCES || rc == EBUSY) + /* unable exclusive, but stay shared */ + return MDBX_RESULT_FALSE; + } + } + assert(rc != MDBX_RESULT_FALSE && rc != MDBX_RESULT_TRUE); + return rc; +} + +int mdbx_lck_downgrade(MDB_env *env) { + return mdbx_lck_op(env->me_lfd, F_SETLK, F_RDLCK, 0); +} + +int mdbx_rpid_set(MDB_env *env) { + return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid); +} + +int mdbx_rpid_clear(MDB_env *env) { + return mdbx_lck_op(env->me_lfd, F_SETLKW, F_UNLCK, env->me_pid); +} + +int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { + int rc = mdbx_lck_op(env->me_lfd, F_GETLK, F_WRLCK, pid); + if (rc == 0) + return MDBX_RESULT_FALSE; + if (rc < 0 && -rc == pid) + return MDBX_RESULT_TRUE; + return rc; +} + +static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset) { + for (;;) { + int rc; + struct flock lock_op; + memset(&lock_op, 0, sizeof(lock_op)); + lock_op.l_type = lck; + lock_op.l_whence = SEEK_SET; + lock_op.l_start = offset; + lock_op.l_len = 1; + if ((rc = fcntl(fd, op, &lock_op)) == 0) { + if (op == F_GETLK && lock_op.l_type != F_UNLCK) + rc = -lock_op.l_pid; + } else if ((rc = errno) == EINTR) { + continue; + } + return rc; + } +} + +static int __cold mdbx_mutex_failed(MDB_env *env, mdbx_mutex_t *mutex, int rc) { +#if MDB_USE_ROBUST + if (unlikely(rc == EOWNERDEAD)) { + int rlocked, rc2; + + /* We own the mutex. Clean up after dead previous owner. */ + rc = MDB_SUCCESS; + rlocked = (mutex == &env->me_txns->mti_rmutex); + if (!rlocked) { + /* Keep mtb.mti_txnid updated, otherwise next writer can + * overwrite data which latest meta page refers to. + * + * LY: Hm, how this can happen, if the mtb.mti_txnid + * is updating only at the finish of a successful commit ? + */ + MDB_meta *meta = mdbx_meta_head_w(env); + assert(env->me_txns->mti_txnid == meta->mm_txnid); + (void)meta; + /* env is hosed if the dead thread was ours */ + if (env->me_txn) { + env->me_flags |= MDB_FATAL_ERROR; + env->me_txn = NULL; + rc = MDB_PANIC; + } + } + mdbx_debug("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); + rc2 = mdbx_reader_check0(env, rlocked, NULL); + if (rc2 == 0) +#if __GLIBC_PREREQ(2, 12) + rc2 = pthread_mutex_consistent(mutex); +#else + rc2 = pthread_mutex_consistent_np(mutex); +#endif + if (rc || (rc = rc2)) { + mdbx_debug("mutex recovery failed, %s", mdbx_strerror(rc)); + pthread_mutex_unlock(mutex); + } + } +#endif /* MDB_USE_ROBUST */ + + if (unlikely(rc)) { + mdbx_debug("lock mutex failed, %s", mdbx_strerror(rc)); + if (rc != EDEADLK) { + env->me_flags |= MDB_FATAL_ERROR; + rc = MDB_PANIC; + } + } + return rc; +} diff --git a/src/lck-windows.c b/src/lck-windows.c new file mode 100644 index 00000000..46525198 --- /dev/null +++ b/src/lck-windows.c @@ -0,0 +1,314 @@ +/* + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "./bits.h" + +/* PREAMBLE FOR WINDOWS: + * + * We are not concerned for performance here. + * If you are running Windows a performance could NOT be the goal. + * Otherwise please use Linux. + * + * Regards, + * LY + */ + +/*----------------------------------------------------------------------------*/ +/* rthc */ + +static CRITICAL_SECTION rthc_critical_section; + +static void NTAPI tls_callback(PVOID module, DWORD reason, PVOID reserved) { + (void)module; + (void)reserved; + switch (reason) { + case DLL_PROCESS_ATTACH: + InitializeCriticalSection(&rthc_critical_section); + break; + case DLL_PROCESS_DETACH: + DeleteCriticalSection(&rthc_critical_section); + break; + + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: + mdbx_rthc_cleanup(); + break; + } +} + +void mdbx_rthc_lock(void) { EnterCriticalSection(&rthc_critical_section); } + +void mdbx_rthc_unlock(void) { LeaveCriticalSection(&rthc_critical_section); } + +/* *INDENT-OFF* */ +/* clang-format off */ +#if defined(_MSC_VER) +# pragma const_seg(push) +# pragma data_seg(push) + +# ifdef _WIN64 + /* kick a linker to create the TLS directory if not already done */ +# pragma comment(linker, "/INCLUDE:_tls_used") + /* Force some symbol references. */ +# pragma comment(linker, "/INCLUDE:mdbx_tls_callback") + /* specific const-segment for WIN64 */ +# pragma const_seg(".CRT$XLB") + const +# else + /* kick a linker to create the TLS directory if not already done */ +# pragma comment(linker, "/INCLUDE:__tls_used") + /* Force some symbol references. */ +# pragma comment(linker, "/INCLUDE:_mdbx_tls_callback") + /* specific data-segment for WIN32 */ +# pragma data_seg(".CRT$XLB") +# endif + + PIMAGE_TLS_CALLBACK mdbx_tls_callback = tls_callback; +# pragma data_seg(pop) +# pragma const_seg(pop) + +#elif defined(__GNUC__) +# ifdef _WIN64 + const +# endif + PIMAGE_TLS_CALLBACK mdbx_tls_callback __attribute__((section(".CRT$XLB"), used)) + = tls_callback; +#else +# error FIXME +#endif +/* *INDENT-ON* */ +/* clang-format on */ + +/*----------------------------------------------------------------------------*/ + +#define LCK_SHARED 0 +#define LCK_EXCLUSIVE LOCKFILE_EXCLUSIVE_LOCK +#define LCK_WAITFOR 0 +#define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY + +static BOOL flock(mdbx_filehandle_t fd, DWORD flags, off_t offset, + size_t bytes) { + OVERLAPPED ov; + ov.hEvent = 0; + ov.Offset = (DWORD)offset; + ov.OffsetHigh = HIGH_DWORD(offset); + return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov); +} + +static BOOL funlock(mdbx_filehandle_t fd, off_t offset, size_t bytes) { + return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, + HIGH_DWORD(bytes)); +} + +/*----------------------------------------------------------------------------*/ +/* global `write` lock for write-txt processing, + * exclusive locking both meta-pages) */ + +int mdbx_txn_lock(MDB_env *env) { + if (flock(env->me_fd, LCK_EXCLUSIVE | LCK_WAITFOR, 0, env->me_psize * 2)) + return MDB_SUCCESS; + return GetLastError(); +} + +void mdbx_txn_unlock(MDB_env *env) { + if (!funlock(env->me_fd, 0, env->me_psize * 2)) + mdbx_panic("%s failed: errcode %u", mdbx_func_, GetLastError()); +} + +/*----------------------------------------------------------------------------*/ +/* global `read` lock for readers registration, + * exclusive locking `mti_numreaders` (second) cacheline */ + +#define LCK_LO_OFFSET 0 +#define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders) +#define LCK_UP_OFFSET LCK_LO_LEN +#define LCK_UP_LEN (MDBX_LOCKINFO_WHOLE_SIZE - LCK_UP_OFFSET) +#define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN +#define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN + +int mdbx_rdt_lock(MDB_env *env) { + if (env->me_lfd == INVALID_HANDLE_VALUE) + return MDB_SUCCESS; /* readonly database in readonly filesystem */ + + /* transite from S-? (used) to S-E (locked), e.g. exlcusive lock upper-part */ + if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) + return MDB_SUCCESS; + return GetLastError(); +} + +void mdbx_rdt_unlock(MDB_env *env) { + if (env->me_lfd != INVALID_HANDLE_VALUE) { + /* transite from S-E (locked) to S-? (used), e.g. unlock upper-part */ + if (!funlock(env->me_lfd, LCK_UPPER)) + mdbx_panic("%s failed: errcode %u", mdbx_func_, GetLastError()); + } +} + +/*----------------------------------------------------------------------------*/ +/* global `initial` lock for lockfile initialization, +* exclusive/shared locking first cacheline */ + +/* FIXME: locking scheme/algo descritpion. + ?-? = free + S-? = used + E-? + ?-S + ?-E = middle + S-S + S-E = locked + E-S + E-E = exclusive +*/ + +int mdbx_lck_init(MDB_env *env) { + (void)env; + return MDB_SUCCESS; +} + +/* Seize state as exclusive (E-E and returns MDBX_RESULT_TRUE) + * or used (S-? and returns MDBX_RESULT_FALSE), otherwise returns an error */ +int mdbx_lck_seize(MDB_env *env) { + /* 1) now on ?-? (free), get ?-E (middle) */ + if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) + return GetLastError() /* 2) something went wrong, give up */; + + /* 3) now on ?-E (middle), try E-E (exclusive) */ + if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) + return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive), done */ + + /* 5) still on ?-E (middle) */ + int rc = GetLastError(); + if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { + /* 6) something went wrong, give up */ + if (!funlock(env->me_lfd, LCK_UPPER)) { + rc = GetLastError(); + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "?-E(middle) >> ?-?(free)", rc); + } + return rc; + } + + /* 7) still on ?-E (middle), try S-E (locked) */ + rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER) + ? MDBX_RESULT_FALSE + : GetLastError(); + + /* 8) now on S-E (locked) or still on ?-E (middle), + * transite to S-? (used) or ?-? (free) */ + if (!funlock(env->me_lfd, LCK_UPPER)) { + rc = GetLastError(); + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "X-E(locked/middle) >> X-?(used/free)", rc); + } + + /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ + return rc; +} + +/* Transite from exclusive state (E-E) to used (S-?) */ +int mdbx_lck_downgrade(MDB_env *env) { + int rc; + + /* 1) now at E-E (exclusive), continue transition to ?_E (middle) */ + if (!funlock(env->me_lfd, LCK_LOWER)) { + rc = GetLastError(); + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "E-E(exclusive) >> ?-E(middle)", rc); + } + + /* 2) now at ?-E (middle), transite to S-E (locked) */ + if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { + rc = GetLastError() /* 3) something went wrong, give up */; + return rc; + } + + /* 4) got S-E (locked), continue transition to S-? (used) */ + if (!funlock(env->me_lfd, LCK_UPPER)) { + rc = GetLastError(); + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "S-E(locked) >> S-?(used)", rc); + } + return MDB_SUCCESS /* 5) now at S-? (used), done */; +} + +void mdbx_lck_destroy(MDB_env *env) { + int rc; + + if (env->me_fd != INVALID_HANDLE_VALUE) { + /* explicitly unlock to avoid latency for other processes (windows kernel + * releases such locks via deferred queues) */ + while (funlock(env->me_fd, 0, env->me_psize * 2)) + ; + rc = GetLastError(); + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + } + + if (env->me_lfd != INVALID_HANDLE_VALUE) { + /* double `unlock` for robustly remove overlapped shared/exclusive locks */ + while (funlock(env->me_lfd, LCK_LOWER)) + ; + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + + while (funlock(env->me_lfd, LCK_UPPER)) + ; + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + } +} + +/*----------------------------------------------------------------------------*/ +/* reader checking (by pid) */ + +int mdbx_rpid_set(MDB_env *env) { + (void)env; + return MDB_SUCCESS; +} + +int mdbx_rpid_clear(MDB_env *env) { + (void)env; + return MDB_SUCCESS; +} + +int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { + (void)env; + HANDLE hProcess = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid); + int rc; + if (hProcess) { + rc = WaitForSingleObject(hProcess, 0); + CloseHandle(hProcess); + } else { + rc = GetLastError(); + } + + switch (rc) { + case ERROR_INVALID_PARAMETER: + /* pid seem invalid */ + return MDBX_RESULT_FALSE; + case WAIT_OBJECT_0: + /* process just exited */ + return MDBX_RESULT_FALSE; + case WAIT_TIMEOUT: + /* pid running */ + return MDBX_RESULT_TRUE; + default: + /* failure */ + return rc; + } +} diff --git a/src/mdbx.c b/src/mdbx.c index d201c495..ee2ab1fe 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1,5 +1,7 @@ /* - * Copyright 2015-2017 Leonid Yuriev . + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. * * This code is derived from "LMDB engine" written by * Howard Chu (Symas Corporation), which itself derived from btree.c @@ -34,112 +36,120 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#ifndef _GNU_SOURCE -#define _GNU_SOURCE -#endif +#include "./bits.h" +#include "./midl.h" -#include +/*----------------------------------------------------------------------------*/ +/* rthc (tls keys and destructors) */ -#include "mdbx.h" - -#ifndef MDB_DEBUG -#define MDB_DEBUG 0 -#endif - -/* LY: Please do not ask us for Windows support, just never! - * But you can make a fork for Windows, or become maintainer for FreeBSD... */ -#ifndef __gnu_linux__ -#warning "libmdbx supports only GNU Linux" -#endif - -#if !defined(__GNUC__) || !__GNUC_PREREQ(4, 2) -/* LY: Actualy libmdbx was not tested with compilers - * older than GCC 4.4 (from RHEL6). - * But you could remove this #error and try to continue at your own risk. - * In such case please don't rise up an issues related ONLY to old compilers. - */ -#warning "libmdbx required at least GCC 4.2 compatible C/C++ compiler." -#endif - -#if !defined(__GNU_LIBRARY__) || !__GLIBC_PREREQ(2, 12) -/* LY: Actualy libmdbx was not tested with something - * older than glibc 2.12 (from RHEL6). - * But you could remove this #error and try to continue at your own risk. - * In such case please don't rise up an issues related ONLY to old systems. - */ -#warning "libmdbx required at least GLIBC 2.12." -#endif +typedef struct rthc_entry_t { + MDB_reader *begin; + MDB_reader *end; + mdbx_thread_key_t key; +} rthc_entry_t; #if MDB_DEBUG -#undef NDEBUG -#endif - -#include -#include -#include -#include -#include -#ifdef HAVE_SYS_FILE_H -#include -#endif -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) -#include -#include /* defines BYTE_ORDER on HPUX and Solaris */ -#endif - -#ifndef _POSIX_SYNCHRONIZED_IO -#define fdatasync fsync -#endif - -#ifndef BYTE_ORDER -#if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && \ - !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)) -/* Solaris just defines one or the other */ -#define LITTLE_ENDIAN 1234 -#define BIG_ENDIAN 4321 -#ifdef _LITTLE_ENDIAN -#define BYTE_ORDER LITTLE_ENDIAN +#define RTHC_INITIAL_LIMIT 1 #else -#define BYTE_ORDER BIG_ENDIAN -#endif -#else -#define BYTE_ORDER __BYTE_ORDER -#endif +#define RTHC_INITIAL_LIMIT 16 #endif -#ifndef LITTLE_ENDIAN -#define LITTLE_ENDIAN __LITTLE_ENDIAN -#endif -#ifndef BIG_ENDIAN -#define BIG_ENDIAN __BIG_ENDIAN -#endif +static unsigned rthc_count; +static unsigned rthc_limit = RTHC_INITIAL_LIMIT; +static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT]; +static rthc_entry_t *rthc_table = rthc_table_static; -#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) -#define MISALIGNED_OK 1 -#endif +__cold void mdbx_rthc_dtor(void *ptr) { + MDB_reader *rthc = (MDB_reader *)ptr; + unsigned i; -#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) -#error "Unknown or unsupported endianness (BYTE_ORDER)" -#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF -#error "Two's complement, reasonably sized integer types, please" -#endif + mdbx_rthc_lock(); + for (i = 0; i < rthc_count; ++i) { + if (rthc >= rthc_table[i].begin && rthc < rthc_table[i].end) { + if (rthc->mr_pid && rthc->mr_pid == mdbx_getpid()) { + rthc->mr_pid = 0; + mdbx_coherent_barrier(); + } + break; + } + } + mdbx_rthc_unlock(); +} -#include "./barriers.h" -#include "./midl.h" -#include "./reopen.h" +__cold void mdbx_rthc_cleanup(void) { + unsigned i; + + mdbx_rthc_lock(); + for (i = 0; i < rthc_count; ++i) { + mdbx_thread_key_t key = rthc_table[i].key; + MDB_reader *rthc = mdbx_thread_rthc_get(key); + if (rthc) { + mdbx_thread_rthc_set(key, NULL); + if (rthc->mr_pid && rthc->mr_pid == mdbx_getpid()) { + rthc->mr_pid = 0; + mdbx_coherent_barrier(); + } + } + } + mdbx_rthc_unlock(); +} + +__cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDB_reader *begin, + MDB_reader *end) { + int rc = mdbx_thread_key_create(key); + if (rc != MDB_SUCCESS) + return rc; + + mdbx_rthc_lock(); + if (rthc_count == rthc_limit) { + rthc_entry_t *new_table = + realloc((rthc_table == rthc_table_static) ? NULL : rthc_table, + sizeof(rthc_entry_t) * rthc_limit * 2); + if (new_table == NULL) { + rc = ENOMEM; + goto bailout; + } + if (rthc_table == rthc_table_static) + memcpy(new_table, rthc_table_static, sizeof(rthc_table_static)); + rthc_table = new_table; + rthc_limit *= 2; + } + + rthc_table[rthc_count].key = *key; + rthc_table[rthc_count].begin = begin; + rthc_table[rthc_count].end = end; + ++rthc_count; + mdbx_rthc_unlock(); + return MDB_SUCCESS; + +bailout: + mdbx_thread_key_delete(*key); + mdbx_rthc_unlock(); + return rc; +} + +__cold void mdbx_rthc_remove(mdbx_thread_key_t key) { + mdbx_rthc_lock(); + mdbx_thread_key_delete(key); + + unsigned i; + for (i = 0; i < rthc_count; ++i) { + if (key == rthc_table[i].key) { + if (--rthc_count > 0) + rthc_table[i] = rthc_table[rthc_count]; + else if (rthc_table != rthc_table_static) { + free(rthc_table); + rthc_table = rthc_table_static; + rthc_limit = RTHC_INITIAL_LIMIT; + } + break; + } + } + + mdbx_rthc_unlock(); +} + +/*----------------------------------------------------------------------------*/ /** Search for an ID in an IDL. * @param[in] ids The IDL to search. @@ -227,6 +237,8 @@ static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id); */ static int mdbx_mid2l_append(MDB_ID2L ids, MDB_ID2 *id); +/*----------------------------------------------------------------------------*/ + int mdbx_runtime_flags = MDBX_DBG_PRINT #if MDB_DEBUG | MDBX_DBG_ASSERT @@ -242,94 +254,25 @@ int mdbx_runtime_flags = MDBX_DBG_PRINT #endif ; -static MDBX_debug_func *mdbx_debug_logger; +MDBX_debug_func *mdbx_debug_logger; int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); +#if MDB_DEBUG +txnid_t mdbx_debug_edge; +#endif + /** Features under development */ #ifndef MDB_DEVEL #define MDB_DEVEL 0 #endif -/** Wrapper around __func__, which is a C99 feature */ -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L -#define mdbx_func_ __func__ -#elif (defined(__GNUC__) && __GNUC__ >= 2) || defined(__clang__) -#define mdbx_func_ __FUNCTION__ -#else -/* If a debug message says (), update the #if statements above */ -#define mdbx_func_ "" -#endif - -/** Some platforms define the EOWNERDEAD error code - * even though they don't support Robust Mutexes. - * Compile with -DMDB_USE_ROBUST=0. - */ -#ifndef MDB_USE_ROBUST -/* Howard Chu: Android currently lacks Robust Mutex support */ -#if defined(EOWNERDEAD) && \ - !defined(ANDROID) /* LY: glibc before 2.10 has a troubles with Robust \ - Mutex too. */ \ - && __GLIBC_PREREQ(2, 10) -#define MDB_USE_ROBUST 1 -#else -#define MDB_USE_ROBUST 0 -#endif -#endif /* MDB_USE_ROBUST */ - /* Internal error codes, not exposed outside liblmdb */ #define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) -/** Mutex for the reader table (rw = r) or write transaction (rw = w). - */ -#define MDB_MUTEX(env, rw) (&(env)->me_txns->mti_##rw##mutex) - -/** An abstraction for a file handle. - * On POSIX systems file handles are small integers. On Windows - * they're opaque pointers. - */ -#define HANDLE int - -/** A value for an invalid file handle. - * Mainly used to initialize file variables and signify that they are - * unused. - */ -#define INVALID_HANDLE_VALUE (-1) - -/** Get the size of a memory page for the system. - * This is the basic size that the platform's memory manager uses, and is - * fundamental to the use of memory-mapped files. - */ -#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) - -/** @} */ - -static int mdbx_mutex_lock(MDB_env *env, pthread_mutex_t *mutex); -static int mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc); -static void mdbx_mutex_unlock(MDB_env *env, pthread_mutex_t *mutex); - -/** A page number in the database. - * Note that 64 bit page numbers are overkill, since pages themselves - * already represent 12-13 bits of addressable memory, and the OS will - * always limit applications to a maximum of 63 bits of address space. - * - * @note In the #MDB_node structure, we only store 48 bits of this value, - * which thus limits us to only 60 bits of addressable data. - */ -typedef MDB_ID pgno_t; - -/** A transaction ID. - * See struct MDB_txn.mt_txnid for details. - */ -typedef MDB_ID txnid_t; - -/** @defgroup debug Debug Macros - * @{ - */ -/** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +/* Debuging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) -/** @} */ /** @brief The maximum size of a database page. * @@ -433,12 +376,6 @@ typedef MDB_ID txnid_t; /** Round \b n up to an even number. */ #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ -/** Used for offsets within a single page. - * Since memory pages are typically 4 or 8KB in size, 12-13 bits, - * this is plenty. - */ -typedef uint16_t indx_t; - /** Default size of memory map. * This is certainly too small for any actual applications. Apps should *always set @@ -508,163 +445,6 @@ typedef uint16_t indx_t; */ #define DEFAULT_READERS 126 -/** The information we store in a single slot of the reader table. - * In addition to a transaction ID, we also record the process and - * thread ID that owns a slot, so that we can detect stale information, - * e.g. threads or processes that went away without cleaning up. - * @note We currently don't check for stale records. We simply re-init - * the table when we know that we're the only process opening the - * lock file. - */ -typedef struct MDB_rxbody { - /** Current Transaction ID when this transaction began, or (txnid_t)-1. - * Multiple readers that start at the same time will probably have the - * same ID here. Again, it's not important to exclude them from - * anything; all we need to know is which version of the DB they - * started from so we can avoid overwriting any data used in that - * particular version. - */ - volatile txnid_t mrb_txnid; - /** The process ID of the process owning this reader txn. */ - volatile pid_t mrb_pid; - /** The thread ID of the thread owning this txn. */ - volatile pthread_t mrb_tid; -} MDB_rxbody; - -/** The actual reader record, with cacheline padding. */ -typedef struct MDB_reader { - union { - MDB_rxbody mrx; -/** shorthand for mrb_txnid */ -#define mr_txnid mru.mrx.mrb_txnid -#define mr_pid mru.mrx.mrb_pid -#define mr_tid mru.mrx.mrb_tid - /** cache line alignment */ - char pad[(sizeof(MDB_rxbody) + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1)]; - } mru; -} MDB_reader; - -/** The header for the reader table. - * The table resides in a memory-mapped file. (This is a different file - * than is used for the main database.) - * - * For POSIX the actual mutexes reside in the shared memory of this - * mapped file. On Windows, mutexes are named objects allocated by the - * kernel; we store the mutex names in this mapped file so that other - * processes can grab them. This same approach is also used on - * MacOSX/Darwin (using named semaphores) since MacOSX doesn't support - * process-shared POSIX mutexes. For these cases where a named object - * is used, the object name is derived from a 64 bit FNV hash of the - * environment pathname. As such, naming collisions are extremely - * unlikely. If a collision occurs, the results are unpredictable. - */ -typedef struct MDB_txbody { - /** Stamp identifying this as an LMDB file. It must be set - * to #MDB_MAGIC. */ - uint32_t mtb_magic; - /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ - uint32_t mtb_format; - /** Mutex protecting access to this table. - * This is the #MDB_MUTEX(env,r) reader table lock. - */ - pthread_mutex_t mtb_rmutex; - /** The ID of the last transaction committed to the database. - * This is recorded here only for convenience; the value can always - * be determined by reading the main database meta pages. - */ - volatile txnid_t mtb_txnid; - /** The number of slots that have been used in the reader table. - * This always records the maximum count, it is not decremented - * when readers release their slots. - */ - volatile unsigned mtb_numreaders; -} MDB_txbody; - -/** The actual reader table definition. */ -typedef struct MDB_txninfo { - union { - MDB_txbody mtb; -#define mti_magic mt1.mtb.mtb_magic -#define mti_format mt1.mtb.mtb_format -#define mti_rmutex mt1.mtb.mtb_rmutex -#define mti_rmname mt1.mtb.mtb_rmname -#define mti_txnid mt1.mtb.mtb_txnid -#define mti_numreaders mt1.mtb.mtb_numreaders - char pad[(sizeof(MDB_txbody) + CACHELINE_SIZE - 1) & ~(CACHELINE_SIZE - 1)]; - } mt1; - union { - pthread_mutex_t mt2_wmutex; -#define mti_wmutex mt2.mt2_wmutex - char pad[(sizeof(pthread_mutex_t) + CACHELINE_SIZE - 1) & - ~(CACHELINE_SIZE - 1)]; - } mt2; - MDB_reader mti_readers[1]; -} MDB_txninfo; - -/** Lockfile format signature: version, features and field layout */ -#define MDB_LOCK_FORMAT \ - ((uint32_t)((MDB_LOCK_VERSION) /* Flags which describe functionality */ \ - + (0 /* SYSV_SEM_FLAG */ << 18) + (1 /* MDB_PIDLOCK */ << 16))) -/** @} */ - -/** Common header for all page types. The page type depends on #mp_flags. - * - * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with - * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages - * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. - * - * #P_OVERFLOW records occupy one or more contiguous pages where only the - * first has a page header. They hold the real data of #F_BIGDATA nodes. - * - * #P_SUBP sub-pages are small leaf "pages" with duplicate data. - * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. - * (Duplicate data can also go in sub-databases, which use normal pages.) - * - * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. - * - * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once - * in the snapshot: Either used by a database or listed in a freeDB record. - */ -typedef struct MDB_page { -#define mp_pgno mp_p.p_pgno -#define mp_next mp_p.p_next - union { - pgno_t p_pgno; /**< page number */ - struct MDB_page *p_next; /**< for in-memory list of freed pages */ - } mp_p; - uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */ - /** @defgroup mdbx_page Page Flags - * @ingroup internal - * Flags for the page headers. - * @{ - */ -#define P_BRANCH 0x01 /**< branch page */ -#define P_LEAF 0x02 /**< leaf page */ -#define P_OVERFLOW 0x04 /**< overflow page */ -#define P_META 0x08 /**< meta page */ -#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ -#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ -#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ -#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ -#define P_KEEP 0x8000 /**< leave this page alone during spill */ - /** @} */ - uint16_t mp_flags; /**< @ref mdbx_page */ -#define mp_lower mp_pb.pb.pb_lower -#define mp_upper mp_pb.pb.pb_upper -#define mp_pages mp_pb.pb_pages - union { - struct { - indx_t pb_lower; /**< lower bound of free space */ - indx_t pb_upper; /**< upper bound of free space */ - } pb; - uint32_t pb_pages; /**< number of overflow pages */ - } mp_pb; - indx_t mp_ptrs[1]; /**< dynamic size */ -} MDB_page; - -/** Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned)offsetof(MDB_page, mp_ptrs)) - /** Address of first usable data byte in a page, after the header */ #define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) @@ -720,18 +500,15 @@ typedef struct MDB_page { * a sub-page/sub-database, and named databases (just #F_SUBDATA). */ typedef struct MDB_node { -/** part of data size or pgno - * @{ */ -#if BYTE_ORDER == LITTLE_ENDIAN - unsigned short mn_lo, mn_hi; +/* part of data size or pgno */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint16_t mn_lo, mn_hi; #else - unsigned short mn_hi, mn_lo; + uint16_t mn_hi, mn_lo; #endif -/** @} */ /** @defgroup mdbx_node Node Flags * @ingroup internal * Flags for node headers. - * @{ */ #define F_BIGDATA 0x01 /**< data put on overflow page */ #define F_SUBDATA 0x02 /**< data is a sub-database */ @@ -740,10 +517,9 @@ typedef struct MDB_node { /** valid flags for #mdbx_node_add() */ #define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDB_RESERVE | MDB_APPEND) - /** @} */ - unsigned short mn_flags; /**< @ref mdbx_node */ - unsigned short mn_ksize; /**< key size */ - char mn_data[1]; /**< key and data are appended here */ + uint16_t mn_flags; /**< @ref mdbx_node */ + uint16_t mn_ksize; /**< key size */ + uint8_t mn_data[1]; /**< key and data are appended here */ } MDB_node; /** Size of the node header, excluding dynamic data at the end */ @@ -762,12 +538,11 @@ typedef struct MDB_node { */ #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) -/** Address of node \b i in page \b p */ -#define NODEPTR(p, i) \ - ({ \ - assert(NUMKEYS(p) > (unsigned)(i)); \ - (MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); \ - }) +/* Address of node i in page p */ +static __inline MDB_node *NODEPTR(MDB_page *p, unsigned i) { + assert(NUMKEYS(p) > (unsigned)(i)); + return (MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); +} /** Address of the key for the node */ #define NODEKEY(node) (void *)((node)->mn_data) @@ -782,10 +557,10 @@ typedef struct MDB_node { /** Set the page number in a branch node */ #define SETPGNO(node, pgno) \ do { \ - (node)->mn_lo = (pgno)&0xffff; \ - (node)->mn_hi = (pgno) >> 16; \ + (node)->mn_lo = (uint16_t)(pgno); \ + (node)->mn_hi = (uint16_t)((pgno) >> 16); \ if (PGNO_TOPWORD) \ - (node)->mn_flags = (pgno) >> PGNO_TOPWORD; \ + (node)->mn_flags = (uint16_t)((pgno) >> PGNO_TOPWORD); \ } while (0) /** Get the size of the data in a leaf node */ @@ -793,21 +568,21 @@ typedef struct MDB_node { /** Set the size of the data for a leaf node */ #define SETDSZ(node, size) \ do { \ - (node)->mn_lo = (size)&0xffff; \ - (node)->mn_hi = (size) >> 16; \ + (node)->mn_lo = (uint16_t)(size); \ + (node)->mn_hi = (uint16_t)((size) >> 16); \ } while (0) /** The size of a key in a node */ #define NODEKSZ(node) ((node)->mn_ksize) /** Copy a page number from src to dst */ -#ifdef MISALIGNED_OK +#if MISALIGNED_OK #define COPY_PGNO(dst, src) dst = src #elif SIZE_MAX > 4294967295UL #define COPY_PGNO(dst, src) \ do { \ - unsigned short *s, *d; \ - s = (unsigned short *)&(src); \ - d = (unsigned short *)&(dst); \ + uint16_t *s, *d; \ + s = (uint16_t *)&(src); \ + d = (uint16_t *)&(dst); \ *d++ = *s++; \ *d++ = *s++; \ *d++ = *s++; \ @@ -816,9 +591,9 @@ typedef struct MDB_node { #else #define COPY_PGNO(dst, src) \ do { \ - unsigned short *s, *d; \ - s = (unsigned short *)&(src); \ - d = (unsigned short *)&(dst); \ + uint16_t *s, *d; \ + s = (uint16_t *)&(src); \ + d = (uint16_t *)&(dst); \ *d++ = *s++; \ *d = *s; \ } while (0) @@ -847,18 +622,6 @@ typedef struct MDB_node { key.mv_data = NODEKEY(node); \ } -/** Information about a single database in the environment. */ -typedef struct MDB_db { - uint32_t md_xsize; /**< also ksize for LEAF2 pages */ - uint16_t md_flags; /**< @ref mdbx_dbi_open */ - uint16_t md_depth; /**< depth of this tree */ - pgno_t md_branch_pages; /**< number of internal pages */ - pgno_t md_leaf_pages; /**< number of leaf pages */ - pgno_t md_overflow_pages; /**< number of overflow pages */ - size_t md_entries; /**< number of data items */ - pgno_t md_root; /**< the root page of this tree */ -} MDB_db; - #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) /** #mdbx_dbi_open() flags */ @@ -866,362 +629,13 @@ typedef struct MDB_db { (MDB_REVERSEKEY | MDB_DUPSORT | MDB_INTEGERKEY | MDB_DUPFIXED | \ MDB_INTEGERDUP | MDB_REVERSEDUP | MDB_CREATE) -/** Handle for the DB used to track free pages. */ -#define FREE_DBI 0 -/** Handle for the default DB. */ -#define MAIN_DBI 1 -/** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ -#define CORE_DBS 2 - -/** Number of meta pages - also hardcoded elsewhere */ -#define NUM_METAS 2 - -/** Meta page content. - * A meta page is the start point for accessing a database snapshot. - * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). - */ -typedef struct MDB_meta { - /** Stamp identifying this as an LMDB file. It must be set - * to #MDB_MAGIC. */ - uint32_t mm_magic; - /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ - uint32_t mm_version; - void *mm_address; /**< address for fixed mapping */ - size_t mm_mapsize; /**< size of mmap region */ - MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ - /** The size of pages used in this DB */ -#define mm_psize mm_dbs[FREE_DBI].md_xsize -/** Any persistent environment flags. @ref mdbx_env */ -#define mm_flags mm_dbs[FREE_DBI].md_flags - /** Last used page in the datafile. - * Actually the file may be shorter if the freeDB lists the final pages. - */ - pgno_t mm_last_pg; - volatile txnid_t mm_txnid; /**< txnid that committed this page */ -#define MDB_DATASIGN_NONE 0 -#define MDB_DATASIGN_WEAK 1 - volatile uint64_t mm_datasync_sign; -#define META_IS_WEAK(meta) ((meta)->mm_datasync_sign == MDB_DATASIGN_WEAK) -#define META_IS_STEADY(meta) ((meta)->mm_datasync_sign > MDB_DATASIGN_WEAK) - -#if MDBX_MODE_ENABLED - volatile mdbx_canary mm_canary; -#endif -} MDB_meta; - -/** Buffer for a stack-allocated meta page. - * The members define size and alignment, and silence type - * aliasing warnings. They are not used directly; that could - * mean incorrectly using several union members in parallel. - */ -typedef union MDB_metabuf { - MDB_page mb_page; - struct { - char mm_pad[PAGEHDRSZ]; - MDB_meta mm_meta; - } mb_metabuf; -} MDB_metabuf; - -/** Auxiliary DB info. - * The information here is mostly static/read-only. There is - * only a single copy of this record in the environment. - */ -typedef struct MDB_dbx { - MDB_val md_name; /**< name of the database */ - MDB_cmp_func *md_cmp; /**< function for comparing keys */ - MDB_cmp_func *md_dcmp; /**< function for comparing data items */ -} MDB_dbx; - -#if MDBX_MODE_ENABLED -#define MDBX_MODE_SALT 0 -#else -#error !? -#endif - -/** A database transaction. - * Every operation requires a transaction handle. - */ -struct MDB_txn { -#define MDBX_MT_SIGNATURE (0x93D53A31 ^ MDBX_MODE_SALT) - unsigned mt_signature; - MDB_txn *mt_parent; /**< parent of a nested txn */ - /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ - MDB_txn *mt_child; - pgno_t mt_next_pgno; /**< next unallocated page */ - /** The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. - */ - txnid_t mt_txnid; - MDB_env *mt_env; /**< the DB environment */ - /** The list of reclaimed txns from freeDB */ - MDB_IDL mt_lifo_reclaimed; - /** The list of pages that became unused during this transaction. - */ - MDB_IDL mt_free_pgs; - /** The list of loose pages that became unused and may be reused - * in this transaction, linked through #NEXT_LOOSE_PAGE(page). - */ - MDB_page *mt_loose_pgs; - /** Number of loose pages (#mt_loose_pgs) */ - int mt_loose_count; - /** The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. - */ - MDB_IDL mt_spill_pgs; - union { - /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ - MDB_ID2L dirty_list; - /** For read txns: This thread/txn's reader table slot, or NULL. */ - MDB_reader *reader; - } mt_u; - /** Array of records for each DB known in the environment. */ - MDB_dbx *mt_dbxs; - /** Array of MDB_db records for each known DB */ - MDB_db *mt_dbs; - /** Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; -/** @defgroup mt_dbflag Transaction DB Flags - * @ingroup internal - * @{ - */ -#define DB_DIRTY 0x01 /**< DB was written in this txn */ -#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ -#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ -#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ -#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ -#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ - /** @} */ - /** In write txns, array of cursors for each DB */ - MDB_cursor **mt_cursors; - /** Array of flags for each DB */ - unsigned char *mt_dbflags; - /** Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. - */ - MDB_dbi mt_numdbs; - -/** @defgroup mdbx_txn Transaction Flags - * @ingroup internal - * @{ - */ -/** #mdbx_txn_begin() flags */ -#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC | MDB_NOSYNC | MDB_RDONLY) -#define MDB_TXN_NOMETASYNC \ - MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ -#define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ -#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ - /* internal txn flags */ -#define MDB_TXN_WRITEMAP \ - MDB_WRITEMAP /**< copy of #MDB_env flag in writers \ - */ -#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ -#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ -#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ -#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ -#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ -/** most operations on the txn are currently illegal */ -#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED | MDB_TXN_ERROR | MDB_TXN_HAS_CHILD) - /** @} */ - unsigned mt_flags; /**< @ref mdbx_txn */ - /** #dirty_list room: Array size - \#dirty pages visible to this txn. - * Includes ancestor txns' dirty pages not hidden by other txns' - * dirty/spilled pages. Thus commit(nested txn) has room to merge - * dirty_list into mt_parent after freeing hidden mt_parent pages. - */ - unsigned mt_dirty_room; - -#if MDBX_MODE_ENABLED - mdbx_canary mt_canary; -#endif -}; - -/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. - * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to - * raise this on a 64 bit machine. - */ -#define CURSOR_STACK 32 - -struct MDB_xcursor; - -/** Cursors are used for all DB operations. - * A cursor holds a path of (page pointer, key index) from the DB - * root to a position in the DB, plus other state. #MDB_DUPSORT - * cursors include an xcursor to the current data item. Write txns - * track their cursors and keep them up to date when data moves. - * Exception: An xcursor's pointer to a #P_SUBP page can be stale. - * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). - */ -struct MDB_cursor { -#define MDBX_MC_SIGNATURE (0xFE05D5B1 ^ MDBX_MODE_SALT) -#define MDBX_MC_READY4CLOSE (0x2817A047 ^ MDBX_MODE_SALT) -#define MDBX_MC_WAIT4EOT (0x90E297A7 ^ MDBX_MODE_SALT) - unsigned mc_signature; - /** Next cursor on this DB in this txn */ - MDB_cursor *mc_next; - /** Backup of the original cursor if this cursor is a shadow */ - MDB_cursor *mc_backup; - /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ - struct MDB_xcursor *mc_xcursor; - /** The transaction that owns this cursor */ - MDB_txn *mc_txn; - /** The database handle this cursor operates on */ - MDB_dbi mc_dbi; - /** The database record for this cursor */ - MDB_db *mc_db; - /** The database auxiliary record for this cursor */ - MDB_dbx *mc_dbx; - /** The @ref mt_dbflag for this database */ - unsigned char *mc_dbflag; - unsigned short mc_snum; /**< number of pushed pages */ - unsigned short mc_top; /**< index of top page, normally mc_snum-1 */ - /** @defgroup mdbx_cursor Cursor Flags - * @ingroup internal - * Cursor state flags. - * @{ - */ -#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ -#define C_EOF 0x02 /**< No more data */ -#define C_SUB 0x04 /**< Cursor is a sub-cursor */ -#define C_DEL 0x08 /**< last op was a cursor_del */ -#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ -#define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */ - /** @} */ - unsigned mc_flags; /**< @ref mdbx_cursor */ - MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ - indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ -}; - -/** Context for sorted-dup records. - * We could have gone to a fully recursive design, with arbitrarily - * deep nesting of sub-databases. But for now we only handle these - * levels - main DB, optional sub-DB, sorted-duplicate DB. - */ -typedef struct MDB_xcursor { - /** A sub-cursor for traversing the Dup DB */ - MDB_cursor mx_cursor; - /** The database record for this Dup DB */ - MDB_db mx_db; - /** The auxiliary DB record for this Dup DB */ - MDB_dbx mx_dbx; - /** The @ref mt_dbflag for this Dup DB */ - unsigned char mx_dbflag; -} MDB_xcursor; - -/** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ -#define XCURSOR_INITED(mc) \ - ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - -/** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed - * when the node which contains the sub-page may have moved. Called - * with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. - */ -#define XCURSOR_REFRESH(mc, mp, ki) \ - do { \ - MDB_page *xr_pg = (mp); \ - MDB_node *xr_node = NODEPTR(xr_pg, ki); \ - if ((xr_node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ - (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ - } while (0) - -/** State of FreeDB old pages, stored in the MDB_env */ -typedef struct MDB_pgstate { - pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ - txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ -} MDB_pgstate; - -/** Context for deferred cleanup of reader's threads. - * to avoid https://github.com/ReOpen/ReOpenLDAP/issues/48 */ -typedef struct MDBX_rthc { - struct MDBX_rthc *rc_next; - pthread_t rc_thread; - MDB_reader *rc_reader; -} MDBX_rthc; - -static MDBX_rthc *mdbx_rthc_get(pthread_key_t key); - -/** The database environment. */ -struct MDB_env { -#define MDBX_ME_SIGNATURE (0x9A899641 ^ MDBX_MODE_SALT) - unsigned me_signature; - HANDLE me_fd; /**< The main data file */ - HANDLE me_lfd; /**< The lock file */ - /** Failed to update the meta page. Probably an I/O error. */ -#define MDB_FATAL_ERROR 0x80000000U -/** Some fields are initialized. */ -#define MDB_ENV_ACTIVE 0x20000000U -/** me_txkey is set */ -#define MDB_ENV_TXKEY 0x10000000U - uint32_t me_flags; /**< @ref mdbx_env */ - unsigned me_psize; /**< DB page size, inited from me_os_psize */ - unsigned me_os_psize; /**< OS page size, from #GET_PAGESIZE */ - unsigned me_maxreaders; /**< size of the reader table */ - /** Max #MDB_txninfo.%mti_numreaders of interest to #mdbx_env_close() */ - unsigned me_close_readers; - MDB_dbi me_numdbs; /**< number of DBs opened */ - MDB_dbi me_maxdbs; /**< size of the DB table */ - pid_t me_pid; /**< process ID of this env */ - char *me_path; /**< path to the DB files */ - char *me_map; /**< the memory map of the data file */ - MDB_txninfo *me_txns; /**< the memory map of the lock file, never NULL */ - void *me_pbuf; /**< scratch area for DUPSORT put() */ - MDB_txn *me_txn; /**< current write transaction */ - MDB_txn *me_txn0; /**< prealloc'd write transaction */ - size_t me_mapsize; /**< size of the data memory map */ - pgno_t me_maxpg; /**< me_mapsize / me_psize */ - MDB_dbx *me_dbxs; /**< array of static DB info */ - uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ - unsigned *me_dbiseqs; /**< array of dbi sequence numbers */ - pthread_key_t me_txkey; /**< thread-key for readers */ - txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ - MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ -#define me_pglast me_pgstate.mf_pglast -#define me_pghead me_pgstate.mf_pghead - MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ - /** IDL of pages that became unused in a write txn */ - MDB_IDL me_free_pgs; - /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ - MDB_ID2L me_dirty_list; - /** Max number of freelist items that can fit in a single overflow page */ - unsigned me_maxfree_1pg; - /** Max size of a node on a page */ - unsigned me_nodemax; - unsigned me_maxkey_limit; /**< max size of a key */ - int me_live_reader; /**< have liveness lock in reader table */ - void *me_userctx; /**< User-settable context */ -#if MDB_DEBUG - MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ -#endif - uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last - mdbx_env_sync() */ - uint64_t - me_sync_threshold; /**< Treshold of above to force synchronous flush */ -#if MDBX_MODE_ENABLED - MDBX_oom_func *me_oom_func; /**< Callback for kicking laggard readers */ -#endif -#ifdef USE_VALGRIND - int me_valgrind_handle; -#endif -}; - -/** Nested transaction */ -typedef struct MDB_ntxn { - MDB_txn mnt_txn; /**< the transaction */ - MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ -} MDB_ntxn; - /** max number of pages to commit in one writev() call */ #define MDB_COMMIT_PAGES 64 -#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES +#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ #undef MDB_COMMIT_PAGES #define MDB_COMMIT_PAGES IOV_MAX #endif -/** max bytes to write in one call */ -#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) - /** Check \b txn and \b dbi arguments to a function */ #define TXN_DBI_EXIST(txn, dbi, validity) \ ((dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) @@ -1230,11 +644,6 @@ typedef struct MDB_ntxn { #define TXN_DBI_CHANGED(txn, dbi) \ ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) -#define METAPAGE_1(env) (&((MDB_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) - -#define METAPAGE_2(env) \ - (&((MDB_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) - static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags); static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); @@ -1316,7 +725,6 @@ static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node); static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); static int mdbx_drop0(MDB_cursor *mc, int subs); -static int mdbx_reader_check0(MDB_env *env, int rlocked, int *dead); /** @cond */ static MDB_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_ai, @@ -1324,18 +732,18 @@ static MDB_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_ai, /** @endcond */ #ifdef __SANITIZE_THREAD__ -static pthread_mutex_t tsan_mutex = PTHREAD_MUTEX_INITIALIZER; +static mdbx_mutex_t tsan_mutex = mdbx_mutex_initIALIZER; #endif /** Return the library version info. */ const char *mdbx_version(int *major, int *minor, int *patch) { if (major) - *major = MDB_VERSION_MAJOR; + *major = MDBX_VERSION_MAJOR; if (minor) - *minor = MDB_VERSION_MINOR; + *minor = MDBX_VERSION_MINOR; if (patch) - *patch = MDB_VERSION_PATCH; - return MDB_VERSION_STRING; + *patch = MDBX_VERSION_PATCH; + return MDBX_VERSION_STRING; } static const char *__mdbx_strerr(int errnum) { @@ -1385,57 +793,38 @@ static const char *__mdbx_strerr(int errnum) { const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) { const char *msg = __mdbx_strerr(errnum); - return msg ? msg : strerror_r(errnum, buf, buflen); + if (!msg) { +#if defined(_WIN32) || defined(_WIN64) + (void)errnum; + (void)buf; + (void)buflen; + msg = FIXME; +#else + msg = strerror_r(errnum, buf, buflen); +#endif + } + return msg; } const char *__cold mdbx_strerror(int errnum) { const char *msg = __mdbx_strerr(errnum); - return msg ? msg : strerror(errnum); + if (!msg) { +#if defined(_WIN32) || defined(_WIN64) + (void)errnum; + msg = FIXME; +#else + msg = strerror(errnum); +#endif + } + return msg; } #if MDBX_MODE_ENABLED static txnid_t mdbx_oomkick(MDB_env *env, txnid_t oldest); #endif /* MDBX_MODE_ENABLED */ -static void mdbx_debug_log(int type, const char *function, int line, - const char *fmt, ...) - __attribute__((format(printf, 4, 5))); - -#if MDB_DEBUG -static txnid_t mdbx_debug_edge; - -static void __cold mdbx_assert_fail(MDB_env *env, const char *msg, - const char *func, int line) { - if (env && env->me_assert_func) - env->me_assert_func(env, msg, func, line); - else { - if (mdbx_debug_logger) - mdbx_debug_log(MDBX_DBG_ASSERT, func, line, "assert: %s\n", msg); - __assert_fail(msg, __FILE__, line, func); - } -} - -#define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT) - -#define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT) - -#define mdbx_debug_enabled(type) \ - unlikely(mdbx_runtime_flags &(type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) - -#else -#ifndef NDEBUG -#define mdbx_debug_enabled(type) (1) -#else -#define mdbx_debug_enabled(type) (0) -#endif -#define mdbx_audit_enabled() (0) -#define mdbx_assert_enabled() (0) -#define mdbx_assert_fail(env, msg, func, line) \ - __assert_fail(msg, __FILE__, line, func) -#endif /* MDB_DEBUG */ - -static void __cold mdbx_debug_log(int type, const char *function, int line, - const char *fmt, ...) { +void __cold mdbx_debug_log(int type, const char *function, int line, + const char *fmt, ...) { va_list args; va_start(args, fmt); @@ -1453,69 +842,24 @@ static void __cold mdbx_debug_log(int type, const char *function, int line, va_end(args); } -#define mdbx_print(fmt, ...) \ - mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) - -#define mdbx_debug(fmt, ...) \ - do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ - mdbx_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", \ - ##__VA_ARGS__); \ - } while (0) - -#define mdbx_debug_print(fmt, ...) \ - do { \ - if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ - mdbx_debug_log(MDBX_DBG_TRACE, NULL, 0, fmt, ##__VA_ARGS__); \ - } while (0) - -#define mdbx_debug_extra(fmt, ...) \ - do { \ - if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ - mdbx_debug_log(MDBX_DBG_EXTRA, __FUNCTION__, __LINE__, fmt, \ - ##__VA_ARGS__); \ - } while (0) - -#define mdbx_debug_extra_print(fmt, ...) \ - do { \ - if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) \ - mdbx_debug_log(MDBX_DBG_EXTRA, NULL, 0, fmt, ##__VA_ARGS__); \ - } while (0) - -#define mdbx_ensure_msg(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - mdbx_assert_fail(env, msg, __FUNCTION__, __LINE__); \ - } while (0) - -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) - -/** assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ - do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ - } while (0) - -/** assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) - -/** assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) - /** Return the page number of \b mp which may be sub-page, for debug output */ -static MDBX_INLINE pgno_t mdbx_dbg_pgno(MDB_page *mp) { +static __inline pgno_t mdbx_dbg_pgno(MDB_page *mp) { pgno_t ret; COPY_PGNO(ret, mp->mp_pgno); return ret; } /** Display a key in hexadecimal and return the address of the result. - * @param[in] key the key to display - * @param[in] buf the buffer to write into. Should always be #DKBUF. - * @return The key in hexadecimal form. - */ +* @param[in] key the key to display +* @param[in] buf the buffer to write into. Should always be #DKBUF. +* @return The key in hexadecimal form. +*/ char *mdbx_dkey(MDB_val *key, char *buf) { +#ifdef _MSC_VER + (void)key; + (void)buf; + return "FIXME: mdbx_dkey()"; +#else char *ptr = buf; unsigned i; @@ -1525,7 +869,7 @@ char *mdbx_dkey(MDB_val *key, char *buf) { if (key->mv_size > DKBUF_MAXKEYSIZE) return "MDB_MAXKEYSIZE"; /* may want to make this a dynamic check: if the key is mostly - * printable characters, print it as-is instead of converting to hex. */ +* printable characters, print it as-is instead of converting to hex. */ #if 1 buf[0] = '\0'; for (i = 0; i < key->mv_size; i++) @@ -1534,6 +878,7 @@ char *mdbx_dkey(MDB_val *key, char *buf) { sprintf(buf, "%.*s", key->mv_size, key->mv_data); #endif return buf; +#endif /* _MSC_VER */ } #if 0 /* LY: debug stuff */ @@ -1742,7 +1087,7 @@ static MDB_page *mdbx_page_malloc(MDB_txn *txn, unsigned num) { * Saves single pages to a list, for future reuse. * (This is not used for multi-page overflow pages.) */ -static MDBX_INLINE void mdbx_page_free(MDB_env *env, MDB_page *mp) { +static __inline void mdbx_page_free(MDB_env *env, MDB_page *mp) { mp->mp_next = env->me_dpages; VALGRIND_MEMPOOL_FREE(env, mp); env->me_dpages = mp; @@ -1763,7 +1108,7 @@ static void mdbx_dpage_free(MDB_env *env, MDB_page *dp) { static void mdbx_dlist_free(MDB_txn *txn) { MDB_env *env = txn->mt_env; MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned i, n = dl[0].mid; + size_t i, n = dl[0].mid; for (i = 1; i <= n; i++) { mdbx_dpage_free(env, dl[i].mptr); @@ -1781,13 +1126,10 @@ static void __cold mdbx_kill_page(MDB_env *env, pgno_t pgno) { VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_pb, env->me_psize - shift); ASAN_POISON_MEMORY_REGION(&mp->mp_pb, env->me_psize - shift); } else { - struct iovec iov[1]; - iov[0].iov_len = env->me_psize - shift; - iov[0].iov_base = alloca(iov[0].iov_len); - memset(iov[0].iov_base, 0x6F /* 'o', 111 */, iov[0].iov_len); - ssize_t rc = pwritev(env->me_fd, iov, 1, offs + shift); - assert(rc == (ssize_t)iov[0].iov_len); - (void)rc; + ssize_t len = env->me_psize - shift; + void *buf = alloca(len); + memset(buf, 0x6F /* 'o', 111 */, len); + (void)mdbx_pwrite(env->me_fd, buf, len, offs + shift); } } @@ -1857,7 +1199,7 @@ static int mdbx_page_loose(MDB_cursor *mc, MDB_page *mp) { * @return 0 on success, non-zero on failure. */ static int mdbx_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { - enum { Mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP }; + const unsigned Mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP; MDB_txn *txn = mc->mc_txn; MDB_cursor *m3, *m0 = mc; MDB_xcursor *mx; @@ -2044,7 +1386,7 @@ bailout: return rc; } -static MDBX_INLINE uint64_t mdbx_meta_sign(MDB_meta *meta) { +static __inline uint64_t mdbx_meta_sign(MDB_meta *meta) { uint64_t sign = MDB_DATASIGN_NONE; #if 0 /* TODO */ sign = hippeus_hash64( @@ -2059,29 +1401,12 @@ static MDBX_INLINE uint64_t mdbx_meta_sign(MDB_meta *meta) { return (sign > MDB_DATASIGN_WEAK) ? sign : ~sign; } -static MDBX_INLINE MDB_meta *mdbx_meta_head_w(MDB_env *env) { - MDB_meta *a = METAPAGE_1(env); - MDB_meta *b = METAPAGE_2(env); - txnid_t head_txnid = env->me_txns->mti_txnid; - - mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); - if (a->mm_txnid == head_txnid) - return a; - if (likely(b->mm_txnid == head_txnid)) - return b; - - mdbx_debug("me_txns->mti_txnid not match meta-pages"); - mdbx_assert(env, head_txnid == a->mm_txnid || head_txnid == b->mm_txnid); - env->me_flags |= MDB_FATAL_ERROR; - return a; -} - static MDB_meta *mdbx_meta_head_r(MDB_env *env) { MDB_meta *a = METAPAGE_1(env); MDB_meta *b = METAPAGE_2(env), *h; #ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); + mdbx_mutex_lock(&tsan_mutex); #endif txnid_t head_txnid = env->me_txns->mti_txnid; @@ -2102,26 +1427,26 @@ static MDB_meta *mdbx_meta_head_r(MDB_env *env) { h = b; } else { /* LY: got a race again, or DB is corrupted */ - int rc = mdbx_mutex_lock(env, MDB_MUTEX(env, w)); + int rc = mdbx_txn_lock(env); h = mdbx_meta_head_w(env); - if (rc == 0) - mdbx_mutex_unlock(env, MDB_MUTEX(env, w)); + if (rc == MDB_SUCCESS) + mdbx_txn_unlock(env); } } #ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); + mdbx_mutex_unlock(&tsan_mutex); #endif return h; } -static MDBX_INLINE MDB_meta *mdbx_env_meta_flipflop(const MDB_env *env, - MDB_meta *meta) { +static __inline MDB_meta *mdbx_env_meta_flipflop(const MDB_env *env, + MDB_meta *meta) { return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env); } -static MDBX_INLINE int mdbx_meta_lt(MDB_meta *a, MDB_meta *b) { +static __inline int mdbx_meta_lt(MDB_meta *a, MDB_meta *b) { return (META_IS_STEADY(a) == META_IS_STEADY(b)) ? a->mm_txnid < b->mm_txnid : META_IS_STEADY(b); } @@ -2129,7 +1454,7 @@ static MDBX_INLINE int mdbx_meta_lt(MDB_meta *a, MDB_meta *b) { /** Find oldest txnid still referenced. */ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { #ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); + mdbx_mutex_lock(&tsan_mutex); #endif int i, reader; MDB_reader *r = env->me_txns->mti_readers; @@ -2152,7 +1477,7 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { } } #ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); + mdbx_mutex_unlock(&tsan_mutex); #endif if (laggard) @@ -2446,14 +1771,14 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { mdbx_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu, txnid %zu", head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest, - env->me_txns->mt1.mtb.mtb_txnid); + env->me_txns->mti_txnid); - int flags = env->me_flags & MDB_WRITEMAP; + int me_flags = env->me_flags & MDB_WRITEMAP; if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC) - flags |= MDBX_UTTERLY_NOSYNC; + me_flags |= MDBX_UTTERLY_NOSYNC; mdbx_assert(env, env->me_sync_pending > 0); - if (mdbx_env_sync0(env, flags, &meta) == MDB_SUCCESS) { + if (mdbx_env_sync0(env, me_flags, &meta) == MDB_SUCCESS) { txnid_t snap = mdbx_find_oldest(env, NULL); if (snap > oldest) { continue; @@ -2703,7 +2028,6 @@ fail: int mdbx_env_sync(MDB_env *env, int force) { int rc; - pthread_mutex_t *mutex; MDB_meta *head; unsigned flags; @@ -2733,20 +2057,19 @@ int mdbx_env_sync(MDB_env *env, int force) { /* LY: early sync before acquiring the mutex to reduce writer's latency */ if (env->me_sync_pending > env->me_psize * 16 && (flags & MDB_NOSYNC) == 0) { + assert(((flags ^ env->me_flags) & MDB_WRITEMAP) == 0); if (flags & MDB_WRITEMAP) { size_t used_size = env->me_psize * (head->mm_last_pg + 1); - rc = msync(env->me_map, used_size, - (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC); + rc = mdbx_msync(env->me_map, used_size, flags & MDB_MAPASYNC); } else { - rc = fdatasync(env->me_fd); + rc = mdbx_filesync(env->me_fd, false); } - if (unlikely(rc)) - return errno; + if (unlikely(rc != MDB_SUCCESS)) + return rc; } - mutex = MDB_MUTEX(env, w); - rc = mdbx_mutex_lock(env, mutex); - if (unlikely(rc)) + rc = mdbx_txn_lock(env); + if (unlikely(rc != MDB_SUCCESS)) return rc; /* LY: head may be changed while the mutex has been acquired. */ @@ -2758,7 +2081,7 @@ int mdbx_env_sync(MDB_env *env, int force) { rc = mdbx_env_sync0(env, flags, &meta); } - mdbx_mutex_unlock(env, mutex); + mdbx_txn_unlock(env); return rc; } @@ -2853,29 +2176,6 @@ static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { } } -/** Set or check a pid lock. Set returns 0 on success. - * Check returns 0 if the process is certainly dead, nonzero if it may - * be alive (the lock exists or an error happened so we do not know). - */ -static int mdbx_reader_pid(MDB_env *env, int op, pid_t pid) { - for (;;) { - int rc; - struct flock lock_info; - memset(&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_WRLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = pid; - lock_info.l_len = 1; - if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) { - if (op == F_GETLK && lock_info.l_type != F_UNLCK) - rc = -1; - } else if ((rc = errno) == EINTR) { - continue; - } - return rc; - } -} - /** Common code for #mdbx_txn_begin() and #mdbx_txn_renew(). * @param[in] txn the transaction handle to initialize * @return 0 on success, non-zero on failure. @@ -2885,50 +2185,43 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { unsigned i, nr; int rc, new_notls = 0; - if (unlikely(env->me_pid != getpid())) { + if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDB_FATAL_ERROR; return MDB_PANIC; } if (flags & MDB_TXN_RDONLY) { - MDBX_rthc *rthc = NULL; - MDB_reader *r = NULL; - txn->mt_flags = MDB_TXN_RDONLY; + MDB_reader *r = txn->mt_u.reader; if (likely(env->me_flags & MDB_ENV_TXKEY)) { mdbx_assert(env, !(env->me_flags & MDB_NOTLS)); - rthc = mdbx_rthc_get(env->me_txkey); - if (unlikely(!rthc)) - return ENOMEM; - if (likely(rthc->rc_reader)) { - r = rthc->rc_reader; + r = mdbx_thread_rthc_get(env->me_txkey); + if (likely(r)) { mdbx_assert(env, r->mr_pid == env->me_pid); - mdbx_assert(env, r->mr_tid == pthread_self()); + mdbx_assert(env, r->mr_tid == mdbx_thread_self()); } } else { mdbx_assert(env, env->me_flags & MDB_NOTLS); - r = txn->mt_u.reader; } if (likely(r)) { if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != ~(txnid_t)0)) return MDB_BAD_RSLOT; } else { - pid_t pid = env->me_pid; - pthread_t tid = pthread_self(); - pthread_mutex_t *rmutex = MDB_MUTEX(env, r); + mdbx_pid_t pid = env->me_pid; + mdbx_tid_t tid = mdbx_thread_self(); - rc = mdbx_mutex_lock(env, rmutex); + rc = mdbx_rdt_lock(env); if (unlikely(rc != MDB_SUCCESS)) return rc; - if (unlikely(!env->me_live_reader)) { - rc = mdbx_reader_pid(env, F_SETLK, pid); + if (unlikely(env->me_live_reader != pid)) { + rc = mdbx_rpid_set(env); if (unlikely(rc != MDB_SUCCESS)) { - mdbx_mutex_unlock(env, rmutex); + mdbx_rdt_unlock(env); return rc; } - env->me_live_reader = 1; + env->me_live_reader = pid; } nr = env->me_txns->mti_numreaders; @@ -2936,13 +2229,13 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { if (env->me_txns->mti_readers[i].mr_pid == 0) break; if (unlikely(i == env->me_maxreaders)) { - mdbx_mutex_unlock(env, rmutex); + mdbx_rdt_unlock(env); return MDB_READERS_FULL; } r = &env->me_txns->mti_readers[i]; /* Claim the reader slot, carefully since other code * uses the reader table un-mutexed: First reset the - * slot, next publish it in mti_numreaders. After + * slot, next publish it in mtb.mti_numreaders. After * that, it is safe for mdbx_env_close() to touch it. * When it will be closed, we can finally claim it. */ r->mr_pid = 0; @@ -2950,7 +2243,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { r->mr_tid = tid; mdbx_coherent_barrier(); #ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); + mdbx_mutex_lock(&tsan_mutex); #endif if (i == nr) env->me_txns->mti_numreaders = ++nr; @@ -2958,13 +2251,13 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { env->me_close_readers = nr; r->mr_pid = pid; #ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); + mdbx_mutex_unlock(&tsan_mutex); #endif - mdbx_mutex_unlock(env, rmutex); + mdbx_rdt_unlock(env); - new_notls = MDB_END_SLOT; - if (likely(rthc)) { - rthc->rc_reader = r; + new_notls = MDB_END_SLOT /* == MDB_NOTLS */; + if (likely(env->me_flags & MDB_ENV_TXKEY)) { + mdbx_thread_rthc_set(env->me_txkey, r); new_notls = 0; } } @@ -2993,12 +2286,12 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ } else { /* Not yet touching txn == env->me_txn0, it may be active */ - rc = mdbx_mutex_lock(env, MDB_MUTEX(env, w)); + rc = mdbx_txn_lock(env); if (unlikely(rc)) return rc; #ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); + mdbx_mutex_lock(&tsan_mutex); #endif MDB_meta *meta = mdbx_meta_head_w(env); #if MDBX_MODE_ENABLED @@ -3007,7 +2300,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { txn->mt_txnid = meta->mm_txnid + 1; txn->mt_flags = flags; #ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); + mdbx_mutex_unlock(&tsan_mutex); #endif #if MDB_DEBUG @@ -3094,7 +2387,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(env->me_pid != getpid())) { + if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDB_FATAL_ERROR; return MDB_PANIC; } @@ -3128,7 +2421,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, goto renew; } if (unlikely((txn = calloc(1, size)) == NULL)) { - mdbx_debug("calloc: %s", strerror(errno)); + mdbx_debug("calloc: %s", "failed"); return ENOMEM; } txn->mt_dbxs = env->me_dbxs; /* static */ @@ -3243,7 +2536,7 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { MDB_env *env = txn->mt_env; static const char *const names[] = MDB_END_NAMES; - if (unlikely(txn->mt_env->me_pid != getpid())) { + if (unlikely(txn->mt_env->me_pid != mdbx_getpid())) { env->me_flags |= MDB_FATAL_ERROR; return MDB_PANIC; } @@ -3259,7 +2552,7 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { if (txn->mt_u.reader) { #ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); + mdbx_mutex_lock(&tsan_mutex); #endif txn->mt_u.reader->mr_txnid = ~(txnid_t)0; if (!(env->me_flags & MDB_NOTLS)) { @@ -3269,7 +2562,7 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { txn->mt_u.reader = NULL; } /* else txn owns the slot until it does MDB_END_SLOT */ #ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); + mdbx_mutex_unlock(&tsan_mutex); #endif } mdbx_coherent_barrier(); @@ -3306,7 +2599,7 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { mode = 0; /* txn == env->me_txn0, do not free() it */ /* The writer mutex was locked in mdbx_txn_begin. */ - mdbx_mutex_unlock(env, MDB_MUTEX(env, w)); + mdbx_txn_unlock(env); } else { txn->mt_parent->mt_child = NULL; txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; @@ -3366,7 +2659,7 @@ int mdbx_txn_abort(MDB_txn *txn) { return mdbx_txn_end(txn, MDB_END_ABORT | MDB_END_SLOT | MDB_END_FREE); } -static MDBX_INLINE int mdbx_backlog_size(MDB_txn *txn) { +static __inline int mdbx_backlog_size(MDB_txn *txn) { int reclaimed = txn->mt_env->me_pghead ? txn->mt_env->me_pghead[0] : 0; return reclaimed + txn->mt_loose_count; } @@ -3629,6 +2922,8 @@ again: rc = MDB_SUCCESS; if (mop_len) { MDB_val key, data; + key.mv_size = data.mv_size = 0; /* avoid MSVC warning */ + key.mv_data = data.mv_data = NULL; mop += mop_len; if (!lifo) { @@ -3723,7 +3018,7 @@ static int mdbx_page_flush(MDB_txn *txn, int keep) { pgno_t pgno = 0; MDB_page *dp = NULL; struct iovec iov[MDB_COMMIT_PAGES]; - ssize_t wpos = 0, wsize = 0, wres; + ssize_t wpos = 0, wsize = 0; size_t next_pos = 1; /* impossible pos, so pos != next_pos */ int n = 0; @@ -3769,17 +3064,11 @@ static int mdbx_page_flush(MDB_txn *txn, int keep) { if (n) { retry: /* Write previous page(s) */ - wres = pwritev(env->me_fd, iov, n, wpos); - if (unlikely(wres != wsize)) { - if (wres < 0) { - rc = errno; - if (rc == EINTR) - goto retry; - mdbx_debug("Write error: %s", strerror(rc)); - } else { - rc = EIO; /* TODO: Use which error code? */ - mdbx_debug("short write, filesystem full?"); - } + rc = mdbx_pwritev(env->me_fd, iov, n, wpos, wsize); + if (unlikely(rc != MDB_SUCCESS)) { + if (rc == EINTR) + goto retry; + mdbx_debug("Write error: %s", strerror(rc)); return rc; } n = 0; @@ -3819,7 +3108,6 @@ done: int mdbx_txn_commit(MDB_txn *txn) { int rc; - unsigned i; if (unlikely(txn == NULL)) return EINVAL; @@ -3828,7 +3116,7 @@ int mdbx_txn_commit(MDB_txn *txn) { return MDBX_EBADSIGN; MDB_env *env = txn->mt_env; - if (unlikely(env->me_pid != getpid())) { + if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDB_FATAL_ERROR; return MDB_PANIC; } @@ -3859,7 +3147,7 @@ int mdbx_txn_commit(MDB_txn *txn) { MDB_page **lp; MDB_ID2L dst, src; MDB_IDL pspill; - unsigned x, y, len, ps_len; + unsigned i, x, y, len, ps_len; /* Append our reclaim list to parent's */ if (txn->mt_lifo_reclaimed) { @@ -4084,20 +3372,16 @@ static int __cold mdbx_env_read_header(MDB_env *env, MDB_meta *meta) { MDB_page *p; MDB_meta *m; int i, rc, off; - enum { Size = sizeof(pbuf) }; + assert(offsetof(MDB_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); /* We don't know the page size yet, so use a minimum value. - * Read both meta pages so we can use the latest one. - */ + * Read both meta pages so we can use the latest one. */ meta->mm_datasync_sign = MDB_DATASIGN_WEAK; meta->mm_txnid = 0; for (i = off = 0; i < NUM_METAS; i++, off += meta->mm_psize) { - rc = pread(env->me_fd, &pbuf, Size, off); - if (rc != Size) { - if (rc == 0 && off == 0) - return ENOENT; - rc = rc < 0 ? (int)errno : MDB_INVALID; + rc = mdbx_pread(env->me_fd, &pbuf, sizeof(pbuf), off); + if (rc != MDB_SUCCESS) { mdbx_debug("read: %s", mdbx_strerror(rc)); return rc; } @@ -4159,9 +3443,9 @@ static int __cold mdbx_env_init_meta(MDB_env *env, MDB_meta *meta) { MDB_page *p, *q; int rc; unsigned psize; - int len; mdbx_debug("writing new meta page"); + assert(offsetof(MDB_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); psize = env->me_psize; @@ -4177,16 +3461,8 @@ static int __cold mdbx_env_init_meta(MDB_env *env, MDB_meta *meta) { q->mp_flags = P_META; *(MDB_meta *)PAGEDATA(q) = *meta; - do - len = pwrite(env->me_fd, p, psize * NUM_METAS, 0); - while (len == -1 && errno == EINTR); + rc = mdbx_pwrite(env->me_fd, p, psize * NUM_METAS, 0); - if (len < 0) - rc = errno; - else if ((unsigned)len == psize * NUM_METAS) - rc = MDB_SUCCESS; - else - rc = ENOSPC; free(p); return rc; } @@ -4218,17 +3494,16 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { /* LY: step#1 - sync previously written/updated data-pages */ if (env->me_sync_pending && (flags & MDB_NOSYNC) == 0) { - if (env->me_flags & MDB_WRITEMAP) { - int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - if (unlikely(msync(env->me_map, used_size, mode))) { - rc = errno; - /* LY: msync() should never return EINTR */ + assert(((flags ^ env->me_flags) & MDB_WRITEMAP) == 0); + if (flags & MDB_WRITEMAP) { + rc = mdbx_msync(env->me_map, used_size, flags & MDB_MAPASYNC); + if (unlikely(rc != MDB_SUCCESS)) + /* LY: mdbx_msync() should never return EINTR */ goto fail; - } if ((flags & MDB_MAPASYNC) == 0) env->me_sync_pending = 0; } else { - int (*flush)(int fd) = fdatasync; + bool syncmeta = false; if (unlikely(prev_mapsize != pending->mm_mapsize)) { /* LY: It is no reason to use fdatasync() here, even in case * no such bug in a kernel. Because "no-bug" mean that a kernel @@ -4240,10 +3515,10 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { * * For more info about of a corresponding fdatasync() bug * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ - flush = fsync; + syncmeta = true; } - while (unlikely(flush(env->me_fd) < 0)) { - rc = errno; + while ( + unlikely((rc = mdbx_filesync(env->me_fd, syncmeta)) != MDB_SUCCESS)) { if (rc != EINTR) goto fail; } @@ -4271,7 +3546,7 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { mdbx_debug( "writing meta %d (%s, was %zu/%s, stay %s %zu/%s), root %zu, " "txn_id %zu, %s", - offset >= env->me_psize, target == head ? "head" : "tail", + offset >= (off_t)env->me_psize, target == head ? "head" : "tail", target->mm_txnid, META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" : "Legacy", @@ -4283,7 +3558,7 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { if (env->me_flags & MDB_WRITEMAP) { #ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); + mdbx_mutex_lock(&tsan_mutex); #endif /* LY: 'invalidate' the meta, * but mdbx_meta_head_r() will be confused/retired in collision case. */ @@ -4303,28 +3578,18 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { } else { pending->mm_magic = MDB_MAGIC; pending->mm_version = MDB_DATA_VERSION; - pending->mm_address = head->mm_address; - retry: - rc = pwrite(env->me_fd, pending, sizeof(MDB_meta), offset); - if (unlikely(rc != sizeof(MDB_meta))) { - rc = (rc < 0) ? errno : EIO; - if (rc == EINTR) - goto retry; - + rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDB_meta), offset); + if (unlikely(rc != MDB_SUCCESS)) { undo: mdbx_debug("write failed, disk error?"); /* On a failure, the pagecache still contains the new data. - * Write some old data back, to prevent it from being used. */ - if (pwrite(env->me_fd, (void *)target, sizeof(MDB_meta), offset) == - sizeof(MDB_meta)) { - /* LY: take a chance, if write succeeds at a magic ;) */ - goto retry; - } + * Try write some old data back, to prevent it from being used. */ + mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDB_meta), offset); goto fail; } mdbx_invalidate_cache(env->me_map + offset, sizeof(MDB_meta)); #ifdef __SANITIZE_THREAD__ - pthread_mutex_lock(&tsan_mutex); + mdbx_mutex_lock(&tsan_mutex); #endif } @@ -4336,21 +3601,19 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { */ env->me_txns->mti_txnid = pending->mm_txnid; #ifdef __SANITIZE_THREAD__ - pthread_mutex_unlock(&tsan_mutex); + mdbx_mutex_unlock(&tsan_mutex); #endif /* LY: step#3 - sync meta-pages. */ if ((flags & (MDB_NOSYNC | MDB_NOMETASYNC)) == 0) { - if (env->me_flags & MDB_WRITEMAP) { + assert(((flags ^ env->me_flags) & MDB_WRITEMAP) == 0); + if (flags & MDB_WRITEMAP) { char *ptr = env->me_map + (offset & ~(env->me_os_psize - 1)); - int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; - if (unlikely(msync(ptr, env->me_os_psize, mode) < 0)) { - rc = errno; + rc = mdbx_msync(ptr, env->me_os_psize, flags & MDB_MAPASYNC); + if (unlikely(rc != MDB_SUCCESS)) goto fail; - } } else { - while (unlikely(fdatasync(env->me_fd) < 0)) { - rc = errno; + while (unlikely((rc = mdbx_filesync(env->me_fd, false)) != MDB_SUCCESS)) { if (rc != EINTR) goto undo; } @@ -4360,15 +3623,13 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { /* LY: currently this can't happen, but... */ if (unlikely(pending->mm_mapsize < prev_mapsize)) { mdbx_assert(env, pending->mm_mapsize == env->me_mapsize); - if (unlikely(mremap(env->me_map, prev_mapsize, pending->mm_mapsize, - MREMAP_FIXED, pending->mm_address) == MAP_FAILED)) { - rc = errno; + rc = mdbx_ftruncate(env->me_fd, pending->mm_mapsize); + if (unlikely(rc != MDB_SUCCESS)) goto fail; - } - if (unlikely(ftruncate(env->me_fd, pending->mm_mapsize) < 0)) { - rc = errno; + rc = mdbx_mremap_size((void **)&env->me_map, prev_mapsize, + pending->mm_mapsize); + if (unlikely(rc != MDB_SUCCESS)) goto fail; - } } return MDB_SUCCESS; @@ -4389,8 +3650,8 @@ int __cold mdbx_env_create(MDB_env **env) { e->me_maxdbs = e->me_numdbs = CORE_DBS; e->me_fd = INVALID_HANDLE_VALUE; e->me_lfd = INVALID_HANDLE_VALUE; - e->me_pid = getpid(); - GET_PAGESIZE(e->me_os_psize); + e->me_pid = mdbx_getpid(); + e->me_os_psize = mdbx_syspagesize(); VALGRIND_CREATE_MEMPOOL(e, 0, 0); e->me_signature = MDBX_ME_SIGNATURE; *env = e; @@ -4399,32 +3660,26 @@ int __cold mdbx_env_create(MDB_env **env) { static int __cold mdbx_env_map(MDB_env *env, void *addr, size_t usedsize) { unsigned flags = env->me_flags; + int rc; - int prot = PROT_READ; if (flags & MDB_WRITEMAP) { - prot |= PROT_WRITE; - if (ftruncate(env->me_fd, env->me_mapsize) < 0) - return errno; + rc = mdbx_ftruncate(env->me_fd, env->me_mapsize); + if (unlikely(rc != MDB_SUCCESS)) + return rc; } - env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED, env->me_fd, 0); - if (env->me_map == MAP_FAILED) { + env->me_map = addr; + rc = mdbx_mmap((void **)&env->me_map, env->me_mapsize, flags & MDB_WRITEMAP, + env->me_fd); + if (unlikely(rc != MDB_SUCCESS)) { env->me_map = NULL; - return errno; - } - - /* Can happen because the address argument to mmap() is just a - * hint. mmap() can pick another, e.g. if the range is in use. - * The MAP_FIXED flag would prevent that, but then mmap could - * instead unmap existing pages to make room for the new map. - */ - if (addr && env->me_map != addr) { - errno = 0; /* LY: clean errno as a hit for this case */ - return EBUSY; /* TODO: Make a new MDB_* error code? */ + return rc; } +#ifdef MADV_DONTFORK if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK)) return errno; +#endif #ifdef MADV_NOHUGEPAGE (void)madvise(env->me_map, env->me_mapsize, MADV_NOHUGEPAGE); @@ -4441,17 +3696,24 @@ static int __cold mdbx_env_map(MDB_env *env, void *addr, size_t usedsize) { (void)madvise(env->me_map + usedsize, env->me_mapsize - usedsize, MADV_REMOVE); } +#else + (void)usedsize; #endif +#if defined(MADV_RANDOM) && defined(MADV_WILLNEED) /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ if (madvise(env->me_map, env->me_mapsize, (flags & MDB_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED)) return errno; +#endif /* Lock meta pages to avoid unexpected write, * before the data pages would be synchronized. */ - if ((flags & MDB_WRITEMAP) && mlock(env->me_map, env->me_psize * 2)) - return errno; + if (flags & MDB_WRITEMAP) { + rc = mdbx_mlock(env->me_map, env->me_psize * 2); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + } #ifdef USE_VALGRIND env->me_valgrind_handle = @@ -4477,9 +3739,10 @@ int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { if (env->me_map) { int rc; MDB_meta *meta; - void *old; if (env->me_txn) return EINVAL; + + /* FIXME: lock/unlock */ meta = mdbx_meta_head_w(env); if (!size) size = meta->mm_mapsize; @@ -4487,17 +3750,23 @@ int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; if (size < usedsize) size = usedsize; - munmap(env->me_map, env->me_mapsize); + + mdbx_munmap(env->me_map, env->me_mapsize); #ifdef USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; #endif + + rc = mdbx_ftruncate(env->me_fd, size); + if (unlikely(rc != MDB_SUCCESS)) + return rc; env->me_mapsize = size; - old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; - rc = mdbx_env_map(env, old, usedsize); - if (rc) + /* FIXME: update meta */ + rc = mdbx_env_map(env, NULL, usedsize); + if (unlikely(rc != MDB_SUCCESS)) return rc; } + env->me_mapsize = size; if (env->me_psize) env->me_maxpg = env->me_mapsize / env->me_psize; @@ -4543,25 +3812,14 @@ int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) { return MDB_SUCCESS; } -static int __cold mdbx_fsize(HANDLE fd, size_t *size) { - struct stat st; - - if (fstat(fd, &st)) - return errno; - - *size = st.st_size; - return MDB_SUCCESS; -} - /** Further setup required for opening an LMDB environment */ static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { - unsigned flags = env->me_flags; - int i, newenv = 0, rc; - - if ((i = mdbx_env_read_header(env, meta)) != 0) { - if (i != ENOENT) - return i; + int newenv = 0; + int rc = mdbx_env_read_header(env, meta); + if (unlikely(rc != MDB_SUCCESS)) { + if (rc != ENOENT) + return rc; mdbx_debug("new mdbenv"); newenv = 1; env->me_psize = env->me_os_psize; @@ -4575,48 +3833,36 @@ static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { } /* Was a mapsize configured? */ - if (!env->me_mapsize) { + if (!env->me_mapsize) env->me_mapsize = meta->mm_mapsize; - } - { + else { /* Make sure mapsize >= committed data size. Even when using * mm_mapsize, which could be broken in old files (ITS#7789). */ size_t minsize = (meta->mm_last_pg + 1) * meta->mm_psize; if (env->me_mapsize < minsize) env->me_mapsize = minsize; - } - meta->mm_mapsize = env->me_mapsize; - if (newenv && !(flags & MDB_FIXEDMAP)) { + meta->mm_mapsize = env->me_mapsize; + } + + if (newenv) { /* mdbx_env_map() may grow the datafile. Write the metapages - * first, so the file will be valid if initialization fails. - * Except with FIXEDMAP, since we do not yet know mm_address. - * We could fill in mm_address later, but then a different - * program might end up doing that - one with a memory layout - * and map address which does not suit the main program. - */ + * first, so the file will be valid if initialization fails. */ rc = mdbx_env_init_meta(env, meta); - if (rc) + if (unlikely(rc != MDB_SUCCESS)) + return rc; + + rc = mdbx_ftruncate(env->me_fd, env->me_mapsize); + if (unlikely(rc != MDB_SUCCESS)) return rc; - newenv = 0; } const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; - rc = mdbx_env_map(env, (flags & MDB_FIXEDMAP) ? meta->mm_address : NULL, - usedsize); + rc = mdbx_env_map(env, NULL, usedsize); if (rc) return rc; - if (newenv) { - if (flags & MDB_FIXEDMAP) - meta->mm_address = env->me_map; - i = mdbx_env_init_meta(env, meta); - if (i != MDB_SUCCESS) { - return i; - } - } - env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) - sizeof(indx_t); @@ -4631,402 +3877,57 @@ static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { /****************************************************************************/ -#ifndef MDBX_USE_THREAD_ATEXIT -#if __GLIBC_PREREQ(2, 18) -#define MDBX_USE_THREAD_ATEXIT 1 -#else -#define MDBX_USE_THREAD_ATEXIT 0 -#endif -#endif - -static pthread_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER; -static MDBX_rthc *mdbx_rthc_list; -static pthread_key_t mdbx_pthread_crutch_key; - -static __inline void mdbx_rthc_lock(void) { - mdbx_ensure(NULL, pthread_mutex_lock(&mdbx_rthc_mutex) == 0); -} - -static __inline void mdbx_rthc_unlock(void) { - mdbx_ensure(NULL, pthread_mutex_unlock(&mdbx_rthc_mutex) == 0); -} - -/** Release a reader thread's slot in the reader lock table. - * This function is called automatically when a thread exits. - * @param[in] ptr This points to the MDB_rthc of a slot in the reader lock - *table. - */ -static __cold void mdbx_rthc_dtor(void) { - /* LY: Основная задача этого деструктора была и есть в освобождении - * слота таблицы читателей при завершении треда, но тут есть пара - * не очевидных сложностей: - * - Таблица читателей располагается в разделяемой памяти, поэтому - * во избежание segfault деструктор не должен что-либо делать после - * или одновременно с mdbx_env_close(). - * - Действительно, mdbx_env_close() вызовет pthread_key_delete() и - * после этого glibc не будет вызывать деструктор. - * - ОДНАКО, это никак не решает проблему гонок между mdbx_env_close() - * и завершающимися тредами. Грубо говоря, при старте mdbx_env_close() - * деструктор уже может выполняться в некоторых тредах, и завершиться - * эти выполнения могут во время или после окончания mdbx_env_close(). - * - БОЛЕЕ ТОГО, схожая проблема возникает при выгрузке dso/dll, - * так как в текущей glibc (2.24) подсистема ld.so ничего не знает о - * TSD-деструкторах и поэтому может выгрузить lib.so до того как - * отработали все деструкторы. - * - Исходное проявление проблемы было зафиксировано - * в https://github.com/ReOpen/ReOpenLDAP/issues/48 - * - * Предыдущее решение посредством выделяемого динамически MDB_rthc - * было не удачным, так как порождало либо утечку памяти, - * либо вероятностное обращение к уже освобожденной памяти - * из этого деструктора. - * - * Текущее решение достаточно "развесисто", но решает все описанные выше - * проблемы без пенальти по производительности. - */ - - mdbx_rthc_lock(); - - pid_t pid = getpid(); - pthread_t thread = pthread_self(); - for (MDBX_rthc **ref = &mdbx_rthc_list; *ref;) { - MDBX_rthc *rthc = *ref; - if (rthc->rc_thread == thread) { - if (rthc->rc_reader && rthc->rc_reader->mr_pid == pid) { - rthc->rc_reader->mr_pid = 0; - mdbx_coherent_barrier(); - } - *ref = rthc->rc_next; - free(rthc); - } else { - ref = &(*ref)->rc_next; - } - } - - mdbx_rthc_unlock(); -} - -#if MDBX_USE_THREAD_ATEXIT - -extern void *__dso_handle __attribute__((__weak__)); -extern int __cxa_thread_atexit_impl(void (*dtor)(void *), void *obj, - void *dso_symbol); - -static __cold void mdbx_rthc__thread_atexit(void *ptr) { - mdbx_ensure(NULL, ptr == pthread_getspecific(mdbx_pthread_crutch_key)); - mdbx_ensure(NULL, pthread_setspecific(mdbx_pthread_crutch_key, NULL) == 0); - mdbx_rthc_dtor(); -} - -static __attribute__((constructor)) __cold void mdbx_pthread_crutch_ctor(void) { - mdbx_ensure(NULL, pthread_key_create(&mdbx_pthread_crutch_key, NULL) == 0); -} - -#else /* MDBX_USE_THREAD_ATEXIT */ - -static __cold void mdbx_rthc__thread_key_dtor(void *ptr) { - (void)ptr; - if (mdbx_pthread_crutch_key != (pthread_key_t)-1) - mdbx_rthc_dtor(); -} - -static __attribute__((constructor)) __cold void mdbx_pthread_crutch_ctor(void) { - mdbx_ensure(NULL, pthread_key_create(&mdbx_pthread_crutch_key, - mdbx_rthc__thread_key_dtor) == 0); -} - -static __attribute__((destructor)) __cold void mdbx_pthread_crutch_dtor(void) { - pthread_key_delete(mdbx_pthread_crutch_key); - mdbx_pthread_crutch_key = -1; - - /* LY: Из-за race condition в pthread_key_delete() - * деструкторы уже могли начать выполняться. - * Уступая квант времени сразу после удаления ключа - * мы даем им шанс завершиться. */ - pthread_yield(); - - mdbx_rthc_lock(); - pid_t pid = getpid(); - while (mdbx_rthc_list != NULL) { - MDBX_rthc *rthc = mdbx_rthc_list; - mdbx_rthc_list = mdbx_rthc_list->rc_next; - if (rthc->rc_reader && rthc->rc_reader->mr_pid == pid) { - rthc->rc_reader->mr_pid = 0; - mdbx_coherent_barrier(); - } - free(rthc); - - /* LY: Каждый неудаленный элемент списка - это один - * не отработавший деструктор и потенциальный - * шанс получить segfault после выгрузки lib.so - * Поэтому на каждой итерации уступаем квант времени, - * в надежде что деструкторы успеют отработать. */ - mdbx_rthc_unlock(); - pthread_yield(); - mdbx_rthc_lock(); - } - mdbx_rthc_unlock(); - pthread_yield(); -} -#endif /* MDBX_USE_THREAD_ATEXIT */ - -static __cold MDBX_rthc *mdbx_rthc_add(pthread_key_t key) { - MDBX_rthc *rthc = malloc(sizeof(MDBX_rthc)); - if (unlikely(rthc == NULL)) - goto bailout; - - rthc->rc_next = NULL; - rthc->rc_reader = NULL; - rthc->rc_thread = pthread_self(); - if (unlikely(pthread_setspecific(key, rthc) != 0)) - goto bailout_free; - - mdbx_rthc_lock(); - if (pthread_getspecific(mdbx_pthread_crutch_key) == NULL) { -#if MDBX_USE_THREAD_ATEXIT - void *dso_anchor = - (&__dso_handle && __dso_handle) ? __dso_handle : (void *)mdbx_version; - if (unlikely(__cxa_thread_atexit_impl(mdbx_rthc__thread_atexit, rthc, - dso_anchor) != 0)) { - mdbx_rthc_unlock(); - goto bailout_free; - } -#endif /* MDBX_USE_THREAD_ATEXIT */ - mdbx_ensure(NULL, pthread_setspecific(mdbx_pthread_crutch_key, rthc) == 0); - } - rthc->rc_next = mdbx_rthc_list; - mdbx_rthc_list = rthc; - mdbx_rthc_unlock(); - return rthc; - -bailout_free: - free(rthc); -bailout: - return NULL; -} - -static __inline MDBX_rthc *mdbx_rthc_get(pthread_key_t key) { - MDBX_rthc *rthc = pthread_getspecific(key); - if (likely(rthc != NULL)) - return rthc; - return mdbx_rthc_add(key); -} - -static __cold void mdbx_rthc_cleanup(MDB_env *env) { - mdbx_rthc_lock(); - - MDB_reader *begin = env->me_txns->mti_readers; - MDB_reader *end = begin + env->me_close_readers; - for (MDBX_rthc **ref = &mdbx_rthc_list; *ref;) { - MDBX_rthc *rthc = *ref; - if (rthc->rc_reader >= begin && rthc->rc_reader < end) { - if (rthc->rc_reader->mr_pid == env->me_pid) { - rthc->rc_reader->mr_pid = 0; - mdbx_coherent_barrier(); - } - *ref = rthc->rc_next; - free(rthc); - } else { - ref = &(*ref)->rc_next; - } - } - - mdbx_rthc_unlock(); -} - -/****************************************************************************/ - -/** Downgrade the exclusive lock on the region back to shared */ -static __cold int mdbx_env_share_locks(MDB_env *env, int *excl) { - struct flock lock_info; - int rc = 0; - - /* The shared lock replaces the existing lock */ - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_RDLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = 0; - lock_info.l_len = 1; - while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && - (rc = errno) == EINTR) - ; - *excl = rc ? -1 : 0; /* error may mean we lost the lock */ - - return rc; -} - -/** Try to get exclusive lock, otherwise shared. - * Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive. - */ -static int __cold mdbx_env_excl_lock(MDB_env *env, int *excl) { - int rc = 0; - struct flock lock_info; - - memset((void *)&lock_info, 0, sizeof(lock_info)); - lock_info.l_type = F_WRLCK; - lock_info.l_whence = SEEK_SET; - lock_info.l_start = 0; - lock_info.l_len = 1; - while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) && - (rc = errno) == EINTR) - ; - if (!rc) { - *excl = 1; - } else { - lock_info.l_type = F_RDLCK; - while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) && - (rc = errno) == EINTR) - ; - if (rc == 0) - *excl = 0; - } - return rc; -} - -#ifdef MDB_USE_HASH -/* - * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code - * - * @(#) $Revision: 5.1 $ - * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $ - * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $ - * - * http://www.isthe.com/chongo/tech/comp/fnv/index.html - * - *** - * - * Please do not copyright this code. This code is in the public domain. - * - * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO - * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF - * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR - * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THIS SOFTWARE. - * - * By: - * chongo /\oo/\ - * http://www.isthe.com/chongo/ - * - * Share and Enjoy! :-) - */ - -typedef unsigned long long mdbx_hash_t; -#define MDB_HASH_INIT ((mdbx_hash_t)0xcbf29ce484222325ULL) - -/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer - * @param[in] val value to hash - * @param[in] hval initial value for hash - * @return 64 bit hash - * - * NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the - * hval arg on the first call. - */ -static mdbx_hash_t mdbx_hash_val(MDB_val *val, mdbx_hash_t hval) { - unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */ - unsigned char *end = s + val->mv_size; - /* - * FNV-1a hash each octet of the string - */ - while (s < end) { - /* xor the bottom with the current octet */ - hval ^= (mdbx_hash_t)*s++; - - /* multiply by the 64 bit FNV magic prime mod 2^64 */ - hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + - (hval << 8) + (hval << 40); - } - /* return our new hash value */ - return hval; -} - -/** Hash the string and output the encoded hash. - * This uses modified RFC1924 Ascii85 encoding to accommodate systems with - * very short name limits. We don't care about the encoding being reversible, - * we just want to preserve as many bits of the input as possible in a - * small printable string. - * @param[in] str string to hash - * @param[out] encbuf an array of 11 chars to hold the hash - */ -static const char mdbx_a85[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghij" - "klmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~"; - -static void __cold mdbx_pack85(unsigned long l, char *out) { - int i; - - for (i = 0; i < 5; i++) { - *out++ = mdbx_a85[l % 85]; - l /= 85; - } -} - -static void __cold mdbx_hash_enc(MDB_val *val, char *encbuf) { - mdbx_hash_t h = mdbx_hash_val(val, MDB_HASH_INIT); - - mdbx_pack85(h, encbuf); - mdbx_pack85(h >> 32, encbuf + 5); - encbuf[10] = '\0'; -} -#endif - -/** Open and/or initialize the lock region for the environment. - * @param[in] env The LMDB environment. - * @param[in] lpath The pathname of the file used for the lock region. - * @param[in] mode The Unix permissions for the file, if we create it. - * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive - * @return 0 on success, non-zero on failure. - */ +/* Open and/or initialize the lock region for the environment. */ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) { - int fdflags; - int rc; off_t size, rsize; - void *m; - env->me_lfd = open(lpath, O_RDWR | O_CREAT | O_CLOEXEC, mode); - if (env->me_lfd == INVALID_HANDLE_VALUE) { - rc = errno; + int rc = mdbx_openfile(lpath, O_RDWR | O_CREAT, mode, &env->me_lfd); + if (rc != MDB_SUCCESS) { if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { - return MDB_SUCCESS; + env->me_lfd = INVALID_HANDLE_VALUE; + rc = MDB_SUCCESS; } return rc; } - /* Lose record locks when exec*() */ - if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(env->me_lfd, F_SETFD, fdflags); - - if (!(env->me_flags & MDB_NOTLS)) { - rc = pthread_key_create(&env->me_txkey, NULL); - if (rc) - return rc; - env->me_flags |= MDB_ENV_TXKEY; - } - /* Try to get exclusive lock. If we succeed, then - * nobody is using the lock region and we should initialize it. - */ - if ((rc = mdbx_env_excl_lock(env, excl))) + * nobody is using the lock region and we should initialize it. */ + rc = mdbx_lck_seize(env); + if (rc == MDBX_RESULT_TRUE) + *excl = true; + else if (rc == MDBX_RESULT_FALSE) + *excl = false; + else return rc; - size = lseek(env->me_lfd, 0, SEEK_END); - if (size == -1) - return errno; - rsize = (env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); - if (size 0) { - if (ftruncate(env->me_lfd, rsize) != 0) - return errno; + rc = mdbx_filesize(env->me_lfd, &size); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + rsize = (env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDBX_lockinfo); + if (size != rsize && *excl > 0) { + rc = mdbx_ftruncate(env->me_lfd, rsize); + if (unlikely(rc != MDB_SUCCESS)) + return rc; } else { rsize = size; - size = rsize - sizeof(MDB_txninfo); + size = rsize - sizeof(MDBX_lockinfo); env->me_maxreaders = size / sizeof(MDB_reader) + 1; } - m = mmap(NULL, rsize, PROT_READ | PROT_WRITE, MAP_SHARED, env->me_lfd, 0); - if (m == MAP_FAILED) - return errno; - env->me_txns = m; + void *addr = NULL; + rc = mdbx_mmap(&addr, rsize, true, env->me_lfd); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + env->me_txns = addr; + + if (!(env->me_flags & MDB_NOTLS)) { + rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_txns->mti_readers[0], + &env->me_txns->mti_readers[env->me_maxreaders]); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + env->me_flags |= MDB_ENV_TXKEY; + } #ifdef MADV_NOHUGEPAGE (void)madvise(env->me_txns, rsize, MADV_NOHUGEPAGE); @@ -5036,54 +3937,38 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, (void)madvise(env->me_txns, rsize, MADV_DODUMP); #endif +#ifdef MADV_DONTFORK if (madvise(env->me_txns, rsize, MADV_DONTFORK) < 0) return errno; +#endif +#ifdef MADV_WILLNEED if (madvise(env->me_txns, rsize, MADV_WILLNEED) < 0) return errno; +#endif +#ifdef MADV_RANDOM if (madvise(env->me_txns, rsize, MADV_RANDOM) < 0) return errno; +#endif if (*excl > 0) { - /* Solaris needs this before initing a robust mutex. Otherwise - * it may skip the init and return EBUSY "seems someone already - * inited" or EINVAL "it was inited differently". - */ - memset(&env->me_txns->mti_rmutex, 0, sizeof(env->me_txns->mti_rmutex)); - memset(&env->me_txns->mti_wmutex, 0, sizeof(env->me_txns->mti_wmutex)); - - pthread_mutexattr_t mattr; - rc = pthread_mutexattr_init(&mattr); - if (rc) - return rc; - - rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); - -#if MDB_USE_ROBUST - if (!rc) - rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); -#endif /* MDB_USE_ROBUST */ - if (!rc) - rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr); - if (!rc) - rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr); - - pthread_mutexattr_destroy(&mattr); + memset(env->me_txns, 0, sizeof(MDBX_lockinfo)); + rc = mdbx_lck_init(env); if (rc) return rc; env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_format = MDB_LOCK_FORMAT; - env->me_txns->mti_txnid = ~0L; - env->me_txns->mti_numreaders = 0; + env->me_txns->mti_txnid = ~(txnid_t)0; } else { if (env->me_txns->mti_magic != MDB_MAGIC) { mdbx_debug("lock region has invalid magic"); return MDB_INVALID; } if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { - mdbx_debug("lock region has format+version 0x%x, expected 0x%x", + mdbx_debug("lock region has format+version 0x%" PRIx64 + ", expected 0x%" PRIx64, env->me_txns->mti_format, MDB_LOCK_FORMAT); return MDB_VERSION_MISMATCH; } @@ -5106,8 +3991,8 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, (MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC | MDB_NOMEMINIT | \ MDBX_COALESCE | MDBX_PAGEPERTURB) #define CHANGELESS \ - (MDB_FIXEDMAP | MDB_NOSUBDIR | MDB_RDONLY | MDB_WRITEMAP | MDB_NOTLS | \ - MDB_NORDAHEAD | MDBX_LIFORECLAIM) + (MDB_NOSUBDIR | MDB_RDONLY | MDB_WRITEMAP | MDB_NOTLS | MDB_NORDAHEAD | \ + MDBX_LIFORECLAIM) #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE | CHANGELESS) #error "Persistent DB flags & env flags overlap, but both go in mm_flags" @@ -5137,6 +4022,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, lpath = malloc(rc); if (!lpath) return ENOMEM; + if (flags & MDB_NOSUBDIR) { dpath = lpath + len + sizeof(LOCKSUFF); sprintf(lpath, "%s" LOCKSUFF, path); @@ -5161,15 +4047,15 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, } env->me_flags = flags |= MDB_ENV_ACTIVE; if (rc) - goto leave; + goto bailout; - env->me_path = strdup(path); + env->me_path = mdbx_strdup(path); env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned)); if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { rc = ENOMEM; - goto leave; + goto bailout; } env->me_dbxs[FREE_DBI].md_cmp = mdbx_cmp_int_ai; /* aligned MDB_INTEGERKEY */ @@ -5177,7 +4063,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, if (!(flags & MDB_RDONLY)) { rc = mdbx_env_setup_locks(env, lpath, mode, &excl); if (rc) - goto leave; + goto bailout; } if (F_ISSET(flags, MDB_RDONLY)) @@ -5185,33 +4071,29 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, else oflags = O_RDWR | O_CREAT; - env->me_fd = open(dpath, oflags | O_CLOEXEC, mode); - if (env->me_fd == INVALID_HANDLE_VALUE) { - rc = errno; - goto leave; - } - - int fdflags; - if ((fdflags = fcntl(env->me_fd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(env->me_fd, F_SETFD, fdflags); + rc = mdbx_openfile(dpath, oflags, mode, &env->me_fd); + if (rc != MDB_SUCCESS) + goto bailout; if (flags & MDB_RDONLY) { rc = mdbx_env_setup_locks(env, lpath, mode, &excl); if (rc) - goto leave; + goto bailout; } MDB_meta meta; - if ((rc = mdbx_env_open2(env, &meta)) == MDB_SUCCESS) { + rc = mdbx_env_open2(env, &meta); + if (rc == MDB_SUCCESS) { mdbx_debug("opened dbenv %p", (void *)env); if (excl > 0) { env->me_txns->mti_txnid = meta.mm_txnid; if (exclusive == NULL || *exclusive < 2) { /* LY: downgrade lock only if exclusive access not requested. * in case exclusive==1, just leave value as is. */ - rc = mdbx_env_share_locks(env, &excl); - if (rc) - goto leave; + rc = mdbx_lck_downgrade(env); + if (rc != MDB_SUCCESS) + goto bailout; + excl = 0; } } else if (exclusive) { /* LY: just indicate that is not an exclusive access. */ @@ -5257,7 +4139,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, } #endif -leave: +bailout: if (rc) mdbx_env_close0(env); free(lpath); @@ -5276,6 +4158,7 @@ static void __cold mdbx_env_close0(MDB_env *env) { if (!(env->me_flags & MDB_ENV_ACTIVE)) return; env->me_flags &= ~MDB_ENV_ACTIVE; + mdbx_lck_destroy(env); /* Doing this here since me_dbxs may not exist during mdbx_env_close */ if (env->me_dbxs) { @@ -5295,38 +4178,28 @@ static void __cold mdbx_env_close0(MDB_env *env) { mdbx_midl_free(env->me_free_pgs); if (env->me_flags & MDB_ENV_TXKEY) { - mdbx_ensure(env, pthread_key_delete(env->me_txkey) == 0); + mdbx_rthc_remove(env->me_txkey); env->me_flags &= ~MDB_ENV_TXKEY; } if (env->me_map) { - munmap(env->me_map, env->me_mapsize); + mdbx_munmap(env->me_map, env->me_mapsize); #ifdef USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; #endif } if (env->me_fd != INVALID_HANDLE_VALUE) - (void)close(env->me_fd); + (void)mdbx_closefile(env->me_fd); - /* Clearing readers is done in this function because - * me_txkey with its destructor must be disabled first. - * - * We skip the the reader mutex, so we touch only - * data owned by this process (me_close_readers and - * our readers), and clear each reader atomically. - */ - if (env->me_pid == getpid()) - mdbx_rthc_cleanup(env); - - munmap((void *)env->me_txns, - (env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDB_txninfo)); + mdbx_munmap((void *)env->me_txns, + (env->me_maxreaders - 1) * sizeof(MDB_reader) + + sizeof(MDBX_lockinfo)); env->me_txns = NULL; env->me_pid = 0; - if (env->me_lfd != INVALID_HANDLE_VALUE) { - (void)close(env->me_lfd); - } + if (env->me_lfd != INVALID_HANDLE_VALUE) + (void)mdbx_closefile(env->me_lfd); } int __cold mdbx_env_close_ex(MDB_env *env, int dont_sync) { @@ -5390,7 +4263,7 @@ static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { mdbx_assert(NULL, a->mv_size == b->mv_size); mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(uint16_t) && 0 == (uintptr_t)b->mv_data % sizeof(uint16_t)); -#ifdef MISALIGNED_OK +#if MISALIGNED_OK if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) return mdbx_cmp2int(*(size_t *)a->mv_data, *(size_t *)b->mv_data); @@ -5402,19 +4275,19 @@ static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { int diff; const uint16_t *pa, *pb, *end; -#if BYTE_ORDER == LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ end = (const uint16_t *)a->mv_data; pa = (const uint16_t *)((char *)a->mv_data + a->mv_size); pb = (const uint16_t *)((char *)b->mv_data + a->mv_size); do { diff = *--pa - *--pb; -#else /* BYTE_ORDER */ +#else /* __BYTE_ORDER__ */ end = (const uint16_t *)((char *)a->mv_data + a->mv_size); pa = (const uint16_t *)a->mv_data; pb = (const uint16_t *)b->mv_data; do { diff = *pa++ - *pb++; -#endif /* BYTE_ORDER */ +#endif /* __BYTE_ORDER__ */ if (likely(diff != 0)) break; } while (pa != end); @@ -5437,7 +4310,7 @@ static int __hot mdbx_cmp_int_ua(const MDB_val *a, const MDB_val *b) { return mdbx_cmp2int(*(unsigned *)a->mv_data, *(unsigned *)b->mv_data); #else mdbx_assert(NULL, a->mv_size == sizeof(int) || a->mv_size == sizeof(size_t)); -#if BYTE_ORDER == LITTLE_ENDIAN +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ { int diff; const uint8_t *pa, *pb; @@ -5452,9 +4325,9 @@ static int __hot mdbx_cmp_int_ua(const MDB_val *a, const MDB_val *b) { } while (pa != a->mv_data); return diff; } -#else /* BYTE_ORDER */ +#else /* __BYTE_ORDER__ */ return memcmp(a->mv_data, b->mv_data, a->mv_size); -#endif /* BYTE_ORDER */ +#endif /* __BYTE_ORDER__ */ #endif /* MISALIGNED_OK */ } @@ -5664,9 +4537,9 @@ static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, goto mapped; } if (dl[0].mid) { - unsigned x = mdbx_mid2l_search(dl, pgno); - if (x <= dl[0].mid && dl[x].mid == pgno) { - p = dl[x].mptr; + unsigned y = mdbx_mid2l_search(dl, pgno); + if (y <= dl[0].mid && dl[y].mid == pgno) { + p = dl[y].mptr; goto done; } } @@ -5825,7 +4698,6 @@ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { { MDB_val data; int exact = 0; - uint16_t flags; MDB_node *leaf = mdbx_node_search(&mc2, &mc->mc_dbx->md_name, &exact); if (!exact) return MDB_NOTFOUND; @@ -5834,12 +4706,14 @@ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { rc = mdbx_node_read(&mc2, leaf, &data); if (rc) return rc; - memcpy(&flags, ((char *)data.mv_data + offsetof(MDB_db, md_flags)), + + uint16_t md_flags; + memcpy(&md_flags, ((char *)data.mv_data + offsetof(MDB_db, md_flags)), sizeof(uint16_t)); /* The txn may not know this DBI, or another process may * have dropped and recreated the DB with other flags. */ - if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)) + if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != md_flags)) return MDB_INCOMPATIBLE; memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); } @@ -5953,8 +4827,8 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDB_page *mp) { * @param[out] data Updated to point to the node's data. * @return 0 on success, non-zero on failure. */ -static MDBX_INLINE int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, - MDB_val *data) { +static __inline int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, + MDB_val *data) { MDB_page *omp; /* overflow page */ pgno_t pgno; int rc; @@ -6710,8 +5584,7 @@ static int mdbx_cursor_touch(MDB_cursor *mc) { int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned flags) { MDB_env *env; - MDB_node *leaf = NULL; - MDB_page *fp, *mp, *sub_root = NULL; + MDB_page *fp, *sub_root = NULL; uint16_t fp_flags; MDB_val xdata, *rdata, dkey, olddata; MDB_db dummy; @@ -6883,7 +5756,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, */ fp_flags = P_LEAF | P_DIRTY; fp = env->me_pbuf; - fp->mp_leaf2_ksize = data->mv_size; /* used if MDB_DUPFIXED */ + fp->mp_leaf2_ksize = (uint16_t)data->mv_size; /* used if MDB_DUPFIXED */ fp->mp_lower = fp->mp_upper = (PAGEHDRSZ - PAGEBASE); olddata.mv_size = PAGEHDRSZ; goto prep_subDB; @@ -6902,7 +5775,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, * update branch key if there is a parent page */ if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - unsigned short dtop = 1; + unsigned dtop = 1; mc->mc_top--; /* slot 0 is always an empty key, find real slot */ while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { @@ -6920,8 +5793,8 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDB_SUCCESS; } - more: - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + more:; + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); olddata.mv_size = NODEDSZ(leaf); olddata.mv_data = NODEDATA(leaf); @@ -6933,7 +5806,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, * size. xdata: node data with new page or DB. */ unsigned i, offset = 0; - mp = fp = xdata.mv_data = env->me_pbuf; + MDB_page *mp = fp = xdata.mv_data = env->me_pbuf; mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; /* Was a single item before, must convert now */ @@ -6964,13 +5837,13 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; if (mc->mc_db->md_flags & MDB_DUPFIXED) { fp->mp_flags |= P_LEAF2; - fp->mp_leaf2_ksize = data->mv_size; + fp->mp_leaf2_ksize = (uint16_t)data->mv_size; xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ } else { xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + (dkey.mv_size & 1) + (data->mv_size & 1); } - fp->mp_upper = xdata.mv_size - PAGEBASE; + fp->mp_upper = (uint16_t)(xdata.mv_size - PAGEBASE); olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ } else if (leaf->mn_flags & F_SUBDATA) { /* Data is on sub-DB, just store it */ @@ -7105,7 +5978,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, * Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. */ - off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); + off = (PAGEHDRSZ + data->mv_size) & -(ssize_t)sizeof(size_t); memcpy((size_t *)((char *)np + off), (size_t *)((char *)omp + off), sz - off); sz = PAGEHDRSZ; @@ -7190,7 +6063,7 @@ new_sub: put_sub: xdata.mv_size = 0; xdata.mv_data = ""; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (flags & MDB_CURRENT) { xflags = (flags & MDB_NODUPDATA) ? MDB_CURRENT | MDB_NOOVERWRITE | MDB_NOSPILL @@ -7435,8 +6308,8 @@ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, * @param[in] data The data for the node. * @return The number of bytes needed to store the node. */ -static MDBX_INLINE size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, - MDB_val *data) { +static __inline size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, + MDB_val *data) { size_t sz; sz = LEAFSIZE(key, data); @@ -7458,7 +6331,7 @@ static MDBX_INLINE size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, * @param[in] key The key for the node. * @return The number of bytes needed to store the node. */ -static MDBX_INLINE size_t mdbx_branch_size(MDB_env *env, MDB_val *key) { +static __inline size_t mdbx_branch_size(MDB_env *env, MDB_val *key) { size_t sz; sz = INDXSIZE(key); @@ -7494,7 +6367,7 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, unsigned i; size_t node_size = NODESIZE; ssize_t room; - indx_t ofs; + unsigned ofs; MDB_node *node; MDB_page *mp = mc->mc_pg[mc->mc_top]; MDB_page *ofp = NULL; /* overflow page */ @@ -7565,13 +6438,13 @@ update: /* Adjust free space offsets. */ ofs = mp->mp_upper - node_size; mdbx_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); - mp->mp_ptrs[indx] = ofs; - mp->mp_upper = ofs; + mp->mp_ptrs[indx] = (uint16_t)ofs; + mp->mp_upper = (uint16_t)ofs; mp->mp_lower += sizeof(indx_t); /* Write the node data. */ node = NODEPTR(mp, indx); - node->mn_ksize = (key == NULL) ? 0 : key->mv_size; + node->mn_ksize = (key == NULL) ? 0 : (uint16_t)key->mv_size; node->mn_flags = flags; if (IS_LEAF(mp)) SETDSZ(node, data->mv_size); @@ -7674,7 +6547,7 @@ static void mdbx_node_shrink(MDB_page *mp, indx_t indx) { MDB_node *node; MDB_page *sp, *xp; char *base; - indx_t delta, nsize, len, ptr; + unsigned nsize, delta, len, ptr; int i; node = NODEPTR(mp, indx); @@ -8071,7 +6944,7 @@ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { /* But even if no shift was needed, update ksize */ if (node->mn_ksize != key->mv_size) - node->mn_ksize = key->mv_size; + node->mn_ksize = (uint16_t)key->mv_size; if (key->mv_size) memcpy(NODEKEY(node), key->mv_data, key->mv_size); @@ -8084,11 +6957,11 @@ static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); /** Perform \b act while tracking temporary cursor \b mn */ #define WITH_CURSOR_TRACKING(mn, act) \ do { \ - MDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + MDB_cursor mc_dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ if ((mn).mc_flags & C_SUB) { \ - dummy.mc_flags = C_INITIALIZED; \ - dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ - tracked = &dummy; \ + mc_dummy.mc_flags = C_INITIALIZED; \ + mc_dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ + tracked = &mc_dummy; \ } else { \ tracked = &(mn); \ } \ @@ -8106,7 +6979,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { pgno_t srcpg; MDB_cursor mn; int rc; - unsigned short flags; + unsigned flags; DKBUF; @@ -9316,14 +8189,14 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, typedef struct mdbx_copy { MDB_env *mc_env; MDB_txn *mc_txn; - pthread_mutex_t mc_mutex; - pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ + mdbx_mutex_t mc_mutex; + mdbx_cond_t mc_cond; /**< Condition variable for #mc_new */ char *mc_wbuf[2]; char *mc_over[2]; int mc_wlen[2]; int mc_olen[2]; pgno_t mc_next_pgno; - HANDLE mc_fd; + mdbx_filehandle_t mc_fd; int mc_toggle; /**< Buffer number in provider */ int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ /** Error code. Never cleared if set. Both threads can set nonzero @@ -9333,57 +8206,26 @@ typedef struct mdbx_copy { } mdbx_copy; /** Dedicated writer thread for compacting copy. */ -static void *__cold mdbx_env_copythr(void *arg) { +static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { mdbx_copy *my = arg; char *ptr; - int toggle = 0, wsize, rc = 0; - int len; + int toggle = 0, wsize; -#ifdef SIGPIPE - sigset_t set; - sigemptyset(&set); - sigaddset(&set, SIGPIPE); - if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) - my->mc_error = rc; -#endif - - pthread_mutex_lock(&my->mc_mutex); - for (;;) { + mdbx_mutex_lock(&my->mc_mutex); + while (!my->mc_error) { while (!my->mc_new) - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + mdbx_cond_wait(&my->mc_cond, &my->mc_mutex); if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ break; wsize = my->mc_wlen[toggle]; ptr = my->mc_wbuf[toggle]; again: - rc = MDB_SUCCESS; - while (wsize > 0 && !my->mc_error) { - len = write(my->mc_fd, ptr, wsize); - if (len < 0) { - rc = errno; -#ifdef SIGPIPE - if (rc == EPIPE) { - /* Collect the pending SIGPIPE, otherwise at least OS X - * gives it to the process on thread-exit (ITS#8504). - */ - int tmp; - sigwait(&set, &tmp); - } -#endif - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - wsize -= len; - continue; - } else { - rc = EIO; - break; - } - } - if (rc) { - my->mc_error = rc; + if (wsize > 0 && !my->mc_error) { + int rc = mdbx_write(my->mc_fd, ptr, wsize); + if (rc != MDB_SUCCESS) + my->mc_error = rc; } + /* If there's an overflow page tail, write it too */ if (my->mc_olen[toggle]) { wsize = my->mc_olen[toggle]; @@ -9395,10 +8237,10 @@ static void *__cold mdbx_env_copythr(void *arg) { toggle ^= 1; /* Return the empty buffer to provider */ my->mc_new--; - pthread_cond_signal(&my->mc_cond); + mdbx_cond_signal(&my->mc_cond); } - pthread_mutex_unlock(&my->mc_mutex); - return NULL; + mdbx_mutex_unlock(&my->mc_mutex); + return (THREAD_RESULT)0; } /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. @@ -9407,12 +8249,12 @@ static void *__cold mdbx_env_copythr(void *arg) { * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). */ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { - pthread_mutex_lock(&my->mc_mutex); + mdbx_mutex_lock(&my->mc_mutex); my->mc_new += adjust; - pthread_cond_signal(&my->mc_cond); + mdbx_cond_signal(&my->mc_cond); while (my->mc_new & 2) /* both buffers in use */ - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - pthread_mutex_unlock(&my->mc_mutex); + mdbx_cond_wait(&my->mc_cond, &my->mc_mutex); + mdbx_mutex_unlock(&my->mc_mutex); my->mc_toggle ^= (adjust & 1); /* Both threads reset mc_wlen, to be safe from threading errors */ @@ -9474,7 +8316,6 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { ni = NODEPTR(mp, i); if (ni->mn_flags & F_BIGDATA) { MDB_page *omp; - pgno_t pg; /* Need writable leaf */ if (mp != leaf) { @@ -9484,9 +8325,10 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { ni = NODEPTR(mp, i); } - memcpy(&pg, NODEDATA(ni), sizeof(pg)); + pgno_t pgno; + memcpy(&pgno, NODEDATA(ni), sizeof(pgno)); memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); - rc = mdbx_page_get(&mc, pg, &omp, NULL); + rc = mdbx_page_get(&mc, pgno, &omp, NULL); if (rc) goto done; if (my->mc_wlen[toggle] >= MDB_WBUF) { @@ -9532,11 +8374,11 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { } else { mc.mc_ki[mc.mc_top]++; if (mc.mc_ki[mc.mc_top] < n) { - pgno_t pg; + pgno_t pgno; again: ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); - pg = NODEPGNO(ni); - rc = mdbx_page_get(&mc, pg, &mp, NULL); + pgno = NODEPGNO(ni); + rc = mdbx_page_get(&mc, pgno, &mp, NULL); if (rc) goto done; mc.mc_top++; @@ -9580,31 +8422,31 @@ done: } /** Copy environment with compaction. */ -static int __cold mdbx_env_copyfd1(MDB_env *env, HANDLE fd) { +static int __cold mdbx_env_copyfd1(MDB_env *env, mdbx_filehandle_t fd) { MDB_meta *mm; MDB_page *mp; mdbx_copy my; MDB_txn *txn = NULL; - pthread_t thr; + mdbx_thread_t thr; pgno_t root, new_root; - int rc = MDB_SUCCESS; + int rc; memset(&my, 0, sizeof(my)); - if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) + if ((rc = mdbx_mutex_init(&my.mc_mutex)) != 0) return rc; - if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) + if ((rc = mdbx_cond_init(&my.mc_cond)) != 0) goto done2; - my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF * 2); - if (my.mc_wbuf[0] == NULL) { - rc = errno; + rc = mdbx_memalign_alloc(env->me_os_psize, MDB_WBUF * 2, + (void **)&my.mc_wbuf[0]); + if (rc != MDB_SUCCESS) goto done; - } + memset(my.mc_wbuf[0], 0, MDB_WBUF * 2); my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; my.mc_next_pgno = NUM_METAS; my.mc_env = env; my.mc_fd = fd; - rc = pthread_create(&thr, NULL, mdbx_env_copythr, &my); + rc = mdbx_thread_create(&thr, mdbx_env_copythr, &my); if (rc) goto done; @@ -9618,7 +8460,6 @@ static int __cold mdbx_env_copyfd1(MDB_env *env, HANDLE fd) { mp->mp_flags = P_META; mm = (MDB_meta *)PAGEDATA(mp); mdbx_env_init_meta0(env, mm); - mm->mm_address = METAPAGE_1(env)->mm_address; mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); mp->mp_pgno = 1; @@ -9669,124 +8510,71 @@ finish: if (rc) my.mc_error = rc; mdbx_env_cthr_toggle(&my, 1 | MDB_EOF); - rc = pthread_join(thr, NULL); + rc = mdbx_thread_join(thr); mdbx_txn_abort(txn); done: - free(my.mc_wbuf[0]); - pthread_cond_destroy(&my.mc_cond); + mdbx_memalign_free(my.mc_wbuf[0]); + mdbx_cond_destroy(&my.mc_cond); done2: - pthread_mutex_destroy(&my.mc_mutex); + mdbx_mutex_destroy(&my.mc_mutex); return rc ? rc : my.mc_error; } /** Copy environment as-is. */ -static int __cold mdbx_env_copyfd0(MDB_env *env, HANDLE fd) { +static int __cold mdbx_env_copyfd0(MDB_env *env, mdbx_filehandle_t fd) { MDB_txn *txn = NULL; - pthread_mutex_t *wmutex = NULL; int rc; - size_t wsize; - char *ptr; - ssize_t len; - size_t w2; /* Do the lock/unlock of the reader mutex before starting the - * write txn. Otherwise other read txns could block writers. - */ + * write txn. Otherwise other read txns could block writers. */ rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); - if (rc) + if (unlikely(rc)) return rc; /* We must start the actual read txn after blocking writers */ rc = mdbx_txn_end(txn, MDB_END_RESET_TMP); - if (rc) - return rc; + if (unlikely(rc)) + goto bailout; /* FIXME: or just return? */ /* Temporarily block writers until we snapshot the meta pages */ - wmutex = MDB_MUTEX(env, w); - rc = mdbx_mutex_lock(env, wmutex); + rc = mdbx_txn_lock(env); if (unlikely(rc)) - goto leave; + goto bailout; rc = mdbx_txn_renew0(txn, MDB_RDONLY); if (rc) { - mdbx_mutex_unlock(env, wmutex); - goto leave; + mdbx_txn_unlock(env); + goto bailout; } - wsize = env->me_psize * NUM_METAS; - ptr = env->me_map; - w2 = wsize; - while (w2 > 0) { - len = write(fd, ptr, w2); - if (len < 0) { - rc = errno; - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - w2 -= len; - continue; - } else { - /* Non-blocking or async handles are not supported */ - rc = EIO; - break; - } - } - mdbx_mutex_unlock(env, wmutex); + rc = mdbx_write(fd, env->me_map, env->me_psize * NUM_METAS); + mdbx_txn_unlock(env); - if (rc) - goto leave; + if (rc == MDB_SUCCESS) + rc = mdbx_ftruncate(fd, txn->mt_next_pgno * env->me_psize); - w2 = txn->mt_next_pgno * env->me_psize; - { - size_t fsize = 0; - if ((rc = mdbx_fsize(env->me_fd, &fsize))) - goto leave; - if (w2 > fsize) - w2 = fsize; - } - wsize = w2 - wsize; - while (wsize > 0) { - if (wsize > MAX_WRITE) - w2 = MAX_WRITE; - else - w2 = wsize; - len = write(fd, ptr, w2); - if (len < 0) { - rc = errno; - break; - } else if (len > 0) { - rc = MDB_SUCCESS; - ptr += len; - wsize -= len; - continue; - } else { - rc = EIO; - break; - } - } - -leave: +bailout: mdbx_txn_abort(txn); return rc; } -int __cold mdbx_env_copyfd2(MDB_env *env, HANDLE fd, unsigned flags) { +int __cold mdbx_env_copyfd2(MDB_env *env, mdbx_filehandle_t fd, + unsigned flags) { if (flags & MDB_CP_COMPACT) return mdbx_env_copyfd1(env, fd); else return mdbx_env_copyfd0(env, fd); } -int __cold mdbx_env_copyfd(MDB_env *env, HANDLE fd) { +int __cold mdbx_env_copyfd(MDB_env *env, mdbx_filehandle_t fd) { return mdbx_env_copyfd2(env, fd, 0); } int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { int rc, len; char *lpath; - HANDLE newfd = INVALID_HANDLE_VALUE; + mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE; if (env->me_flags & MDB_NOSUBDIR) { lpath = (char *)path; @@ -9801,36 +8589,29 @@ int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { /* The destination path must exist, but the destination file must not. * We don't want the OS to cache the writes, since the source data is - * already in the OS cache. - */ - newfd = open(lpath, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0666); - if (newfd == INVALID_HANDLE_VALUE) { - rc = errno; - goto leave; - } - - int fdflags; - if ((fdflags = fcntl(newfd, F_GETFD) | FD_CLOEXEC) >= 0) - fcntl(newfd, F_SETFD, fdflags); - - if (env->me_psize >= env->me_os_psize) { + * already in the OS cache. */ + rc = mdbx_openfile(lpath, O_WRONLY | O_CREAT | O_EXCL, 0666, &newfd); + if (rc == MDB_SUCCESS) { + if (env->me_psize >= env->me_os_psize) { #ifdef F_NOCACHE /* __APPLE__ */ - (void)fcntl(newfd, F_NOCACHE, 1); -#elif defined O_DIRECT - /* Set O_DIRECT if the file system supports it */ - if ((rc = fcntl(newfd, F_GETFL)) != -1) - (void)fcntl(newfd, F_SETFL, rc | O_DIRECT); + (void)fcntl(newfd, F_NOCACHE, 1); +#elif defined(O_DIRECT) && defined(F_GETFL) + /* Set O_DIRECT if the file system supports it */ + if ((rc = fcntl(newfd, F_GETFL)) != -1) + (void)fcntl(newfd, F_SETFL, rc | O_DIRECT); #endif + } + rc = mdbx_env_copyfd2(env, newfd, flags); } - rc = mdbx_env_copyfd2(env, newfd, flags); - -leave: if (!(env->me_flags & MDB_NOSUBDIR)) free(lpath); - if (newfd != INVALID_HANDLE_VALUE) - if (close(newfd) < 0 && rc == MDB_SUCCESS) - rc = errno; + + if (newfd != INVALID_HANDLE_VALUE) { + int err = mdbx_closefile(newfd); + if (rc == MDB_SUCCESS && err != rc) + rc = err; + } return rc; } @@ -9843,8 +8624,7 @@ int __cold mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff) { if (unlikely(flags & ~CHANGEABLE)) return EINVAL; - pthread_mutex_t *mutex = MDB_MUTEX(env, w); - int rc = mdbx_mutex_lock(env, mutex); + int rc = mdbx_txn_lock(env); if (unlikely(rc)) return rc; @@ -9853,7 +8633,7 @@ int __cold mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff) { else env->me_flags &= ~flags; - mdbx_mutex_unlock(env, mutex); + mdbx_txn_unlock(env); return MDB_SUCCESS; } @@ -9896,7 +8676,7 @@ int __cold mdbx_env_get_path(MDB_env *env, const char **arg) { return MDB_SUCCESS; } -int __cold mdbx_env_get_fd(MDB_env *env, int *arg) { +int __cold mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *arg) { if (unlikely(!env || !arg)) return EINVAL; @@ -9917,7 +8697,6 @@ static int __cold mdbx_stat0(MDB_env *env, MDB_db *db, MDBX_stat *arg) { arg->ms_leaf_pages = db->md_leaf_pages; arg->ms_overflow_pages = db->md_overflow_pages; arg->ms_entries = db->md_entries; - return MDB_SUCCESS; } @@ -9961,7 +8740,6 @@ int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) { arg->me_meta1_sign != m1->mm_datasync_sign || arg->me_meta2_sign != m2->mm_datasync_sign)); - arg->me_mapaddr = meta->mm_address; arg->me_mapsize = env->me_mapsize; arg->me_maxreaders = env->me_maxreaders; arg->me_numreaders = env->me_txns->mti_numreaders; @@ -10013,7 +8791,6 @@ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_val key, data; MDB_dbi i; MDB_cursor mc; - MDB_db dummy; int rc, dbflag, exact; unsigned unused = 0, seq; char *namedup; @@ -10092,16 +8869,17 @@ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, } /* Done here so we cannot fail after creating a new DB */ - if (unlikely((namedup = strdup(name)) == NULL)) + if (unlikely((namedup = mdbx_strdup(name)) == NULL)) return ENOMEM; if (unlikely(rc)) { + MDB_db db_dummy; /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ - data.mv_size = sizeof(MDB_db); - data.mv_data = &dummy; - memset(&dummy, 0, sizeof(dummy)); - dummy.md_root = P_INVALID; - dummy.md_flags = flags & PERSISTENT_FLAGS; + memset(&db_dummy, 0, sizeof(db_dummy)); + db_dummy.md_root = P_INVALID; + db_dummy.md_flags = flags & PERSISTENT_FLAGS; + data.mv_size = sizeof(db_dummy); + data.mv_data = &db_dummy; WITH_CURSOR_TRACKING(mc, rc = mdbx_cursor_put(&mc, &key, &data, F_SUBDATA)); dbflag |= DB_DIRTY; } @@ -10371,7 +9149,7 @@ int __cold mdbx_env_get_maxkeysize(MDB_env *env) { } int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { - unsigned i, rdrs; + unsigned i, snap_nreaders; MDB_reader *mr; char buf[64]; int rc = 0, first = 1; @@ -10382,16 +9160,17 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - rdrs = env->me_txns->mti_numreaders; + snap_nreaders = env->me_txns->mti_numreaders; mr = env->me_txns->mti_readers; - for (i = 0; i < rdrs; i++) { + for (i = 0; i < snap_nreaders; i++) { if (mr[i].mr_pid) { txnid_t txnid = mr[i].mr_txnid; if (txnid == ~(txnid_t)0) - sprintf(buf, "%10d %zx -\n", (int)mr[i].mr_pid, (size_t)mr[i].mr_tid); + snprintf(buf, sizeof(buf), "%10d %zx -\n", (int)mr[i].mr_pid, + (size_t)mr[i].mr_tid); else - sprintf(buf, "%10d %zx %zu\n", (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, - txnid); + snprintf(buf, sizeof(buf), "%10d %zx %zu\n", (int)mr[i].mr_pid, + (size_t)mr[i].mr_tid, txnid); if (first) { first = 0; @@ -10413,7 +9192,7 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { /** Insert pid into list if not already present. * return -1 if already present. */ -static int __cold mdbx_pid_insert(pid_t *ids, pid_t pid) { +static int __cold mdbx_pid_insert(mdbx_pid_t *ids, mdbx_pid_t pid) { /* binary search of pid in list */ unsigned base = 0; unsigned cursor = 1; @@ -10454,44 +9233,55 @@ int __cold mdbx_reader_check(MDB_env *env, int *dead) { return mdbx_reader_check0(env, 0, dead); } -/** As #mdbx_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ -static int __cold mdbx_reader_check0(MDB_env *env, int rlocked, int *dead) { - pthread_mutex_t *rmutex = rlocked ? NULL : MDB_MUTEX(env, r); - unsigned i, j, rdrs; - MDB_reader *mr; - pid_t *pids, pid; +int __cold mdbx_reader_check0(MDB_env *env, int rlocked, int *dead) { + assert(rlocked >= 0); + unsigned i, j; + mdbx_pid_t *pids, pid; int rc = MDB_SUCCESS, count = 0; - if (unlikely(env->me_pid != getpid())) { + if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDB_FATAL_ERROR; return MDB_PANIC; } - rdrs = env->me_txns->mti_numreaders; - pids = malloc((rdrs + 1) * sizeof(pid_t)); + unsigned snap_nreaders = env->me_txns->mti_numreaders; + pids = malloc((snap_nreaders + 1) * sizeof(mdbx_pid_t)); if (!pids) return ENOMEM; + pids[0] = 0; - mr = env->me_txns->mti_readers; - for (i = 0; i < rdrs; i++) { + MDB_reader *mr = env->me_txns->mti_readers; + for (i = 0; i < snap_nreaders; i++) { pid = mr[i].mr_pid; if (pid && pid != env->me_pid) { if (mdbx_pid_insert(pids, pid) == 0) { - if (!mdbx_reader_pid(env, F_GETLK, pid)) { - /* Stale reader found */ + rc = mdbx_rpid_check(env, pid); + if (rc == MDBX_RESULT_FALSE) { + /* stale reader found */ j = i; - if (rmutex) { - if ((rc = pthread_mutex_lock(rmutex)) != 0) { - if ((rc = mdbx_mutex_failed(env, rmutex, rc))) - break; - rdrs = 0; /* the above checked all readers */ + if (!rlocked) { + rlocked = -1; + rc = mdbx_rdt_lock(env); + if (rc != MDB_SUCCESS) { + if (rc != MDBX_RESULT_TRUE) { + break; /* lock failed */ + } else { + /* recovered after mutex owner died */ + snap_nreaders = 0; /* the above checked all readers */ + } } else { - /* Recheck, a new process may have reused pid */ - if (mdbx_reader_pid(env, F_GETLK, pid)) - j = rdrs; + /* a other process may have clean and reused slot, recheck */ + rc = mdbx_rpid_check(env, pid); + if (rc != MDBX_RESULT_FALSE) { + if (rc != MDBX_RESULT_TRUE) + break; /* mdbx_rpid_check() failed */ + /* the race with other process, slot reused */ + rc = MDB_SUCCESS; + continue; + } } } - for (; j < rdrs; j++) { + for (; j < snap_nreaders; j++) { if (mr[j].mr_pid == pid) { mdbx_debug("clear stale reader pid %u txn %zd", (unsigned)pid, mr[j].mr_txnid); @@ -10499,81 +9289,21 @@ static int __cold mdbx_reader_check0(MDB_env *env, int rlocked, int *dead) { count++; } } - if (rmutex) - mdbx_mutex_unlock(env, rmutex); - } + } else if (rc != MDBX_RESULT_TRUE) + break; /* mdbx_rpid_check() failed */ } } } + + if (rlocked < 0) + mdbx_rdt_unlock(env); free(pids); + if (dead) *dead = count; return rc; } -static int __cold mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, - int rc) { -#if MDB_USE_ROBUST - if (unlikely(rc == EOWNERDEAD)) { - int rlocked, rc2; - - /* We own the mutex. Clean up after dead previous owner. */ - rc = MDB_SUCCESS; - rlocked = (mutex == MDB_MUTEX(env, r)); - if (!rlocked) { - /* Keep mti_txnid updated, otherwise next writer can - * overwrite data which latest meta page refers to. - * - * LY: Hm, how this can happen, if the mti_txnid - * is updating only at the finish of a successful commit ? - */ - - MDB_meta *meta = mdbx_meta_head_w(env); - assert(env->me_txns->mti_txnid == meta->mm_txnid); - (void)meta; - /* env is hosed if the dead thread was ours */ - if (env->me_txn) { - env->me_flags |= MDB_FATAL_ERROR; - env->me_txn = NULL; - rc = MDB_PANIC; - } - } - mdbx_debug("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); - rc2 = mdbx_reader_check0(env, rlocked, NULL); - if (rc2 == 0) - rc2 = pthread_mutex_consistent(mutex); - if (rc || (rc = rc2)) { - mdbx_debug("mutex recovery failed, %s", mdbx_strerror(rc)); - pthread_mutex_unlock(mutex); - } - } -#endif /* MDB_USE_ROBUST */ - if (unlikely(rc)) { - mdbx_debug("lock mutex failed, %s", mdbx_strerror(rc)); - if (rc != EDEADLK) { - env->me_flags |= MDB_FATAL_ERROR; - rc = MDB_PANIC; - } - } - - return rc; -} - -static int mdbx_mutex_lock(MDB_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_lock(mutex); - if (unlikely(rc)) - rc = mdbx_mutex_failed(env, mutex, rc); - return rc; -} - -static void mdbx_mutex_unlock(MDB_env *env, pthread_mutex_t *mutex) { - int rc = pthread_mutex_unlock(mutex); - mdbx_assert(env, rc == 0); - (void)env; - (void)rc; -} - static unsigned __hot mdbx_midl_search(MDB_IDL ids, MDB_ID id) { /* * binary search of id in ids @@ -10894,8 +9624,8 @@ static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { } MDB_reader *r; - pthread_t tid; - pid_t pid; + mdbx_tid_t tid; + mdbx_pid_t pid; int rc; if (!env->me_oom_func) @@ -10907,7 +9637,7 @@ static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { if (r->mr_txnid != oldest || pid <= 0) continue; - rc = env->me_oom_func(env, pid, (void *)tid, oldest, + rc = env->me_oom_func(env, pid, tid, oldest, mdbx_meta_head_w(env)->mm_txnid - oldest, retry); if (rc < 0) break; @@ -10951,10 +9681,12 @@ MDBX_oom_func *__cold mdbx_env_get_oomfunc(MDB_env *env) { : NULL; } -ATTRIBUTE_NO_SANITIZE_THREAD /* LY: avoid tsan-trap by me_txn, mm_last_pg and - mt_next_pgno */ - int - mdbx_txn_straggler(MDB_txn *txn, int *percent) { +#ifdef __SANITIZE_THREAD__ +/* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */ +__attribute__((no_sanitize_thread, noinline)) +#endif +int mdbx_txn_straggler(MDB_txn *txn, int *percent) +{ MDB_env *env; MDB_meta *meta; txnid_t lag; @@ -10978,7 +9710,7 @@ ATTRIBUTE_NO_SANITIZE_THREAD /* LY: avoid tsan-trap by me_txn, mm_last_pg and *percent = (last * 100ull + maxpg / 2) / maxpg; } lag = meta->mm_txnid - txn->mt_u.reader->mr_txnid; - return (0 > (long)lag) ? ~0u >> 1 : lag; + return (lag > INT_MAX) ? INT_MAX : (int)lag; } typedef struct mdbx_walk_ctx { @@ -11100,7 +9832,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, if (!(node->mn_flags & F_DUPDATA)) { name = NODEKEY(node); - int namelen = (char *)db - name; + ptrdiff_t namelen = (char *)db - name; name = memcpy(alloca(namelen + 1), name, namelen); name[namelen] = 0; } @@ -11534,9 +10266,10 @@ int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, int rc = mdbx_dbi_open(txn, name, flags, pdbi); if (likely(rc == MDB_SUCCESS)) { MDB_dbi dbi = *pdbi; - unsigned flags = txn->mt_dbs[dbi].md_flags; - txn->mt_dbxs[dbi].md_cmp = keycmp ? keycmp : mdbx_default_keycmp(flags); - txn->mt_dbxs[dbi].md_dcmp = datacmp ? datacmp : mdbx_default_datacmp(flags); + unsigned md_flags = txn->mt_dbs[dbi].md_flags; + txn->mt_dbxs[dbi].md_cmp = keycmp ? keycmp : mdbx_default_keycmp(md_flags); + txn->mt_dbxs[dbi].md_dcmp = + datacmp ? datacmp : mdbx_default_datacmp(md_flags); } return rc; } diff --git a/src/midl.h b/src/midl.h index eccc6099..b59e024e 100644 --- a/src/midl.h +++ b/src/midl.h @@ -1,19 +1,19 @@ -/** A generic unsigned ID number. These were entryIDs in back-bdb. - * Preferably it should have the same size as a pointer. +/* + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . */ -typedef size_t MDB_ID; - -/** An IDL is an ID List, a sorted array of IDs. The first - * element of the array is a counter for how many actual - * IDs are in the list. In the original back-bdb code, IDLs are - * sorted in ascending order. For libmdb IDLs are sorted in - * descending order. - */ -typedef MDB_ID *MDB_IDL; /* IDL sizes - likely should be even bigger - * limiting factors: sizeof(ID), thread stack size - */ + * limiting factors: sizeof(ID), thread stack size */ #define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ #define MDB_IDL_DB_SIZE (1 << MDB_IDL_LOGN) #define MDB_IDL_UM_SIZE (1 << (MDB_IDL_LOGN + 1)) @@ -27,27 +27,12 @@ typedef MDB_ID *MDB_IDL; #define MDB_IDL_FIRST(ids) ((ids)[1]) #define MDB_IDL_LAST(ids) ((ids)[(ids)[0]]) -/** Current max length of an #mdbx_midl_alloc()ed IDL */ +/* Current max length of an #mdbx_midl_alloc()ed IDL */ #define MDB_IDL_ALLOCLEN(ids) ((ids)[-1]) -/** Append ID to IDL. The IDL must be big enough. */ +/* Append ID to IDL. The IDL must be big enough. */ #define mdbx_midl_xappend(idl, id) \ do { \ MDB_ID *xidl = (idl), xlen = ++(xidl[0]); \ xidl[xlen] = (id); \ } while (0) - -/** An ID2 is an ID/pointer pair. - */ -typedef struct MDB_ID2 { - MDB_ID mid; /**< The ID */ - void *mptr; /**< The pointer */ -} MDB_ID2; - -/** An ID2L is an ID2 List, a sorted array of ID2s. - * The first element's \b mid member is a count of how many actual - * elements are in the array. The \b mptr member of the first element is - * unused. - * The array is sorted in ascending order by \b mid. - */ -typedef MDB_ID2 *MDB_ID2L; diff --git a/src/osal.c b/src/osal.c new file mode 100644 index 00000000..7061204a --- /dev/null +++ b/src/osal.c @@ -0,0 +1,625 @@ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ + +/* + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "./bits.h" + +#if defined(_WIN32) || defined(_WIN64) +static int waitfor2errcode(DWORD result) { + switch (result) { + case WAIT_OBJECT_0: + return MDB_SUCCESS; + case WAIT_FAILED: + return GetLastError(); + case WAIT_ABANDONED: + return ERROR_ABANDONED_WAIT_0; + case WAIT_IO_COMPLETION: + return ERROR_USER_APC; + case WAIT_TIMEOUT: + return ERROR_TIMEOUT; + default: + return ERROR_UNHANDLED_ERROR; + } +} +#endif /* _WIN32 || _WIN64 */ + +/*----------------------------------------------------------------------------*/ + +#ifndef _MSC_VER +/* Prototype should match libc runtime. ISO POSIX (2003) & LSB 3.1 */ +__nothrow __noreturn void __assert_fail(const char *assertion, const char *file, + unsigned line, const char *function); +#else +__extern_C __declspec(dllimport) void __cdecl _assert(char const *message, + char const *filename, + unsigned line); +#endif /* _MSC_VER */ + +#ifndef mdbx_assert_fail +void __cold mdbx_assert_fail(MDB_env *env, const char *msg, const char *func, + int line) { +#if MDB_DEBUG + if (env && env->me_assert_func) { + env->me_assert_func(env, msg, func, line); + return; + } +#else + (void)env; +#endif /* MDB_DEBUG */ + + if (mdbx_debug_logger) + mdbx_debug_log(MDBX_DBG_ASSERT, func, line, "assert: %s\n", msg); +#ifndef _MSC_VER + __assert_fail(msg, "mdbx", line, func); +#else + _assert(msg, func, line); +#endif /* _MSC_VER */ +} +#endif /* mdbx_assert_fail */ + +__cold void mdbx_panic(const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); +#ifdef _MSC_VER + if (IsDebuggerPresent()) { + OutputDebugString("\r\n" FIXME "\r\n"); + FatalExit(ERROR_UNHANDLED_ERROR); + } +#elif _XOPEN_SOURCE >= 700 || _POSIX_C_SOURCE >= 200809L || \ + (__GLIBC_PREREQ(1, 0) && !__GLIBC_PREREQ(2, 10) && defined(_GNU_SOURCE)) + vdprintf(STDERR_FILENO, fmt, ap); +#else +#error FIXME +#endif + va_end(ap); + abort(); +} + +/*----------------------------------------------------------------------------*/ + +#ifndef mdbx_asprintf +int mdbx_asprintf(char **strp, const char *fmt, ...) { + va_list ap, ones; + + va_start(ap, fmt); + va_copy(ones, ap); +#ifdef _MSC_VER + int needed = _vscprintf(fmt, ap); +#elif defined(_BSD_SOURCE) || _XOPEN_SOURCE >= 500 || \ + defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L + int needed = vsnprintf(nullptr, 0, fmt, ap); +#else +#error FIXME +#endif + va_end(ap); + + if (unlikely(needed < 0 || needed >= INT_MAX)) { + *strp = NULL; + va_end(ones); + return needed; + } + + *strp = malloc(needed + 1); + if (unlikely(*strp == NULL)) { + va_end(ones); + return -ENOMEM; + } + +#ifdef _MSC_VER + int actual = vsnprintf_s(*strp, needed + 1, _TRUNCATE, fmt, ones); +#elif defined(_BSD_SOURCE) || _XOPEN_SOURCE >= 500 || \ + defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L + int actual = vsnprintf(*strp, needed + 1, fmt, ones); +#else +#error FIXME +#endif + va_end(ones); + + assert(actual == needed); + if (unlikely(actual < 0)) { + free(*strp); + *strp = NULL; + } + return actual; +} +#endif /* mdbx_asprintf */ + +#ifndef mdbx_memalign_alloc +int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result) { +#if _MSC_VER + *result = _aligned_malloc(bytes, alignment); + return *result ? MDB_SUCCESS : ERROR_OUTOFMEMORY; +#elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L + *result = memalign(alignment, bytes); + return *result ? MDB_SUCCESS : errno; +#elif _POSIX_VERSION >= 200112L + *result = NULL; + return posix_memalign(result, alignment, bytes); +#else +#error FIXME +#endif +} +#endif /* mdbx_memalign_alloc */ + +#ifndef mdbx_memalign_free +void mdbx_memalign_free(void *ptr) { +#if _MSC_VER + _aligned_free(ptr); +#else + free(ptr); +#endif +} +#endif /* mdbx_memalign_free */ + +/*----------------------------------------------------------------------------*/ + +int mdbx_mutex_init(mdbx_mutex_t *mutex) { +#if defined(_WIN32) || defined(_WIN64) + *mutex = CreateMutex(NULL, FALSE, NULL); + return *mutex ? MDB_SUCCESS : GetLastError(); +#else + return pthread_mutex_init(mutex, NULL); +#endif +} + +int mdbx_mutex_destroy(mdbx_mutex_t *mutex) { +#if defined(_WIN32) || defined(_WIN64) + return CloseHandle(*mutex) ? MDB_SUCCESS : GetLastError(); +#else + return pthread_mutex_destroy(mutex); +#endif +} + +int mdbx_mutex_lock(mdbx_mutex_t *mutex) { +#if defined(_WIN32) || defined(_WIN64) + DWORD code = WaitForSingleObject(*mutex, INFINITE); + return waitfor2errcode(code); +#else + return pthread_mutex_lock(mutex); +#endif +} + +int mdbx_mutex_unlock(mdbx_mutex_t *mutex) { +#if defined(_WIN32) || defined(_WIN64) + return ReleaseMutex(*mutex) ? MDB_SUCCESS : GetLastError(); +#else + return pthread_mutex_unlock(mutex); +#endif +} + +/*----------------------------------------------------------------------------*/ + +int mdbx_cond_init(mdbx_cond_t *cond) { +#if defined(_WIN32) || defined(_WIN64) + *cond = CreateEvent(NULL, FALSE, FALSE, NULL); + return *cond ? MDB_SUCCESS : GetLastError(); +#else + return pthread_cond_init(cond, NULL); +#endif +} + +#ifndef mdbx_cond_destroy +int mdbx_cond_destroy(mdbx_cond_t *cond) { +#if defined(_WIN32) || defined(_WIN64) + return CloseHandle(*cond) ? MDB_SUCCESS : GetLastError(); +#else + return pthread_cond_destroy(cond); +#endif +} +#endif /* mdbx_cond_destroy */ + +int mdbx_cond_signal(mdbx_cond_t *cond) { +#if defined(_WIN32) || defined(_WIN64) + return SetEvent(*cond) ? MDB_SUCCESS : GetLastError(); +#else + return pthread_cond_signal(cond); +#endif +} + +int mdbx_cond_wait(mdbx_cond_t *cond, mdbx_mutex_t *mutex) { +#if defined(_WIN32) || defined(_WIN64) + DWORD code = SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); + if (code == WAIT_OBJECT_0) + code = WaitForSingleObject(*mutex, INFINITE); + return waitfor2errcode(code); +#else + return pthread_cond_wait(cond, mutex); +#endif +} + +/*----------------------------------------------------------------------------*/ + +int mdbx_openfile(const char *pathname, int flags, mode_t mode, + mdbx_filehandle_t *fd) { + *fd = INVALID_HANDLE_VALUE; +#if defined(_WIN32) || defined(_WIN64) + (void)mode; + + DWORD DesiredAccess; + DWORD ShareMode = FILE_SHARE_READ | FILE_SHARE_WRITE; + DWORD FlagsAndAttributes = FILE_ATTRIBUTE_NORMAL; + switch (flags & (O_RDONLY | O_WRONLY | O_RDWR)) { + default: + return ERROR_INVALID_PARAMETER; + case O_RDONLY: + DesiredAccess = GENERIC_READ; + break; + case O_WRONLY: /* assume for mdb_env_copy() and friends output */ + DesiredAccess = GENERIC_WRITE; + ShareMode = 0; + FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; + break; + case O_RDWR: + DesiredAccess = GENERIC_READ | GENERIC_WRITE; + break; + } + + DWORD CreationDisposition; + switch (flags & (O_EXCL | O_CREAT)) { + default: + return ERROR_INVALID_PARAMETER; + case 0: + CreationDisposition = OPEN_EXISTING; + break; + case O_EXCL | O_CREAT: + CreationDisposition = CREATE_NEW; + FlagsAndAttributes |= FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; + break; + case O_CREAT: + CreationDisposition = OPEN_ALWAYS; + FlagsAndAttributes |= FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; + break; + } + + *fd = CreateFileA(pathname, DesiredAccess, ShareMode, NULL, + CreationDisposition, FlagsAndAttributes, NULL); + + if (*fd == INVALID_HANDLE_VALUE) + return GetLastError(); + if ((flags & O_CREAT) && GetLastError() != ERROR_ALREADY_EXISTS) { + /* set FILE_ATTRIBUTE_NOT_CONTENT_INDEXED for new file */ + DWORD FileAttributes = GetFileAttributesA(pathname); + if (FileAttributes == INVALID_FILE_ATTRIBUTES || + !SetFileAttributesA(pathname, FileAttributes | + FILE_ATTRIBUTE_NOT_CONTENT_INDEXED)) { + int rc = GetLastError(); + CloseHandle(*fd); + *fd = INVALID_HANDLE_VALUE; + return rc; + } + } +#else + +#ifdef O_CLOEXEC + flags |= O_CLOEXEC; +#endif + *fd = open(pathname, flags, mode); + if (*fd < 0) + return errno; +#if defined(FD_CLOEXEC) && defined(F_GETFD) + flags = fcntl(*fd, F_GETFD); + if (flags >= 0) + (void)fcntl(*fd, F_SETFD, flags | FD_CLOEXEC); +#endif +#endif + return MDB_SUCCESS; +} + +int mdbx_closefile(mdbx_filehandle_t fd) { +#if defined(_WIN32) || defined(_WIN64) + return CloseHandle(fd) ? MDB_SUCCESS : GetLastError(); +#else + return (close(fd) == 0) ? MDB_SUCCESS : errno; +#endif +} + +int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { +#if defined(_WIN32) || defined(_WIN64) + if (bytes > MAX_WRITE) + return ERROR_INVALID_PARAMETER; + + OVERLAPPED ov; + ov.hEvent = 0; + ov.Offset = (DWORD)offset; + ov.OffsetHigh = HIGH_DWORD(offset); + + DWORD read; + if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { + int rc = GetLastError(); + if (rc == ERROR_HANDLE_EOF && read == 0 && offset == 0) + return ENOENT; + return rc; + } + return (read == bytes) ? MDB_SUCCESS : ERROR_READ_FAULT; +#else + ssize_t read = pread(fd, buf, bytes, offset); + if (likely(bytes == (size_t)read)) + return MDB_SUCCESS; + if (read < 0) + return errno; + return (read == 0 && offset == 0) ? ENOENT : EIO; +#endif +} + +int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, + off_t offset) { +#if defined(_WIN32) || defined(_WIN64) + if (bytes > MAX_WRITE) + return ERROR_INVALID_PARAMETER; + + OVERLAPPED ov; + ov.hEvent = 0; + ov.Offset = (DWORD)offset; + ov.OffsetHigh = HIGH_DWORD(offset); + + DWORD written; + if (likely(WriteFile(fd, buf, (DWORD)bytes, &written, &ov))) + return (bytes == written) ? MDB_SUCCESS : ERROR_WRITE_FAULT; + return GetLastError(); +#else + int rc; + ssize_t written; + do { + written = pwrite(fd, buf, bytes, offset); + if (likely(bytes == (size_t)written)) + return MDB_SUCCESS; + rc = errno; + } while (rc == EINTR); + return (written < 0) ? rc : EIO /* Use which error code (ENOSPC)? */; +#endif +} + +int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, + off_t offset, size_t expected_written) { +#if defined(_WIN32) || defined(_WIN64) + size_t written = 0; + for (int i = 0; i > iovcnt; ++i) { + int rc = mdbx_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + written += iov[i].iov_len; + offset += iov[i].iov_len; + } + return (expected_written == written) ? MDB_SUCCESS : ERROR_WRITE_FAULT; +#else + int rc; + ssize_t written; + do { + written = pwritev(fd, iov, iovcnt, offset); + if (likely(expected_written == (size_t)written)) + return MDB_SUCCESS; + rc = errno; + } while (rc == EINTR); + return (written < 0) ? rc : EIO /* Use which error code? */; +#endif +} + +int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { +#ifdef SIGPIPE + sigset_t set, old; + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + int rc = rc = pthread_sigmask(SIG_BLOCK, &set, &old); + if (rc != 0) + return rc; +#endif + + const char *ptr = buf; + for (;;) { + size_t chunk = (MAX_WRITE < bytes) ? MAX_WRITE : bytes; +#if defined(_WIN32) || defined(_WIN64) + DWORD written; + if (unlikely(!WriteFile(fd, ptr, (DWORD)chunk, &written, NULL))) + return GetLastError(); +#else + ssize_t written = write(fd, ptr, chunk); + if (written < 0) { + int rc = errno; +#ifdef SIGPIPE + if (rc == EPIPE) { + /* Collect the pending SIGPIPE, otherwise at least OS X + * gives it to the process on thread-exit (ITS#8504). + */ + int tmp; + sigwait(&set, &tmp); + written = 0; + continue; + } + pthread_sigmask(SIG_SETMASK, &old, NULL); +#endif + return rc; + } +#endif + if (likely(bytes == (size_t)written)) { +#ifdef SIGPIPE + pthread_sigmask(SIG_SETMASK, &old, NULL); +#endif + return MDB_SUCCESS; + } + ptr += written; + bytes -= written; + } +} + +int mdbx_filesync(mdbx_filehandle_t fd, bool syncmeta) { +#if defined(_WIN32) || defined(_WIN64) + (void)syncmeta; + return FlushFileBuffers(fd) ? 0 : -1; +#elif __GLIBC_PREREQ(2, 16) || _BSD_SOURCE || _XOPEN_SOURCE || \ + (__GLIBC_PREREQ(2, 8) && _POSIX_C_SOURCE >= 200112L) +#if _POSIX_C_SOURCE >= 199309L || _XOPEN_SOURCE >= 500 || \ + defined(_POSIX_SYNCHRONIZED_IO) + if (!syncmeta) + return (fdatasync(fd) == 0) ? MDB_SUCCESS : errno; +#endif + (void)syncmeta; + return (fsync(fd) == 0) ? MDB_SUCCESS : errno; +#else +#error FIXME +#endif +} + +int mdbx_filesize(mdbx_filehandle_t fd, off_t *length) { +#if defined(_WIN32) || defined(_WIN64) + BY_HANDLE_FILE_INFORMATION info; + if (!GetFileInformationByHandle(fd, &info)) + return GetLastError(); + *length = info.nFileSizeLow | (uint64_t)info.nFileIndexHigh << 32; +#else + struct stat st; + + if (fstat(fd, &st)) + return errno; + + *length = st.st_size; +#endif + return MDB_SUCCESS; +} + +int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length) { +#if defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER li; + li.QuadPart = length; + return (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) && SetEndOfFile(fd)) + ? MDB_SUCCESS + : GetLastError(); +#else + return ftruncate(fd, length) == 0 ? MDB_SUCCESS : errno; +#endif +} + +/*----------------------------------------------------------------------------*/ + +int mdbx_thread_key_create(mdbx_thread_key_t *key) { +#if defined(_WIN32) || defined(_WIN64) + *key = TlsAlloc(); + return (*key != TLS_OUT_OF_INDEXES) ? MDB_SUCCESS : GetLastError(); +#else + return pthread_key_create(key, mdbx_rthc_dtor); +#endif +} + +void mdbx_thread_key_delete(mdbx_thread_key_t key) { +#if defined(_WIN32) || defined(_WIN64) + mdbx_ensure(NULL, TlsFree(key)); +#else + mdbx_ensure(NULL, pthread_key_delete(key) == 0); +#endif +} + +void *mdbx_thread_rthc_get(mdbx_thread_key_t key) { +#if defined(_WIN32) || defined(_WIN64) + return TlsGetValue(key); +#else + return pthread_getspecific(key); +#endif +} + +void mdbx_thread_rthc_set(mdbx_thread_key_t key, const void *value) { +#if defined(_WIN32) || defined(_WIN64) + mdbx_ensure(NULL, TlsSetValue(key, (void *)value)); +#else + mdbx_ensure(NULL, pthread_setspecific(key, value) == 0); +#endif +} + +mdbx_tid_t mdbx_thread_self(void) { +#if defined(_WIN32) || defined(_WIN64) + return GetCurrentThreadId(); +#else + return pthread_self(); +#endif +} + +int mdbx_thread_create(mdbx_thread_t *thread, + THREAD_RESULT(THREAD_CALL *start_routine)(void *), + void *arg) { +#if defined(_WIN32) || defined(_WIN64) + *thread = CreateThread(NULL, 0, start_routine, arg, 0, NULL); + return *thread ? MDB_SUCCESS : GetLastError(); +#else + return pthread_create(thread, NULL, start_routine, arg); +#endif +} + +int mdbx_thread_join(mdbx_thread_t thread) { +#if defined(_WIN32) || defined(_WIN64) + DWORD code = WaitForSingleObject(thread, INFINITE); + return waitfor2errcode(code); +#else + void *unused_retval = &unused_retval; + return pthread_join(thread, &unused_retval); +#endif +} + +/*----------------------------------------------------------------------------*/ + +int mdbx_msync(void *addr, size_t length, int async) { +#if defined(_WIN32) || defined(_WIN64) + if (async) + return MDB_SUCCESS; + return FlushViewOfFile(addr, length) ? 0 : GetLastError(); +#else + return (msync(addr, length, async ? MS_ASYNC : MS_SYNC) == 0) ? MDB_SUCCESS + : errno; +#endif +} + +int mdbx_mremap_size(void **address, size_t old_size, size_t new_size) { +#if defined(_WIN32) || defined(_WIN64) + *address = MAP_FAILED; + (void)old_size; + (void)new_size; + return ERROR_NOT_SUPPORTED; +#else + *address = mremap(*address, old_size, new_size, 0, address); + return (*address != MAP_FAILED) ? MDB_SUCCESS : errno; +#endif +} + +int mdbx_mmap(void **address, size_t length, int rw, mdbx_filehandle_t fd) { +#if defined(_WIN32) || defined(_WIN64) + HANDLE h = CreateFileMapping(fd, NULL, rw ? PAGE_READWRITE : PAGE_READONLY, + HIGH_DWORD(length), (DWORD)length, NULL); + if (!h) + return GetLastError(); + *address = MapViewOfFileEx(h, rw ? FILE_MAP_WRITE : FILE_MAP_READ, 0, 0, + length, *address); + int rc = (*address != MAP_FAILED) ? MDB_SUCCESS : GetLastError(); + CloseHandle(h); + return rc; +#else + *address = mmap(address, length, rw ? PROT_READ | PROT_WRITE : PROT_READ, + MAP_SHARED, fd, 0); + return (*address != MAP_FAILED) ? MDB_SUCCESS : errno; +#endif +} + +int mdbx_munmap(void *address, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + (void)length; + return UnmapViewOfFile(address) ? MDB_SUCCESS : GetLastError(); +#else + return (munmap(address, length) == 0) ? MDB_SUCCESS : errno; +#endif +} + +int mdbx_mlock(const void *address, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + return VirtualLock((void *)address, length) ? MDB_SUCCESS : GetLastError(); +#else + return (mlock(address, length) == 0) ? MDB_SUCCESS : errno; +#endif +} diff --git a/src/osal.h b/src/osal.h new file mode 100644 index 00000000..469ca341 --- /dev/null +++ b/src/osal.h @@ -0,0 +1,423 @@ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ + +/* + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once + +#ifdef _MSC_VER +#pragma warning(push, 1) +#pragma warning(disable : 4530) /* C++ exception handler used, but \ + unwind semantics are not enabled. Specify \ + /EHsc */ +#pragma warning(disable : 4577) /* 'noexcept' used with no exception \ + handling mode specified; termination on \ + exception is not guaranteed. Specify /EHsc \ + */ +#endif /* _MSC_VER (warnings) */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef _POSIX_C_SOURCE +#ifdef _POSIX_SOURCE +#define _POSIX_C_SOURCE 1 +#else +#define _POSIX_C_SOURCE 0 +#endif +#endif + +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE 0 +#endif + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#define HAVE_SYS_STAT_H +#define HAVE_SYS_TYPES_H +typedef HANDLE mdbx_mutex_t; +typedef HANDLE mdbx_cond_t; +typedef HANDLE mdbx_thread_t; +typedef unsigned mdbx_thread_key_t; +typedef SSIZE_T ssize_t; +#define MAP_FAILED NULL +#define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) +#define THREAD_CALL WINAPI +#define THREAD_RESULT DWORD +#else +#include +#include +#include +#include +#include +#include +typedef pthread_mutex_t mdbx_mutex_t; +typedef pthread_cond_t mdbx_cond_t; +typedef pthread_t mdbx_thread_t; +typedef pthread_key_t mdbx_thread_key_t; +#define INVALID_HANDLE_VALUE (-1) +#define THREAD_CALL +#define THREAD_RESULT void * +#endif /* Platform */ + +#ifndef SSIZE_MAX +#define SSIZE_MAX INTPTR_MAX +#endif + +#ifdef HAVE_SYS_STAT_H +#include +#endif +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_FILE_H +#include +#endif + +/*----------------------------------------------------------------------------*/ + +#ifdef _MSC_VER + +#if _MSC_FULL_VER < 190024215 +#if _MSC_FULL_VER < 180040629 && defined(_M_IX86) +#error Please use Visual Studio 2015 (MSC 19.0) or newer for 32-bit target. +#else +#pragma message( \ + "It is recommended to use Visual Studio 2015 (MSC 19.0) or newer.") +#endif +#endif + +#include + +#elif __GNUC_PREREQ(4, 4) || defined(__clang__) +#if defined(__i386__) || defined(__x86_64__) +#include +#include +#endif +#elif defined(__INTEL_COMPILER) +#include +#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) +#include +#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ + (defined(HP_IA64) || defined(__ia64)) +#include +#elif defined(__IBMC__) && defined(__powerpc) +#include +#elif defined(_AIX) +#include +#include +#elif (defined(__osf__) && defined(__DECC)) || defined(__alpha) +#include +#include +#elif defined(__MWERKS__) +/* CodeWarrior - troubles ? */ +#pragma gcc_extensions +#elif defined(__SNC__) +/* Sony PS3 - troubles ? */ +#else +#error Unknown C compiler, please use GNU C 5.x or newer +#endif /* Compiler */ + +/*----------------------------------------------------------------------------*/ +/* Byteorder */ + +#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ + !defined(__ORDER_BIG_ENDIAN__) + +#if defined(HAVE_ENDIAN_H) +#include +#elif defined(HAVE_SYS_PARAM_H) +#include /* for endianness */ +#elif defined(HAVE_NETINET_IN_H) && defined(HAVE_RESOLV_H) +#include +#include /* defines BYTE_ORDER on HPUX and Solaris */ +#endif + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) +#define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN +#define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN +#define __BYTE_ORDER__ __BYTE_ORDER +#else +#define __ORDER_LITTLE_ENDIAN__ 1234 +#define __ORDER_BIG_ENDIAN__ 4321 +#if defined(__LITTLE_ENDIAN__) || defined(_LITTLE_ENDIAN) || \ + defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \ + defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ + defined(__i386) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ + defined(_X86_64_) || defined(_M_ARM) || defined(__e2k__) +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#elif defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(__ARMEB__) || \ + defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(__MIPSEB__) || \ + defined(_MIPSEB) || defined(__MIPSEB) +#define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ +#else +#error __BYTE_ORDER__ should be defined. +#endif +#endif +#endif + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ + __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ +#error Unsupported byte order. +#endif + +/*----------------------------------------------------------------------------*/ +/* Cache coherence */ + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64) || \ + defined(_M_IX86) || defined(__i386) || defined(__amd64) || \ + defined(i386) || defined(__x86_64) || defined(_AMD64_) || defined(_M_X64) +#define MDBX_CACHE_IS_COHERENT 1 +#elif defined(__hppa) || defined(__hppa__) +#define MDBX_CACHE_IS_COHERENT 1 +#endif + +#ifndef MDBX_CACHE_IS_COHERENT +#define MDBX_CACHE_IS_COHERENT 0 +#endif + +#ifndef MDBX_CACHELINE_SIZE +#if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) +#define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE +#elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64) +#define MDBX_CACHELINE_SIZE 128 +#else +#define MDBX_CACHELINE_SIZE 64 +#endif +#endif /* MDBX_CACHELINE_SIZE */ + +#ifndef __cache_aligned +#define __cache_aligned __aligned(MDBX_CACHELINE_SIZE) +#endif + +/*----------------------------------------------------------------------------*/ +/* Memory/Compiler barriers */ + +static __inline void mdbx_compiler_barrier(void) { +#if defined(__clang__) || defined(__GNUC__) + __asm__ __volatile__("" ::: "memory"); +#elif defined(_MSC_VER) + _ReadWriteBarrier(); +#elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ + __memory_barrier(); + if (type > MDBX_BARRIER_COMPILER) +#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) + __mf(); +#elif defined(__i386__) || defined(__x86_64__) + _mm_mfence(); +#else +#error "Unknown target for Intel Compiler, please report to us." +#endif +#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) + __compiler_barrier(); +#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ + (defined(HP_IA64) || defined(__ia64)) + _Asm_sched_fence(/* LY: no-arg meaning 'all expect ALU', e.g. 0x3D3D */); +#elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \ + defined(__ppc64__) || defined(__powerpc64__) + __fence(); +#else +#error "Could not guess the kind of compiler, please report to us." +#endif +} + +static __inline void mdbx_memory_barrier(void) { +#if __has_extension(c_atomic) || __has_extension(cxx_atomic) + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif defined(__ATOMIC_SEQ_CST) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif defined(__clang__) || defined(__GNUC__) + __sync_synchronize(); +#elif defined(_MSC_VER) + MemoryBarrier(); +#elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ +#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) + __mf(); +#elif defined(__i386__) || defined(__x86_64__) + _mm_mfence(); +#else +#error "Unknown target for Intel Compiler, please report to us." +#endif +#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) + __machine_rw_barrier(); +#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ + (defined(HP_IA64) || defined(__ia64)) + _Asm_mf(); +#elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \ + defined(__ppc64__) || defined(__powerpc64__) + __lwsync(); +#else +#error "Could not guess the kind of compiler, please report to us." +#endif +} + +#if MDBX_CACHE_IS_COHERENT +#define mdbx_coherent_barrier() mdbx_compiler_barrier() +#else +#define mdbx_coherent_barrier() mdbx_memory_barrier() +#endif + +#if defined(__mips) && defined(__linux) +/* Only MIPS has explicit cache control */ +#include +#endif + +static __inline void mdbx_invalidate_cache(void *addr, size_t nbytes) { + mdbx_coherent_barrier(); +#if defined(__mips) && defined(__linux) + /* MIPS has cache coherency issues. + * Note: for any nbytes >= on-chip cache size, entire is flushed. */ + cacheflush(addr, nbytes, DCACHE); +#elif defined(_M_MRX000) || defined(_MIPS_) +#error "Sorry, cacheflush() for MIPS not implemented" +#else + /* LY: assume no relevant mmap/dcache issues. */ + (void)addr; + (void)nbytes; +#endif +} + +/*----------------------------------------------------------------------------*/ + +/* max bytes to write in one call */ +#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) + +/* Get the size of a memory page for the system. + * This is the basic size that the platform's memory manager uses, and is + * fundamental to the use of memory-mapped files. */ +static __inline size_t mdbx_syspagesize(void) { +#if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwPageSize; +#else + return sysconf(_SC_PAGE_SIZE); +#endif +} + +static __inline char *mdbx_strdup(const char *str) { +#ifdef _MSC_VER + return _strdup(str); +#else + return strdup(str); +#endif +} + +int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result); +void mdbx_memalign_free(void *ptr); + +int mdbx_mutex_init(mdbx_mutex_t *mutex); +int mdbx_mutex_destroy(mdbx_mutex_t *mutex); +int mdbx_mutex_lock(mdbx_mutex_t *mutex); +int mdbx_mutex_unlock(mdbx_mutex_t *mutex); + +int mdbx_cond_init(mdbx_cond_t *cond); +int mdbx_cond_destroy(mdbx_cond_t *cond); +int mdbx_cond_signal(mdbx_cond_t *cond); +int mdbx_cond_wait(mdbx_cond_t *cond, mdbx_mutex_t *mutex); + +int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, + off_t offset, size_t expected_written); +int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, off_t offset); +int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, + off_t offset); +int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t count); + +int mdbx_msync(void *addr, size_t length, int async); + +int mdbx_thread_create(mdbx_thread_t *thread, + THREAD_RESULT(THREAD_CALL *start_routine)(void *), + void *arg); +int mdbx_thread_join(mdbx_thread_t thread); +mdbx_tid_t mdbx_thread_self(void); +int mdbx_thread_key_create(mdbx_thread_key_t *key); +void mdbx_thread_key_delete(mdbx_thread_key_t key); +void *mdbx_thread_rthc_get(mdbx_thread_key_t key); +void mdbx_thread_rthc_set(mdbx_thread_key_t key, const void *value); + +int mdbx_filesync(mdbx_filehandle_t fd, bool syncmeta); +int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length); +int mdbx_filesize(mdbx_filehandle_t fd, off_t *length); +int mdbx_openfile(const char *pathname, int flags, mode_t mode, + mdbx_filehandle_t *fd); +int mdbx_closefile(mdbx_filehandle_t fd); + +int mdbx_mremap_size(void **address, size_t old_size, size_t new_size); +int mdbx_mmap(void **address, size_t length, int rw, mdbx_filehandle_t fd); +int mdbx_munmap(void *address, size_t length); +int mdbx_mlock(const void *address, size_t length); + +static __inline mdbx_pid_t mdbx_getpid(void) { +#if defined(_WIN32) || defined(_WIN64) + return GetCurrentProcessId(); +#else + return getpid(); +#endif +} + +/*----------------------------------------------------------------------------*/ + +#ifndef mdbx_assert_fail +void mdbx_assert_fail(MDB_env *env, const char *msg, const char *func, + int line); +#endif /* mdbx_assert_fail */ + +#if __GLIBC_PREREQ(2, 1) +#define mdbx_asprintf asprintf +#else +int mdbx_asprintf(char **strp, const char *fmt, ...); +#endif + +/*----------------------------------------------------------------------------*/ + +#if defined(_WIN32) || defined(_WIN64) +#undef MDBX_OSAL_LOCK +#define MDBX_OSAL_LOCK_SIGN MDBX_TETRAD('f', 'l', 'c', 'k') +#else +#define MDBX_OSAL_LOCK pthread_mutex_t +#define MDBX_OSAL_LOCK_SIGN MDBX_TETRAD('P', 'T', 'M', 'X') +#endif + +int mdbx_lck_init(MDB_env *env); + +int mdbx_lck_seize(MDB_env *env); +int mdbx_lck_downgrade(MDB_env *env); +void mdbx_lck_destroy(MDB_env *env); + +int mdbx_rdt_lock(MDB_env *env); +void mdbx_rdt_unlock(MDB_env *env); + +int mdbx_txn_lock(MDB_env *env); +void mdbx_txn_unlock(MDB_env *env); + +int mdbx_rpid_set(MDB_env *env); +int mdbx_rpid_clear(MDB_env *env); +int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid); diff --git a/src/reopen.h b/src/reopen.h deleted file mode 100644 index ee828b94..00000000 --- a/src/reopen.h +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#pragma once -/* *INDENT-OFF* */ -/* clang-format off */ - -#ifndef __CLANG_PREREQ -# ifdef __clang__ -# define __CLANG_PREREQ(maj,min) \ - ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min)) -# else -# define __CLANG_PREREQ(maj,min) (0) -# endif -#endif /* __CLANG_PREREQ */ - -#ifndef __has_attribute -# define __has_attribute(x) (0) -#endif - -#if !defined(__thread) && (defined(_MSC_VER) || defined(__DMC__)) -# define __thread __declspec(thread) -#endif - -#ifndef __forceinline -# if defined(__GNUC__) || defined(__clang__) -# define __forceinline __inline __attribute__((always_inline)) -# elif ! defined(_MSC_VER) -# define __forceinline -# endif -#endif /* __forceinline */ - -#ifndef __noinline -# if defined(__GNUC__) || defined(__clang__) -# define __noinline __attribute__((noinline)) -# elif defined(_MSC_VER) -# define __noinline __declspec(noinline) -# endif -#endif /* __noinline */ - -#ifndef __must_check_result -# if defined(__GNUC__) || defined(__clang__) -# define __must_check_result __attribute__((warn_unused_result)) -# else -# define __must_check_result -# endif -#endif /* __must_check_result */ - -#ifndef __hot -# if defined(__OPTIMIZE__) && (defined(__GNUC__) && !defined(__clang__)) -# define __hot __attribute__((hot, optimize("O3"))) -# elif defined(__GNUC__) - /* cland case, just put frequently used functions in separate section */ -# define __hot __attribute__((section("text.hot"))) -# else -# define __hot -# endif -#endif /* __hot */ - -#ifndef __cold -# if defined(__OPTIMIZE__) && (defined(__GNUC__) && !defined(__clang__)) -# define __cold __attribute__((cold, optimize("Os"))) -# elif defined(__GNUC__) - /* cland case, just put infrequently used functions in separate section */ -# define __cold __attribute__((section("text.unlikely"))) -# else -# define __cold -# endif -#endif /* __cold */ - -#ifndef __flatten -# if defined(__OPTIMIZE__) && (defined(__GNUC__) || defined(__clang__)) -# define __flatten __attribute__((flatten)) -# else -# define __flatten -# endif -#endif /* __flatten */ - -#ifndef __aligned -# if defined(__GNUC__) || defined(__clang__) -# define __aligned(N) __attribute__((aligned(N))) -# elif defined(__MSC_VER) -# define __aligned(N) __declspec(align(N)) -# else -# define __aligned(N) -# endif -#endif /* __align */ - -#ifndef __noreturn -# if defined(__GNUC__) || defined(__clang__) -# define __noreturn __attribute__((noreturn)) -# elif defined(__MSC_VER) -# define __noreturn __declspec(noreturn) -# else -# define __noreturn -# endif -#endif - -#ifndef __nothrow -# if defined(__GNUC__) || defined(__clang__) -# define __nothrow __attribute__((nothrow)) -# elif defined(__MSC_VER) -# define __nothrow __declspec(nothrow) -# else -# define __nothrow -# endif -#endif - -#ifndef CACHELINE_SIZE -# if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) -# define CACHELINE_SIZE 128 -# else -# define CACHELINE_SIZE 64 -# endif -#endif - -#ifndef __cache_aligned -# define __cache_aligned __aligned(CACHELINE_SIZE) -#endif - -#ifndef likely -# if defined(__GNUC__) || defined(__clang__) -# ifdef __cplusplus - /* LY: workaround for "pretty" boost */ - static __inline __attribute__((always_inline)) - bool likely(bool cond) { return __builtin_expect(cond, 1); } -# else -# define likely(cond) __builtin_expect(!!(cond), 1) -# endif -# else -# define likely(x) (x) -# endif -#endif /* likely */ - -#ifndef unlikely -# if defined(__GNUC__) || defined(__clang__) -# ifdef __cplusplus - /* LY: workaround for "pretty" boost */ - static __inline __attribute__((always_inline)) - bool unlikely(bool cond) { return __builtin_expect(cond, 0); } -# else -# define unlikely(cond) __builtin_expect(!!(cond), 0) -# endif -# else -# define unlikely(x) (x) -# endif -#endif /* unlikely */ - -#ifndef __extern_C -# ifdef __cplusplus -# define __extern_C extern "C" -# else -# define __extern_C -# endif -#endif - -#ifndef __noop -# define __noop() do {} while (0) -#endif - -/* -------------------------------------------------------------------------- */ - -#include - -/* Prototype should match libc runtime. ISO POSIX (2003) & LSB 3.1 */ -__extern_C void __assert_fail( - const char* assertion, - const char* file, - unsigned line, - const char* function) __nothrow __noreturn; - -/* -------------------------------------------------------------------------- */ - -#if defined(HAVE_VALGRIND) || defined(USE_VALGRIND) - /* Get debugging help from Valgrind */ -# include -# ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE - /* LY: available since Valgrind 3.10 */ -# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# endif -#else -# define VALGRIND_CREATE_MEMPOOL(h,r,z) -# define VALGRIND_DESTROY_MEMPOOL(h) -# define VALGRIND_MEMPOOL_TRIM(h,a,s) -# define VALGRIND_MEMPOOL_ALLOC(h,a,s) -# define VALGRIND_MEMPOOL_FREE(h,a) -# define VALGRIND_MEMPOOL_CHANGE(h,a,b,s) -# define VALGRIND_MAKE_MEM_NOACCESS(a,s) -# define VALGRIND_MAKE_MEM_DEFINED(a,s) -# define VALGRIND_MAKE_MEM_UNDEFINED(a,s) -# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) -# define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) -#endif /* ! USE_VALGRIND */ - -#if defined(__has_feature) -# if __has_feature(thread_sanitizer) -# define __SANITIZE_THREAD__ 1 -# endif -#endif - -#ifdef __SANITIZE_THREAD__ -# define ATTRIBUTE_NO_SANITIZE_THREAD __attribute__((no_sanitize_thread, noinline)) -#else -# define ATTRIBUTE_NO_SANITIZE_THREAD -#endif - -#if defined(__has_feature) -# if __has_feature(address_sanitizer) -# define __SANITIZE_ADDRESS__ 1 -# endif -#endif - -#ifdef __SANITIZE_ADDRESS__ -# include -# define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address, noinline)) -#else -# define ASAN_POISON_MEMORY_REGION(addr, size) \ - ((void)(addr), (void)(size)) -# define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ - ((void)(addr), (void)(size)) -# define ATTRIBUTE_NO_SANITIZE_ADDRESS -#endif /* __SANITIZE_ADDRESS__ */ diff --git a/src/mdbx_chk.c b/src/tools/mdbx_chk.c similarity index 97% rename from src/mdbx_chk.c rename to src/tools/mdbx_chk.c index 6c1f6454..5122bedb 100644 --- a/src/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -1,24 +1,17 @@ /* mdbx_chk.c - memory-mapped database check tool */ /* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. * - * This file is part of libmdbx. - * - * libmdbx is free software; you can redistribute it and/or modify it under - * the terms of the GNU Affero General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * libmdbx is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . */ #include @@ -32,8 +25,8 @@ #include #include -#include "mdbx.h" -#include "midl.h" +#include "../../mdbx.h" +#include "../midl.h" typedef struct flagbit { int bit; @@ -624,7 +617,7 @@ int main(int argc, char *argv[]) { while ((i = getopt(argc, argv, "Vvqnwcds:")) != EOF) { switch (i) { case 'V': - printf("%s\n", MDB_VERSION_STRING); + printf("%s\n", MDBX_VERSION_STRING); exit(EXIT_SUCCESS); break; case 'v': diff --git a/src/mdbx_copy.1 b/src/tools/mdbx_copy.1 similarity index 100% rename from src/mdbx_copy.1 rename to src/tools/mdbx_copy.1 diff --git a/src/mdbx_copy.c b/src/tools/mdbx_copy.c similarity index 89% rename from src/mdbx_copy.c rename to src/tools/mdbx_copy.c index b80b70a5..aea8be5f 100644 --- a/src/mdbx_copy.c +++ b/src/tools/mdbx_copy.c @@ -1,9 +1,8 @@ /* mdbx_copy.c - memory-mapped database backup tool */ /* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2012-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -15,10 +14,11 @@ * . */ -#include "mdbx.h" +#include "../../mdbx.h" #include #include #include +#include static void sighandle(int sig) { (void)sig; } @@ -35,7 +35,7 @@ int main(int argc, char *argv[]) { else if (argv[1][1] == 'c' && argv[1][2] == '\0') cpflags |= MDB_CP_COMPACT; else if (argv[1][1] == 'V' && argv[1][2] == '\0') { - printf("%s\n", MDB_VERSION_STRING); + printf("%s\n", MDBX_VERSION_STRING); exit(0); } else argc = 0; diff --git a/src/mdbx_dump.1 b/src/tools/mdbx_dump.1 similarity index 100% rename from src/mdbx_dump.1 rename to src/tools/mdbx_dump.1 diff --git a/src/mdbx_dump.c b/src/tools/mdbx_dump.c similarity index 97% rename from src/mdbx_dump.c rename to src/tools/mdbx_dump.c index 16543d09..79680fcb 100644 --- a/src/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -1,9 +1,8 @@ /* mdbx_dump.c - memory-mapped database dump tool */ /* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -15,7 +14,7 @@ * . */ -#include "mdbx.h" +#include "../../mdbx.h" #include #include #include @@ -178,7 +177,7 @@ int main(int argc, char *argv[]) { while ((i = getopt(argc, argv, "af:lnps:V")) != EOF) { switch (i) { case 'V': - printf("%s\n", MDB_VERSION_STRING); + printf("%s\n", MDBX_VERSION_STRING); exit(0); break; case 'l': diff --git a/src/mdbx_load.1 b/src/tools/mdbx_load.1 similarity index 100% rename from src/mdbx_load.1 rename to src/tools/mdbx_load.1 diff --git a/src/mdbx_load.c b/src/tools/mdbx_load.c similarity index 98% rename from src/mdbx_load.c rename to src/tools/mdbx_load.c index a211b24e..d9c62d59 100644 --- a/src/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -1,9 +1,8 @@ /* mdbx_load.c - memory-mapped database load tool */ /* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -15,7 +14,7 @@ * . */ -#include "mdbx.h" +#include "../../mdbx.h" #include #include #include @@ -314,7 +313,7 @@ int main(int argc, char *argv[]) { while ((i = getopt(argc, argv, "f:ns:NTV")) != EOF) { switch (i) { case 'V': - printf("%s\n", MDB_VERSION_STRING); + printf("%s\n", MDBX_VERSION_STRING); exit(0); break; case 'f': @@ -365,8 +364,10 @@ int main(int argc, char *argv[]) { if (info.me_mapsize) mdbx_env_set_mapsize(env, info.me_mapsize); +#ifdef MDB_FIXEDMAP if (info.me_mapaddr) envflags |= MDB_FIXEDMAP; +#endif rc = mdbx_env_open(env, envname, envflags, 0664); if (rc) { diff --git a/src/mdbx_stat.1 b/src/tools/mdbx_stat.1 similarity index 100% rename from src/mdbx_stat.1 rename to src/tools/mdbx_stat.1 diff --git a/src/mdbx_stat.c b/src/tools/mdbx_stat.c similarity index 97% rename from src/mdbx_stat.c rename to src/tools/mdbx_stat.c index ca72b290..fb920a13 100644 --- a/src/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -1,9 +1,8 @@ /* mdbx_stat.c - memory-mapped database status tool */ /* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -15,7 +14,7 @@ * . */ -#include "mdbx.h" +#include "../../mdbx.h" #include #include #include @@ -65,7 +64,7 @@ int main(int argc, char *argv[]) { while ((i = getopt(argc, argv, "Vaefnrs:")) != EOF) { switch (i) { case 'V': - printf("%s\n", MDB_VERSION_STRING); + printf("%s\n", MDBX_VERSION_STRING); exit(0); break; case 'a': diff --git a/test/test0.c b/test/test0.c index 29037618..68919b22 100644 --- a/test/test0.c +++ b/test/test0.c @@ -15,7 +15,7 @@ * . */ -#include "mdbx.h" +#include "../mdbx.h" #include #include #include @@ -79,7 +79,7 @@ int main(int argc, char *argv[]) { E(stat("/proc/self/exe", &exe_stat) ? errno : 0); E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; + env_oflags = MDB_NOSYNC; if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { /* LY: Assume running inside a CI-environment: * 1) don't use FIXEDMAP to avoid EBUSY in case collision, diff --git a/test/test1.c b/test/test1.c index 58be6928..e0c8f10c 100644 --- a/test/test1.c +++ b/test/test1.c @@ -14,7 +14,7 @@ /* Based on mtest2.c - memory-mapped database tester/toy */ -#include "mdbx.h" +#include "../mdbx.h" #include #include #include @@ -65,7 +65,7 @@ int main(int argc, char *argv[]) { E(stat("/proc/self/exe", &exe_stat) ? errno : 0); E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; + env_oflags = MDB_NOSYNC; if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { /* LY: Assume running inside a CI-environment: * 1) don't use FIXEDMAP to avoid EBUSY in case collision, diff --git a/test/test2.c b/test/test2.c index 2244c7b3..f35bca9b 100644 --- a/test/test2.c +++ b/test/test2.c @@ -17,7 +17,7 @@ /* Just like mtest.c, but using a subDB instead of the main DB */ -#include "mdbx.h" +#include "../mdbx.h" #include #include #include @@ -68,7 +68,7 @@ int main(int argc, char *argv[]) { E(stat("/proc/self/exe", &exe_stat) ? errno : 0); E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; + env_oflags = MDB_NOSYNC; if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { /* LY: Assume running inside a CI-environment: * 1) don't use FIXEDMAP to avoid EBUSY in case collision, diff --git a/test/test3.c b/test/test3.c index 1f1cacac..2dac03d0 100644 --- a/test/test3.c +++ b/test/test3.c @@ -16,7 +16,7 @@ */ /* Tests for sorted duplicate DBs */ -#include "mdbx.h" +#include "../mdbx.h" #include #include #include @@ -70,7 +70,7 @@ int main(int argc, char *argv[]) { E(stat("/proc/self/exe", &exe_stat) ? errno : 0); E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; + env_oflags = MDB_NOSYNC; if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { /* LY: Assume running inside a CI-environment: * 1) don't use FIXEDMAP to avoid EBUSY in case collision, diff --git a/test/test4.c b/test/test4.c index 423535c7..aedec134 100644 --- a/test/test4.c +++ b/test/test4.c @@ -16,7 +16,7 @@ */ /* Tests for sorted duplicate DBs with fixed-size keys */ -#include "mdbx.h" +#include "../mdbx.h" #include #include #include @@ -68,7 +68,7 @@ int main(int argc, char *argv[]) { E(stat("/proc/self/exe", &exe_stat) ? errno : 0); E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; + env_oflags = MDB_NOSYNC; if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { /* LY: Assume running inside a CI-environment: * 1) don't use FIXEDMAP to avoid EBUSY in case collision, diff --git a/test/test5.c b/test/test5.c index a482274a..c1018c64 100644 --- a/test/test5.c +++ b/test/test5.c @@ -16,7 +16,7 @@ */ /* Tests for sorted duplicate DBs using cursor_put */ -#include "mdbx.h" +#include "../mdbx.h" #include #include #include @@ -70,7 +70,7 @@ int main(int argc, char *argv[]) { E(stat("/proc/self/exe", &exe_stat) ? errno : 0); E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; + env_oflags = MDB_NOSYNC; if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { /* LY: Assume running inside a CI-environment: * 1) don't use FIXEDMAP to avoid EBUSY in case collision, diff --git a/test/test6.c b/test/test6.c index ccd6c93a..6f14ce4e 100644 --- a/test/test6.c +++ b/test/test6.c @@ -16,7 +16,7 @@ */ /* Tests for DB splits and merges */ -#include "mdbx.h" +#include "../mdbx.h" #include #include #include @@ -61,7 +61,7 @@ int main(int argc, char *argv[]) { E(stat("/proc/self/exe", &exe_stat) ? errno : 0); E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_FIXEDMAP | MDB_NOSYNC; + env_oflags = MDB_NOSYNC; if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { /* LY: Assume running inside a CI-environment: * 1) don't use FIXEDMAP to avoid EBUSY in case collision, diff --git a/test/test_bench.c b/test/test_bench.c index 95dc60d6..377fbf70 100644 --- a/test/test_bench.c +++ b/test/test_bench.c @@ -22,7 +22,7 @@ #include #include -#include "mdbx.h" +#include "../mdbx.h" #define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) #define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) diff --git a/test/test_yota1.c b/test/test_yota1.c index 701d748c..28744185 100644 --- a/test/test_yota1.c +++ b/test/test_yota1.c @@ -22,7 +22,7 @@ #include #include -#include "mdbx.h" +#include "../mdbx.h" #include #include #include diff --git a/test/test_yota2.c b/test/test_yota2.c index 69d41c7c..84cb0479 100644 --- a/test/test_yota2.c +++ b/test/test_yota2.c @@ -22,7 +22,7 @@ #include #include -#include "mdbx.h" +#include "../mdbx.h" #include #include #include From bfac83a3b8b0d433cf757f214a69effa2440c83a Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 17 Mar 2017 11:35:16 +0300 Subject: [PATCH 018/303] mdbx: MSVC project/solution for DLL. --- .gitignore | 5 +- mdbx-dll.sln | 28 +++++++++ mdbx-dll.vcxproj | 157 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 188 insertions(+), 2 deletions(-) create mode 100644 mdbx-dll.sln create mode 100644 mdbx-dll.vcxproj diff --git a/.gitignore b/.gitignore index 9507e7d7..7a7d1485 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ mdbx_chk core core.* valgrind.* -man/ -html/ yota_test* +mdbx-dll.VC.db +mdbx-dll.VC.VC.opendb +mdbx-dll.vcxproj.filters diff --git a/mdbx-dll.sln b/mdbx-dll.sln new file mode 100644 index 00000000..b94111d6 --- /dev/null +++ b/mdbx-dll.sln @@ -0,0 +1,28 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 14.0.25420.1 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mdbx-dll", "mdbx-dll.vcxproj", "{6D19209B-ECE7-4B9C-941C-0AA2B484F199}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x64.ActiveCfg = Debug|x64 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x64.Build.0 = Debug|x64 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x86.ActiveCfg = Debug|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x86.Build.0 = Debug|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.ActiveCfg = Release|x64 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.Build.0 = Release|x64 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.ActiveCfg = Release|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.Build.0 = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/mdbx-dll.vcxproj b/mdbx-dll.vcxproj new file mode 100644 index 00000000..ccdd2cb1 --- /dev/null +++ b/mdbx-dll.vcxproj @@ -0,0 +1,157 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + {6D19209B-ECE7-4B9C-941C-0AA2B484F199} + Win32Proj + 8.1 + + + + DynamicLibrary + true + v140 + + + DynamicLibrary + false + v140 + + + DynamicLibrary + true + v140 + + + DynamicLibrary + false + v140 + + + + + + + + + + + + + + + + + + + + + true + + + false + + + false + + + + WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions) + MultiThreadedDebugDLL + Level3 + ProgramDatabase + Disabled + true + + + MachineX86 + true + Windows + + + + + WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions) + MultiThreadedDLL + Level3 + ProgramDatabase + true + Full + AnySuitable + true + Size + true + true + + + MachineX86 + true + Windows + true + true + UseLinkTimeCodeGeneration + + + + + ProgramDatabase + + + + + Level3 + WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions) + MultiThreadedDebugDLL + true + + + + + WIN64;NDEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions) + MultiThreadedDLL + true + Full + AnySuitable + true + Size + true + true + + + UseLinkTimeCodeGeneration + + + + + + + + + + + + + + + + + + + \ No newline at end of file From 12954bc49bc51b9d64ed3aab341a0431df85aa76 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 20 Mar 2017 13:43:40 +0300 Subject: [PATCH 019/303] mdbx: Qt-Creator project. --- libmdbx.config | 2 ++ libmdbx.creator | 1 + libmdbx.files | 33 +++++++++++++++++++++++++++++++++ libmdbx.includes | 4 ++++ 4 files changed, 40 insertions(+) create mode 100644 libmdbx.config create mode 100644 libmdbx.creator create mode 100644 libmdbx.files create mode 100644 libmdbx.includes diff --git a/libmdbx.config b/libmdbx.config new file mode 100644 index 00000000..e0284f42 --- /dev/null +++ b/libmdbx.config @@ -0,0 +1,2 @@ +// Add predefined macros for your project here. For example: +// #define THE_ANSWER 42 diff --git a/libmdbx.creator b/libmdbx.creator new file mode 100644 index 00000000..e94cbbd3 --- /dev/null +++ b/libmdbx.creator @@ -0,0 +1 @@ +[General] diff --git a/libmdbx.files b/libmdbx.files new file mode 100644 index 00000000..b7fa32d3 --- /dev/null +++ b/libmdbx.files @@ -0,0 +1,33 @@ +AUTHORS +README.md +mdbx.h +src/bits.h +src/defs.h +src/lck-posix.c +src/lck-windows.c +src/mdbx.c +src/midl.h +src/osal.c +src/osal.h +src/tools/mdbx_chk.c +src/tools/mdbx_copy.1 +src/tools/mdbx_copy.c +src/tools/mdbx_dump.1 +src/tools/mdbx_dump.c +src/tools/mdbx_load.1 +src/tools/mdbx_load.c +src/tools/mdbx_stat.1 +src/tools/mdbx_stat.c +test/test0.c +test/test1.c +test/test2.c +test/test3.c +test/test4.c +test/test5.c +test/test6.c +test/test_bench.c +test/test_yota1.c +test/test_yota2.c +tutorial/README.md +tutorial/sample-bdb.txt +tutorial/sample-mdb.txt diff --git a/libmdbx.includes b/libmdbx.includes new file mode 100644 index 00000000..eb512a01 --- /dev/null +++ b/libmdbx.includes @@ -0,0 +1,4 @@ +. +src +src/tools +test From c25934a8dd0042177d6dd5df72e881558c6a2c8c Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 23 Mar 2017 20:37:24 +0000 Subject: [PATCH 020/303] mdbx: backport - fix xcursor after cursor_del (ITS#8622). Re-fix 6b1df0e4c7fadd21d1233d7157229b2d89ccaa04 from ITS#8406 Change-Id: I177bef20cfa4b464a38cb42d66b7134f84434490 --- src/mdbx.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index ee2ab1fe..ef7afcbb 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -7617,14 +7617,16 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { if (mc->mc_db->md_flags & MDB_DUPSORT) { MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node is a fake page, it needs to be reinited - * because its data has moved. But just reset mc_pg[0] - * if the xcursor is already live. + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not initd it must be reinited. + * Else if node points to a subDB, nothing is needed. */ - if ((node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - else + if (node->mn_flags & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node->mn_flags & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } else mdbx_xcursor_init1(m3, node); } } From e9ea16a54e1591c308017ddf65b14fd7bcba36bf Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 22 Mar 2017 16:03:45 +0300 Subject: [PATCH 021/303] mdbx: adds test7 by Klaus Malorny --- AUTHORS | 1 + Makefile | 4 +- libmdbx.files | 3 + test/test7.c | 246 ++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 252 insertions(+), 2 deletions(-) create mode 100644 test/test7.c diff --git a/AUTHORS b/AUTHORS index b65b8080..0c7ca23f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -12,6 +12,7 @@ Howard Chu , Ignacio Casal Quinteiro Jean-Christophe DUBOIS John Hewson +Klaus Malorny Kurt Zeilenga Leonid Yuriev , Lorenz Bauer diff --git a/Makefile b/Makefile index 2dbfbeea..9da7d70e 100644 --- a/Makefile +++ b/Makefile @@ -41,8 +41,8 @@ HEADERS := mdbx.h LIBRARIES := libmdbx.a libmdbx.so TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 -TESTS := test0 test1 test2 test3 test4 test5 test6 test_bench \ - test_yota1 test_yota2 +TESTS := test0 test1 test2 test3 test4 test5 test6 test7 \ + test_bench test_yota1 test_yota2 MDBX_SRC := mdbx.h $(addprefix src/, mdbx.c osal.c lck-posix.c defs.h bits.h osal.h midl.h) diff --git a/libmdbx.files b/libmdbx.files index b7fa32d3..1cb076a5 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -1,4 +1,6 @@ AUTHORS +LICENSE +Makefile README.md mdbx.h src/bits.h @@ -25,6 +27,7 @@ test/test3.c test/test4.c test/test5.c test/test6.c +test/test7.c test/test_bench.c test/test_yota1.c test/test_yota2.c diff --git a/test/test7.c b/test/test7.c new file mode 100644 index 00000000..61110d5c --- /dev/null +++ b/test/test7.c @@ -0,0 +1,246 @@ +/* + * Copyright 2017 Klaus Malorny + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include +#include +#include +#include +#include + +#include "../mdbx.h" + +static const char *fileName = "/dev/shm/test.mdbx"; +static const char *dbName = "test"; +static long size = 1500000000; +static int recordCount = 33000000; +static int majorIdCount = 6000; +static int minorIdCount = 1000000; +static unsigned int seed = 1; +static long *majorIds; + +typedef struct { + long majorId; + long minorId; +} KeyType; + +typedef struct { long refId; } DataType; + +typedef struct { + KeyType key; + DataType data; +} KeyDataType; + +void check(const char *op, int error) { + if (error != 0) { + fprintf(stderr, "%s: unexpected error %d: %s\n", op, error, + mdbx_strerror(error)); + exit(1); + } +} + +void shuffle(void *data, int recordSize, int recordCount) { + char *ptr = (char *)data; + char *swapBuf = malloc(recordSize); + + for (int i = recordCount - 2; i >= 0; i--) { + int j = (int)(random() % (recordCount - i)); + + if (j > 0) { + char *ptr1 = ptr + i * recordSize; + char *ptr2 = ptr + (i + j) * recordSize; + + memcpy(swapBuf, ptr1, recordSize); + memcpy(ptr1, ptr2, recordSize); + memcpy(ptr2, swapBuf, recordSize); + } + } + + free(swapBuf); +} + +void fill(MDB_env *env, MDB_dbi dbi) { + KeyType key; + DataType data; + + MDB_val keyRef; + MDB_val dataRef; + MDB_txn *txn; + + printf("generating data\n"); + + srandom(seed); + + majorIds = (long *)malloc(majorIdCount * sizeof(long)); + + if (!majorIds) { + fprintf(stderr, "out of memory\n"); + exit(1); + } + + for (int i = 0; i < majorIdCount; i++) + majorIds[i] = i; + + // now shuffle (for later deletion test) + shuffle((void *)majorIds, sizeof(long), majorIdCount); + + KeyDataType *records = malloc(sizeof(KeyDataType) * recordCount); + KeyDataType *ptr = records; + int remaining = recordCount; + long refId = 0; + + for (int i = 0; i < minorIdCount; i++) { + long majorId = random() % majorIdCount; + long minorId = i; + + int max = remaining / (minorIdCount - i + 1); + int use; + + if (i == minorIdCount - 1 || max < 2) { + use = max; + + } else { + long rand1 = random() % max; + long rand2 = random() % max; + use = (int)((rand1 * rand2 / (max - 1))) + 1; // non-linear distribution + } + + // printf ("%d %d %d\n", i, max, use); + + while (use-- > 0) { + ptr->key.majorId = majorId; + ptr->key.minorId = minorId; + ptr->data.refId = ++refId; + ptr++; + remaining--; + } + } + + shuffle((void *)records, sizeof(KeyDataType), recordCount); + + printf("writing data\n"); + + check("txn_begin", mdbx_txn_begin(env, NULL, 0, &txn)); + + ptr = records; + + for (int i = recordCount; i > 0; i--) { + + key.majorId = htobe64(ptr->key.majorId); + key.minorId = htobe64(ptr->key.minorId); + data.refId = htobe64(ptr->data.refId); + + keyRef.mv_size = sizeof(key); + keyRef.mv_data = (void *)&key; + dataRef.mv_size = sizeof(data); + dataRef.mv_data = (void *)&data; + + check("mdbx_put", mdbx_put(txn, dbi, &keyRef, &dataRef, 0)); + + ptr++; + } + + check("txn_commit", mdbx_txn_commit(txn)); + + printf("%d records written\n", recordCount); +} + +void deleteRange(MDB_env *env, MDB_dbi dbi, MDB_txn *txn, KeyType *startKey, + KeyType *endKey, int endIsInclusive) { + MDB_cursor *cursor; + MDB_val curKeyRef; + MDB_val endKeyRef; + MDB_val curDataRef; + (void)env; + + check("cursor_open", mdbx_cursor_open(txn, dbi, &cursor)); + + curKeyRef.mv_size = sizeof(KeyType); + curKeyRef.mv_data = (void *)startKey; + endKeyRef.mv_size = sizeof(KeyType); + endKeyRef.mv_data = (void *)endKey; + curDataRef.mv_size = 0; + curDataRef.mv_data = NULL; + + int error = mdbx_cursor_get(cursor, &curKeyRef, &curDataRef, MDB_SET_RANGE); + + while (error != MDB_NOTFOUND) { + check("mdbx_cursor_get", error); + + int compResult = mdbx_cmp(txn, dbi, &curKeyRef, &endKeyRef); + + if (compResult > 0 || (!compResult && !endIsInclusive)) + break; + + check("mdbx_cursor_del", mdbx_cursor_del(cursor, MDB_NODUPDATA)); + + error = mdbx_cursor_get(cursor, &curKeyRef, &curDataRef, MDB_NEXT); + } + + mdbx_cursor_close(cursor); +} + +void testDelete(MDB_env *env, MDB_dbi dbi) { + MDB_txn *txn; + KeyType startKey; + KeyType endKey; + + printf("testing\n"); + + check("txn_begin", mdbx_txn_begin(env, NULL, 0, &txn)); + + long majorId; + + for (int i = 0; i < majorIdCount; i++) { + majorId = majorIds[i]; + startKey.majorId = htobe64(majorId); + startKey.minorId = htobe64(1); + endKey.majorId = htobe64(majorId); + endKey.minorId = htobe64((long)(~0UL >> 1)); + + deleteRange(env, dbi, txn, &startKey, &endKey, 1); + } + + check("txn_commit", mdbx_txn_commit(txn)); +} + +int main(int argc, char *argv[]) { + MDB_env *env; + MDB_dbi dbi; + MDB_txn *txn; + (void)argc; + (void)argv; + + printf("LMDB version: %s\n", MDBX_VERSION_STRING); + + unlink(fileName); + check("env_create", mdbx_env_create(&env)); + check("env_set_mapsize", mdbx_env_set_mapsize(env, size)); + check("env_set_maxdbs", mdbx_env_set_maxdbs(env, 2)); + + check("env_open", + mdbx_env_open(env, fileName, MDB_NOSUBDIR | MDB_WRITEMAP, 0666)); + + check("txn_begin", mdbx_txn_begin(env, NULL, 0, &txn)); + + check("dbi_open", mdbx_dbi_open(txn, dbName, MDB_CREATE | MDB_DUPSORT, &dbi)); + + check("txn_commit", mdbx_txn_commit(txn)); + + fill(env, dbi); + testDelete(env, dbi); + + mdbx_env_close(env); + + printf("done.\n"); +} From 69a6d10a8a695c379361b4a068c3bf266e9b6ed9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 28 Mar 2017 17:50:49 +0300 Subject: [PATCH 022/303] mdbx: crutch for Windows's UnlockFile(). --- src/lck-windows.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/lck-windows.c b/src/lck-windows.c index 46525198..e2fd071c 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -104,12 +104,38 @@ static BOOL flock(mdbx_filehandle_t fd, DWORD flags, off_t offset, ov.hEvent = 0; ov.Offset = (DWORD)offset; ov.OffsetHigh = HIGH_DWORD(offset); + +#ifdef MDBX_WINDOWS_UnlockFile_CRUTCH + if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) + return true; + + if ((flags & LOCKFILE_FAIL_IMMEDIATELY) == 0) + return false; + + int rc = GetLastError(); + if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) + return false; + + /* FIXME: Windows kernel is ugly and mad... */ + SwitchToThread(); + Sleep(42); + SwitchToThread(); +#endif /* MDBX_WINDOWS_UnlockFile_CRUTCH */ return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov); } static BOOL funlock(mdbx_filehandle_t fd, off_t offset, size_t bytes) { - return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, - HIGH_DWORD(bytes)); + if (!UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, + HIGH_DWORD(bytes))) + return false; + +#ifdef MDBX_WINDOWS_UnlockFile_CRUTCH + /* FIXME: Windows kernel is ugly and mad... */ + SwitchToThread(); + Sleep(42); + SwitchToThread(); +#endif /* MDBX_WINDOWS_UnlockFile_CRUTCH */ + return true; } /*----------------------------------------------------------------------------*/ From 9731e07120aadfb9b4725c555c959ddea1b7da5b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 28 Mar 2017 21:00:13 +0300 Subject: [PATCH 023/303] mdbx: uint32/uint64 for INTEGER_KEY. --- src/mdbx.c | 58 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index ef7afcbb..fb3af0ed 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4250,12 +4250,16 @@ static int __hot mdbx_cmp_int_ai(const MDB_val *a, const MDB_val *b) { mdbx_assert(NULL, a->mv_size == b->mv_size); mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(int) && 0 == (uintptr_t)b->mv_data % sizeof(int)); - - if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) - return mdbx_cmp2int(*(size_t *)a->mv_data, *(size_t *)b->mv_data); - - mdbx_assert(NULL, a->mv_size == sizeof(int)); - return mdbx_cmp2int(*(unsigned *)a->mv_data, *(unsigned *)b->mv_data); + switch (a->mv_size) { + case 4: + return mdbx_cmp2int(*(uint32_t *)a->mv_data, *(uint32_t *)b->mv_data); + case 8: + return mdbx_cmp2int(*(uint64_t *)a->mv_data, *(uint64_t *)b->mv_data); + default: + mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", mdbx_func_, + __LINE__); + return 0; + } } /** Compare two items pointing at 2-byte aligned unsigned int's. */ @@ -4264,11 +4268,16 @@ static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(uint16_t) && 0 == (uintptr_t)b->mv_data % sizeof(uint16_t)); #if MISALIGNED_OK - if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) - return mdbx_cmp2int(*(size_t *)a->mv_data, *(size_t *)b->mv_data); - - mdbx_assert(NULL, a->mv_size == sizeof(int)); - return mdbx_cmp2int(*(unsigned *)a->mv_data, *(unsigned *)b->mv_data); + switch (a->mv_size) { + case 4: + return mdbx_cmp2int(*(uint32_t *)a->mv_data, *(uint32_t *)b->mv_data); + case 8: + return mdbx_cmp2int(*(uint64_t *)a->mv_data, *(uint64_t *)b->mv_data); + default: + mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", mdbx_func_, + __LINE__); + return 0; + } #else mdbx_assert(NULL, 0 == a->mv_size % sizeof(uint16_t)); { @@ -4303,11 +4312,16 @@ static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { static int __hot mdbx_cmp_int_ua(const MDB_val *a, const MDB_val *b) { mdbx_assert(NULL, a->mv_size == b->mv_size); #if MISALIGNED_OK - if (sizeof(int) != sizeof(size_t) && likely(a->mv_size == sizeof(size_t))) - return mdbx_cmp2int(*(size_t *)a->mv_data, *(size_t *)b->mv_data); - - mdbx_assert(NULL, a->mv_size == sizeof(int)); - return mdbx_cmp2int(*(unsigned *)a->mv_data, *(unsigned *)b->mv_data); + switch (a->mv_size) { + case 4: + return mdbx_cmp2int(*(uint32_t *)a->mv_data, *(uint32_t *)b->mv_data); + case 8: + return mdbx_cmp2int(*(uint64_t *)a->mv_data, *(uint64_t *)b->mv_data); + default: + mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", mdbx_func_, + __LINE__); + return 0; + } #else mdbx_assert(NULL, a->mv_size == sizeof(int) || a->mv_size == sizeof(size_t)); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ @@ -5115,8 +5129,8 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, DKBUF; if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && - unlikely(key->mv_size != sizeof(unsigned) && - key->mv_size != sizeof(size_t))) { + unlikely(key->mv_size != sizeof(uint32_t) && + key->mv_size != sizeof(uint64_t))) { mdbx_cassert(mc, !"key-size is invalid for MDB_INTEGERKEY"); return MDB_BAD_VALSIZE; } @@ -5639,15 +5653,15 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, #endif if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && - unlikely(key->mv_size != sizeof(unsigned) && - key->mv_size != sizeof(size_t))) { + unlikely(key->mv_size != sizeof(uint32_t) && + key->mv_size != sizeof(uint64_t))) { mdbx_cassert(mc, !"key-size is invalid for MDB_INTEGERKEY"); return MDB_BAD_VALSIZE; } if ((mc->mc_db->md_flags & MDB_INTEGERDUP) && - unlikely(data->mv_size != sizeof(unsigned) && - data->mv_size != sizeof(size_t))) { + unlikely(data->mv_size != sizeof(uint32_t) && + data->mv_size != sizeof(uint64_t))) { mdbx_cassert(mc, !"data-size is invalid MDB_INTEGERDUP"); return MDB_BAD_VALSIZE; } From 5f7e5547c33fca7926395e01ad42de70fb8a4187 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Wed, 29 Mar 2017 01:11:33 +0300 Subject: [PATCH 024/303] mdbx: add .appveyor.yml --- .appveyor.yml | 18 ++++++++++++++++++ README.md | 1 + mdbx-dll.sln => mdbx.sln | 0 3 files changed, 19 insertions(+) create mode 100644 .appveyor.yml rename mdbx-dll.sln => mdbx.sln (100%) diff --git a/.appveyor.yml b/.appveyor.yml new file mode 100644 index 00000000..aa190ea7 --- /dev/null +++ b/.appveyor.yml @@ -0,0 +1,18 @@ +version: '{build}' +max_jobs: 1 + +platform: + - x86 + - x64 + +configuration: + - Release + - Debug + +build: + verbosity: minimal + project: mdbx.sln + +# test_script: +# - ps: | +# & "C:\projects\mdbx\$env:PLATFORM\$env:CONFIGURATION\test\test.exe" diff --git a/README.md b/README.md index 56f5ec31..1cf2ea8b 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ Extended LMDB, aka "Расширенная LMDB". *The Future will Positive. Всё будет хорошо.* [![Build Status](https://travis-ci.org/ReOpen/libmdbx.svg?branch=devel)](https://travis-ci.org/ReOpen/libmdbx) +[![Build status](https://ci.appveyor.com/api/projects/status/v21jlh5kfmk85r7t/branch/devel?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/devel) English version by Google [is here](https://translate.googleusercontent.com/translate_c?act=url&ie=UTF8&sl=ru&tl=en&u=https://github.com/ReOpen/libmdbx/tree/devel). diff --git a/mdbx-dll.sln b/mdbx.sln similarity index 100% rename from mdbx-dll.sln rename to mdbx.sln From 40f8f53b0ecb8639f4cfe8a6008e26003f4079dd Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 29 Mar 2017 15:41:53 +0300 Subject: [PATCH 025/303] mdbx: add TODO.md --- TODO.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 TODO.md diff --git a/TODO.md b/TODO.md new file mode 100644 index 00000000..9fb41a2b --- /dev/null +++ b/TODO.md @@ -0,0 +1,16 @@ +- [x] CI посредством AppVeyor +- [ ] uint32/uint64 в структурах +- [ ] правки API (много...) +- [ ] инкрементальный mmap +- [ ] возврат выделенных страниц в unallocated tail +- [ ] устранение всех предупреждений +- [ ] перевод mdbx-tools на С++ и сборка для Windows +- [ ] тест конкурентного доступа +- [ ] тест основного функционала (заменить текущий треш) +- [ ] базовый бенчмарк +- [ ] переработка формата: заголовки страниц, meta, clk... +- [ ] зачистка Doxygen и бесполезных коментариев +- [ ] сборка через CMake +- [ ] актуализация README.md +- [ ] возможность хранения ключей внутри data (libfptu) +- [ ] асинхронная фиксация (https://github.com/ReOpen/libmdbx/issues/5) From 7d351f74c4bbd1b0d5cf4277ffa1c522924ed22c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 29 Mar 2017 18:51:22 +0300 Subject: [PATCH 026/303] mdbx: rename mdbx_stat() to mdbx_dbi_stat(). --- mdbx.h | 6 +++--- src/mdbx.c | 5 +++-- src/tools/mdbx_chk.c | 4 ++-- src/tools/mdbx_dump.c | 2 +- src/tools/mdbx_stat.c | 14 ++++++++------ test/test6.c | 2 +- test/test_yota1.c | 2 +- test/test_yota2.c | 2 +- 8 files changed, 20 insertions(+), 17 deletions(-) diff --git a/mdbx.h b/mdbx.h index 90c1564c..b1da10a6 100644 --- a/mdbx.h +++ b/mdbx.h @@ -421,7 +421,7 @@ typedef enum MDB_cursor_op { /* Statistics for a database in the environment */ typedef struct MDBX_stat { unsigned ms_psize; /* Size of a database page. - This is currently the + This is currently the same for all databases. */ unsigned ms_depth; /* Depth (height) of the B-tree */ size_t ms_branch_pages; /* Number of internal (non-leaf) pages */ @@ -1184,8 +1184,8 @@ LIBMDBX_API int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, * errors are: * - EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, - size_t bytes); +LIBMDBX_API int mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, + size_t bytes); /* Retrieve the DB flags for a database handle. * diff --git a/src/mdbx.c b/src/mdbx.c index fb3af0ed..bc503c2a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -8700,7 +8700,7 @@ int __cold mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *arg) { return MDB_SUCCESS; } -/** Common code for #mdbx_stat() and #mdbx_env_stat(). +/** Common code for #mdbx_dbi_stat() and #mdbx_env_stat(). * @param[in] env the environment to operate in. * @param[in] db the #MDB_db record containing the stats to return. * @param[out] arg the address of an #MDB_stat structure to receive the stats. @@ -8925,7 +8925,8 @@ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, return rc; } -int __cold mdbx_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, size_t bytes) { +int __cold mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, + size_t bytes) { if (unlikely(!arg || !txn)) return EINVAL; diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 5122bedb..1aaaaff2 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -442,9 +442,9 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { return rc; } - rc = mdbx_stat(txn, dbi, &ms, sizeof(ms)); + rc = mdbx_dbi_stat(txn, dbi, &ms, sizeof(ms)); if (rc) { - error(" - mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + error(" - mdbx_dbi_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); return rc; } diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index 79680fcb..c217c503 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -96,7 +96,7 @@ static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) { if (rc) return rc; - rc = mdbx_stat(txn, dbi, &ms, sizeof(ms)); + rc = mdbx_dbi_stat(txn, dbi, &ms, sizeof(ms)); if (rc) return rc; diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index fb920a13..565d58bb 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -170,9 +170,10 @@ int main(int argc, char *argv[]) { mdbx_strerror(rc)); goto txn_abort; } - rc = mdbx_stat(txn, dbi, &mst, sizeof(mst)); + rc = mdbx_dbi_stat(txn, dbi, &mst, sizeof(mst)); if (rc) { - fprintf(stderr, "mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + fprintf(stderr, "mdbx_dbi_stat failed, error %d %s\n", rc, + mdbx_strerror(rc)); goto txn_abort; } prstat(&mst); @@ -248,9 +249,10 @@ int main(int argc, char *argv[]) { goto txn_abort; } - rc = mdbx_stat(txn, dbi, &mst, sizeof(mst)); + rc = mdbx_dbi_stat(txn, dbi, &mst, sizeof(mst)); if (rc) { - fprintf(stderr, "mdbx_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); + fprintf(stderr, "mdbx_dbi_stat failed, error %d %s\n", rc, + mdbx_strerror(rc)); goto txn_abort; } printf("Status of %s\n", subname ? subname : "Main DB"); @@ -280,9 +282,9 @@ int main(int argc, char *argv[]) { free(str); if (rc) continue; - rc = mdbx_stat(txn, db2, &mst, sizeof(mst)); + rc = mdbx_dbi_stat(txn, db2, &mst, sizeof(mst)); if (rc) { - fprintf(stderr, "mdbx_stat failed, error %d %s\n", rc, + fprintf(stderr, "mdbx_dbi_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); goto txn_abort; } diff --git a/test/test6.c b/test/test6.c index 6f14ce4e..e9d6dd3c 100644 --- a/test/test6.c +++ b/test/test6.c @@ -78,7 +78,7 @@ int main(int argc, char *argv[]) { E(mdbx_drop(txn, dbi, 1)); E(mdbx_dbi_open(txn, "id6", MDB_CREATE | MDB_INTEGERKEY, &dbi)); E(mdbx_cursor_open(txn, dbi, &cursor)); - E(mdbx_stat(txn, dbi, &mst, sizeof(mst))); + E(mdbx_dbi_stat(txn, dbi, &mst, sizeof(mst))); sval = calloc(1, mst.ms_psize / 4); key.mv_size = sizeof(long); diff --git a/test/test_yota1.c b/test/test_yota1.c index 28744185..7d036f0f 100644 --- a/test/test_yota1.c +++ b/test/test_yota1.c @@ -203,7 +203,7 @@ static void get_db_stat(const char *db, int64_t *ms_branch_pages, LMDB_CHECK(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); LMDB_CHECK(mdbx_dbi_open(txn, db, MDB_CREATE, &dbi)); - LMDB_CHECK(mdbx_stat(txn, dbi, &stat, sizeof(stat))); + LMDB_CHECK(mdbx_dbi_stat(txn, dbi, &stat, sizeof(stat))); mdbx_txn_abort(txn); printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", db, stat.ms_branch_pages, stat.ms_depth, stat.ms_entries, diff --git a/test/test_yota2.c b/test/test_yota2.c index 84cb0479..79c72880 100644 --- a/test/test_yota2.c +++ b/test/test_yota2.c @@ -230,7 +230,7 @@ static void get_db_stat(const char *db, int64_t *ms_branch_pages, LMDB_CHECK(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); LMDB_CHECK(mdbx_dbi_open(txn, db, MDB_CREATE, &dbi)); - LMDB_CHECK(mdbx_stat(txn, dbi, &stat, sizeof(stat))); + LMDB_CHECK(mdbx_dbi_stat(txn, dbi, &stat, sizeof(stat))); mdbx_txn_abort(txn); printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", db, stat.ms_branch_pages, stat.ms_depth, stat.ms_entries, From cb6ac8912f9fdd8e4b6b1c9a71aada33d9661dce Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 31 Mar 2017 12:44:19 +0300 Subject: [PATCH 027/303] mdbx: now MDBX_MODE_ENABLED always. --- mdbx.h | 4 --- src/bits.h | 24 +++---------- src/mdbx.c | 99 ++++++++---------------------------------------------- 3 files changed, 19 insertions(+), 108 deletions(-) diff --git a/mdbx.h b/mdbx.h index b1da10a6..db7427f6 100644 --- a/mdbx.h +++ b/mdbx.h @@ -250,14 +250,10 @@ typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); #define MDB_NORDAHEAD 0x800000u /* don't initialize malloc'd memory before writing to datafile */ #define MDB_NOMEMINIT 0x1000000u - -#if MDBX_MODE_ENABLED /* aim to coalesce FreeDB records */ #define MDBX_COALESCE 0x2000000u /* LIFO policy for reclaiming FreeDB records */ #define MDBX_LIFORECLAIM 0x4000000u -#endif /* MDBX_MODE_ENABLED */ - /* make a steady-sync only on close and explicit env-sync */ #define MDBX_UTTERLY_NOSYNC (MDB_NOSYNC | MDB_MAPASYNC) /* debuging option, fill/perturb released pages */ diff --git a/src/bits.h b/src/bits.h index 29c1663c..5537aa5f 100644 --- a/src/bits.h +++ b/src/bits.h @@ -279,10 +279,7 @@ typedef struct MDB_meta { volatile uint64_t mm_datasync_sign; #define META_IS_WEAK(meta) ((meta)->mm_datasync_sign == MDB_DATASIGN_WEAK) #define META_IS_STEADY(meta) ((meta)->mm_datasync_sign > MDB_DATASIGN_WEAK) - -#if MDBX_MODE_ENABLED volatile mdbx_canary mm_canary; -#endif } MDB_meta; /** Common header for all page types. The page type depends on #mp_flags. @@ -394,17 +391,11 @@ typedef struct MDB_dbx { MDB_cmp_func *md_dcmp; /**< function for comparing data items */ } MDB_dbx; -#if MDBX_MODE_ENABLED -#define MDBX_MODE_SALT 0 -#else -#error !? -#endif - /** A database transaction. * Every operation requires a transaction handle. */ struct MDB_txn { -#define MDBX_MT_SIGNATURE (0x93D53A31 ^ MDBX_MODE_SALT) +#define MDBX_MT_SIGNATURE (0x93D53A31) unsigned mt_signature; MDB_txn *mt_parent; /**< parent of a nested txn */ /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ @@ -494,10 +485,7 @@ struct MDB_txn { * dirty_list into mt_parent after freeing hidden mt_parent pages. */ unsigned mt_dirty_room; - -#if MDBX_MODE_ENABLED mdbx_canary mt_canary; -#endif }; /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. @@ -517,9 +505,9 @@ struct MDB_xcursor; * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). */ struct MDB_cursor { -#define MDBX_MC_SIGNATURE (0xFE05D5B1 ^ MDBX_MODE_SALT) -#define MDBX_MC_READY4CLOSE (0x2817A047 ^ MDBX_MODE_SALT) -#define MDBX_MC_WAIT4EOT (0x90E297A7 ^ MDBX_MODE_SALT) +#define MDBX_MC_SIGNATURE (0xFE05D5B1) +#define MDBX_MC_READY4CLOSE (0x2817A047) +#define MDBX_MC_WAIT4EOT (0x90E297A7) unsigned mc_signature; /** Next cursor on this DB in this txn */ MDB_cursor *mc_next; @@ -606,7 +594,7 @@ typedef struct MDB_pgstate { /** The database environment. */ struct MDB_env { -#define MDBX_ME_SIGNATURE (0x9A899641 ^ MDBX_MODE_SALT) +#define MDBX_ME_SIGNATURE (0x9A899641) unsigned me_signature; mdbx_filehandle_t me_fd; /**< The main data file */ mdbx_filehandle_t me_lfd; /**< The lock file */ @@ -660,9 +648,7 @@ struct MDB_env { mdbx_env_sync() */ uint64_t me_sync_threshold; /**< Treshold of above to force synchronous flush */ -#if MDBX_MODE_ENABLED MDBX_oom_func *me_oom_func; /**< Callback for kicking laggard readers */ -#endif #ifdef USE_VALGRIND int me_valgrind_handle; #endif diff --git a/src/mdbx.c b/src/mdbx.c index bc503c2a..977683a2 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -819,9 +819,7 @@ const char *__cold mdbx_strerror(int errnum) { return msg; } -#if MDBX_MODE_ENABLED static txnid_t mdbx_oomkick(MDB_env *env, txnid_t oldest); -#endif /* MDBX_MODE_ENABLED */ void __cold mdbx_debug_log(int type, const char *function, int line, const char *fmt, ...) { @@ -1787,12 +1785,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { } if (rc == MDB_MAP_FULL) { -#if MDBX_MODE_ENABLED txnid_t snap = mdbx_oomkick(env, oldest); -#else - mdbx_debug("DB size maxed out"); - txnid_t snap = mdbx_find_oldest(env, NULL); -#endif /* MDBX_MODE_ENABLED */ if (snap > oldest) { oldest = snap; continue; @@ -2153,7 +2146,6 @@ static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { if ((mx = mc->mc_xcursor) != NULL) *mx = *(MDB_xcursor *)(bk + 1); } -#if MDBX_MODE_ENABLED bk->mc_signature = 0; free(bk); } @@ -2164,13 +2156,6 @@ static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { mc->mc_signature = MDBX_MC_READY4CLOSE; mc->mc_flags = 0 /* reset C_UNTRACK */; } -#else - mc = bk; - } - /* Only malloced cursors are permanently tracked. */ - mc->mc_signature = 0; - free(mc); -#endif } cursors[i] = NULL; } @@ -2275,9 +2260,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { txn->mt_next_pgno = meta->mm_last_pg + 1; /* Copy the DB info and flags */ memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); -#if MDBX_MODE_ENABLED txn->mt_canary = meta->mm_canary; -#endif break; } } @@ -2294,9 +2277,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { mdbx_mutex_lock(&tsan_mutex); #endif MDB_meta *meta = mdbx_meta_head_w(env); -#if MDBX_MODE_ENABLED txn->mt_canary = meta->mm_canary; -#endif txn->mt_txnid = meta->mm_txnid + 1; txn->mt_flags = flags; #ifdef __SANITIZE_THREAD__ @@ -2631,12 +2612,8 @@ int mdbx_txn_reset(MDB_txn *txn) { if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) return EINVAL; -#if MDBX_MODE_ENABLED /* LY: don't close DBI-handles in MDBX mode */ return mdbx_txn_end(txn, MDB_END_RESET | MDB_END_UPDATE); -#else - return mdbx_txn_end(txn, MDB_END_RESET); -#endif /* MDBX_MODE_ENABLED */ } int mdbx_txn_abort(MDB_txn *txn) { @@ -2646,12 +2623,10 @@ int mdbx_txn_abort(MDB_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; -#if MDBX_MODE_ENABLED if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) /* LY: don't close DBI-handles in MDBX mode */ return mdbx_txn_end(txn, MDB_END_ABORT | MDB_END_UPDATE | MDB_END_SLOT | MDB_END_FREE); -#endif /* MDBX_MODE_ENABLED */ if (txn->mt_child) mdbx_txn_abort(txn->mt_child); @@ -3343,9 +3318,7 @@ int mdbx_txn_commit(MDB_txn *txn) { meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta.mm_last_pg = txn->mt_next_pgno - 1; meta.mm_txnid = txn->mt_txnid; -#if MDBX_MODE_ENABLED meta.mm_canary = txn->mt_canary; -#endif rc = mdbx_env_sync0(env, env->me_flags | txn->mt_flags, &meta); } @@ -3569,9 +3542,7 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; target->mm_last_pg = pending->mm_last_pg; -#if MDBX_MODE_ENABLED target->mm_canary = pending->mm_canary; -#endif /* LY: 'commit' the meta */ target->mm_txnid = pending->mm_txnid; target->mm_datasync_sign = pending->mm_datasync_sign; @@ -5673,7 +5644,6 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (flags & MDB_CURRENT) { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return EINVAL; -#if MDBX_MODE_ENABLED if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5688,7 +5658,6 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } } -#endif /* MDBX_MODE_ENABLED */ rc = MDB_SUCCESS; } else if (mc->mc_db->md_root == P_INVALID) { /* new database, cursor has nothing to point to */ @@ -5969,8 +5938,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (omp->mp_flags & P_DIRTY) { /* yes, overwrite it. Note in this case we don't * bother to try shrinking the page if the new data - * is smaller than the overflow threshold. - */ + * is smaller than the overflow threshold. */ if (unlikely(level > 1)) { /* It is writable only in a parent txn */ MDB_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); @@ -5982,22 +5950,18 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, /* Note - this page is already counted in parent's dirty_room */ rc2 = mdbx_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); mdbx_cassert(mc, rc2 == 0); + /* Currently we make the page look as with put() in the * parent txn, in case the user peeks at MDB_RESERVEd - * or unused parts. Some users treat ovpages specially. - */ - size_t sz = (size_t)env->me_psize * ovpages, off; - if (MDBX_MODE_ENABLED || !(flags & MDB_RESERVE)) { - /* Skip the part where LMDB will put *data. - * Copy end of page, adjusting alignment so - * compiler may copy words instead of bytes. - */ - off = (PAGEHDRSZ + data->mv_size) & -(ssize_t)sizeof(size_t); - memcpy((size_t *)((char *)np + off), - (size_t *)((char *)omp + off), sz - off); - sz = PAGEHDRSZ; - } - memcpy(np, omp, sz); /* Copy whole or header of page */ + * or unused parts. Some users treat ovpages specially. */ + size_t whole = (size_t)env->me_psize * ovpages; + /* Skip the part where LMDB will put *data. + * Copy end of page, adjusting alignment so + * compiler may copy words instead of bytes. */ + size_t off = (PAGEHDRSZ + data->mv_size) & -(ssize_t)sizeof(size_t); + memcpy((size_t *)((char *)np + off), (size_t *)((char *)omp + off), + whole - off); + memcpy(np, omp, PAGEHDRSZ); /* Copy header of page */ omp = np; } SETDSZ(leaf, data->mv_size); @@ -6784,16 +6748,12 @@ int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { return EINVAL; if (unlikely((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)) { -#if MDBX_MODE_ENABLED MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; if (*prev == mc) *prev = mc->mc_next; mc->mc_signature = MDBX_MC_READY4CLOSE; -#else - return EINVAL; -#endif } if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) @@ -6817,7 +6777,6 @@ int mdbx_cursor_count(MDB_cursor *mc, size_t *countp) { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return EINVAL; -#if MDBX_MODE_ENABLED if (!mc->mc_snum) { *countp = 0; return MDB_NOTFOUND; @@ -6838,26 +6797,6 @@ int mdbx_cursor_count(MDB_cursor *mc, size_t *countp) { *countp = mc->mc_xcursor->mx_db.md_entries; } } -#else - if (unlikely(mc->mc_xcursor == NULL)) - return MDB_INCOMPATIBLE; - - if (!mc->mc_snum) - return MDB_NOTFOUND; - - MDB_page *mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) - return MDB_NOTFOUND; - - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - *countp = 1; - } else { - if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) - return EINVAL; - *countp = mc->mc_xcursor->mx_db.md_entries; - } -#endif /* MDBX_MODE_ENABLED */ return MDB_SUCCESS; } @@ -7668,13 +7607,6 @@ int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; -#if !MDBX_MODE_ENABLED - if (!F_ISSET(txn->mt_dbs[dbi].md_flags, MDB_DUPSORT)) { - /* must ignore any data */ - data = NULL; - } -#endif - return mdbx_del0(txn, dbi, key, data, 0); } @@ -8160,10 +8092,7 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, if (unlikely(flags & ~(MDB_NOOVERWRITE | MDB_NODUPDATA | MDB_RESERVE | MDB_APPEND | - MDB_APPENDDUP - /* LY: MDB_CURRENT indicates explicit overwrite (update) - for MDBX */ - | (MDBX_MODE_ENABLED ? MDB_CURRENT : 0)))) + MDB_APPENDDUP | MDB_CURRENT))) return EINVAL; if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) @@ -8172,8 +8101,8 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, mdbx_cursor_init(&mc, txn, dbi, &mx); mc.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &mc; + int rc = MDB_SUCCESS; -#if MDBX_MODE_ENABLED /* LY: support for update (explicit overwrite) */ if (flags & MDB_CURRENT) { rc = mdbx_cursor_get(&mc, key, NULL, MDB_SET); @@ -8188,7 +8117,7 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, } } } -#endif /* MDBX_MODE_ENABLED */ + if (likely(rc == MDB_SUCCESS)) rc = mdbx_cursor_put(&mc, key, data, flags); txn->mt_cursors[dbi] = mc.mc_next; From f8eb858ef3d53a6e78db1d7951a75d57f7c164e4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 31 Mar 2017 16:20:05 +0300 Subject: [PATCH 028/303] mdbx: drop MDB_MAXKEYSIZE. --- mdbx.h | 2 -- src/bits.h | 2 +- src/mdbx.c | 77 ++++++++++++++------------------------------ src/tools/mdbx_chk.c | 23 ++++++------- 4 files changed, 35 insertions(+), 69 deletions(-) diff --git a/mdbx.h b/mdbx.h index db7427f6..3ebc59b3 100644 --- a/mdbx.h +++ b/mdbx.h @@ -921,8 +921,6 @@ LIBMDBX_API int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); /* Get the maximum size of keys and MDB_DUPSORT data we can write. * - * Depends on the compile-time constant MDB_MAXKEYSIZE. Default 511. - * See MDB_val. * [in] env An environment handle returned by mdbx_env_create() * Returns The maximum size of a key we can write */ diff --git a/src/bits.h b/src/bits.h index 5537aa5f..45c0d69c 100644 --- a/src/bits.h +++ b/src/bits.h @@ -693,7 +693,7 @@ void mdbx_panic(const char *fmt, ...) #define mdbx_debug_enabled(type) (1) #else #define mdbx_debug_enabled(type) (0) -#endif +#endif /* NDEBUG */ #define mdbx_audit_enabled() (0) #define mdbx_assert_enabled() (0) #endif /* MDB_DEBUG */ diff --git a/src/mdbx.c b/src/mdbx.c index 977683a2..2f4e17fb 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -317,48 +317,17 @@ txnid_t mdbx_debug_edge; /** The version number for a database's lockfile format. */ #define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) -/** @brief The max size of a key we can write, or 0 for computed max. - * - * This macro should normally be left alone or set to 0. - * Note that a database with big keys or dupsort data cannot be - * reliably modified by a liblmdb which uses a smaller max. - * The default is 511 for backwards compat, or 0 when #MDB_DEVEL. - * - * Other values are allowed, for backwards compat. However: - * A value bigger than the computed max can break if you do not - * know what you are doing, and liblmdb <= 0.9.10 can break when - * modifying a DB with keys/dupsort data bigger than its max. - * - * Data items in an #MDB_DUPSORT database are also limited to - * this size, since they're actually keys of a sub-DB. Keys and - * #MDB_DUPSORT data items must fit on a node in a regular page. - */ -#ifndef MDB_MAXKEYSIZE -#define MDB_MAXKEYSIZE ((MDB_DEVEL) ? 0 : 511) -#endif - -/** The maximum size of a key we can write to the environment. */ -#if MDB_MAXKEYSIZE -#define ENV_MAXKEY(env) (MDB_MAXKEYSIZE) -#else -#define ENV_MAXKEY(env) ((env)->me_maxkey_limit) -#endif /* MDB_MAXKEYSIZE */ - /** @brief The maximum size of a data item. * * We only store a 32 bit value for node sizes. */ #define MAXDATASIZE 0xffffffffUL -/** Key size which fits in a #DKBUF. - * @ingroup debug - */ -#define DKBUF_MAXKEYSIZE ((MDB_MAXKEYSIZE) > 0 ? (MDB_MAXKEYSIZE) : 511) -/** A key buffer. - * @ingroup debug - * This is used for printing a hex dump of a key's contents. - */ -#define DKBUF char kbuf[DKBUF_MAXKEYSIZE * 2 + 1] +#define DKBUF_MAXKEYSIZE 511 /* FIXME */ + /** Key size which fits in a #DKBUF. + * @ingroup debug + */ +#define DKBUF char kbuf[DKBUF_MAXKEYSIZE] /** Display a key in hex. * @ingroup debug * Invoke a function to display a key in hex. @@ -3610,6 +3579,16 @@ fail: return rc; } +static void __cold mdbx_env_setup_limits(MDB_env *env, size_t pagesize) { + env->me_maxfree_1pg = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + env->me_nodemax = + (((pagesize - PAGEHDRSZ) / MDB_MINKEYS) & -2) - sizeof(indx_t); + env->me_maxkey_limit = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); + assert(env->me_maxkey_limit > 42 && env->me_maxkey_limit < pagesize); + + env->me_maxpg = env->me_mapsize / pagesize; +} + int __cold mdbx_env_create(MDB_env **env) { MDB_env *e; @@ -3622,10 +3601,11 @@ int __cold mdbx_env_create(MDB_env **env) { e->me_fd = INVALID_HANDLE_VALUE; e->me_lfd = INVALID_HANDLE_VALUE; e->me_pid = mdbx_getpid(); - e->me_os_psize = mdbx_syspagesize(); + mdbx_env_setup_limits(e, e->me_os_psize = mdbx_syspagesize()); VALGRIND_CREATE_MEMPOOL(e, 0, 0); e->me_signature = MDBX_ME_SIGNATURE; *env = e; + return MDB_SUCCESS; } @@ -3783,8 +3763,7 @@ int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) { return MDB_SUCCESS; } -/** Further setup required for opening an LMDB environment - */ +/* Further setup required for opening an LMDB environment */ static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { int newenv = 0; int rc = mdbx_env_read_header(env, meta); @@ -3834,15 +3813,7 @@ static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { if (rc) return rc; - env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - env->me_nodemax = - (((env->me_psize - PAGEHDRSZ) / MDB_MINKEYS) & -2) - sizeof(indx_t); - env->me_maxkey_limit = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); - env->me_maxpg = env->me_mapsize / env->me_psize; - - if (MDB_MAXKEYSIZE > env->me_maxkey_limit) - return MDB_BAD_VALSIZE; - + mdbx_env_setup_limits(env, env->me_psize); return MDB_SUCCESS; } @@ -5609,17 +5580,17 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - if (unlikely(key->mv_size > ENV_MAXKEY(env))) + if (unlikely(key->mv_size > env->me_maxkey_limit)) return MDB_BAD_VALSIZE; #if SIZE_MAX > MAXDATASIZE if (unlikely(data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) - ? ENV_MAXKEY(env) + ? env->me_maxkey_limit : MAXDATASIZE))) return MDB_BAD_VALSIZE; #else if ((mc->mc_db->md_flags & MDB_DUPSORT) && - unlikely(data->mv_size > ENV_MAXKEY(env))) + unlikely(data->mv_size > env->me_maxkey_limit)) return MDB_BAD_VALSIZE; #endif @@ -9089,9 +9060,9 @@ int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { } int __cold mdbx_env_get_maxkeysize(MDB_env *env) { - if (!env || env->me_signature != MDBX_ME_SIGNATURE) + if (!env || env->me_signature != MDBX_ME_SIGNATURE || !env->me_maxkey_limit) return EINVAL; - return ENV_MAXKEY(env); + return env->me_maxkey_limit; } int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 1aaaaff2..cc0efdd9 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -671,14 +671,6 @@ int main(int argc, char *argv[]) { return rc < 0 ? EXIT_FAILURE_MDB : EXIT_FAILURE_SYS; } - rc = mdbx_env_get_maxkeysize(env); - if (rc < 0) { - error("mdbx_env_get_maxkeysize failed, error %d %s\n", rc, - mdbx_strerror(rc)); - goto bailout; - } - maxkeysize = rc; - rc = mdbx_env_set_maxdbs(env, MAX_DBI); if (rc < 0) { error("mdbx_env_set_maxdbs failed, error %d %s\n", rc, mdbx_strerror(rc)); @@ -702,6 +694,14 @@ int main(int argc, char *argv[]) { } } + rc = mdbx_env_get_maxkeysize(env); + if (rc < 0) { + error("mdbx_env_get_maxkeysize failed, error %d %s\n", rc, + mdbx_strerror(rc)); + goto bailout; + } + maxkeysize = rc; + rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); if (rc) { error("mdbx_txn_begin(read-only) failed, error %d %s\n", rc, @@ -734,11 +734,8 @@ int main(int argc, char *argv[]) { sf[i]); if (info.me_mapaddr) print(" - mapaddr %p\n", info.me_mapaddr); - print(" - pagesize %u, max keysize %zu (%s), max readers %u\n", - stat.ms_psize, maxkeysize, - (maxkeysize == 511) ? "default" : (maxkeysize == 0) ? "devel" - : "custom", - info.me_maxreaders); + print(" - pagesize %u, max keysize %zu, max readers %u\n", stat.ms_psize, + maxkeysize, info.me_maxreaders); print(" - transactions: last %zu, bottom %zu, lag reading %zi\n", info.me_last_txnid, info.me_tail_txnid, info.me_last_txnid - info.me_tail_txnid); From 5c5ef256b39665d1f882f12c76f54cbac6c807d0 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 31 Mar 2017 16:20:38 +0300 Subject: [PATCH 029/303] mdbx: change mdbx_dkey(). --- mdbx.h | 2 +- src/mdbx.c | 35 +++++++++++++++++++++-------------- test/test6.c | 5 +++-- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/mdbx.h b/mdbx.h index 3ebc59b3..301854e6 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1625,7 +1625,7 @@ LIBMDBX_API int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); */ LIBMDBX_API int mdbx_reader_check(MDB_env *env, int *dead); -LIBMDBX_API char *mdbx_dkey(MDB_val *key, char *buf); +LIBMDBX_API char *mdbx_dkey(MDB_val *key, char *buf, const size_t bufsize); LIBMDBX_API int mdbx_env_close_ex(MDB_env *env, int dont_sync); diff --git a/src/mdbx.c b/src/mdbx.c index 2f4e17fb..8abf7666 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -332,7 +332,7 @@ txnid_t mdbx_debug_edge; * @ingroup debug * Invoke a function to display a key in hex. */ -#define DKEY(x) mdbx_dkey(x, kbuf) +#define DKEY(x) mdbx_dkey(x, kbuf, sizeof(kbuf)) /** An invalid page number. * Mainly used to denote an empty tree. @@ -821,7 +821,7 @@ static __inline pgno_t mdbx_dbg_pgno(MDB_page *mp) { * @param[in] buf the buffer to write into. Should always be #DKBUF. * @return The key in hexadecimal form. */ -char *mdbx_dkey(MDB_val *key, char *buf) { +char *mdbx_dkey(MDB_val *key, char *buf, const size_t bufsize) { #ifdef _MSC_VER (void)key; (void)buf; @@ -833,17 +833,24 @@ char *mdbx_dkey(MDB_val *key, char *buf) { if (!key) return ""; - if (key->mv_size > DKBUF_MAXKEYSIZE) - return "MDB_MAXKEYSIZE"; -/* may want to make this a dynamic check: if the key is mostly -* printable characters, print it as-is instead of converting to hex. */ -#if 1 - buf[0] = '\0'; - for (i = 0; i < key->mv_size; i++) - ptr += sprintf(ptr, "%02x", ((unsigned char *)key->mv_data)[i]); -#else - sprintf(buf, "%.*s", key->mv_size, key->mv_data); -#endif + const uint8_t *const data = key->mv_data; + bool is_ascii = true; + for (i = 0; is_ascii && i < key->mv_size; i++) + if (data[i] < ' ' || data[i] > 127) + is_ascii = false; + + if (is_ascii) + snprintf(buf, bufsize, "%.*s", + (key->mv_size > INT_MAX) ? INT_MAX : (int)key->mv_size, data); + else { + buf[0] = '\0'; + for (i = 0; i < key->mv_size; i++) { + int len = snprintf(ptr, bufsize - (ptr - buf), "%02x", data[i]); + if (len < 1) + break; + ptr += len; + } + } return buf; #endif /* _MSC_VER */ } @@ -6833,7 +6840,7 @@ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { k2.mv_data = NODEKEY(node); k2.mv_size = node->mn_ksize; mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %zu", indx, ptr, - mdbx_dkey(&k2, kbuf2), DKEY(key), mp->mp_pgno); + mdbx_dkey(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), mp->mp_pgno); } /* Sizes must be 2-byte aligned. */ diff --git a/test/test6.c b/test/test6.c index e9d6dd3c..03b6f7d1 100644 --- a/test/test6.c +++ b/test/test6.c @@ -113,8 +113,9 @@ int main(int argc, char *argv[]) { E(mdbx_cursor_get(cursor, &key, &data, MDB_FIRST)); do { - printf("key: %p %s, data: %p %.*s\n", key.mv_data, mdbx_dkey(&key, dkbuf), - data.mv_data, (int)data.mv_size, (char *)data.mv_data); + printf("key: %p %s, data: %p %.*s\n", key.mv_data, + mdbx_dkey(&key, dkbuf, sizeof(dkbuf)), data.mv_data, + (int)data.mv_size, (char *)data.mv_data); } while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0); CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); mdbx_cursor_close(cursor); From a2593625cc2ac5f033c4b5f262fa5fb29656e93f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 31 Mar 2017 17:51:42 +0300 Subject: [PATCH 030/303] mdbx: add mdbx_get_maxkeysize(). --- mdbx.h | 1 + src/mdbx.c | 44 +++++++++++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/mdbx.h b/mdbx.h index 301854e6..594e488c 100644 --- a/mdbx.h +++ b/mdbx.h @@ -925,6 +925,7 @@ LIBMDBX_API int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); * Returns The maximum size of a key we can write */ LIBMDBX_API int mdbx_env_get_maxkeysize(MDB_env *env); +LIBMDBX_API int mdbx_get_maxkeysize(size_t pagesize); /* Set application information associated with the MDB_env. * diff --git a/src/mdbx.c b/src/mdbx.c index 8abf7666..552a424f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3586,14 +3586,42 @@ fail: return rc; } +int __cold mdbx_env_get_maxkeysize(MDB_env *env) { + if (!env || env->me_signature != MDBX_ME_SIGNATURE || !env->me_maxkey_limit) + return EINVAL; + return env->me_maxkey_limit; +} + +static __inline ssize_t mdbx_calc_nodemax(ssize_t pagesize) { + assert(pagesize > 0); + return (((pagesize - PAGEHDRSZ) / MDB_MINKEYS) & -(ssize_t)2) - + sizeof(indx_t); +} + +static __inline ssize_t mdbx_calc_maxkey(ssize_t nodemax) { + assert(nodemax > 0); + return nodemax - (NODESIZE + sizeof(MDB_db)); +} + +int mdbx_get_maxkeysize(size_t pagesize) { + if (pagesize == 0) + pagesize = mdbx_syspagesize(); + + ssize_t nodemax = mdbx_calc_nodemax(pagesize); + if (nodemax < 0) + return -EINVAL; + + ssize_t maxkey = mdbx_calc_maxkey(nodemax); + return (maxkey > 0 && maxkey < INT_MAX) ? (int)maxkey : -EINVAL; +} + static void __cold mdbx_env_setup_limits(MDB_env *env, size_t pagesize) { env->me_maxfree_1pg = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - env->me_nodemax = - (((pagesize - PAGEHDRSZ) / MDB_MINKEYS) & -2) - sizeof(indx_t); - env->me_maxkey_limit = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); - assert(env->me_maxkey_limit > 42 && env->me_maxkey_limit < pagesize); - env->me_maxpg = env->me_mapsize / pagesize; + + env->me_nodemax = mdbx_calc_nodemax(pagesize); + env->me_maxkey_limit = mdbx_calc_maxkey(env->me_nodemax); + assert(env->me_maxkey_limit > 42 && env->me_maxkey_limit < pagesize); } int __cold mdbx_env_create(MDB_env **env) { @@ -9066,12 +9094,6 @@ int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { return MDB_SUCCESS; } -int __cold mdbx_env_get_maxkeysize(MDB_env *env) { - if (!env || env->me_signature != MDBX_ME_SIGNATURE || !env->me_maxkey_limit) - return EINVAL; - return env->me_maxkey_limit; -} - int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { unsigned i, snap_nreaders; MDB_reader *mr; From ab4597cedc0634b088525b9d00d72aa37f532e76 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 31 Mar 2017 17:59:12 +0300 Subject: [PATCH 031/303] mdbx: add MDBX_MAXDATASIZE. --- mdbx.h | 4 ++++ src/mdbx.c | 14 +------------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/mdbx.h b/mdbx.h index 594e488c..2d30b2b8 100644 --- a/mdbx.h +++ b/mdbx.h @@ -225,6 +225,10 @@ typedef struct iovec MDB_val; #define mv_size iov_len #define mv_data iov_base +/* The maximum size of a data item. + * MDBX only store a 32 bit value for node sizes. */ +#define MDBX_MAXDATASIZE INT32_MAX + /* A callback function used to compare two keys in a database */ typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); diff --git a/src/mdbx.c b/src/mdbx.c index 552a424f..384ecfda 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -317,12 +317,6 @@ txnid_t mdbx_debug_edge; /** The version number for a database's lockfile format. */ #define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) -/** @brief The maximum size of a data item. - * - * We only store a 32 bit value for node sizes. - */ -#define MAXDATASIZE 0xffffffffUL - #define DKBUF_MAXKEYSIZE 511 /* FIXME */ /** Key size which fits in a #DKBUF. * @ingroup debug @@ -5618,16 +5612,10 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (unlikely(key->mv_size > env->me_maxkey_limit)) return MDB_BAD_VALSIZE; -#if SIZE_MAX > MAXDATASIZE if (unlikely(data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? env->me_maxkey_limit - : MAXDATASIZE))) + : MDBX_MAXDATASIZE))) return MDB_BAD_VALSIZE; -#else - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - unlikely(data->mv_size > env->me_maxkey_limit)) - return MDB_BAD_VALSIZE; -#endif if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && unlikely(key->mv_size != sizeof(uint32_t) && From 6d507233cc9e7fb04bca9b714a30f4d9fc9d6661 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Apr 2017 15:17:24 +0300 Subject: [PATCH 032/303] mdbx: snprintf/vsnprintf stub for MSC. --- src/osal.c | 6 ++---- src/osal.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/osal.c b/src/osal.c index 7061204a..2bac64e1 100644 --- a/src/osal.c +++ b/src/osal.c @@ -97,7 +97,7 @@ int mdbx_asprintf(char **strp, const char *fmt, ...) { va_copy(ones, ap); #ifdef _MSC_VER int needed = _vscprintf(fmt, ap); -#elif defined(_BSD_SOURCE) || _XOPEN_SOURCE >= 500 || \ +#elif defined(vsnprintf) || defined(_BSD_SOURCE) || _XOPEN_SOURCE >= 500 || \ defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L int needed = vsnprintf(nullptr, 0, fmt, ap); #else @@ -117,9 +117,7 @@ int mdbx_asprintf(char **strp, const char *fmt, ...) { return -ENOMEM; } -#ifdef _MSC_VER - int actual = vsnprintf_s(*strp, needed + 1, _TRUNCATE, fmt, ones); -#elif defined(_BSD_SOURCE) || _XOPEN_SOURCE >= 500 || \ +#if defined(vsnprintf) || defined(_BSD_SOURCE) || _XOPEN_SOURCE >= 500 || \ defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L int actual = vsnprintf(*strp, needed + 1, fmt, ones); #else diff --git a/src/osal.h b/src/osal.h index 469ca341..4000475b 100644 --- a/src/osal.h +++ b/src/osal.h @@ -421,3 +421,17 @@ void mdbx_txn_unlock(MDB_env *env); int mdbx_rpid_set(MDB_env *env); int mdbx_rpid_clear(MDB_env *env); int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid); + +/*----------------------------------------------------------------------------*/ + +#ifdef _MSC_VER +#ifndef snprintf +#define snprintf(buffer, buffer_size, format, ...) \ + _snprintf_s(buffer, buffer_size, _TRUNCATE, format, __VA_ARGS__) +#endif /* snprintf */ + +#ifndef vsnprintf +#define vsnprintf(buffer, buffer_size, format, args) \ + _vsnprintf_s(buffer, buffer_size, _TRUNCATE, format, args) +#endif /* vsnprintf */ +#endif /* _MSC_VER */ From f51c8dae727f6fa575f590e07914c8b764691a25 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Apr 2017 15:42:30 +0300 Subject: [PATCH 033/303] mdbx: add VS2013/2012/2010 for appveyor matrix. --- .appveyor.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.appveyor.yml b/.appveyor.yml index aa190ea7..aa374141 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,6 +1,16 @@ version: '{build}' max_jobs: 1 +image: Visual Studio 2015 + +environment: + matrix: +# - Toolset: v141 + - Toolset: v140 + - Toolset: v120 + - Toolset: v110 + - Toolset: v100 + platform: - x86 - x64 From fcb63130d657f0cbc92b72a870308e4b0e7496b6 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 5 Apr 2017 16:34:19 +0300 Subject: [PATCH 034/303] mdbx: fix posix mdbx_lck_destroy(). --- src/lck-posix.c | 2 +- src/mdbx.c | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 7c78f707..82b23934 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -89,7 +89,7 @@ bailout: void mdbx_lck_destroy(MDB_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* try get exclusive access */ - if (mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0) == 0) { + if (env->me_txns && mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0) == 0) { /* got exclusive, drown mutexes */ int rc = pthread_mutex_destroy(&env->me_txns->mti_rmutex); if (rc == 0) diff --git a/src/mdbx.c b/src/mdbx.c index 384ecfda..de228e5a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4160,8 +4160,10 @@ static void __cold mdbx_env_close0(MDB_env *env) { env->me_valgrind_handle = -1; #endif } - if (env->me_fd != INVALID_HANDLE_VALUE) + if (env->me_fd != INVALID_HANDLE_VALUE) { (void)mdbx_closefile(env->me_fd); + env->me_fd = INVALID_HANDLE_VALUE; + } mdbx_munmap((void *)env->me_txns, (env->me_maxreaders - 1) * sizeof(MDB_reader) + @@ -4169,8 +4171,10 @@ static void __cold mdbx_env_close0(MDB_env *env) { env->me_txns = NULL; env->me_pid = 0; - if (env->me_lfd != INVALID_HANDLE_VALUE) + if (env->me_lfd != INVALID_HANDLE_VALUE) { (void)mdbx_closefile(env->me_lfd); + env->me_lfd = INVALID_HANDLE_VALUE; + } } int __cold mdbx_env_close_ex(MDB_env *env, int dont_sync) { From 0b400c5dd0830b5ddbe45dec38bbcc6dedaaf0ea Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 5 Apr 2017 18:19:26 +0300 Subject: [PATCH 035/303] mdbx: fix posix mdbx_mmap(). --- src/osal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osal.c b/src/osal.c index 2bac64e1..40e93178 100644 --- a/src/osal.c +++ b/src/osal.c @@ -599,7 +599,7 @@ int mdbx_mmap(void **address, size_t length, int rw, mdbx_filehandle_t fd) { CloseHandle(h); return rc; #else - *address = mmap(address, length, rw ? PROT_READ | PROT_WRITE : PROT_READ, + *address = mmap(NULL, length, rw ? PROT_READ | PROT_WRITE : PROT_READ, MAP_SHARED, fd, 0); return (*address != MAP_FAILED) ? MDB_SUCCESS : errno; #endif From b558757cf16fbd03bd4eb7af6f981ae522e8e7fe Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 5 Apr 2017 18:18:46 +0300 Subject: [PATCH 036/303] mdbx: roundup readers/lck-filesize. --- src/mdbx.c | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index de228e5a..af89676a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3848,10 +3848,17 @@ static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { /****************************************************************************/ +static __inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } + +static __inline size_t roundup2(size_t value, size_t granularity) { + assert(is_powerof2(granularity)); + return (value + granularity - 1) & ~(granularity - 1); +} + /* Open and/or initialize the lock region for the environment. */ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) { - off_t size, rsize; + off_t size; int rc = mdbx_openfile(lpath, O_RDWR | O_CREAT, mode, &env->me_lfd); if (rc != MDB_SUCCESS) { @@ -3875,19 +3882,22 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, rc = mdbx_filesize(env->me_lfd, &size); if (unlikely(rc != MDB_SUCCESS)) return rc; - rsize = (env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDBX_lockinfo); - if (size != rsize && *excl > 0) { - rc = mdbx_ftruncate(env->me_lfd, rsize); - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } else { - rsize = size; - size = rsize - sizeof(MDBX_lockinfo); - env->me_maxreaders = size / sizeof(MDB_reader) + 1; + + if (*excl > 0) { + off_t wanna = roundup2((env->me_maxreaders - 1) * sizeof(MDB_reader) + + sizeof(MDBX_lockinfo), + env->me_os_psize); + if (size != wanna) { + rc = mdbx_ftruncate(env->me_lfd, wanna); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + size = wanna; + } } + env->me_maxreaders = (size - sizeof(MDBX_lockinfo)) / sizeof(MDB_reader) + 1; void *addr = NULL; - rc = mdbx_mmap(&addr, rsize, true, env->me_lfd); + rc = mdbx_mmap(&addr, size, true, env->me_lfd); if (unlikely(rc != MDB_SUCCESS)) return rc; env->me_txns = addr; @@ -3901,25 +3911,25 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, } #ifdef MADV_NOHUGEPAGE - (void)madvise(env->me_txns, rsize, MADV_NOHUGEPAGE); + (void)madvise(env->me_txns, size, MADV_NOHUGEPAGE); #endif #ifdef MADV_DODUMP - (void)madvise(env->me_txns, rsize, MADV_DODUMP); + (void)madvise(env->me_txns, size, MADV_DODUMP); #endif #ifdef MADV_DONTFORK - if (madvise(env->me_txns, rsize, MADV_DONTFORK) < 0) + if (madvise(env->me_txns, size, MADV_DONTFORK) < 0) return errno; #endif #ifdef MADV_WILLNEED - if (madvise(env->me_txns, rsize, MADV_WILLNEED) < 0) + if (madvise(env->me_txns, size, MADV_WILLNEED) < 0) return errno; #endif #ifdef MADV_RANDOM - if (madvise(env->me_txns, rsize, MADV_RANDOM) < 0) + if (madvise(env->me_txns, size, MADV_RANDOM) < 0) return errno; #endif @@ -4129,7 +4139,6 @@ static void __cold mdbx_env_close0(MDB_env *env) { if (!(env->me_flags & MDB_ENV_ACTIVE)) return; env->me_flags &= ~MDB_ENV_ACTIVE; - mdbx_lck_destroy(env); /* Doing this here since me_dbxs may not exist during mdbx_env_close */ if (env->me_dbxs) { @@ -4171,6 +4180,7 @@ static void __cold mdbx_env_close0(MDB_env *env) { env->me_txns = NULL; env->me_pid = 0; + mdbx_lck_destroy(env); if (env->me_lfd != INVALID_HANDLE_VALUE) { (void)mdbx_closefile(env->me_lfd); env->me_lfd = INVALID_HANDLE_VALUE; From 095cd25e2337d00f866d95b81dfe6fc29bc99f59 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 5 Apr 2017 18:33:19 +0300 Subject: [PATCH 037/303] mdbx: check sys-pagesize for power-of-2. --- src/bits.h | 7 +++++++ src/mdbx.c | 9 ++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/bits.h b/src/bits.h index 45c0d69c..57551a48 100644 --- a/src/bits.h +++ b/src/bits.h @@ -780,3 +780,10 @@ void mdbx_rthc_unlock(void); int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDB_reader *begin, MDB_reader *end); void mdbx_rthc_remove(mdbx_thread_key_t key); void mdbx_rthc_cleanup(void); + +static __inline bool is_power2(size_t x) { return (x & (x - 1)) == 0; } + +static __inline size_t roundup2(size_t value, size_t granularity) { + assert(is_power2(granularity)); + return (value + granularity - 1) & ~(granularity - 1); +} diff --git a/src/mdbx.c b/src/mdbx.c index af89676a..1af8aec8 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3631,6 +3631,8 @@ int __cold mdbx_env_create(MDB_env **env) { e->me_lfd = INVALID_HANDLE_VALUE; e->me_pid = mdbx_getpid(); mdbx_env_setup_limits(e, e->me_os_psize = mdbx_syspagesize()); + if (!is_power2(e->me_os_psize)) + return MDB_INCOMPATIBLE; VALGRIND_CREATE_MEMPOOL(e, 0, 0); e->me_signature = MDBX_ME_SIGNATURE; *env = e; @@ -3848,13 +3850,6 @@ static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { /****************************************************************************/ -static __inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } - -static __inline size_t roundup2(size_t value, size_t granularity) { - assert(is_powerof2(granularity)); - return (value + granularity - 1) & ~(granularity - 1); -} - /* Open and/or initialize the lock region for the environment. */ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) { From a3644aa6d02fc677920ec67465056aeba2c1d2aa Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 30 Mar 2017 18:54:57 +0300 Subject: [PATCH 038/303] mdbx: new testset (initial, stub). Initial stub for https://github.com/ReOpen/libmdbx/issues/8 --- .appveyor.yml | 7 +- .gitignore | 45 ++-- Makefile | 42 +-- mdbx-dll.vcxproj => dll.vcxproj | 0 libmdbx.files | 31 ++- mdbx.h | 2 +- mdbx.sln | 12 +- src/bits.h | 8 +- src/defs.h | 7 + src/lck-windows.c | 2 + src/mdbx.c | 52 ++-- src/osal.c | 8 +- src/osal.h | 5 +- test/base.h | 61 +++++ test/cases.cc | 69 +++++ test/config.cc | 446 ++++++++++++++++++++++++++++++++ test/config.h | 149 +++++++++++ test/dead.cc | 61 +++++ test/hill.cc | 37 +++ test/jitter.cc | 33 +++ test/keygen.cc | 72 ++++++ test/keygen.h | 123 +++++++++ test/log.cc | 129 +++++++++ test/log.h | 61 +++++ test/main.cc | 311 ++++++++++++++++++++++ test/osal-unix.cc | 230 ++++++++++++++++ test/osal-windows.cc | 262 +++++++++++++++++++ test/osal.h | 28 ++ test/test.cc | 192 ++++++++++++++ test/test.h | 145 +++++++++++ test/test.vcxproj | 182 +++++++++++++ test/test0.c | 220 ---------------- test/test1.c | 199 -------------- test/test2.c | 153 ----------- test/test3.c | 162 ------------ test/test4.c | 196 -------------- test/test5.c | 164 ------------ test/test6.c | 175 ------------- test/test7.c | 246 ------------------ test/test_bench.c | 260 ------------------- test/test_yota1.c | 277 -------------------- test/test_yota2.c | 335 ------------------------ test/utils.cc | 90 +++++++ test/utils.h | 312 ++++++++++++++++++++++ 44 files changed, 3120 insertions(+), 2481 deletions(-) rename mdbx-dll.vcxproj => dll.vcxproj (100%) create mode 100644 test/base.h create mode 100644 test/cases.cc create mode 100644 test/config.cc create mode 100644 test/config.h create mode 100644 test/dead.cc create mode 100644 test/hill.cc create mode 100644 test/jitter.cc create mode 100644 test/keygen.cc create mode 100644 test/keygen.h create mode 100644 test/log.cc create mode 100644 test/log.h create mode 100644 test/main.cc create mode 100644 test/osal-unix.cc create mode 100644 test/osal-windows.cc create mode 100644 test/osal.h create mode 100644 test/test.cc create mode 100644 test/test.h create mode 100644 test/test.vcxproj delete mode 100644 test/test0.c delete mode 100644 test/test1.c delete mode 100644 test/test2.c delete mode 100644 test/test3.c delete mode 100644 test/test4.c delete mode 100644 test/test5.c delete mode 100644 test/test6.c delete mode 100644 test/test7.c delete mode 100644 test/test_bench.c delete mode 100644 test/test_yota1.c delete mode 100644 test/test_yota2.c create mode 100644 test/utils.cc create mode 100644 test/utils.h diff --git a/.appveyor.yml b/.appveyor.yml index aa374141..a164b1c8 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,4 +1,3 @@ -version: '{build}' max_jobs: 1 image: Visual Studio 2015 @@ -23,6 +22,6 @@ build: verbosity: minimal project: mdbx.sln -# test_script: -# - ps: | -# & "C:\projects\mdbx\$env:PLATFORM\$env:CONFIGURATION\test\test.exe" +test_script: +- ps: | + & "C:\projects\mdbx\$env:PLATFORM\$env:CONFIGURATION\test\test.exe" --pathname=tmp.db --basic --dont-cleanup-after diff --git a/.gitignore b/.gitignore index 7a7d1485..84d48914 100644 --- a/.gitignore +++ b/.gitignore @@ -1,26 +1,33 @@ -mtest[0123456] -wbench -testdb -mdbx_copy -mdbx_stat -mdbx_dump -mdbx_load -mdbx_chk -*.lo -*.[ao] -*.so -*.exe *[~#] +*.[ao] *.bak -*.orig -*.rej -*.gcov -*.gcda -*.gcno core core.* -valgrind.* -yota_test* +*.exe +*.gcda +*.gcno +*.gcov +libmdbx.creator.user +*.lo +mdbx_chk +mdbx_copy mdbx-dll.VC.db mdbx-dll.VC.VC.opendb mdbx-dll.vcxproj.filters +mdbx_dump +mdbx_load +mdbx_stat +*.orig +*.rej +*.so +/test/test +test/test.vcxproj.user +test/tmp.db +test/tmp.db-lock +tmp.db +tmp.db-lock +valgrind.* +.vs/ +Win32/ +x64/ +x86/ diff --git a/Makefile b/Makefile index 9da7d70e..e88007b1 100644 --- a/Makefile +++ b/Makefile @@ -23,10 +23,13 @@ mandir ?= $(prefix)/man suffix ?= CC ?= gcc +CXX ?= g++ XCFLAGS ?= -DNDEBUG=1 -DMDB_DEBUG=0 -DMDBX_EXPORTS=1 CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden CFLAGS += -D_GNU_SOURCE=1 -std=gnu99 -pthread $(XCFLAGS) -COVER ?= -coverage -fprofile-arcs -ftest-coverage -O0 +# COVER ?= -coverage -fprofile-arcs -ftest-coverage -Og + +CXXFLAGS = -std=c++11 $(filter-out -std=gnu99,$(CFLAGS)) # LY: for ability to built with modern glibc, # but then run with the old @@ -41,14 +44,12 @@ HEADERS := mdbx.h LIBRARIES := libmdbx.a libmdbx.so TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 -TESTS := test0 test1 test2 test3 test4 test5 test6 test7 \ - test_bench test_yota1 test_yota2 MDBX_SRC := mdbx.h $(addprefix src/, mdbx.c osal.c lck-posix.c defs.h bits.h osal.h midl.h) -.PHONY: mdbx all install clean check tests coverage +.PHONY: mdbx all install clean check coverage -all: $(LIBRARIES) $(TOOLS) +all: $(LIBRARIES) $(TOOLS) test/test mdbx: libmdbx.a libmdbx.so @@ -65,20 +66,10 @@ install: $(LIBRARIES) $(TOOLS) $(HEADERS) && cp -t $(SANDBOX)$(mandir)/man1 $(MANPAGES) clean: - rm -rf $(TOOLS) $(TESTS) @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err + rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err -tests: $(TESTS) - -check: tests - [ -d tmp.db ] || mkdir tmp.db && rm -f tmp.db/* \ - && echo "*** LMDB-TEST-0" && ./test0 && ./mdbx_chk -v tmp.db \ - && echo "*** LMDB-TEST-1" && ./test1 && ./mdbx_chk -v tmp.db \ - && echo "*** LMDB-TEST-2" && ./test2 && ./mdbx_chk -v tmp.db \ - && echo "*** LMDB-TEST-3" && ./test3 && ./mdbx_chk -v tmp.db \ - && echo "*** LMDB-TEST-4" && ./test4 && ./mdbx_chk -v tmp.db \ - && echo "*** LMDB-TEST-5" && ./test5 && ./mdbx_chk -v tmp.db \ - && echo "*** LMDB-TEST-6" && ./test6 && ./mdbx_chk -v tmp.db \ - && echo "*** LMDB-TESTs - all done" +check: test/test + test/test --pathname=tmp.db --basic --dont-cleanup-after && ./mdbx_chk -vn tmp.db mdbx.o: $(MDBX_SRC) Makefile $(CC) $(CFLAGS) -c src/mdbx.c -o $@ @@ -98,19 +89,8 @@ libmdbx.so: mdbx.o osal.o lck-posix.o mdbx_%: src/tools/mdbx_%.c libmdbx.a $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ -test%: test/test%.c libmdbx.a - $(CC) $(CFLAGS) $(LDFLAGS) -Isrc -o $@ $^ - -gcov-mdbx.o: $(MDBX_SRC) Makefile - $(CC) $(CFLAGS) $(COVER) -c src/mdbx.c -o $@ - -# Seem this useless :( -coverage: gcov-mdbx.o - for t in test/test[0-9]*.c; do x=`basename \$$t .c`; \ - $(CC) $(CFLAGS) $(COVER) -Isrc $$t -o gcov-$$x $^; \ - rm -rf tmp.db; mkdir tmp.db; ./gcov-$$x; \ - done - gcov *.gcno +test/test: $(wildcard test/*.h) $(filter-out test/osal-windows.cc, $(wildcard test/*.cc)) libmdbx.a + $(CXX) $(CXXFLAGS) $(LDFLAGS) -Isrc -o $@ $(filter-out %.h, $^) ifneq ($(wildcard $(IOARENA)),) diff --git a/mdbx-dll.vcxproj b/dll.vcxproj similarity index 100% rename from mdbx-dll.vcxproj rename to dll.vcxproj diff --git a/libmdbx.files b/libmdbx.files index 1cb076a5..c0b049fa 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -20,17 +20,26 @@ src/tools/mdbx_load.1 src/tools/mdbx_load.c src/tools/mdbx_stat.1 src/tools/mdbx_stat.c -test/test0.c -test/test1.c -test/test2.c -test/test3.c -test/test4.c -test/test5.c -test/test6.c -test/test7.c -test/test_bench.c -test/test_yota1.c -test/test_yota2.c +test/actor.cc +test/base.h +test/config.h +test/dead.cc +test/hill.cc +test/jitter.cc +test/keygen.cc +test/keygen.h +test/log.cc +test/log.h +test/main.cc +test/config.cc +test/cases.cc +test/osal-unix.cc +test/osal-windows.cc +test/osal.h +test/test.cc +test/test.h +test/utils.cc +test/utils.h tutorial/README.md tutorial/sample-bdb.txt tutorial/sample-mdb.txt diff --git a/mdbx.h b/mdbx.h index 2d30b2b8..75688438 100644 --- a/mdbx.h +++ b/mdbx.h @@ -90,7 +90,7 @@ #if defined(LIBMDBX_EXPORTS) # define LIBMDBX_API __dll_export -#elif defined(MDBX_IMPORTS) +#elif defined(LIBMDBX_IMPORTS) # define LIBMDBX_API __dll_import #else # define LIBMDBX_API diff --git a/mdbx.sln b/mdbx.sln index b94111d6..aa2025d8 100644 --- a/mdbx.sln +++ b/mdbx.sln @@ -3,7 +3,9 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 14 VisualStudioVersion = 14.0.25420.1 MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mdbx-dll", "mdbx-dll.vcxproj", "{6D19209B-ECE7-4B9C-941C-0AA2B484F199}" +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "test\test.vcxproj", "{30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dll", "dll.vcxproj", "{6D19209B-ECE7-4B9C-941C-0AA2B484F199}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -21,6 +23,14 @@ Global {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.Build.0 = Release|x64 {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.ActiveCfg = Release|Win32 {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.Build.0 = Release|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x64.ActiveCfg = Debug|x64 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x64.Build.0 = Debug|x64 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x86.ActiveCfg = Debug|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x86.Build.0 = Debug|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x64.ActiveCfg = Release|x64 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x64.Build.0 = Release|x64 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x86.ActiveCfg = Release|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/src/bits.h b/src/bits.h index 57551a48..70cfaa48 100644 --- a/src/bits.h +++ b/src/bits.h @@ -96,11 +96,11 @@ #endif #if defined(__i386) || defined(__x86_64) || defined(_M_IX86) -# define MISALIGNED_OK 1 /* TODO */ +# define UNALIGNED_OK 1 /* TODO */ #endif -#ifndef MISALIGNED_OK -# define MISALIGNED_OK 0 -#endif /* MISALIGNED_OK */ +#ifndef UNALIGNED_OK +# define UNALIGNED_OK 0 +#endif /* UNALIGNED_OK */ #if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF # error "Sanity checking failed: Two's complement, reasonably sized integer types" diff --git a/src/defs.h b/src/defs.h index a00fce4a..24c67cc2 100644 --- a/src/defs.h +++ b/src/defs.h @@ -283,6 +283,13 @@ # endif #endif /* unlikely */ +#if !defined(__noop) && !defined(_MSC_VER) + static __inline int __do_noop(void* crutch, ...) { + (void) crutch; return 0; + } +# define __noop(...) __do_noop(0, __VA_ARGS__) +#endif /* __noop */ + /*----------------------------------------------------------------------------*/ /* Wrapper around __func__, which is a C99 feature */ diff --git a/src/lck-windows.c b/src/lck-windows.c index e2fd071c..a037dbda 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -287,12 +287,14 @@ void mdbx_lck_destroy(MDB_env *env) { /* double `unlock` for robustly remove overlapped shared/exclusive locks */ while (funlock(env->me_lfd, LCK_LOWER)) ; + rc = GetLastError(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); while (funlock(env->me_lfd, LCK_UPPER)) ; + rc = GetLastError(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); diff --git a/src/mdbx.c b/src/mdbx.c index 1af8aec8..19424aba 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -538,7 +538,7 @@ static __inline MDB_node *NODEPTR(MDB_page *p, unsigned i) { #define NODEKSZ(node) ((node)->mn_ksize) /** Copy a page number from src to dst */ -#if MISALIGNED_OK +#if UNALIGNED_OK #define COPY_PGNO(dst, src) dst = src #elif SIZE_MAX > 4294967295UL #define COPY_PGNO(dst, src) \ @@ -560,7 +560,7 @@ static __inline MDB_node *NODEPTR(MDB_page *p, unsigned i) { *d++ = *s++; \ *d = *s; \ } while (0) -#endif /* MISALIGNED_OK */ +#endif /* UNALIGNED_OK */ /** The address of a key in a LEAF2 page. * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate @@ -754,16 +754,31 @@ static const char *__mdbx_strerr(int errnum) { } } -const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) { +const char *__cold mdbx_strerror_r(int errnum, char *buf, size_t buflen) { const char *msg = __mdbx_strerr(errnum); if (!msg) { -#if defined(_WIN32) || defined(_WIN64) - (void)errnum; - (void)buf; - (void)buflen; - msg = FIXME; -#else + if (!buflen) + return NULL; +#ifdef _MSC_VER + int rc = strerror_s(buf, buflen, errnum); + assert(rc == 0); + (void)rc; + return buf; +#elif defined(_GNU_SOURCE) + /* GNU-specific */ msg = strerror_r(errnum, buf, buflen); +#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) + /* XSI-compliant */ + int rc = strerror_r(errnum, buf, buflen); + if (rc) { + rc = snprintf(buf, buflen, "error %d", errnum); + assert(rc > 0); + } + return buf; +#else + strncpy(buf, strerror(errnum), buflen); + buf[buflen - 1] = '\0'; + return buf; #endif } return msg; @@ -772,9 +787,12 @@ const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) { const char *__cold mdbx_strerror(int errnum) { const char *msg = __mdbx_strerr(errnum); if (!msg) { -#if defined(_WIN32) || defined(_WIN64) - (void)errnum; - msg = FIXME; +#ifdef _MSC_VER + static __thread char buffer[1024]; + int rc = strerror_s(buffer, sizeof(buffer), errnum); + assert(rc == 0); + (void)rc; + msg = buffer; #else msg = strerror(errnum); #endif @@ -3776,7 +3794,7 @@ int __cold mdbx_env_set_maxreaders(MDB_env *env, unsigned readers) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(env->me_map)) + if (unlikely(env->me_map || readers > INT16_MAX)) return EINVAL; env->me_maxreaders = readers; @@ -4247,7 +4265,7 @@ static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { mdbx_assert(NULL, a->mv_size == b->mv_size); mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(uint16_t) && 0 == (uintptr_t)b->mv_data % sizeof(uint16_t)); -#if MISALIGNED_OK +#if UNALIGNED_OK switch (a->mv_size) { case 4: return mdbx_cmp2int(*(uint32_t *)a->mv_data, *(uint32_t *)b->mv_data); @@ -4282,7 +4300,7 @@ static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { } while (pa != end); return diff; } -#endif /* MISALIGNED_OK */ +#endif /* UNALIGNED_OK */ } /** Compare two items pointing at unsigneds of unknown alignment. @@ -4291,7 +4309,7 @@ static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { */ static int __hot mdbx_cmp_int_ua(const MDB_val *a, const MDB_val *b) { mdbx_assert(NULL, a->mv_size == b->mv_size); -#if MISALIGNED_OK +#if UNALIGNED_OK switch (a->mv_size) { case 4: return mdbx_cmp2int(*(uint32_t *)a->mv_data, *(uint32_t *)b->mv_data); @@ -4322,7 +4340,7 @@ static int __hot mdbx_cmp_int_ua(const MDB_val *a, const MDB_val *b) { #else /* __BYTE_ORDER__ */ return memcmp(a->mv_data, b->mv_data, a->mv_size); #endif /* __BYTE_ORDER__ */ -#endif /* MISALIGNED_OK */ +#endif /* UNALIGNED_OK */ } /** Compare two items lexically */ diff --git a/src/osal.c b/src/osal.c index 40e93178..12cf7782 100644 --- a/src/osal.c +++ b/src/osal.c @@ -17,7 +17,7 @@ #include "./bits.h" #if defined(_WIN32) || defined(_WIN64) -static int waitfor2errcode(DWORD result) { +static int waitstatus2errcode(DWORD result) { switch (result) { case WAIT_OBJECT_0: return MDB_SUCCESS; @@ -183,7 +183,7 @@ int mdbx_mutex_destroy(mdbx_mutex_t *mutex) { int mdbx_mutex_lock(mdbx_mutex_t *mutex) { #if defined(_WIN32) || defined(_WIN64) DWORD code = WaitForSingleObject(*mutex, INFINITE); - return waitfor2errcode(code); + return waitstatus2errcode(code); #else return pthread_mutex_lock(mutex); #endif @@ -231,7 +231,7 @@ int mdbx_cond_wait(mdbx_cond_t *cond, mdbx_mutex_t *mutex) { DWORD code = SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); if (code == WAIT_OBJECT_0) code = WaitForSingleObject(*mutex, INFINITE); - return waitfor2errcode(code); + return waitstatus2errcode(code); #else return pthread_cond_wait(cond, mutex); #endif @@ -555,7 +555,7 @@ int mdbx_thread_create(mdbx_thread_t *thread, int mdbx_thread_join(mdbx_thread_t thread) { #if defined(_WIN32) || defined(_WIN64) DWORD code = WaitForSingleObject(thread, INFINITE); - return waitfor2errcode(code); + return waitstatus2errcode(code); #else void *unused_retval = &unused_retval; return pthread_join(thread, &unused_retval); diff --git a/src/osal.h b/src/osal.h index 4000475b..e0dc9244 100644 --- a/src/osal.h +++ b/src/osal.h @@ -172,11 +172,12 @@ typedef pthread_key_t mdbx_thread_key_t; defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ defined(__i386) || defined(__x86_64__) || defined(_M_IX86) || \ defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ - defined(_X86_64_) || defined(_M_ARM) || defined(__e2k__) + defined(_X86_64_) || defined(_M_ARM) || defined(_M_ARM64) || \ + defined(__e2k__) #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ #elif defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(__ARMEB__) || \ defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(__MIPSEB__) || \ - defined(_MIPSEB) || defined(__MIPSEB) + defined(_MIPSEB) || defined(__MIPSEB) || defined(_M_IA64) #define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ #else #error __BYTE_ORDER__ should be defined. diff --git a/test/base.h b/test/base.h new file mode 100644 index 00000000..155cd98f --- /dev/null +++ b/test/base.h @@ -0,0 +1,61 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once + +#ifndef NOMINMAX +#define NOMINMAX +#endif + +#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) +/* If you wish to build your application for a previous Windows platform, + * include WinSDKVer.h and set the _WIN32_WINNT macro to the platform you + * wish to support before including SDKDDKVer.h. + * + * TODO: #define _WIN32_WINNT WIN32_MUSTDIE */ +#include +#endif /* WINDOWS */ + +#include +#include +#include +#include +#include +#include + +#ifdef _BSD_SOURCE +#include +#endif + +#include +#include +#include // for PRId64, PRIu64 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#include +#endif + +#include "../mdbx.h" +#include "../src/defs.h" diff --git a/test/cases.cc b/test/cases.cc new file mode 100644 index 00000000..a24838c6 --- /dev/null +++ b/test/cases.cc @@ -0,0 +1,69 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +void configure_actor(unsigned &lastid, const actor_testcase testcase, + const char *id_cstr, const actor_params ¶ms) { + unsigned wait4id = 0; + + if (params.waitfor_nops) { + for (auto i = global::actors.rbegin(); i != global::actors.rend(); ++i) { + if (i->is_waitable(params.waitfor_nops)) { + if (i->signal_nops && i->signal_nops != params.waitfor_nops) + failure("Previous waitable actor (id=%u) already linked on %u-ops\n", + i->id, i->signal_nops); + wait4id = i->id; + i->signal_nops = params.waitfor_nops; + break; + } + } + if (!wait4id) + failure("No previous waitable actor for %u-ops\n", params.waitfor_nops); + } + + unsigned long id = 0; + if (!id_cstr || strcmp(id_cstr, "auto") == 0) + id = lastid + 1; + else { + char *end = nullptr; + errno = 0; + id = strtoul(id_cstr, &end, 0); + if (errno) + failure_perror("Expects an integer value for actor-id\n", errno); + if (end && *end) + failure("The '%s' is unexpected for actor-id\n", end); + } + + if (id < 1 || id > ACTOR_ID_MAX) + failure("Invalid actor-id %lu\n", id); + lastid = id; + + global::actors.emplace_back(actor_config(testcase, params, id, wait4id)); + global::databases.insert(params.pathname_db); +} + +bool testcase_setup(const char *casename, const actor_params ¶ms, + unsigned &lastid) { + log_notice("testcase_setup(%s): TODO", casename); + + if (strcmp(casename, "basic") == 0) { + configure_actor(lastid, ac_hill, nullptr, params); + return true; + } + + return false; +} + +/* TODO */ diff --git a/test/config.cc b/test/config.cc new file mode 100644 index 00000000..fd9bc732 --- /dev/null +++ b/test/config.cc @@ -0,0 +1,446 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +#if defined(_MSC_VER) && !defined(strcasecmp) +#define strcasecmp(str, len) _stricmp(str, len) +#endif /* _MSC_VER && strcasecmp() */ + +namespace config { + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + const char **value, const char *default_value) { + assert(narg < argc); + const char *current = argv[narg]; + const size_t optlen = strlen(option); + + if (strncmp(current, "--", 2) || strncmp(current + 2, option, optlen)) + return false; + + if (!value) { + if (current[optlen + 2] == '=') + failure("Option '--%s' doen't accept any value\n", option); + narg += 1; + return true; + } + + *value = nullptr; + if (current[optlen + 2] == '=') { + *value = ¤t[optlen + 3]; + narg += 1; + return true; + } + + if (narg + 1 < argc && strncmp("--", argv[narg + 1], 2)) { + *value = argv[narg + 1]; + narg += 2; + return true; + } + + if (default_value) { + *value = default_value; + narg += 1; + return true; + } + + failure("No value given for '--%s' option\n", option); +} + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + std::string &value, bool allow_empty) { + const char *value_cstr; + if (!parse_option(argc, argv, narg, option, &value_cstr, + allow_empty ? "" : nullptr)) + return false; + + if (!allow_empty && strlen(value_cstr) == 0) + failure("Value for option '--%s' could't be empty\n", option); + + value = value_cstr; + return true; +} + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + size_t &mask, const option_verb *verbs) { + const char *list; + if (!parse_option(argc, argv, narg, option, &list)) + return false; + + mask = 0; + while (*list) { + if (*list == ',' || *list == ' ' || *list == '\t') { + ++list; + continue; + } + + const char *const comma = strchr(list, ','); + const size_t len = (comma) ? comma - list : strlen(list); + const option_verb *scan = verbs; + while (true) { + if (!scan->verb) + failure("Unknown verb '%.*s', for option '==%s'\n", (int)len, list, + option); + if (strlen(scan->verb) == len && strncmp(list, scan->verb, len) == 0) { + mask |= scan->mask; + list += len; + break; + } + ++scan; + } + } + + return true; +} + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + uint64_t &value, const scale_mode scale, + const uint64_t minval, const uint64_t maxval) { + + const char *value_cstr; + if (!parse_option(argc, argv, narg, option, &value_cstr)) + return false; + + char *suffix = nullptr; + errno = 0; + unsigned long raw = strtoul(value_cstr, &suffix, 0); + if (errno) + failure("Option '--%s' expects a numeric value (%s)\n", option, + test_strerror(errno)); + + uint64_t multipler = 1; + if (suffix && *suffix) { + if (scale == no_scale) + failure("Option '--%s' doen't accepts suffixes, so '%s' is unexpected\n", + option, suffix); + if (strcmp(suffix, "K") == 0 || strcasecmp(suffix, "Kilo") == 0) + multipler = (scale == decimal) ? UINT64_C(1000) : UINT64_C(1024); + else if (strcmp(suffix, "M") == 0 || strcasecmp(suffix, "Mega") == 0) + multipler = + (scale == decimal) ? UINT64_C(1000) * 1000 : UINT64_C(1024) * 1024; + else if (strcmp(suffix, "G") == 0 || strcasecmp(suffix, "Giga") == 0) + multipler = (scale == decimal) ? UINT64_C(1000) * 1000 * 1000 + : UINT64_C(1024) * 1024 * 1024; + else if (strcmp(suffix, "T") == 0 || strcasecmp(suffix, "Tera") == 0) + multipler = (scale == decimal) ? UINT64_C(1000) * 1000 * 1000 * 1000 + : UINT64_C(1024) * 1024 * 1024 * 1024; + else if (scale == duration && + (strcmp(suffix, "s") == 0 || strcasecmp(suffix, "Seconds") == 0)) + multipler = 1; + else if (scale == duration && + (strcmp(suffix, "m") == 0 || strcasecmp(suffix, "Minutes") == 0)) + multipler = 60; + else if (scale == duration && + (strcmp(suffix, "h") == 0 || strcasecmp(suffix, "Hours") == 0)) + multipler = 3600; + else if (scale == duration && + (strcmp(suffix, "d") == 0 || strcasecmp(suffix, "Days") == 0)) + multipler = 3600 * 24; + else + failure( + "Option '--%s' expects a numeric value with Kilo/Mega/Giga/Tera %s" + "suffixes, but '%s' is unexpected\n", + option, (scale == duration) ? "or Seconds/Minutes/Hours/Days " : "", + suffix); + } + + if (raw >= UINT64_MAX / multipler) + failure("The value for option '--%s' is too huge\n", option); + + value = raw * multipler; + if (maxval && value > maxval) + failure("The maximal value for option '--%s' is %" PRIu64 "\n", option, + maxval); + if (value < minval) + failure("The minimal value for option '--%s' is %" PRIu64 "\n", option, + minval); + return true; +} + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + unsigned &value, const scale_mode scale, + const unsigned minval, const unsigned maxval) { + + uint64_t huge; + if (!parse_option(argc, argv, narg, option, huge, scale, minval, maxval)) + return false; + value = (unsigned)huge; + return true; +} + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + bool &value) { + const char *value_cstr = NULL; + if (!parse_option(argc, argv, narg, option, &value_cstr, "yes")) { + const char *current = argv[narg]; + if (strncmp(current, "--no-", 5) || strcmp(current + 5, option)) + return false; + value = false; + narg += 1; + return true; + } + + if (!value_cstr) { + value = true; + return true; + } + + if (strcasecmp(value_cstr, "yes") == 0 || strcasecmp(value_cstr, "1") == 0) { + value = true; + return true; + } + + if (strcasecmp(value_cstr, "no") == 0 || strcasecmp(value_cstr, "0") == 0) { + value = false; + return true; + } + + failure( + "Option '--%s' expects a 'boolean' value Yes/No, so '%s' is unexpected\n", + option, value_cstr); +} + +//----------------------------------------------------------------------------- + +const struct option_verb mode_bits[] = { + {"rdonly", MDB_RDONLY}, {"mapasync", MDB_MAPASYNC}, + {"utterly", MDBX_UTTERLY_NOSYNC}, {"nosubdir", MDB_NOSUBDIR}, + {"nosync", MDB_NOSYNC}, {"nometasync", MDB_NOMETASYNC}, + {"writemap", MDB_WRITEMAP}, {"notls", MDB_NOTLS}, + {"nordahead", MDB_NORDAHEAD}, {"nomeminit", MDB_NOMEMINIT}, + {"coasesce", MDBX_COALESCE}, {"lifo", MDBX_LIFORECLAIM}, + {"parturb", MDBX_PAGEPERTURB}, {nullptr, 0}}; + +const struct option_verb table_bits[] = { + {"key.reverse", MDB_REVERSEKEY}, + {"key.integer", MDB_INTEGERKEY}, + {"data.integer", MDB_INTEGERDUP | MDB_DUPFIXED | MDB_DUPSORT}, + {"data.fixed", MDB_DUPFIXED | MDB_DUPSORT}, + {"data.reverse", MDB_REVERSEDUP | MDB_DUPSORT}, + {"data.dups", MDB_DUPSORT}, + {nullptr, 0}}; + +static void dump_verbs(FILE *out, const char *caption, size_t bits, + const struct option_verb *verbs) { + fprintf(out, "%s: (%" PRIu64 ")", caption, (uint64_t)bits); + + while (verbs->mask && bits) { + if ((bits & verbs->mask) == verbs->mask) { + fprintf(out, ", %s", verbs->verb); + bits -= verbs->mask; + } + ++verbs; + } + + fprintf(out, "\n"); +} + +static void dump_duration(FILE *out, const char *caption, unsigned duration) { + fprintf(out, "%s: ", caption); + if (duration) { + if (duration > 24 * 3600) + fprintf(out, "%u_", duration / (24 * 3600)); + if (duration > 3600) + fprintf(out, "%02u:", (duration % (24 * 3600)) / 3600); + fprintf(out, "%02u:%02u", (duration % 3600) / 60, duration % 60); + } else + fprintf(out, "INFINITE"); + fprintf(out, "\n"); +} + +void dump(FILE *out) { + for (auto i = global::actors.begin(); i != global::actors.end(); ++i) { + fprintf(out, "testcase %s\n", testcase2str(i->testcase)); + if (i->id) + fprintf(out, "\tid/table %u\n", i->id); + + if (i->params.loglevel) { + fprintf(out, "\tlog: level %u, %s\n", i->params.loglevel, + i->params.pathname_log.empty() ? "console" + : i->params.pathname_log.c_str()); + } + + fprintf(out, "\tdatabase: %s, size %" PRIu64 "\n", + i->params.pathname_db.c_str(), i->params.size); + + dump_verbs(out, "\tmode", i->params.mode_flags, mode_bits); + dump_verbs(out, "\ttable", i->params.table_flags, table_bits); + + fprintf(out, "\tseed %u\n", i->params.seed); + + if (i->params.test_nrecords) + fprintf(out, "\trecords %u\n", i->params.test_nrecords); + else + dump_duration(out, "\tduration", i->params.test_duration); + + if (i->params.nrepeat) + fprintf(out, "\trepeat %u\n", i->params.nrepeat); + else + fprintf(out, "\trepeat ETERNALLY\n"); + + fprintf(out, "\tthreads %u\n", i->params.nthreads); + + fprintf(out, "\tkey: minlen %u, maxlen %u\n", i->params.keylen_min, + i->params.keylen_max); + fprintf(out, "\tdata: minlen %u, maxlen %u\n", i->params.datalen_min, + i->params.datalen_max); + + fprintf(out, "\tbatch: read %u, write %u\n", i->params.batch_read, + i->params.batch_write); + + if (i->params.waitfor_nops) + fprintf(out, "\twait: actor %u for %u ops\n", i->wait4id, + i->params.waitfor_nops); + else if (i->params.delaystart) + dump_duration(out, "\tdelay", i->params.delaystart); + else + fprintf(out, "\tno-delay\n"); + + fprintf(out, "\tlimits: readers %u, tables %u\n", i->params.max_readers, + i->params.max_tables); + + fprintf(out, "\tdrop table: %s\n", i->params.drop_table ? "Yes" : "No"); + + fprintf(out, "\t#---\n"); + } + + dump_duration(out, "timeout", global::config::timeout); + fprintf(out, "cleanup: before %s, after %s\n", + global::config::dont_cleanup_before ? "No" : "Yes", + global::config::dont_cleanup_after ? "No" : "Yes"); +} + +} /* namespace config */ + +//----------------------------------------------------------------------------- + +using namespace config; + +actor_config::actor_config(actor_testcase testcase, const actor_params ¶ms, + unsigned id, unsigned wait4id) + : params(params) { + this->id = id; + this->order = (unsigned)global::actors.size(); + this->testcase = testcase; + this->wait4id = wait4id; + signal_nops = 0; +} + +const std::string actor_config::serialize(const char *prefix) const { + simple_checksum checksum; + + std::string result; + if (prefix) + result.append(prefix); + + checksum.push(params.pathname_db); + result.append(params.pathname_db); + result.append("|"); + + checksum.push(params.pathname_log); + result.append(params.pathname_log); + result.append("|"); + + static_assert(std::is_pod::value, + "actor_params_pod should by POD"); + result.append(data2hex(static_cast(¶ms), + sizeof(actor_params_pod), checksum)); + result.append("|"); + + static_assert(std::is_pod::value, + "actor_config_pod should by POD"); + result.append(data2hex(static_cast(this), + sizeof(actor_config_pod), checksum)); + result.append("|"); + + result.append(osal_serialize(checksum)); + result.append("|"); + + result.append(std::to_string(checksum.value)); + return result; +} + +bool actor_config::deserialize(const char *str, actor_config &config) { + simple_checksum checksum; + + TRACE(">> actor_config::deserialize: %s\n", str); + + const char *slash = strchr(str, '|'); + if (!slash) { + TRACE("<< actor_config::deserialize: slash-1\n"); + return false; + } + config.params.pathname_db.assign(str, slash - str); + checksum.push(config.params.pathname_db); + str = slash + 1; + + slash = strchr(str, '|'); + if (!slash) { + TRACE("<< actor_config::deserialize: slash-2\n"); + return false; + } + config.params.pathname_log.assign(str, slash - str); + checksum.push(config.params.pathname_log); + str = slash + 1; + + slash = strchr(str, '|'); + if (!slash) { + TRACE("<< actor_config::deserialize: slash-3\n"); + return false; + } + static_assert(std::is_pod::value, + "actor_params_pod should by POD"); + if (!hex2data(str, slash, static_cast(&config.params), + sizeof(actor_params_pod), checksum)) { + TRACE("<< actor_config::deserialize: actor_params_pod(%.*s)\n", + (int)(slash - str), str); + return false; + } + str = slash + 1; + + slash = strchr(str, '|'); + if (!slash) { + TRACE("<< actor_config::deserialize: slash-4\n"); + return false; + } + static_assert(std::is_pod::value, + "actor_config_pod should by POD"); + if (!hex2data(str, slash, static_cast(&config), + sizeof(actor_config_pod), checksum)) { + TRACE("<< actor_config::deserialize: actor_config_pod(%.*s)\n", + (int)(slash - str), str); + return false; + } + str = slash + 1; + + slash = strchr(str, '|'); + if (!slash) { + TRACE("<< actor_config::deserialize: slash-5\n"); + return false; + } + if (!config.osal_deserialize(str, slash, checksum)) { + TRACE("<< actor_config::deserialize: osal\n"); + return false; + } + str = slash + 1; + + uint64_t verify = std::stoull(std::string(str)); + if (checksum.value != verify) { + TRACE("<< actor_config::deserialize: checksum mismatch\n"); + return false; + } + + TRACE("<< actor_config::deserialize: OK\n"); + return true; +} diff --git a/test/config.h b/test/config.h new file mode 100644 index 00000000..c16eca9a --- /dev/null +++ b/test/config.h @@ -0,0 +1,149 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once + +#include "base.h" +#include "log.h" +#include "utils.h" + +#define ACTOR_ID_MAX INT16_MAX + +enum actor_testcase { ac_none, ac_hill, ac_deadread, ac_deadwrite, ac_jitter }; + +enum actor_status { + as_unknown, + as_debuging, + as_running, + as_successful, + as_killed, + as_failed +}; + +const char *testcase2str(const actor_testcase); +const char *status2str(actor_status status); + +//----------------------------------------------------------------------------- + +namespace config { + +enum scale_mode { no_scale, decimal, binary, duration }; + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + const char **value, const char *default_value = nullptr); + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + std::string &value, bool allow_empty = false); + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + bool &value); + +struct option_verb { + const char *const verb; + unsigned mask; +}; + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + size_t &mask, const option_verb *verbs); + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + uint64_t &value, const scale_mode scale, + const uint64_t minval = 0, const uint64_t maxval = INT64_MAX); + +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + unsigned &value, const scale_mode scale, + const unsigned minval = 0, const unsigned maxval = INT32_MAX); + +//----------------------------------------------------------------------------- + +#pragma pack(push, 1) + +struct actor_params_pod { + unsigned loglevel; + + size_t mode_flags; + size_t table_flags; + uint64_t size; + unsigned seed; + + unsigned test_duration; + unsigned test_nrecords; + unsigned nrepeat; + unsigned nthreads; + + unsigned keylen_min, keylen_max; + unsigned datalen_min, datalen_max; + + unsigned batch_read; + unsigned batch_write; + + unsigned delaystart; + unsigned waitfor_nops; + + bool drop_table; + + unsigned max_readers; + unsigned max_tables; +}; + +struct actor_config_pod { + unsigned id, order; + actor_testcase testcase; + unsigned wait4id; + unsigned signal_nops; +}; + +#pragma pack(pop) + +extern const struct option_verb mode_bits[]; +extern const struct option_verb table_bits[]; +void dump(FILE *out); + +} /* namespace config */ + +struct actor_params : public config::actor_params_pod { + std::string pathname_log; + std::string pathname_db; + void set_defaults(void); +}; + +struct actor_config : public config::actor_config_pod { + actor_params params; + + bool wanna_event4signalling() const { return true /* TODO ? */; } + + actor_config(actor_testcase testcase, const actor_params ¶ms, unsigned id, + unsigned wait4id); + + actor_config(const char *str) { + if (!deserialize(str, *this)) + failure("Invalid internal parameter '%s'\n", str); + } + + const std::string osal_serialize(simple_checksum &) const; + bool osal_deserialize(const char *str, const char *end, simple_checksum &); + + const std::string serialize(const char *prefix) const; + static bool deserialize(const char *str, actor_config &config); + + bool is_waitable(size_t nops) const { + switch (testcase) { + case ac_hill: + if (!params.test_nrecords || params.test_nrecords >= nops) + return true; + default: + return false; + } + } +}; diff --git a/test/dead.cc b/test/dead.cc new file mode 100644 index 00000000..7afa042d --- /dev/null +++ b/test/dead.cc @@ -0,0 +1,61 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +bool testcase_deadread::setup() { + log_trace(">> setup"); + if (!inherited::setup()) + return false; + + log_trace("<< setup"); + return true; +} + +bool testcase_deadread::run() { + /* TODO */ + return true; +} + +bool testcase_deadread::teardown() { + log_trace(">> teardown"); + cursor_guard.release(); + txn_guard.release(); + db_guard.release(); + return true; +} + +//----------------------------------------------------------------------------- + +bool testcase_deadwrite::setup() { + log_trace(">> setup"); + if (!inherited::setup()) + return false; + + log_trace("<< setup"); + return true; +} + +bool testcase_deadwrite::run() { + /* TODO */ + return true; +} + +bool testcase_deadwrite::teardown() { + log_trace(">> teardown"); + cursor_guard.release(); + txn_guard.release(); + db_guard.release(); + return true; +} diff --git a/test/hill.cc b/test/hill.cc new file mode 100644 index 00000000..98a7b82c --- /dev/null +++ b/test/hill.cc @@ -0,0 +1,37 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +bool testcase_hill::setup() { + log_trace(">> setup"); + if (!inherited::setup()) + return false; + + /* TODO */ + + log_trace("<< setup"); + return true; +} + +bool testcase_hill::run() { + mdbx_open(); + /* TODO */ + return true; +} + +bool testcase_hill::teardown() { + log_trace(">> teardown"); + return inherited::teardown(); +} diff --git a/test/jitter.cc b/test/jitter.cc new file mode 100644 index 00000000..00385362 --- /dev/null +++ b/test/jitter.cc @@ -0,0 +1,33 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +bool testcase_jitter::setup() { + log_trace(">> setup"); + if (!inherited::setup()) + return false; + + /* TODO */ + + log_trace("<< setup"); + return true; +} + +bool testcase_jitter::run() { return true; } + +bool testcase_jitter::teardown() { + log_trace(">> teardown"); + return inherited::teardown(); +} diff --git a/test/keygen.cc b/test/keygen.cc new file mode 100644 index 00000000..20c80a2a --- /dev/null +++ b/test/keygen.cc @@ -0,0 +1,72 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +namespace keygen { + +size_t ffs_fallback(serial_t serial) { + size_t bit = sizeof(serial_t) * 8 - 1; + auto mask = (serial_t)1u << bit; + do { + if (serial & mask) + return bit; + --bit; + } while (mask >>= 1); + return 0; +} + +void __hot make(const serial_t serial, const params_t ¶ms, result_t &out) { + assert(out.limit >= params.maxlen); + assert(params.maxlen >= params.minlen); + assert(params.maxlen >= length(serial)); + + out.value.mv_data = out.bytes; + out.value.mv_size = params.minlen; + + if (params.flags & (MDB_INTEGERKEY | MDB_INTEGERDUP)) { + assert(params.maxlen == params.minlen); + assert(params.minlen == 4 || params.minlen == 8); + if (is_byteorder_le() || params.minlen == 8) + out.u64 = serial; + else + out.u32 = (uint32_t)serial; + } else if (params.flags & (MDB_REVERSEKEY | MDB_REVERSEDUP)) { + if (out.value.mv_size > 8) { + memset(out.bytes, '\0', out.value.mv_size - 8); + unaligned::store(out.bytes + out.value.mv_size - 8, htobe64(serial)); + } else { + out.u64 = htobe64(serial); + if (out.value.mv_size < 8) { + out.value.mv_size = std::max(length(serial), out.value.mv_size); + out.value.mv_data = out.bytes + 8 - out.value.mv_size; + } + } + } else { + out.u64 = htole64(serial); + if (out.value.mv_size > 8) + memset(out.bytes + 8, '\0', out.value.mv_size - 8); + else + out.value.mv_size = std::max(length(serial), out.value.mv_size); + } + + assert(out.value.mv_size >= params.minlen); + assert(out.value.mv_size <= params.maxlen); + assert(out.value.mv_size >= length(serial)); + assert(out.value.mv_data >= out.bytes); + assert((uint8_t *)out.value.mv_data + out.value.mv_size <= + out.bytes + out.limit); +} + +} /* namespace keygen */ diff --git a/test/keygen.h b/test/keygen.h new file mode 100644 index 00000000..58db2633 --- /dev/null +++ b/test/keygen.h @@ -0,0 +1,123 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once + +#include "base.h" +#include "log.h" +#include "utils.h" + +namespace keygen { + +/* Под "генерацией ключей" здесь понимается генерация обоих значений для + * пар key-value, т.е. не только ключей, но и ассоциированных с ними данных. + */ + +/* Генерацию ключей нельзя отнести к простым задачам, так как требования + * примерно следующие: + * - генерация разного количества уникальных ключей различной длины + * в задаваемом диапазоне; + * - возможность выбора как псевдо-случайного порядка ключей, + * так и по некоторым специфическим законам (ограниченными упорядоченными + * последовательностями, в шахматном порядке по граница диапазона и т.д.); + * - возможность генерации дубликатов с задаваемым законом распределения; + * - возможность генерации непересекающимися кластерами для параллельного + * использования в нескольких потоках; + * - использовать минимум ресурсов, как CPU, так и RAM, в том числе + * включая cache pollution и ram bandwidth. + * + * При этом заведомо известно, что для MDBX не имеет значения: + * - используемый алфавит (значения байтов); + * - частотное распределение по алфавиту; + * - абсолютное значение ключей или разность между отдельными значениями; + * + * Соответственно, схема генерации следующая: + * - для ключей вводится плоская одномерная "координата" uint64_t; + * - все преобразования (назначение диапазонов, переупорядочивание, + * коррекция распределения) выполняются только над "координатой"; + * - итоговая "координата" преобразуется в 8-байтное суррогатное значение + * ключа, при этом опционально суррогат может усекаться до ненулевых байт; + * - для получения ключей длиной более 8 байт суррогат дополняется + * фиксированной последовательностью; + */ + +typedef uint64_t serial_t; + +struct params_t { + uint8_t minlen; + uint8_t flags; + uint16_t maxlen; +}; + +struct result_t { + MDB_val value; + size_t limit; + union { + uint8_t bytes[sizeof(uint64_t)]; + uint32_t u32; + uint64_t u64; + }; +}; + +void make(const serial_t serial, const params_t ¶ms, result_t &out); + +static __inline void make(const serial_t serial, const params_t ¶ms, + result_t &out, size_t limit) { + out.limit = limit; + make(serial, params, out); +} + +size_t ffs_fallback(serial_t serial); + +static __inline size_t ffs(serial_t serial) { + size_t rc; +#ifdef __GNUC__ + if (sizeof(serial) <= sizeof(int)) + rc = __builtin_ffs((int)serial); + else if (sizeof(serial) == sizeof(long)) + rc = __builtin_ffsl((long)serial); + else if (sizeof(serial) == sizeof(long long)) + rc = __builtin_ffsll((long long)serial); + else + return ffs_fallback(serial); +#elif defined(_MSC_VER) + unsigned long index; + if (sizeof(serial) <= sizeof(unsigned long)) + rc = _BitScanReverse(&index, (unsigned long)serial) ? index : 0; + else if (sizeof(serial) <= sizeof(unsigned __int64)) { +#if defined(_M_ARM64) || defined(_M_X64) + rc = _BitScanReverse64(&index, (unsigned __int64)serial) ? index : 0; +#else + size_t base = 0; + unsigned long value = (unsigned long)serial; + if ((unsigned __int64)serial > ULONG_MAX) { + base = 32; + value = (unsigned long)(serial >> 32); + } + rc = (_BitScanReverse(&index, value) ? index : 0) + base; +#endif /* _M_ARM64 || _M_X64 */ + } else + return ffs_fallback(serial); +#else + return ffs_fallback(serial); +#endif + assert(rc == ffs_fallback(serial)); + return rc; +} + +static __inline size_t length(const serial_t serial) { + return (ffs(serial) + 7) >> 3; +} + +} /* namespace keygen */ diff --git a/test/log.cc b/test/log.cc new file mode 100644 index 00000000..d149393e --- /dev/null +++ b/test/log.cc @@ -0,0 +1,129 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +void failure(const char *fmt, ...) { + va_list ap; + fflush(NULL); + va_start(ap, fmt); + loggging::output(loggging::failure, fmt, ap); + va_end(ap); + fflush(NULL); + exit(EXIT_FAILURE); +} + +const char *test_strerror(int errnum) { + static __thread char buf[1024]; + return mdbx_strerror_r(errnum, buf, sizeof(buf)); +} + +void __noreturn failure_perror(const char *what, int errnum) { + failure("%s failed: %s (%d)\n", what, test_strerror(errnum), errnum); +} + +//----------------------------------------------------------------------------- + +namespace loggging { + +static std::string prefix; +static loglevel level; + +void setup(loglevel _level, const std::string &_prefix) { + level = (_level > error) ? failure : _level; + prefix = _prefix; +} + +void setup(const std::string &_prefix) { prefix = _prefix; } + +const char *level2str(const loglevel level) { + switch (level) { + default: + return "invalid/unknown"; + case trace: + return "trace"; + case info: + return "info"; + case notice: + return "notice"; + case warning: + return "warning"; + case error: + return "error"; + case failure: + return "failure"; + } +} + +void output(loglevel priority, const char *format, va_list ap) { + if (priority >= level) { + fprintf(stderr, "[ %u %-10s %6s ] " /* TODO */, osal_getpid(), + prefix.c_str(), level2str(priority)); + vfprintf(stderr, format, ap); + size_t len = strlen(format); + if (len && format[len - 1] != '\n') + putc('\n', stderr); + } +} + +} /* namespace log */ + +void log_trace(const char *msg, ...) { + if (loggging::trace >= loggging::level) { + va_list ap; + va_start(ap, msg); + loggging::output(loggging::trace, msg, ap); + va_end(ap); + } +} + +void log_info(const char *msg, ...) { + if (loggging::info >= loggging::level) { + va_list ap; + va_start(ap, msg); + loggging::output(loggging::info, msg, ap); + va_end(ap); + } +} + +void log_notice(const char *msg, ...) { + if (loggging::notice >= loggging::level) { + va_list ap; + va_start(ap, msg); + loggging::output(loggging::notice, msg, ap); + va_end(ap); + } +} + +void log_warning(const char *msg, ...) { + if (loggging::warning >= loggging::level) { + va_list ap; + va_start(ap, msg); + loggging::output(loggging::warning, msg, ap); + va_end(ap); + } +} + +void log_error(const char *msg, ...) { + if (loggging::error >= loggging::level) { + va_list ap; + va_start(ap, msg); + loggging::output(loggging::error, msg, ap); + va_end(ap); + } +} + +void log_touble(const char *where, const char *what, int errnum) { + log_error("%s: %s %s", where, what, test_strerror(errnum)); +} diff --git a/test/log.h b/test/log.h new file mode 100644 index 00000000..627a11a0 --- /dev/null +++ b/test/log.h @@ -0,0 +1,61 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once + +#include "base.h" + +void __noreturn usage(void); + +void __noreturn +#ifdef __GNUC__ + __attribute__((format(printf, 1, 2))) +#endif + failure(const char *fmt, ...); + +void __noreturn failure_perror(const char *what, int errnum); +const char *test_strerror(int errnum); + +namespace loggging { + +enum loglevel { + trace, + info, + notice, + warning, + error, + failure, +}; + +const char *level2str(const loglevel level); +void setup(loglevel level, const std::string &prefix); +void setup(const std::string &prefix); + +void output(loglevel priority, const char *format, va_list ap); + +} /* namespace log */ + +void log_trace(const char *msg, ...); +void log_info(const char *msg, ...); +void log_notice(const char *msg, ...); +void log_warning(const char *msg, ...); +void log_error(const char *msg, ...); + +void log_touble(const char *where, const char *what, int errnum); + +#ifdef _DEBUG +#define TRACE(...) log_trace(__VA_ARGS__) +#else +#define TRACE(...) __noop(__VA_ARGS__) +#endif diff --git a/test/main.cc b/test/main.cc new file mode 100644 index 00000000..929f2094 --- /dev/null +++ b/test/main.cc @@ -0,0 +1,311 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +void __noreturn usage(void) { + printf("usage:\n" + "\tFIXME\n"); + exit(EXIT_FAILURE); +} + +//----------------------------------------------------------------------------- + +void actor_params::set_defaults(void) { + pathname_log = ""; + loglevel = +#ifdef NDEBUG + loggging::notice; +#else + loggging::trace; +#endif + + pathname_db = +#ifdef __linux__ + "/dev/shm/test_tmpdb.mdbx"; +#else + "test_tmpdb.mdbx"; +#endif + mode_flags = MDB_NOSUBDIR | MDB_WRITEMAP | MDB_MAPASYNC | MDB_NORDAHEAD | + MDB_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM; + table_flags = MDB_DUPSORT; + size = 1024 * 1024; + seed = 1; + + test_duration = 0; + test_nrecords = 1000; + nrepeat = 1; + nthreads = 1; + + keylen_min = 0; + keylen_max = 42; + datalen_min = 0; + datalen_max = 256; + + batch_read = 4; + batch_write = 4; + + delaystart = 0; + waitfor_nops = 0; + + drop_table = false; + + max_readers = 42; + max_tables = 42; +} + +namespace global { + +std::vector actors; +std::unordered_map events; +std::unordered_map pid2actor; +std::set databases; +unsigned nactors; + +namespace config { +unsigned timeout; +bool dump_config; +bool dont_cleanup_before; +bool dont_cleanup_after; +} /* namespace config */ + +} /* namespace global */ + +//----------------------------------------------------------------------------- + +const char global::thunk_param_prefix[] = "--execute="; + +std::string thunk_param(const actor_config &config) { + return config.serialize(global::thunk_param_prefix); +} + +void cleanup() { + log_trace(">> osal_setup"); + /* TODO: remove each database */ + log_trace("<< osal_setup"); +} + +int main(int argc, char *const argv[]) { + +#ifdef _DEBUG + log_trace("#argc = %d", argc); + for (int i = 0; i < argc; ++i) + log_trace("#argv[%d] = %s", i, argv[i]); +#endif /* _DEBUG */ + + if (argc < 2) + failure("No parameters given\n"); + + if (argc == 2 && + strncmp(argv[1], global::thunk_param_prefix, + strlen(global::thunk_param_prefix)) == 0) + return test_execute( + actor_config(argv[1] + strlen(global::thunk_param_prefix))) + ? EXIT_SUCCESS + : EXIT_FAILURE; + + actor_params params; + params.set_defaults(); + global::config::dump_config = true; + loggging::setup((loggging::loglevel)params.loglevel, "main"); + unsigned lastid = 0; + + if (argc == 2 && strncmp(argv[1], "--case=", 7) == 0) { + const char *casename = argv[1] + 7; + if (!testcase_setup(casename, params, lastid)) + failure("unknown testcase `%s`", casename); + } else { + for (int i = 1; i < argc;) { + const char *value = nullptr; + if (config::parse_option(argc, argv, i, "basic", nullptr)) { + bool ok = testcase_setup("basic", params, lastid); + assert(ok); + (void)ok; + } else if (config::parse_option(argc, argv, i, "race", nullptr)) { + bool ok = testcase_setup("race", params, lastid); + assert(ok); + (void)ok; + } else if (config::parse_option(argc, argv, i, "bench", nullptr)) { + bool ok = testcase_setup("bench", params, lastid); + assert(ok); + (void)ok; + } else if (config::parse_option(argc, argv, i, "pathname", + params.pathname_db) || + config::parse_option(argc, argv, i, "mode", params.mode_flags, + config::mode_bits) || + config::parse_option(argc, argv, i, "table", + params.table_flags, config::table_bits) || + config::parse_option(argc, argv, i, "size", params.size, + config::binary, 4096 * 4) || + config::parse_option(argc, argv, i, "seed", params.seed, + config::no_scale) || + config::parse_option(argc, argv, i, "repeat", params.nrepeat, + config::no_scale) || + config::parse_option(argc, argv, i, "threads", params.nthreads, + config::no_scale, 1, 64) || + config::parse_option(argc, argv, i, "timeout", + global::config::timeout, config::duration, + 1) || + config::parse_option(argc, argv, i, "keylen.min", + params.keylen_min, config::no_scale, 0, + params.keylen_max) || + config::parse_option(argc, argv, i, "keylen.max", + params.keylen_max, config::no_scale, + params.keylen_min, + mdbx_get_maxkeysize(0)) || + config::parse_option(argc, argv, i, "datalen.min", + params.datalen_min, config::no_scale, 0, + params.datalen_max) || + config::parse_option(argc, argv, i, "datalen.max", + params.datalen_max, config::no_scale, + params.datalen_min, MDBX_MAXDATASIZE) || + config::parse_option(argc, argv, i, "batch.read", + params.batch_read, config::no_scale, 1) || + config::parse_option(argc, argv, i, "batch.write", + params.batch_write, config::no_scale, + 1) || + config::parse_option(argc, argv, i, "delay", params.delaystart, + config::duration) || + config::parse_option(argc, argv, i, "wait4ops", + params.waitfor_nops, config::decimal) || + config::parse_option(argc, argv, i, "drop", + params.drop_table) || + config::parse_option(argc, argv, i, "dump-config", + global::config::dump_config) || + config::parse_option(argc, argv, i, "dont-cleanup-before", + global::config::dont_cleanup_before) || + config::parse_option(argc, argv, i, "dont-cleanup-after", + global::config::dont_cleanup_after) || + config::parse_option(argc, argv, i, "max-readers", + params.max_readers, config::no_scale, 1, + 255) || + config::parse_option(argc, argv, i, "max-tables", + params.max_tables, config::no_scale, 1, + INT16_MAX) || + false) { + continue; + } else if (config::parse_option(argc, argv, i, "no-delay", nullptr)) { + params.delaystart = 0; + } else if (config::parse_option(argc, argv, i, "no-wait", nullptr)) { + params.waitfor_nops = 0; + } else if (config::parse_option(argc, argv, i, "duration", + params.test_duration, config::duration, + 1)) { + params.test_nrecords = 0; + continue; + } else if (config::parse_option(argc, argv, i, "records", + params.test_nrecords, config::decimal, + 1)) { + params.test_duration = 0; + continue; + } else if (config::parse_option(argc, argv, i, "hill", &value)) { + configure_actor(lastid, ac_hill, value, params); + continue; + } else if (config::parse_option(argc, argv, i, "jitter", nullptr)) { + configure_actor(lastid, ac_jitter, value, params); + continue; + } else if (config::parse_option(argc, argv, i, "dead.reader", nullptr)) { + configure_actor(lastid, ac_deadread, value, params); + continue; + } else if (config::parse_option(argc, argv, i, "dead.writer", nullptr)) { + configure_actor(lastid, ac_deadwrite, value, params); + continue; + } else { + failure("Unknown option '%s'\n", argv[i]); + } + } + } + + if (global::config::dump_config) + config::dump(stdout); + + bool failed = false; + if (global::actors.size()) { + loggging::setup("overlord"); + + if (!global::config::dont_cleanup_before) + cleanup(); + + log_trace(">> osal_setup"); + osal_setup(global::actors); + log_trace("<< osal_setup"); + + for (auto &a : global::actors) { + mdbx_pid_t pid; + log_trace(">> actor_start"); + int rc = osal_actor_start(a, pid); + log_trace("<< actor_start"); + if (rc) { + log_trace(">> killall_actors"); + osal_killall_actors(); + log_trace("<< killall_actors"); + failure("Failed to start actor #%u (%s)\n", a.order, test_strerror(rc)); + } + global::pid2actor[pid] = &a; + } + + atexit(osal_killall_actors); + log_trace(">> wait4barrier"); + osal_wait4barrier(); + log_trace("<< wait4barrier"); + } + + time_t timestamp_start = time(nullptr); + size_t left = global::actors.size(); + + while (left > 0) { + unsigned timeout = INT_MAX; + if (global::config::timeout) { + time_t timestamp_now = time(nullptr); + if (timestamp_now - timestamp_start > global::config::timeout) + timeout = 0; + else + timeout = global::config::timeout - + (unsigned)(timestamp_now - timestamp_start); + } + + mdbx_pid_t pid; + int rc = osal_actor_poll(pid, timeout); + if (rc) + failure("Poll error: %s (%d)\n", test_strerror(rc), rc); + + if (pid) { + actor_status status = osal_actor_info(pid); + actor_config *actor = global::pid2actor.at(pid); + if (!actor) + continue; + + log_info("actor #%u, id %d, pid %u: %s\n", actor->order, actor->id, pid, + status2str(status)); + if (status > as_running) { + left -= 1; + if (status != as_successful) + failed = true; + } + } else { + if (global::config::timeout && + time(nullptr) - timestamp_start > global::config::timeout) + failure("Timeout\n"); + } + } + + log_notice("OVERALL: %s\n", failed ? "Failed" : "Successful"); + if (!global::config::dont_cleanup_before) { + if (failed) + log_info("skip cleanup"); + else + cleanup(); + } + return failed ? EXIT_FAILURE : EXIT_SUCCESS; +} diff --git a/test/osal-unix.cc b/test/osal-unix.cc new file mode 100644 index 00000000..5131181c --- /dev/null +++ b/test/osal-unix.cc @@ -0,0 +1,230 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +#include +#include +#include +#include +#include +#include + +struct shared_t { + pthread_barrier_t barrier; + pthread_mutex_t mutex; + pthread_cond_t conds[0]; +}; + +static shared_t *shared; +static std::unordered_map events; + +void osal_wait4barrier(void) { + assert(shared != nullptr && shared != MAP_FAILED); + int rc = pthread_barrier_wait(&shared->barrier); + if (rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD) { + failure_perror("pthread_barrier_wait(shared)", rc); + } +} + +void osal_setup(const std::vector &actors) { + assert(shared == nullptr); + + pthread_mutexattr_t mutexattr; + int rc = pthread_mutexattr_init(&mutexattr); + if (rc) + failure_perror("pthread_mutexattr_init()", rc); + rc = pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED); + if (rc) + failure_perror("pthread_mutexattr_setpshared()", rc); + + pthread_barrierattr_t barrierattr; + rc = pthread_barrierattr_init(&barrierattr); + if (rc) + failure_perror("pthread_barrierattr_init()", rc); + rc = pthread_barrierattr_setpshared(&barrierattr, PTHREAD_PROCESS_SHARED); + if (rc) + failure_perror("pthread_barrierattr_setpshared()", rc); + + pthread_condattr_t condattr; + rc = pthread_condattr_init(&condattr); + if (rc) + failure_perror("pthread_condattr_init()", rc); + rc = pthread_condattr_setpshared(&condattr, PTHREAD_PROCESS_SHARED); + if (rc) + failure_perror("pthread_condattr_setpshared()", rc); + + size_t n = 0; + for (const auto &a : actors) + if (a.wanna_event4signalling()) + ++n; + + shared = (shared_t *)mmap( + nullptr, sizeof(shared_t) + n * sizeof(pthread_cond_t), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (MAP_FAILED == (void *)shared) + failure_perror("mmap(shared_conds)", errno); + + rc = pthread_mutex_init(&shared->mutex, &mutexattr); + if (rc) + failure_perror("pthread_mutex_init(shared)", rc); + + rc = pthread_barrier_init(&shared->barrier, &barrierattr, actors.size() + 1); + if (rc) + failure_perror("pthread_barrier_init(shared)", rc); + + auto a = actors.begin(); + for (size_t i = 0; i < n; ++i) { + pthread_cond_t *event = &shared->conds[i]; + rc = pthread_cond_init(event, &condattr); + if (rc) + failure_perror("pthread_cond_init(shared)", rc); + + unsigned id = 0; + while (a != actors.end()) { + if (a->wanna_event4signalling()) { + id = a->id; + break; + } + ++a; + } + assert(id != 0); + events[id] = event; + } + + pthread_barrierattr_destroy(&barrierattr); + pthread_condattr_destroy(&condattr); + pthread_mutexattr_destroy(&mutexattr); +} + +void osal_broadcast(unsigned id) { + assert(shared != nullptr && shared != MAP_FAILED); + int rc = pthread_cond_broadcast(events.at(id)); + if (rc) + failure_perror("sem_post(shared)", rc); +} + +int osal_waitfor(unsigned id) { + assert(shared != nullptr && shared != MAP_FAILED); + + int rc = pthread_mutex_lock(&shared->mutex); + if (rc != 0) + failure_perror("pthread_mutex_lock(shared)", rc); + + rc = pthread_cond_wait(events.at(id), &shared->mutex); + if (rc && rc != EINTR) + failure_perror("pthread_cond_wait(shared)", rc); + + rc = pthread_mutex_unlock(&shared->mutex); + if (rc != 0) + failure_perror("pthread_mutex_unlock(shared)", rc); + + return (rc == 0) ? true : false; +} + +//----------------------------------------------------------------------------- + +const std::string +actor_config::osal_serialize(simple_checksum &checksum) const { + (void)checksum; + /* not used in workload, but just for testing */ + return "unix.fork"; +} + +bool actor_config::osal_deserialize(const char *str, const char *end, + simple_checksum &checksum) { + (void)checksum; + /* not used in workload, but just for testing */ + return strncmp(str, "unix.fork", 9) == 0 && str + 9 == end; +} + +//----------------------------------------------------------------------------- + +static std::unordered_map childs; + +static void handler_SIGCHLD(int unused) { (void)unused; } + +mdbx_pid_t osal_getpid(void) { return getpid(); } + +int osal_actor_start(const actor_config &config, mdbx_pid_t &pid) { + if (childs.empty()) + signal(SIGCHLD, handler_SIGCHLD); + + pid = fork(); + + if (pid == 0) { + const bool result = test_execute(config); + exit(result ? EXIT_SUCCESS : EXIT_FAILURE); + } + + if (pid < 0) + return errno; + + childs[pid] = as_running; + return 0; +} + +actor_status osal_actor_info(const mdbx_pid_t pid) { return childs.at(pid); } + +void osal_killall_actors(void) { + for (auto &pair : childs) { + kill(pair.first, SIGKILL); + pair.second = as_killed; + } +} + +int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout) { +retry: + int status, options = WNOHANG; +#ifdef WUNTRACED + options |= WUNTRACED; +#endif +#ifdef WCONTINUED + options |= WCONTINUED; +#endif + pid = waitpid(0, &status, options); + + if (pid > 0) { + if (WIFEXITED(status)) + childs[pid] = + (WEXITSTATUS(status) == EXIT_SUCCESS) ? as_successful : as_failed; + else if (WIFSIGNALED(status) || WCOREDUMP(status)) + childs[pid] = as_killed; + else if (WIFSTOPPED(status)) + childs[pid] = as_debuging; + else if (WIFCONTINUED(status)) + childs[pid] = as_running; + else { + assert(false); + } + return 0; + } + + if (pid == 0) { + if (timeout && sleep(timeout)) + goto retry; + return 0; + } + + switch (errno) { + case EINTR: + pid = 0; + return 0; + + case ECHILD: + default: + pid = 0; + return errno; + } +} diff --git a/test/osal-windows.cc b/test/osal-windows.cc new file mode 100644 index 00000000..d4083c81 --- /dev/null +++ b/test/osal-windows.cc @@ -0,0 +1,262 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +static std::unordered_map events; +static HANDLE hBarrierSemaphore, hBarrierEvent; + +static int waitstatus2errcode(DWORD result) { + switch (result) { + case WAIT_OBJECT_0: + return MDB_SUCCESS; + case WAIT_FAILED: + return GetLastError(); + case WAIT_ABANDONED: + return ERROR_ABANDONED_WAIT_0; + case WAIT_IO_COMPLETION: + return ERROR_USER_APC; + case WAIT_TIMEOUT: + return ERROR_TIMEOUT; + default: + return ERROR_UNHANDLED_ERROR; + } +} + +void osal_wait4barrier(void) { + DWORD rc = WaitForSingleObject(hBarrierSemaphore, 0); + switch (rc) { + default: + failure_perror("WaitForSingleObject(BarrierSemaphore)", + waitstatus2errcode(rc)); + case WAIT_OBJECT_0: + rc = WaitForSingleObject(hBarrierEvent, INFINITE); + if (rc != WAIT_OBJECT_0) + failure_perror("WaitForSingleObject(BarrierEvent)", + waitstatus2errcode(rc)); + break; + case WAIT_TIMEOUT: + if (!SetEvent(hBarrierEvent)) + failure_perror("SetEvent(BarrierEvent)", GetLastError()); + break; + } +} + +static HANDLE make_inharitable(HANDLE hHandle) { + assert(hHandle != NULL && hHandle != INVALID_HANDLE_VALUE); + if (!DuplicateHandle(GetCurrentProcess(), hHandle, GetCurrentProcess(), + &hHandle, 0, TRUE, + DUPLICATE_CLOSE_SOURCE | DUPLICATE_SAME_ACCESS)) + failure_perror("DuplicateHandle()", GetLastError()); + return hHandle; +} + +void osal_setup(const std::vector &actors) { + size_t n = 0; + for (const auto &a : actors) { + if (a.wanna_event4signalling()) { + HANDLE hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + if (!hEvent) + failure_perror("CreateEvent()", GetLastError()); + hEvent = make_inharitable(hEvent); + events[a.id] = hEvent; + } + } + + hBarrierSemaphore = CreateSemaphore(NULL, 0, (LONG)actors.size(), NULL); + if (!hBarrierSemaphore) + failure_perror("CreateSemaphore(BarrierSemaphore)", GetLastError()); + hBarrierSemaphore = make_inharitable(hBarrierSemaphore); + + hBarrierEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + if (!hBarrierEvent) + failure_perror("CreateEvent(BarrierEvent)", GetLastError()); + hBarrierEvent = make_inharitable(hBarrierEvent); +} + +void osal_broadcast(unsigned id) { + if (!SetEvent(events.at(id))) + failure_perror("SetEvent()", GetLastError()); +} + +int osal_waitfor(unsigned id) { + DWORD rc = WaitForSingleObject(events.at(id), INFINITE); + return waitstatus2errcode(rc); +} + +mdbx_pid_t osal_getpid(void) { return GetCurrentProcessId(); } + +//----------------------------------------------------------------------------- + +const std::string +actor_config::osal_serialize(simple_checksum &checksum) const { + checksum.push(hBarrierSemaphore); + checksum.push(hBarrierEvent); + + HANDLE hWait = INVALID_HANDLE_VALUE; + if (wait4id) { + hWait = events.at(wait4id); + checksum.push(hWait); + } + + HANDLE hSignal = INVALID_HANDLE_VALUE; + if (wanna_event4signalling()) { + hSignal = events.at(id); + checksum.push(hSignal); + } + + return format("%p.%p.%p.%p", hBarrierSemaphore, hBarrierEvent, hWait, + hSignal); +} + +bool actor_config::osal_deserialize(const char *str, const char *end, + simple_checksum &checksum) { + + std::string copy(str, end - str); + TRACE(">> osal_deserialize(%s)\n", copy.c_str()); + + assert(hBarrierSemaphore == 0); + assert(hBarrierEvent == 0); + assert(events.empty()); + + HANDLE hWait, hSignal; + if (sscanf_s(copy.c_str(), "%p.%p.%p.%p", &hBarrierSemaphore, &hBarrierEvent, + &hWait, &hSignal) != 4) { + TRACE("<< osal_deserialize: failed\n"); + return false; + } + + checksum.push(hBarrierSemaphore); + checksum.push(hBarrierEvent); + + if (wait4id) { + checksum.push(hWait); + events[wait4id] = hWait; + } + + if (wanna_event4signalling()) { + checksum.push(hSignal); + events[id] = hSignal; + } + + TRACE("<< osal_deserialize: OK\n"); + return true; +} + +//----------------------------------------------------------------------------- + +typedef std::pair child; +static std::unordered_map childs; + +int osal_actor_start(const actor_config &config, mdbx_pid_t &pid) { + if (childs.size() == MAXIMUM_WAIT_OBJECTS) + failure("Could't manage more that %u actors on Windows\n", + MAXIMUM_WAIT_OBJECTS); + + _flushall(); + + STARTUPINFOA StartupInfo; + GetStartupInfoA(&StartupInfo); + + char exename[_MAX_PATH]; + DWORD exename_size = sizeof(exename); + if (!QueryFullProcessImageNameA(GetCurrentProcess(), 0, exename, + &exename_size)) + failure_perror("QueryFullProcessImageName()", GetLastError()); + + std::string cmdline = "test_mdbx.child " + thunk_param(config); + + PROCESS_INFORMATION ProcessInformation; + if (!CreateProcessA(exename, const_cast(cmdline.c_str()), + NULL, // Retuned process handle is not inheritable. + NULL, // Retuned thread handle is not inheritable. + TRUE, // Child inherits all inheritable handles. + NORMAL_PRIORITY_CLASS | INHERIT_PARENT_AFFINITY, + NULL, // Inherit the parent's environment. + NULL, // Inherit the parent's current directory. + &StartupInfo, &ProcessInformation)) + return GetLastError(); + + CloseHandle(ProcessInformation.hThread); + pid = ProcessInformation.dwProcessId; + childs[pid] = std::make_pair(ProcessInformation.hProcess, as_running); + return 0; +} + +actor_status osal_actor_info(const mdbx_pid_t pid) { + actor_status status = childs.at(pid).second; + if (status > as_running) + return status; + + DWORD ExitCode; + if (!GetExitCodeProcess(childs.at(pid).first, &ExitCode)) + failure_perror("GetExitCodeProcess()", GetLastError()); + + switch (ExitCode) { + case STILL_ACTIVE: + return as_running; + case EXIT_SUCCESS: + status = as_successful; + break; + // case EXCEPTION_BREAKPOINT: + case EXCEPTION_SINGLE_STEP: + status = as_debuging; + break; + case STATUS_CONTROL_C_EXIT: + case EXCEPTION_NONCONTINUABLE_EXCEPTION: + status = as_killed; + break; + default: + status = as_failed; + break; + } + + childs.at(pid).second = status; + return status; +} + +void osal_killall_actors(void) { + for (auto &pair : childs) + TerminateProcess(pair.second.first, STATUS_CONTROL_C_EXIT); +} + +int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout) { + std::vector handles; + handles.reserve(childs.size()); + for (const auto &pair : childs) + if (pair.second.second <= as_running) + handles.push_back(pair.second.first); + + DWORD rc = + MsgWaitForMultipleObjectsEx((DWORD)handles.size(), &handles[0], + (timeout > 60) ? 60 * 1000 : timeout * 1000, + QS_ALLINPUT | QS_ALLPOSTMESSAGE, 0); + + if (rc >= WAIT_OBJECT_0 && rc < WAIT_OBJECT_0 + handles.size()) { + pid = 0; + for (const auto &pair : childs) + if (pair.second.first == handles[rc - WAIT_OBJECT_0]) { + pid = pair.first; + break; + } + return 0; + } + + if (rc == WAIT_TIMEOUT) { + pid = 0; + return 0; + } + + return waitstatus2errcode(rc); +} diff --git a/test/osal.h b/test/osal.h new file mode 100644 index 00000000..1e5de123 --- /dev/null +++ b/test/osal.h @@ -0,0 +1,28 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once + +#include "base.h" + +void osal_setup(const std::vector &actors); +void osal_broadcast(unsigned id); +int osal_waitfor(unsigned id); + +mdbx_pid_t osal_getpid(void); +int osal_actor_start(const actor_config &config, mdbx_pid_t &pid); +actor_status osal_actor_info(const mdbx_pid_t pid); +void osal_killall_actors(void); +int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout); +void osal_wait4barrier(void); diff --git a/test/test.cc b/test/test.cc new file mode 100644 index 00000000..104fe93f --- /dev/null +++ b/test/test.cc @@ -0,0 +1,192 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +const char *testcase2str(const actor_testcase testcase) { + switch (testcase) { + default: + return "?!"; + case ac_none: + return "none"; + case ac_hill: + return "hill"; + case ac_deadread: + return "deadread"; + case ac_deadwrite: + return "deadwrite"; + case ac_jitter: + return "jitter"; + } +} + +const char *status2str(actor_status status) { + switch (status) { + default: + assert(false); + return "?!"; + case as_debuging: + return "debuging"; + case as_running: + return "running"; + case as_successful: + return "successful"; + case as_killed: + return "killed"; + case as_failed: + return "failed"; + } +} + +//----------------------------------------------------------------------------- + +void testcase::mdbx_prepare() { + log_trace(">> mdbx_prepare"); + + MDB_env *env = nullptr; + int rc = mdbx_env_create(&env); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_env_create()", rc); + + assert(env != nullptr); + db_guard.reset(env); + + rc = mdbx_env_set_userctx(env, this); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_env_set_userctx()", rc); + + rc = mdbx_env_set_maxreaders(env, config.params.max_readers); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_env_set_maxreaders()", rc); + + rc = mdbx_env_set_maxdbs(env, config.params.max_tables); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_env_set_maxdbs()", rc); + + rc = mdbx_env_set_mapsize(env, (size_t)config.params.size); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_env_set_mapsize()", rc); + + log_trace("<< mdbx_prepare"); +} + +void testcase::mdbx_open() { + log_trace(">> mdbx_open"); + int rc = mdbx_env_open(db_guard.get(), config.params.pathname_db.c_str(), + (unsigned)config.params.mode_flags, 0640); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_env_open()", rc); + log_trace("<< mdbx_open"); +} + +void testcase::mdbx_close() { + log_trace(">> mdbx_close"); + cursor_guard.reset(); + txn_guard.reset(); + db_guard.reset(); + log_trace("<< mdbx_close"); +} + +bool testcase::wait4start() { + if (config.wait4id) { + log_trace(">> wait4start(%u)", config.wait4id); + int rc = osal_waitfor(config.wait4id); + if (rc) { + log_trace("<< wait4start(%u), failed %s", test_strerror(rc)); + return false; + } + return true; + } else { + log_trace("== wait4start(not needed)"); + return true; + } +} + +void testcase::report(size_t nops_done) { + if (config.signal_nops && !signalled && config.signal_nops <= nops_done) { + log_trace(">> signal(n-ops %zu)", nops_done); + osal_broadcast(config.id); + signalled = true; + log_trace("<< signal(n-ops %zu)", nops_done); + } +} + +void testcase::signal() { + if (!signalled) { + log_trace(">> signal(forced)"); + osal_broadcast(config.id); + signalled = true; + log_trace("<< signal(forced)"); + } +} + +bool testcase::setup() { + mdbx_prepare(); + return wait4start(); +} + +bool testcase::teardown() { + log_trace(">> testcase::teardown"); + signal(); + mdbx_close(); + log_trace("<< testcase::teardown"); + return true; +} + +//----------------------------------------------------------------------------- + +bool test_execute(const actor_config &config) { + const mdbx_pid_t pid = osal_getpid(); + loggging::setup((loggging::loglevel)config.params.loglevel, + format("child_%u.%u", config.order, config.id)); + + log_trace(">> wait4barrier"); + osal_wait4barrier(); + log_trace("<< wait4barrier"); + + try { + std::unique_ptr test; + switch (config.testcase) { + case ac_hill: + test.reset(new testcase_hill(config, pid)); + break; + case ac_deadread: + test.reset(new testcase_deadread(config, pid)); + break; + case ac_deadwrite: + test.reset(new testcase_deadwrite(config, pid)); + break; + case ac_jitter: + test.reset(new testcase_jitter(config, pid)); + break; + default: + test.reset(new testcase(config, pid)); + break; + } + + if (!test->setup()) + log_notice("test setup failed"); + else if (!test->run()) + log_notice("test failed"); + else if (!test->teardown()) + log_notice("test teardown failed"); + else { + log_info("test successed"); + return true; + } + } catch (const std::exception &pipets) { + failure("Exception: %s", pipets.what()); + } + return false; +} diff --git a/test/test.h b/test/test.h new file mode 100644 index 00000000..b1ce82af --- /dev/null +++ b/test/test.h @@ -0,0 +1,145 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once + +#include "base.h" +#include "config.h" +#include "keygen.h" +#include "log.h" +#include "osal.h" +#include "utils.h" + +bool test_execute(const actor_config &config); +std::string thunk_param(const actor_config &config); +bool testcase_setup(const char *casename, const actor_params ¶ms, + unsigned &lastid); +void configure_actor(unsigned &lastid, const actor_testcase testcase, + const char *id_cstr, const actor_params ¶ms); + +namespace global { + +extern const char thunk_param_prefix[]; +extern std::vector actors; +extern std::unordered_map events; +extern std::unordered_map pid2actor; +extern std::set databases; + +namespace config { +extern unsigned timeout; +extern bool dump_config; +extern bool dont_cleanup_before; +extern bool dont_cleanup_after; +} /* namespace config */ + +} /* namespace global */ + +//----------------------------------------------------------------------------- + +struct db_deleter : public std::unary_function { + void operator()(MDB_env *env) const { mdbx_env_close(env); } +}; + +struct txn_deleter : public std::unary_function { + void operator()(MDB_txn *txn) const { + int rc = mdbx_txn_abort(txn); + if (rc) + log_touble(__func__, "mdbx_txn_abort()", rc); + } +}; + +struct cursor_deleter : public std::unary_function { + void operator()(MDB_cursor *cursor) const { mdbx_cursor_close(cursor); } +}; + +typedef std::unique_ptr scoped_db_guard; +typedef std::unique_ptr scoped_txn_guard; +typedef std::unique_ptr scoped_cursor_guard; + +//----------------------------------------------------------------------------- + +class testcase { +protected: + const actor_config &config; + const mdbx_pid_t pid; + + scoped_db_guard db_guard; + scoped_txn_guard txn_guard; + scoped_cursor_guard cursor_guard; + bool signalled; + + void mdbx_prepare(); + void mdbx_open(); + void mdbx_close(); + + bool wait4start(); + void report(size_t nops_done); + void signal(); + +public: + testcase(const actor_config &config, const mdbx_pid_t pid) + : config(config), pid(pid) { + loggging::setup(format("%s_%u.%u", testcase2str(config.testcase), + config.order, config.id)); + } + + virtual bool setup(); + virtual bool run() { return true; } + virtual bool teardown(); + virtual ~testcase() {} +}; + +class testcase_hill : public testcase { + typedef testcase inherited; + +public: + testcase_hill(const actor_config &config, const mdbx_pid_t pid) + : testcase(config, pid) {} + bool setup(); + bool run(); + bool teardown(); +}; + +class testcase_deadread : public testcase { + typedef testcase inherited; + +public: + testcase_deadread(const actor_config &config, const mdbx_pid_t pid) + : testcase(config, pid) {} + bool setup(); + bool run(); + bool teardown(); +}; + +class testcase_deadwrite : public testcase { + typedef testcase inherited; + +public: + testcase_deadwrite(const actor_config &config, const mdbx_pid_t pid) + : testcase(config, pid) {} + bool setup(); + bool run(); + bool teardown(); +}; + +class testcase_jitter : public testcase { + typedef testcase inherited; + +public: + testcase_jitter(const actor_config &config, const mdbx_pid_t pid) + : testcase(config, pid) {} + bool setup(); + bool run(); + bool teardown(); +}; diff --git a/test/test.vcxproj b/test/test.vcxproj new file mode 100644 index 00000000..331963a8 --- /dev/null +++ b/test/test.vcxproj @@ -0,0 +1,182 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + + {6d19209b-ece7-4b9c-941c-0aa2b484f199} + + + + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31} + Win32Proj + mdbxtest + 8.1 + + + + Application + true + v140 + Unicode + + + Application + false + v140 + true + Unicode + + + Application + true + v140 + Unicode + + + Application + false + v140 + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + test.h + + + Console + true + + + + + Use + Level3 + Disabled + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + test.h + + + Console + true + + + + + Level3 + Use + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + test.h + + + Console + true + true + true + + + + + Level3 + Use + MaxSpeed + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + test.h + + + Console + true + true + true + + + + + + + + + + + + + + + + + + + + + Create + Create + Create + Create + + + + + + + + diff --git a/test/test0.c b/test/test0.c deleted file mode 100644 index 68919b22..00000000 --- a/test/test0.c +++ /dev/null @@ -1,220 +0,0 @@ -/* mtest.c - memory-mapped database tester/toy */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include "../mdbx.h" -#include -#include -#include -#include -#include -#include - -#include - -#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) -#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) \ - ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ - __LINE__, msg, mdbx_strerror(rc)), \ - abort())) - -#ifndef DBPATH -#define DBPATH "./tmp.db" -#endif - -void *thread_entry(void *ctx) { - MDB_env *env = ctx; - MDB_txn *txn; - int rc; - - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - mdbx_txn_abort(txn); - - return NULL; -} - -int main(int argc, char *argv[]) { - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDBX_stat mst; - MDB_cursor *cursor, *cur2; - MDB_cursor_op op; - int count; - int *values; - char sval[32] = ""; - int env_oflags; - struct stat db_stat, exe_stat; - - (void)argc; - (void)argv; - srand(time(NULL)); - - count = (rand() % 384) + 64; - values = (int *)malloc(count * sizeof(int)); - - for (i = 0; i < count; i++) { - values[i] = rand() % 1024; - } - - E(mdbx_env_create(&env)); - E(mdbx_env_set_maxreaders(env, 42)); - E(mdbx_env_set_mapsize(env, 10485760)); - - E(stat("/proc/self/exe", &exe_stat) ? errno : 0); - E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_NOSYNC; - if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { - /* LY: Assume running inside a CI-environment: - * 1) don't use FIXEDMAP to avoid EBUSY in case collision, - * which could be inspired by address space randomisation feature. - * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some - * dedicated storage. - */ - env_oflags = 0; - } - E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - - E(mdbx_txn_begin(env, NULL, 0, &txn)); - E(mdbx_dbi_open(txn, NULL, 0, &dbi)); - - key.mv_size = sizeof(int); - key.mv_data = sval; - - printf("Adding %d values\n", count); - for (i = 0; i < count; i++) { - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - /* Set in each iteration, since MDB_NOOVERWRITE may modify it */ - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_KEYEXIST, mdbx_put(txn, dbi, &key, &data, MDB_NOOVERWRITE))) { - j++; - data.mv_size = sizeof(sval); - data.mv_data = sval; - } - } - if (j) - printf("%d duplicates skipped\n", j); - E(mdbx_txn_commit(txn)); - E(mdbx_env_stat(env, &mst, sizeof(mst))); - - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, - (char *)key.mv_data, data.mv_data, (int)data.mv_size, - (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - j = 0; - key.mv_data = sval; - for (i = count - 1; i > -1; i -= (rand() % 5)) { - j++; - txn = NULL; - E(mdbx_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%03x ", values[i]); - if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, NULL))) { - j--; - mdbx_txn_abort(txn); - } else { - E(mdbx_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); - - E(mdbx_env_stat(env, &mst, sizeof(mst))); - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - printf("Cursor last\n"); - E(mdbx_cursor_get(cursor, &key, &data, MDB_LAST)); - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - printf("Cursor prev\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - printf("Cursor last/prev\n"); - E(mdbx_cursor_get(cursor, &key, &data, MDB_LAST)); - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - E(mdbx_cursor_get(cursor, &key, &data, MDB_PREV)); - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - printf("Deleting with cursor\n"); - E(mdbx_txn_begin(env, NULL, 0, &txn)); - E(mdbx_cursor_open(txn, dbi, &cur2)); - for (i = 0; i < 50; i++) { - if (RES(MDB_NOTFOUND, mdbx_cursor_get(cur2, &key, &data, MDB_NEXT))) - break; - printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, - (char *)key.mv_data, data.mv_data, (int)data.mv_size, - (char *)data.mv_data); - E(mdbx_del(txn, dbi, &key, NULL)); - } - - printf("Restarting cursor in txn\n"); - for (op = MDB_FIRST, i = 0; i <= 32; op = MDB_NEXT, i++) { - if (RES(MDB_NOTFOUND, mdbx_cursor_get(cur2, &key, &data, op))) - break; - printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, - (char *)key.mv_data, data.mv_data, (int)data.mv_size, - (char *)data.mv_data); - } - mdbx_cursor_close(cur2); - E(mdbx_txn_commit(txn)); - - for (i = 0; i < 41; ++i) { - pthread_t thread; - pthread_create(&thread, NULL, thread_entry, env); - } - - printf("Restarting cursor outside txn\n"); - E(mdbx_txn_begin(env, NULL, 0, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - for (op = MDB_FIRST, i = 0; i <= 32; op = MDB_NEXT, i++) { - if (RES(MDB_NOTFOUND, mdbx_cursor_get(cursor, &key, &data, op))) - break; - printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, - (char *)key.mv_data, data.mv_data, (int)data.mv_size, - (char *)data.mv_data); - } - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - mdbx_dbi_close(env, dbi); - mdbx_env_close(env); - - return 0; -} diff --git a/test/test1.c b/test/test1.c deleted file mode 100644 index e0c8f10c..00000000 --- a/test/test1.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -/* Based on mtest2.c - memory-mapped database tester/toy */ - -#include "../mdbx.h" -#include -#include -#include -#include -#include -#include - -#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) -#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) \ - ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ - __LINE__, msg, mdbx_strerror(rc)), \ - abort())) - -#ifndef DBPATH -#define DBPATH "./tmp.db" -#endif - -int main(int argc, char *argv[]) { - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDBX_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32] = ""; - int env_oflags; - struct stat db_stat, exe_stat; - - (void)argc; - (void)argv; - srand(time(NULL)); - - count = (rand() % 384) + 64; - values = (int *)malloc(count * sizeof(int)); - - for (i = 0; i < count; i++) { - values[i] = rand() % 1024; - } - - E(mdbx_env_create(&env)); - E(mdbx_env_set_maxreaders(env, 1)); - E(mdbx_env_set_mapsize(env, 10485760)); - E(mdbx_env_set_maxdbs(env, 4)); - - E(stat("/proc/self/exe", &exe_stat) ? errno : 0); - E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_NOSYNC; - if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { - /* LY: Assume running inside a CI-environment: - * 1) don't use FIXEDMAP to avoid EBUSY in case collision, - * which could be inspired by address space randomisation feature. - * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some - * dedicated storage. - */ - env_oflags = 0; - } - /* LY: especially here we always needs MDB_NOSYNC - * for testing mdbx_env_close_ex() and "redo-to-steady" on open. */ - env_oflags |= MDB_NOSYNC; - E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - - E(mdbx_txn_begin(env, NULL, 0, &txn)); - if (mdbx_dbi_open(txn, "id1", MDB_CREATE, &dbi) == MDB_SUCCESS) - E(mdbx_drop(txn, dbi, 1)); - E(mdbx_dbi_open(txn, "id1", MDB_CREATE, &dbi)); - - key.mv_size = sizeof(int); - key.mv_data = sval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - - printf("Adding %d values\n", count); - for (i = 0; i < count; i++) { - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - if (RES(MDB_KEYEXIST, mdbx_put(txn, dbi, &key, &data, MDB_NOOVERWRITE))) - j++; - } - if (j) - printf("%d duplicates skipped\n", j); - E(mdbx_txn_commit(txn)); - E(mdbx_env_stat(env, &mst, sizeof(mst))); - - printf("check-preset-a\n"); - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - int present_a = 0; - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, - (char *)key.mv_data, data.mv_data, (int)data.mv_size, - (char *)data.mv_data); - ++present_a; - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - CHECK(present_a == count - j, "mismatch"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - mdbx_env_sync(env, 1); - - int deleted = 0; - key.mv_data = sval; - for (i = count - 1; i > -1; i -= (rand() % 5)) { - txn = NULL; - E(mdbx_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%03x ", values[i]); - if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, NULL))) { - mdbx_txn_abort(txn); - } else { - E(mdbx_txn_commit(txn)); - deleted++; - } - } - free(values); - printf("Deleted %d values\n", deleted); - - printf("check-preset-b.cursor-next\n"); - E(mdbx_env_stat(env, &mst, sizeof(mst))); - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - int present_b = 0; - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - ++present_b; - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - CHECK(present_b == present_a - deleted, "mismatch"); - - printf("check-preset-b.cursor-prev\n"); - j = 1; - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - ++j; - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - CHECK(present_b == j, "mismatch"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - mdbx_dbi_close(env, dbi); - /********************* LY: kept DB dirty ****************/ - mdbx_env_close_ex(env, 1); - E(mdbx_env_create(&env)); - E(mdbx_env_set_maxdbs(env, 4)); - E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - - printf("check-preset-c.cursor-next\n"); - E(mdbx_env_stat(env, &mst, sizeof(mst))); - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_dbi_open(txn, "id1", 0, &dbi)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - int present_c = 0; - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - ++present_c; - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - printf("Rolled back %d deletion(s)\n", present_c - (present_a - deleted)); - CHECK(present_c > present_a - deleted, "mismatch"); - - printf("check-preset-d.cursor-prev\n"); - j = 1; - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - ++j; - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - CHECK(present_c == j, "mismatch"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - mdbx_dbi_close(env, dbi); - mdbx_env_close_ex(env, 0); - - return 0; -} diff --git a/test/test2.c b/test/test2.c deleted file mode 100644 index f35bca9b..00000000 --- a/test/test2.c +++ /dev/null @@ -1,153 +0,0 @@ -/* mtest2.c - memory-mapped database tester/toy */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -/* Just like mtest.c, but using a subDB instead of the main DB */ - -#include "../mdbx.h" -#include -#include -#include -#include -#include -#include - -#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) -#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) \ - ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ - __LINE__, msg, mdbx_strerror(rc)), \ - abort())) - -#ifndef DBPATH -#define DBPATH "./tmp.db" -#endif - -int main(int argc, char *argv[]) { - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDBX_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32] = ""; - int env_oflags; - struct stat db_stat, exe_stat; - - (void)argc; - (void)argv; - srand(time(NULL)); - - count = (rand() % 384) + 64; - values = (int *)malloc(count * sizeof(int)); - - for (i = 0; i < count; i++) { - values[i] = rand() % 1024; - } - - E(mdbx_env_create(&env)); - E(mdbx_env_set_maxreaders(env, 1)); - E(mdbx_env_set_mapsize(env, 10485760)); - E(mdbx_env_set_maxdbs(env, 4)); - - E(stat("/proc/self/exe", &exe_stat) ? errno : 0); - E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_NOSYNC; - if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { - /* LY: Assume running inside a CI-environment: - * 1) don't use FIXEDMAP to avoid EBUSY in case collision, - * which could be inspired by address space randomisation feature. - * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some - * dedicated storage. - */ - env_oflags = 0; - } - E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - - E(mdbx_txn_begin(env, NULL, 0, &txn)); - if (mdbx_dbi_open(txn, "id2", MDB_CREATE, &dbi) == MDB_SUCCESS) - E(mdbx_drop(txn, dbi, 1)); - E(mdbx_dbi_open(txn, "id2", MDB_CREATE, &dbi)); - - key.mv_size = sizeof(int); - key.mv_data = sval; - - printf("Adding %d values\n", count); - for (i = 0; i < count; i++) { - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_KEYEXIST, mdbx_put(txn, dbi, &key, &data, MDB_NOOVERWRITE))) - j++; - } - if (j) - printf("%d duplicates skipped\n", j); - E(mdbx_txn_commit(txn)); - E(mdbx_env_stat(env, &mst, sizeof(mst))); - - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, - (char *)key.mv_data, data.mv_data, (int)data.mv_size, - (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - j = 0; - key.mv_data = sval; - for (i = count - 1; i > -1; i -= (rand() % 5)) { - j++; - txn = NULL; - E(mdbx_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%03x ", values[i]); - if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, NULL))) { - j--; - mdbx_txn_abort(txn); - } else { - E(mdbx_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); - - E(mdbx_env_stat(env, &mst, sizeof(mst))); - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - mdbx_dbi_close(env, dbi); - mdbx_env_close(env); - return 0; -} diff --git a/test/test3.c b/test/test3.c deleted file mode 100644 index 2dac03d0..00000000 --- a/test/test3.c +++ /dev/null @@ -1,162 +0,0 @@ -/* mtest3.c - memory-mapped database tester/toy */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -/* Tests for sorted duplicate DBs */ -#include "../mdbx.h" -#include -#include -#include -#include -#include -#include -#include - -#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) -#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) \ - ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ - __LINE__, msg, mdbx_strerror(rc)), \ - abort())) - -#ifndef DBPATH -#define DBPATH "./tmp.db" -#endif - -int main(int argc, char *argv[]) { - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDBX_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32]; - char kval[sizeof(int)]; - int env_oflags; - struct stat db_stat, exe_stat; - - (void)argc; - (void)argv; - srand(time(NULL)); - - memset(sval, 0, sizeof(sval)); - - count = (rand() % 384) + 64; - values = (int *)malloc(count * sizeof(int)); - - for (i = 0; i < count; i++) { - values[i] = rand() % 1024; - } - - E(mdbx_env_create(&env)); - E(mdbx_env_set_mapsize(env, 10485760)); - E(mdbx_env_set_maxdbs(env, 4)); - - E(stat("/proc/self/exe", &exe_stat) ? errno : 0); - E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_NOSYNC; - if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { - /* LY: Assume running inside a CI-environment: - * 1) don't use FIXEDMAP to avoid EBUSY in case collision, - * which could be inspired by address space randomisation feature. - * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some - * dedicated storage. - */ - env_oflags = 0; - } - E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - - E(mdbx_txn_begin(env, NULL, 0, &txn)); - if (mdbx_dbi_open(txn, "id3", MDB_CREATE, &dbi) == MDB_SUCCESS) - E(mdbx_drop(txn, dbi, 1)); - E(mdbx_dbi_open(txn, "id3", MDB_CREATE | MDB_DUPSORT, &dbi)); - - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - - printf("Adding %d values\n", count); - for (i = 0; i < count; i++) { - if (!(i & 0x0f)) - sprintf(kval, "%03x", values[i]); - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - if (RES(MDB_KEYEXIST, mdbx_put(txn, dbi, &key, &data, MDB_NODUPDATA))) - j++; - } - if (j) - printf("%d duplicates skipped\n", j); - E(mdbx_txn_commit(txn)); - E(mdbx_env_stat(env, &mst, sizeof(mst))); - - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, - (char *)key.mv_data, data.mv_data, (int)data.mv_size, - (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - j = 0; - - for (i = count - 1; i > -1; i -= (rand() % 5)) { - j++; - txn = NULL; - E(mdbx_txn_begin(env, NULL, 0, &txn)); - sprintf(kval, "%03x", values[i & ~0x0f]); - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { - j--; - mdbx_txn_abort(txn); - } else { - E(mdbx_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); - - E(mdbx_env_stat(env, &mst, sizeof(mst))); - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - mdbx_dbi_close(env, dbi); - mdbx_env_close(env); - return 0; -} diff --git a/test/test4.c b/test/test4.c deleted file mode 100644 index aedec134..00000000 --- a/test/test4.c +++ /dev/null @@ -1,196 +0,0 @@ -/* mtest4.c - memory-mapped database tester/toy */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -/* Tests for sorted duplicate DBs with fixed-size keys */ -#include "../mdbx.h" -#include -#include -#include -#include -#include -#include -#include - -#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) -#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) \ - ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ - __LINE__, msg, mdbx_strerror(rc)), \ - abort())) - -#ifndef DBPATH -#define DBPATH "./tmp.db" -#endif - -int main(int argc, char *argv[]) { - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDBX_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[8]; - char kval[sizeof(int)]; - int env_oflags; - struct stat db_stat, exe_stat; - - (void)argc; - (void)argv; - memset(sval, 0, sizeof(sval)); - - count = 510; - values = (int *)malloc(count * sizeof(int)); - - for (i = 0; i < count; i++) { - values[i] = i * 5; - } - - E(mdbx_env_create(&env)); - E(mdbx_env_set_mapsize(env, 10485760)); - E(mdbx_env_set_maxdbs(env, 4)); - - E(stat("/proc/self/exe", &exe_stat) ? errno : 0); - E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_NOSYNC; - if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { - /* LY: Assume running inside a CI-environment: - * 1) don't use FIXEDMAP to avoid EBUSY in case collision, - * which could be inspired by address space randomisation feature. - * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some - * dedicated storage. - */ - env_oflags = 0; - } - E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - - E(mdbx_txn_begin(env, NULL, 0, &txn)); - if (mdbx_dbi_open(txn, "id4", MDB_CREATE, &dbi) == MDB_SUCCESS) - E(mdbx_drop(txn, dbi, 1)); - E(mdbx_dbi_open(txn, "id4", MDB_CREATE | MDB_DUPSORT | MDB_DUPFIXED, &dbi)); - - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - - printf("Adding %d values\n", count); - strcpy(kval, "001"); - for (i = 0; i < count; i++) { - sprintf(sval, "%07x", values[i]); - if (RES(MDB_KEYEXIST, mdbx_put(txn, dbi, &key, &data, MDB_NODUPDATA))) - j++; - } - if (j) - printf("%d duplicates skipped\n", j); - E(mdbx_txn_commit(txn)); - E(mdbx_env_stat(env, &mst, sizeof(mst))); - - /* there should be one full page of dups now. - */ - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, - (char *)key.mv_data, data.mv_data, (int)data.mv_size, - (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - /* test all 3 branches of split code: - * 1: new key in lower half - * 2: new key at split point - * 3: new key in upper half - */ - - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - - sprintf(sval, "%07x", values[3] + 1); - E(mdbx_txn_begin(env, NULL, 0, &txn)); - (void)RES(MDB_KEYEXIST, mdbx_put(txn, dbi, &key, &data, MDB_NODUPDATA)); - mdbx_txn_abort(txn); - - sprintf(sval, "%07x", values[255] + 1); - E(mdbx_txn_begin(env, NULL, 0, &txn)); - (void)RES(MDB_KEYEXIST, mdbx_put(txn, dbi, &key, &data, MDB_NODUPDATA)); - mdbx_txn_abort(txn); - - sprintf(sval, "%07x", values[500] + 1); - E(mdbx_txn_begin(env, NULL, 0, &txn)); - (void)RES(MDB_KEYEXIST, mdbx_put(txn, dbi, &key, &data, MDB_NODUPDATA)); - E(mdbx_txn_commit(txn)); - - /* Try MDB_NEXT_MULTIPLE */ - E(mdbx_txn_begin(env, NULL, 0, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT_MULTIPLE)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - j = 0; - - for (i = count - 1; i > -1; i -= (rand() % 3)) { - j++; - txn = NULL; - E(mdbx_txn_begin(env, NULL, 0, &txn)); - sprintf(sval, "%07x", values[i]); - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { - j--; - mdbx_txn_abort(txn); - } else { - E(mdbx_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); - - E(mdbx_env_stat(env, &mst, sizeof(mst))); - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - mdbx_dbi_close(env, dbi); - mdbx_env_close(env); - return 0; -} diff --git a/test/test5.c b/test/test5.c deleted file mode 100644 index c1018c64..00000000 --- a/test/test5.c +++ /dev/null @@ -1,164 +0,0 @@ -/* mtest5.c - memory-mapped database tester/toy */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -/* Tests for sorted duplicate DBs using cursor_put */ -#include "../mdbx.h" -#include -#include -#include -#include -#include -#include -#include - -#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) -#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) \ - ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ - __LINE__, msg, mdbx_strerror(rc)), \ - abort())) - -#ifndef DBPATH -#define DBPATH "./tmp.db" -#endif - -int main(int argc, char *argv[]) { - int i = 0, j = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data; - MDB_txn *txn; - MDBX_stat mst; - MDB_cursor *cursor; - int count; - int *values; - char sval[32]; - char kval[sizeof(int)]; - int env_oflags; - struct stat db_stat, exe_stat; - - (void)argc; - (void)argv; - srand(time(NULL)); - - memset(sval, 0, sizeof(sval)); - - count = (rand() % 384) + 64; - values = (int *)malloc(count * sizeof(int)); - - for (i = 0; i < count; i++) { - values[i] = rand() % 1024; - } - - E(mdbx_env_create(&env)); - E(mdbx_env_set_mapsize(env, 10485760)); - E(mdbx_env_set_maxdbs(env, 4)); - - E(stat("/proc/self/exe", &exe_stat) ? errno : 0); - E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_NOSYNC; - if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { - /* LY: Assume running inside a CI-environment: - * 1) don't use FIXEDMAP to avoid EBUSY in case collision, - * which could be inspired by address space randomisation feature. - * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some - * dedicated storage. - */ - env_oflags = 0; - } - E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - - E(mdbx_txn_begin(env, NULL, 0, &txn)); - if (mdbx_dbi_open(txn, "id5", MDB_CREATE, &dbi) == MDB_SUCCESS) - E(mdbx_drop(txn, dbi, 1)); - E(mdbx_dbi_open(txn, "id5", MDB_CREATE | MDB_DUPSORT, &dbi)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - - printf("Adding %d values\n", count); - for (i = 0; i < count; i++) { - if (!(i & 0x0f)) - sprintf(kval, "%03x", values[i]); - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - if (RES(MDB_KEYEXIST, mdbx_cursor_put(cursor, &key, &data, MDB_NODUPDATA))) - j++; - } - if (j) - printf("%d duplicates skipped\n", j); - mdbx_cursor_close(cursor); - E(mdbx_txn_commit(txn)); - E(mdbx_env_stat(env, &mst, sizeof(mst))); - - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int)key.mv_size, - (char *)key.mv_data, data.mv_data, (int)data.mv_size, - (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - j = 0; - - for (i = count - 1; i > -1; i -= (rand() % 5)) { - j++; - txn = NULL; - E(mdbx_txn_begin(env, NULL, 0, &txn)); - sprintf(kval, "%03x", values[i & ~0x0f]); - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { - j--; - mdbx_txn_abort(txn); - } else { - E(mdbx_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); - - E(mdbx_env_stat(env, &mst, sizeof(mst))); - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", (int)key.mv_size, (char *)key.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - mdbx_dbi_close(env, dbi); - mdbx_env_close(env); - return 0; -} diff --git a/test/test6.c b/test/test6.c deleted file mode 100644 index 03b6f7d1..00000000 --- a/test/test6.c +++ /dev/null @@ -1,175 +0,0 @@ -/* mtest6.c - memory-mapped database tester/toy */ - -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2011-2017 Howard Chu, Symas Corp. - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -/* Tests for DB splits and merges */ -#include "../mdbx.h" -#include -#include -#include -#include -#include -#include -#include - -#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) -#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) \ - ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ - __LINE__, msg, mdbx_strerror(rc)), \ - abort())) - -#ifndef DBPATH -#define DBPATH "./tmp.db" -#endif - -char dkbuf[1024]; - -int main(int argc, char *argv[]) { - int i = 0, rc; - MDB_env *env; - MDB_dbi dbi; - MDB_val key, data, sdata; - MDB_txn *txn; - MDBX_stat mst; - MDB_cursor *cursor; - long kval; - char *sval; - int env_oflags; - struct stat db_stat, exe_stat; - - (void)argc; - (void)argv; - srand(time(NULL)); - - E(mdbx_env_create(&env)); - E(mdbx_env_set_mapsize(env, 10485760)); - E(mdbx_env_set_maxdbs(env, 4)); - - E(stat("/proc/self/exe", &exe_stat) ? errno : 0); - E(stat(DBPATH "/.", &db_stat) ? errno : 0); - env_oflags = MDB_NOSYNC; - if (major(db_stat.st_dev) != major(exe_stat.st_dev)) { - /* LY: Assume running inside a CI-environment: - * 1) don't use FIXEDMAP to avoid EBUSY in case collision, - * which could be inspired by address space randomisation feature. - * 2) drop MDB_NOSYNC expecting that DBPATH is at a tmpfs or some - * dedicated storage. - */ - env_oflags = 0; - } - E(mdbx_env_open(env, DBPATH, env_oflags, 0664)); - - E(mdbx_txn_begin(env, NULL, 0, &txn)); - if (mdbx_dbi_open(txn, "id6", MDB_CREATE, &dbi) == MDB_SUCCESS) - E(mdbx_drop(txn, dbi, 1)); - E(mdbx_dbi_open(txn, "id6", MDB_CREATE | MDB_INTEGERKEY, &dbi)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - E(mdbx_dbi_stat(txn, dbi, &mst, sizeof(mst))); - - sval = calloc(1, mst.ms_psize / 4); - key.mv_size = sizeof(long); - key.mv_data = &kval; - sdata.mv_size = mst.ms_psize / 4 - 30; - sdata.mv_data = sval; - - printf("Adding 12 values, should yield 3 splits\n"); - for (i = 0; i < 12; i++) { - kval = i * 5; - sprintf(sval, "%08lx", kval); - data = sdata; - (void)RES(MDB_KEYEXIST, - mdbx_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); - } - printf("Adding 12 more values, should yield 3 splits\n"); - for (i = 0; i < 12; i++) { - kval = i * 5 + 4; - sprintf(sval, "%08lx", kval); - data = sdata; - (void)RES(MDB_KEYEXIST, - mdbx_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); - } - printf("Adding 12 more values, should yield 3 splits\n"); - for (i = 0; i < 12; i++) { - kval = i * 5 + 1; - sprintf(sval, "%08lx", kval); - data = sdata; - (void)RES(MDB_KEYEXIST, - mdbx_cursor_put(cursor, &key, &data, MDB_NOOVERWRITE)); - } - E(mdbx_cursor_get(cursor, &key, &data, MDB_FIRST)); - - do { - printf("key: %p %s, data: %p %.*s\n", key.mv_data, - mdbx_dkey(&key, dkbuf, sizeof(dkbuf)), data.mv_data, - (int)data.mv_size, (char *)data.mv_data); - } while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0); - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_commit(txn); - -#if 0 - int j=0; - int count = 333; - int *values = alloca(sizeof(int) * count); - - for (i= count - 1; i > -1; i-= (rand()%5)) { - j++; - txn=NULL; - E(mdbx_txn_begin(env, NULL, 0, &txn)); - sprintf(kval, "%03x", values[i & ~0x0f]); - sprintf(sval, "%03x %d foo bar", values[i], values[i]); - key.mv_size = sizeof(int); - key.mv_data = kval; - data.mv_size = sizeof(sval); - data.mv_data = sval; - if (RES(MDB_NOTFOUND, mdbx_del(txn, dbi, &key, &data))) { - j--; - mdbx_txn_abort(txn); - } else { - E(mdbx_txn_commit(txn)); - } - } - free(values); - printf("Deleted %d values\n", j); - - E(mdbx_env_stat(env, &mst, sizeof(mst))); - E(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - E(mdbx_cursor_open(txn, dbi, &cursor)); - printf("Cursor next\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - printf("Cursor prev\n"); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_PREV)) == 0) { - printf("key: %.*s, data: %.*s\n", - (int) key.mv_size, (char *) key.mv_data, - (int) data.mv_size, (char *) data.mv_data); - } - CHECK(rc == MDB_NOTFOUND, "mdbx_cursor_get"); - mdbx_cursor_close(cursor); - mdbx_txn_abort(txn); - - mdbx_dbi_close(env, dbi); -#endif - mdbx_env_close(env); - free(sval); - - return 0; -} diff --git a/test/test7.c b/test/test7.c deleted file mode 100644 index 61110d5c..00000000 --- a/test/test7.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright 2017 Klaus Malorny - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include - -#include "../mdbx.h" - -static const char *fileName = "/dev/shm/test.mdbx"; -static const char *dbName = "test"; -static long size = 1500000000; -static int recordCount = 33000000; -static int majorIdCount = 6000; -static int minorIdCount = 1000000; -static unsigned int seed = 1; -static long *majorIds; - -typedef struct { - long majorId; - long minorId; -} KeyType; - -typedef struct { long refId; } DataType; - -typedef struct { - KeyType key; - DataType data; -} KeyDataType; - -void check(const char *op, int error) { - if (error != 0) { - fprintf(stderr, "%s: unexpected error %d: %s\n", op, error, - mdbx_strerror(error)); - exit(1); - } -} - -void shuffle(void *data, int recordSize, int recordCount) { - char *ptr = (char *)data; - char *swapBuf = malloc(recordSize); - - for (int i = recordCount - 2; i >= 0; i--) { - int j = (int)(random() % (recordCount - i)); - - if (j > 0) { - char *ptr1 = ptr + i * recordSize; - char *ptr2 = ptr + (i + j) * recordSize; - - memcpy(swapBuf, ptr1, recordSize); - memcpy(ptr1, ptr2, recordSize); - memcpy(ptr2, swapBuf, recordSize); - } - } - - free(swapBuf); -} - -void fill(MDB_env *env, MDB_dbi dbi) { - KeyType key; - DataType data; - - MDB_val keyRef; - MDB_val dataRef; - MDB_txn *txn; - - printf("generating data\n"); - - srandom(seed); - - majorIds = (long *)malloc(majorIdCount * sizeof(long)); - - if (!majorIds) { - fprintf(stderr, "out of memory\n"); - exit(1); - } - - for (int i = 0; i < majorIdCount; i++) - majorIds[i] = i; - - // now shuffle (for later deletion test) - shuffle((void *)majorIds, sizeof(long), majorIdCount); - - KeyDataType *records = malloc(sizeof(KeyDataType) * recordCount); - KeyDataType *ptr = records; - int remaining = recordCount; - long refId = 0; - - for (int i = 0; i < minorIdCount; i++) { - long majorId = random() % majorIdCount; - long minorId = i; - - int max = remaining / (minorIdCount - i + 1); - int use; - - if (i == minorIdCount - 1 || max < 2) { - use = max; - - } else { - long rand1 = random() % max; - long rand2 = random() % max; - use = (int)((rand1 * rand2 / (max - 1))) + 1; // non-linear distribution - } - - // printf ("%d %d %d\n", i, max, use); - - while (use-- > 0) { - ptr->key.majorId = majorId; - ptr->key.minorId = minorId; - ptr->data.refId = ++refId; - ptr++; - remaining--; - } - } - - shuffle((void *)records, sizeof(KeyDataType), recordCount); - - printf("writing data\n"); - - check("txn_begin", mdbx_txn_begin(env, NULL, 0, &txn)); - - ptr = records; - - for (int i = recordCount; i > 0; i--) { - - key.majorId = htobe64(ptr->key.majorId); - key.minorId = htobe64(ptr->key.minorId); - data.refId = htobe64(ptr->data.refId); - - keyRef.mv_size = sizeof(key); - keyRef.mv_data = (void *)&key; - dataRef.mv_size = sizeof(data); - dataRef.mv_data = (void *)&data; - - check("mdbx_put", mdbx_put(txn, dbi, &keyRef, &dataRef, 0)); - - ptr++; - } - - check("txn_commit", mdbx_txn_commit(txn)); - - printf("%d records written\n", recordCount); -} - -void deleteRange(MDB_env *env, MDB_dbi dbi, MDB_txn *txn, KeyType *startKey, - KeyType *endKey, int endIsInclusive) { - MDB_cursor *cursor; - MDB_val curKeyRef; - MDB_val endKeyRef; - MDB_val curDataRef; - (void)env; - - check("cursor_open", mdbx_cursor_open(txn, dbi, &cursor)); - - curKeyRef.mv_size = sizeof(KeyType); - curKeyRef.mv_data = (void *)startKey; - endKeyRef.mv_size = sizeof(KeyType); - endKeyRef.mv_data = (void *)endKey; - curDataRef.mv_size = 0; - curDataRef.mv_data = NULL; - - int error = mdbx_cursor_get(cursor, &curKeyRef, &curDataRef, MDB_SET_RANGE); - - while (error != MDB_NOTFOUND) { - check("mdbx_cursor_get", error); - - int compResult = mdbx_cmp(txn, dbi, &curKeyRef, &endKeyRef); - - if (compResult > 0 || (!compResult && !endIsInclusive)) - break; - - check("mdbx_cursor_del", mdbx_cursor_del(cursor, MDB_NODUPDATA)); - - error = mdbx_cursor_get(cursor, &curKeyRef, &curDataRef, MDB_NEXT); - } - - mdbx_cursor_close(cursor); -} - -void testDelete(MDB_env *env, MDB_dbi dbi) { - MDB_txn *txn; - KeyType startKey; - KeyType endKey; - - printf("testing\n"); - - check("txn_begin", mdbx_txn_begin(env, NULL, 0, &txn)); - - long majorId; - - for (int i = 0; i < majorIdCount; i++) { - majorId = majorIds[i]; - startKey.majorId = htobe64(majorId); - startKey.minorId = htobe64(1); - endKey.majorId = htobe64(majorId); - endKey.minorId = htobe64((long)(~0UL >> 1)); - - deleteRange(env, dbi, txn, &startKey, &endKey, 1); - } - - check("txn_commit", mdbx_txn_commit(txn)); -} - -int main(int argc, char *argv[]) { - MDB_env *env; - MDB_dbi dbi; - MDB_txn *txn; - (void)argc; - (void)argv; - - printf("LMDB version: %s\n", MDBX_VERSION_STRING); - - unlink(fileName); - check("env_create", mdbx_env_create(&env)); - check("env_set_mapsize", mdbx_env_set_mapsize(env, size)); - check("env_set_maxdbs", mdbx_env_set_maxdbs(env, 2)); - - check("env_open", - mdbx_env_open(env, fileName, MDB_NOSUBDIR | MDB_WRITEMAP, 0666)); - - check("txn_begin", mdbx_txn_begin(env, NULL, 0, &txn)); - - check("dbi_open", mdbx_dbi_open(txn, dbName, MDB_CREATE | MDB_DUPSORT, &dbi)); - - check("txn_commit", mdbx_txn_commit(txn)); - - fill(env, dbi); - testDelete(env, dbi); - - mdbx_env_close(env); - - printf("done.\n"); -} diff --git a/test/test_bench.c b/test/test_bench.c deleted file mode 100644 index 377fbf70..00000000 --- a/test/test_bench.c +++ /dev/null @@ -1,260 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2015,2016 Peter-Service R&D LLC. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../mdbx.h" - -#define E(expr) CHECK((rc = (expr)) == MDB_SUCCESS, #expr) -#define RES(err, expr) ((rc = expr) == (err) || (CHECK(!rc, #expr), 0)) -#define CHECK(test, msg) \ - ((test) ? (void)0 : ((void)fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, \ - __LINE__, msg, mdbx_strerror(rc)), \ - abort())) - -#ifndef DBPATH -#define DBPATH "./tmp.db" -#endif - -struct t0 { - struct rusage ru; - struct timespec ts; -}; - -void t0(struct t0 *t0) { - int rc; - E(getrusage(RUSAGE_SELF, &t0->ru)); - E(clock_gettime(CLOCK_MONOTONIC_RAW, &t0->ts)); -} - -struct info { - double wall_s, cpu_sys_s, cpu_user_s; - long iops_r, iops_w, iops_pf; -}; - -double delta_s(const struct timeval *begin, const struct timeval *end) { - return end->tv_sec - begin->tv_sec + - (end->tv_usec - begin->tv_usec) / 1000000.0; -} - -double delta2_s(const struct timespec *begin, const struct timespec *end) { - return end->tv_sec - begin->tv_sec + - (end->tv_nsec - begin->tv_nsec) / 1000000000.0; -} - -void measure(const struct t0 *t0, struct info *i) { - struct t0 t1; - int rc; - - E(clock_gettime(CLOCK_MONOTONIC_RAW, &t1.ts)); - E(getrusage(RUSAGE_SELF, &t1.ru)); - - i->wall_s = delta2_s(&t0->ts, &t1.ts); - i->cpu_user_s = delta_s(&t0->ru.ru_utime, &t1.ru.ru_utime); - i->cpu_sys_s = delta_s(&t0->ru.ru_stime, &t1.ru.ru_stime); - i->iops_r = t1.ru.ru_inblock - t0->ru.ru_inblock; - i->iops_w = t1.ru.ru_oublock - t0->ru.ru_oublock; - i->iops_pf = - t1.ru.ru_majflt - t0->ru.ru_majflt + t1.ru.ru_minflt - t0->ru.ru_minflt; -} - -void print(struct info *i) { - printf("wall-clock %.3f, iops: %lu reads, %lu writes, %lu page-faults, " - "cpu: %.3f user, %.3f sys\n", - i->wall_s, i->iops_r, i->iops_w, i->iops_pf, i->cpu_user_s, - i->cpu_sys_s); -} - -static void wbench(int flags, int mb, int count, int salt) { - MDB_env *env; - MDB_dbi dbi; - MDB_txn *txn; - MDB_val key, data; - unsigned key_value = salt; - char data_value[777]; - int i, rc; - struct t0 start; - struct info ra, rd, rs, rt; - - mkdir(DBPATH, 0755); - unlink(DBPATH "/data.mdb"); - unlink(DBPATH "/lock.mdb"); - - printf("\nProbing %d Mb, %d items, flags:", mb, count); - if (flags & MDB_NOSYNC) - printf(" NOSYNC"); - if (flags & MDB_NOMETASYNC) - printf(" NOMETASYNC"); - if (flags & MDB_WRITEMAP) - printf(" WRITEMAP"); - if (flags & MDB_MAPASYNC) - printf(" MAPASYNC"); -#if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - if (flags & MDBX_COALESCE) - printf(" COALESCE"); - if (flags & MDBX_LIFORECLAIM) - printf(" LIFO"); -#endif - printf(" 0x%X\n", flags); - - E(mdbx_env_create(&env)); - E(mdbx_env_set_mapsize(env, (1ull << 20) * mb)); - E(mdbx_env_open(env, DBPATH, flags, 0664)); - - key.mv_size = sizeof(key_value); - key.mv_data = &key_value; - data.mv_size = sizeof(data_value); - data.mv_data = &data_value; - - printf("\tAdding %d values...", count); - fflush(stdout); - key_value = salt; - t0(&start); - for (i = 0; i < count; ++i) { - E(mdbx_txn_begin(env, NULL, 0, &txn)); - E(mdbx_dbi_open(txn, NULL, 0, &dbi)); - - snprintf(data_value, sizeof(data_value), "value=%u", key_value); - E(mdbx_put(txn, dbi, &key, &data, MDB_NOOVERWRITE)); - E(mdbx_txn_commit(txn)); - - key_value = key_value * 1664525 + 1013904223; - } - measure(&start, &ra); - print(&ra); - - printf("\tDeleting %d values...", count); - fflush(stdout); - key_value = salt; - t0(&start); - for (i = 0; i < count; ++i) { - E(mdbx_txn_begin(env, NULL, 0, &txn)); - E(mdbx_dbi_open(txn, NULL, 0, &dbi)); - - E(mdbx_del(txn, dbi, &key, NULL)); - E(mdbx_txn_commit(txn)); - - key_value = key_value * 1664525 + 1013904223; - } - measure(&start, &rd); - print(&rd); - - printf("\tCheckpoint..."); - fflush(stdout); - t0(&start); - mdbx_env_sync(env, 1); - measure(&start, &rs); - print(&rs); - - mdbx_env_close(env); - rt.wall_s = ra.wall_s + rd.wall_s + rs.wall_s; - rt.cpu_sys_s = ra.cpu_sys_s + rd.cpu_sys_s + rs.cpu_sys_s; - rt.cpu_user_s = ra.cpu_user_s + rd.cpu_user_s + rs.cpu_user_s; - rt.iops_r = ra.iops_r + rd.iops_r + rs.iops_r; - rt.iops_w = ra.iops_w + rd.iops_w + rs.iops_w; - rt.iops_pf = ra.iops_pf + rd.iops_pf + rs.iops_pf; - printf("Total "); - print(&rt); - - fprintf(stderr, "flags: "); - if (flags & MDB_NOSYNC) - fprintf(stderr, " NOSYNC"); - if (flags & MDB_NOMETASYNC) - fprintf(stderr, " NOMETASYNC"); - if (flags & MDB_WRITEMAP) - fprintf(stderr, " WRITEMAP"); - if (flags & MDB_MAPASYNC) - fprintf(stderr, " MAPASYNC"); -#if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - if (flags & MDBX_COALESCE) - fprintf(stderr, " COALESCE"); - if (flags & MDBX_LIFORECLAIM) - fprintf(stderr, " LIFO"); -#endif - fprintf(stderr, "\t%.3f\t%.3f\t%.3f\t%.3f\n", rt.iops_w / 1000.0, - rt.cpu_user_s, rt.cpu_sys_s, rt.wall_s); -} - -int main(int argc, char *argv[]) { - (void)argc; - (void)argv; - -#define SALT 1 -#define COUNT 10000 -#define SIZE 12 - - printf("\nDefault 'sync' mode..."); - wbench(0, SIZE, COUNT, SALT); -#if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - // wbench(MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -// wbench(MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -#endif - - printf("\nno-meta-sync hack..."); - wbench(MDB_NOMETASYNC, SIZE, COUNT, SALT); -#if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - // wbench(MDB_NOMETASYNC | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -// wbench(MDB_NOMETASYNC | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -#endif - - printf("\nno-sync..."); - wbench(MDB_NOSYNC, SIZE, COUNT, SALT); -#if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) -// wbench(MDB_NOSYNC | MDBX_COALESCE, SIZE, COUNT, SALT); -// wbench(MDB_NOSYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, -// SALT); -// wbench(MDB_NOSYNC | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -#endif - - printf("\nr/w-map..."); - wbench(MDB_WRITEMAP, SIZE, COUNT, SALT); -#if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - // wbench(MDB_WRITEMAP | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_WRITEMAP | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -// wbench(MDB_WRITEMAP | MDBX_LIFORECLAIM, SIZE, COUNT, SALT); -#endif - - printf("\nasync..."); - wbench(MDB_WRITEMAP | MDB_MAPASYNC, SIZE, COUNT, SALT); -#if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - // wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_COALESCE, SIZE, COUNT, - // SALT); - wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, - COUNT, SALT); -// wbench(MDB_WRITEMAP | MDB_MAPASYNC | MDBX_LIFORECLAIM, SIZE, COUNT, -// SALT); -#endif - - printf("\nr/w-map + no-sync..."); - wbench(MDB_NOSYNC | MDB_WRITEMAP, SIZE, COUNT, SALT); -#if defined(MDBX_COALESCE) && defined(MDBX_LIFORECLAIM) - // wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_COALESCE, SIZE, COUNT, SALT); - wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_COALESCE | MDBX_LIFORECLAIM, SIZE, - COUNT, SALT); -// wbench(MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, SIZE, COUNT, -// SALT); -#endif - - return 0; -} diff --git a/test/test_yota1.c b/test/test_yota1.c deleted file mode 100644 index 7d036f0f..00000000 --- a/test/test_yota1.c +++ /dev/null @@ -1,277 +0,0 @@ -/* - * Copyright 2016-2017 Leonid Yuriev . - * Copyright 2015 Vladimir Romanov - * , Yota Lab. - * - * This file is part of libmdbx. - * - * libmdbx is free software; you can redistribute it and/or modify it under - * the terms of the GNU Affero General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * libmdbx is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include -#include - -#include "../mdbx.h" -#include -#include -#include -#include -#include -#include -#include - -#define IP_PRINTF_ARG_HOST(addr) \ - (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), \ - (int)((addr)&0xff) - -char opt_db_path[PATH_MAX] = "/dev/shm/x_bench1"; -static MDB_env *env; -#define REC_COUNT 1000000 -int64_t ids[REC_COUNT + REC_COUNT / 10]; -int32_t ids_count = 0; - -int64_t x_add = 0; -int64_t x_del = 0; -int64_t obj_id = 0; - -static void add_id_to_pool(int64_t id) { - ids[ids_count] = id; - ids_count++; -} - -static inline int64_t getTimeMicroseconds(void) { - struct timeval val; - gettimeofday(&val, NULL); - return val.tv_sec * ((int64_t)1000000) + val.tv_usec; -} - -static int64_t get_id_from_pool() { - if (ids_count == 0) { - return -1; - } - int32_t index = rand() % ids_count; - int64_t id = ids[index]; - ids[index] = ids[ids_count - 1]; - ids_count--; - return id; -} - -#define LMDB_CHECK(x) \ - do { \ - const int rc = (x); \ - if (rc != MDB_SUCCESS) { \ - printf("Error [%d] %s in %s at %s:%d\n", rc, mdbx_strerror(rc), #x, \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -static void db_connect() { - LMDB_CHECK(mdbx_env_create(&env)); - LMDB_CHECK(mdbx_env_set_mapsize(env, 3L * 1024L * 1024L * 1024L)); - LMDB_CHECK(mdbx_env_set_maxdbs(env, 30)); -#if defined(MDBX_LIFORECLAIM) - LMDB_CHECK(mdbx_env_open( - env, opt_db_path, - MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664)); -#else - LMDB_CHECK(mdbx_env_open(env, opt_db_path, - MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); -#endif - printf("Connection open\n"); -} - -typedef struct { - char session_id1[100]; - char session_id2[100]; - char ip[20]; - uint8_t fill[100]; -} session_data_t; - -typedef struct { - int64_t obj_id; - int8_t event_type; -} __attribute__((__packed__)) event_data_t; - -static void create_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; - session_data_t data; - // transaction init - snprintf(data.session_id1, sizeof(data.session_id1), - "mskugw%02ld_%02ld.gx.yota.ru;3800464060;4152;%ld", - record_id % 3 + 1, record_id % 9 + 1, record_id); - snprintf(data.session_id2, sizeof(data.session_id2), - "gx_service;%ld;%ld;node@spb-jsm1", record_id, - record_id % 1000000000 + 99999); - snprintf(data.ip, sizeof(data.ip), "%d.%d.%d.%d", - IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); - event.obj_id = record_id; - event.event_type = 1; - - MDB_val _session_id1_rec = {data.session_id1, strlen(data.session_id1)}; - MDB_val _session_id2_rec = {data.session_id2, strlen(data.session_id2)}; - MDB_val _ip_rec = {data.ip, strlen(data.ip)}; - MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; - MDB_val _data_rec = {&data, offsetof(session_data_t, fill) + - (rand() % sizeof(data.fill))}; - MDB_val _event_rec = {&event, sizeof(event)}; - - LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); - LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - LMDB_CHECK(mdbx_put(txn, dbi_session, &_obj_id_rec, &_data_rec, - MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id1_rec, &_obj_id_rec, - MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id2_rec, &_obj_id_rec, - MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdbx_put(txn, dbi_ip, &_ip_rec, &_obj_id_rec, 0)); - LMDB_CHECK(mdbx_put(txn, dbi_event, &_event_rec, &_obj_id_rec, 0)); - - // transaction commit - LMDB_CHECK(mdbx_txn_commit(txn)); - x_add++; -} - -static void delete_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; - - // transaction init - LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); - // open database in read-write mode - LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - // put data - MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; - MDB_val v_rec; - // get data - LMDB_CHECK(mdbx_get(txn, dbi_session, &_obj_id_rec, &v_rec)); - session_data_t *data = (session_data_t *)v_rec.mv_data; - - MDB_val _session_id1_rec = {data->session_id1, strlen(data->session_id1)}; - MDB_val _session_id2_rec = {data->session_id2, strlen(data->session_id2)}; - MDB_val _ip_rec = {data->ip, strlen(data->ip)}; - LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id1_rec, NULL)); - LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id2_rec, NULL)); - LMDB_CHECK(mdbx_del(txn, dbi_ip, &_ip_rec, NULL)); - event.obj_id = record_id; - event.event_type = 1; - MDB_val _event_rec = {&event, sizeof(event)}; - LMDB_CHECK(mdbx_del(txn, dbi_event, &_event_rec, NULL)); - LMDB_CHECK(mdbx_del(txn, dbi_session, &_obj_id_rec, NULL)); - - // transaction commit - LMDB_CHECK(mdbx_txn_commit(txn)); - x_del++; -} - -static void db_disconnect() { - mdbx_env_close(env); - printf("Connection closed\n"); -} - -static void get_db_stat(const char *db, int64_t *ms_branch_pages, - int64_t *ms_leaf_pages) { - MDB_txn *txn; - MDBX_stat stat; - MDB_dbi dbi; - - LMDB_CHECK(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - LMDB_CHECK(mdbx_dbi_open(txn, db, MDB_CREATE, &dbi)); - LMDB_CHECK(mdbx_dbi_stat(txn, dbi, &stat, sizeof(stat))); - mdbx_txn_abort(txn); - printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", db, - stat.ms_branch_pages, stat.ms_depth, stat.ms_entries, - stat.ms_leaf_pages, stat.ms_overflow_pages); - (*ms_branch_pages) += stat.ms_branch_pages; - (*ms_leaf_pages) += stat.ms_leaf_pages; -} - -static void periodic_stat(void) { - int64_t ms_branch_pages = 0; - int64_t ms_leaf_pages = 0; - printf(" Name | ms_branch_pages | depth | entries | " - "leaf_pages | overf_pages |\n"); - get_db_stat("session", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); - printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, - "", "", ms_leaf_pages, ""); - static int64_t prev_add; - static int64_t prev_del; - static int64_t t = -1; - if (t > 0) { - int64_t delta = getTimeMicroseconds() - t; - printf("CPS: add %ld, delete %ld, items processed - %ld\n", - (x_add - prev_add) * 1000000 / delta, - (x_del - prev_del) * 1000000 / delta, obj_id); - } - t = getTimeMicroseconds(); - prev_add = x_add; - prev_del = x_del; -} - -static void periodic_add_rec() { - int i; - for (i = 0; i < 10000; i++) { - if (ids_count <= REC_COUNT) { - int64_t id = obj_id++; - create_record(id); - add_id_to_pool(id); - } - if (ids_count > REC_COUNT) { - int64_t id = get_id_from_pool(); - delete_record(id); - } - } - periodic_stat(); -} - -int main(int argc, char **argv) { - (void)argc; - (void)argv; - - char filename[PATH_MAX]; - mkdir(opt_db_path, 0775); - - strcpy(filename, opt_db_path); - strcat(filename, "/data.mdb"); - remove(filename); - - strcpy(filename, opt_db_path); - strcat(filename, "/lock.mdb"); - remove(filename); - - db_connect(); - while (1) { - periodic_add_rec(); - } - db_disconnect(); - return 0; -} diff --git a/test/test_yota2.c b/test/test_yota2.c deleted file mode 100644 index 79c72880..00000000 --- a/test/test_yota2.c +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright 2016-2017 Leonid Yuriev . - * Copyright 2015 Vladimir Romanov - * , Yota Lab. - * - * This file is part of libmdbx. - * - * libmdbx is free software; you can redistribute it and/or modify it under - * the terms of the GNU Affero General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * libmdbx is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ - -#include -#include - -#include "../mdbx.h" -#include -#include -#include -#include -#include -#include -#include - -#define IP_PRINTF_ARG_HOST(addr) \ - (int)((addr) >> 24), (int)((addr) >> 16 & 0xff), (int)((addr) >> 8 & 0xff), \ - (int)((addr)&0xff) - -char opt_db_path[PATH_MAX] = "/dev/shm/x_bench2"; -static MDB_env *env; -#define REC_COUNT 1024000 -int64_t ids[REC_COUNT * 10]; -int32_t ids_count = 0; - -int64_t x_add = 0; -int64_t x_del = 0; -int64_t obj_id = 0; -int64_t x_data_size = 0; -int64_t x_key_size = 0; - -static void add_id_to_pool(int64_t id) { - ids[ids_count] = id; - ids_count++; -} - -static inline int64_t getTimeMicroseconds(void) { - struct timeval val; - gettimeofday(&val, NULL); - return val.tv_sec * ((int64_t)1000000) + val.tv_usec; -} - -static int64_t get_id_from_pool() { - if (ids_count == 0) { - return -1; - } - int32_t index = rand() % ids_count; - int64_t id = ids[index]; - ids[index] = ids[ids_count - 1]; - ids_count--; - return id; -} - -#define LMDB_CHECK(x) \ - do { \ - const int rc = (x); \ - if (rc != MDB_SUCCESS) { \ - printf("Error [%d] %s in %s at %s:%d\n", rc, mdbx_strerror(rc), #x, \ - __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -static void db_connect() { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - - LMDB_CHECK(mdbx_env_create(&env)); - LMDB_CHECK(mdbx_env_set_mapsize(env, 300000L * 4096L)); - LMDB_CHECK(mdbx_env_set_maxdbs(env, 30)); -#if defined(MDBX_LIFORECLAIM) - LMDB_CHECK(mdbx_env_open( - env, opt_db_path, - MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664)); -#else - LMDB_CHECK(mdbx_env_open(env, opt_db_path, - MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); -#endif - MDB_txn *txn; - - // transaction init - LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); - // open database in read-write mode - LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - // transaction commit - LMDB_CHECK(mdbx_txn_commit(txn)); - printf("Connection open\n"); -} - -typedef struct { - char session_id1[100]; - char session_id2[100]; - char ip[20]; - uint8_t fill[100]; -} session_data_t; - -typedef struct { - int64_t obj_id; - int8_t event_type; -} __attribute__((__packed__)) event_data_t; - -static void create_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; - session_data_t data; - // transaction init - snprintf(data.session_id1, sizeof(data.session_id1), - "mskugw%02ld_%02ld.gx.yota.ru;3800464060;4152;%ld", - record_id % 3 + 1, record_id % 9 + 1, record_id); - snprintf(data.session_id2, sizeof(data.session_id2), - "gx_service;%ld;%ld;node@spb-jsm1", record_id, - record_id % 1000000000 + 99999); - snprintf(data.ip, sizeof(data.ip), "%d.%d.%d.%d", - IP_PRINTF_ARG_HOST(record_id & 0xFFFFFFFF)); - event.obj_id = record_id; - event.event_type = 1; - - MDB_val _session_id1_rec = {data.session_id1, strlen(data.session_id1)}; - MDB_val _session_id2_rec = {data.session_id2, strlen(data.session_id2)}; - MDB_val _ip_rec = {data.ip, strlen(data.ip)}; - MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; - MDB_val _data_rec = {&data, offsetof(session_data_t, fill) + - (rand() % sizeof(data.fill))}; - MDB_val _event_rec = {&event, sizeof(event)}; - - LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); - LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - LMDB_CHECK(mdbx_put(txn, dbi_session, &_obj_id_rec, &_data_rec, - MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id1_rec, &_obj_id_rec, - MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdbx_put(txn, dbi_session_id, &_session_id2_rec, &_obj_id_rec, - MDB_NOOVERWRITE | MDB_NODUPDATA)); - LMDB_CHECK(mdbx_put(txn, dbi_ip, &_ip_rec, &_obj_id_rec, 0)); - LMDB_CHECK(mdbx_put(txn, dbi_event, &_event_rec, &_obj_id_rec, 0)); - x_data_size += (_data_rec.mv_size + _obj_id_rec.mv_size * 4); - x_key_size += - (_obj_id_rec.mv_size + _session_id1_rec.mv_size + - _session_id2_rec.mv_size + _ip_rec.mv_size + _event_rec.mv_size); - - // transaction commit - LMDB_CHECK(mdbx_txn_commit(txn)); - x_add++; -} - -static void delete_record(int64_t record_id) { - MDB_dbi dbi_session; - MDB_dbi dbi_session_id; - MDB_dbi dbi_event; - MDB_dbi dbi_ip; - event_data_t event; - MDB_txn *txn; - - // transaction init - LMDB_CHECK(mdbx_txn_begin(env, NULL, 0, &txn)); - // open database in read-write mode - LMDB_CHECK(mdbx_dbi_open(txn, "session", MDB_CREATE, &dbi_session)); - LMDB_CHECK(mdbx_dbi_open(txn, "session_id", MDB_CREATE, &dbi_session_id)); - LMDB_CHECK(mdbx_dbi_open(txn, "event", MDB_CREATE, &dbi_event)); - LMDB_CHECK(mdbx_dbi_open(txn, "ip", MDB_CREATE, &dbi_ip)); - // put data - MDB_val _obj_id_rec = {&record_id, sizeof(record_id)}; - MDB_val _data_rec; - // get data - LMDB_CHECK(mdbx_get(txn, dbi_session, &_obj_id_rec, &_data_rec)); - session_data_t *data = (session_data_t *)_data_rec.mv_data; - - MDB_val _session_id1_rec = {data->session_id1, strlen(data->session_id1)}; - MDB_val _session_id2_rec = {data->session_id2, strlen(data->session_id2)}; - MDB_val _ip_rec = {data->ip, strlen(data->ip)}; - LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id1_rec, NULL)); - LMDB_CHECK(mdbx_del(txn, dbi_session_id, &_session_id2_rec, NULL)); - LMDB_CHECK(mdbx_del(txn, dbi_ip, &_ip_rec, NULL)); - event.obj_id = record_id; - event.event_type = 1; - MDB_val _event_rec = {&event, sizeof(event)}; - LMDB_CHECK(mdbx_del(txn, dbi_event, &_event_rec, NULL)); - LMDB_CHECK(mdbx_del(txn, dbi_session, &_obj_id_rec, NULL)); - - x_data_size -= (_data_rec.mv_size + _obj_id_rec.mv_size * 4); - x_key_size -= - (_obj_id_rec.mv_size + _session_id1_rec.mv_size + - _session_id2_rec.mv_size + _ip_rec.mv_size + _event_rec.mv_size); - - // transaction commit - LMDB_CHECK(mdbx_txn_commit(txn)); - x_del++; -} - -static void db_disconnect() { - mdbx_env_close(env); - printf("Connection closed\n"); -} - -static void get_db_stat(const char *db, int64_t *ms_branch_pages, - int64_t *ms_leaf_pages) { - MDB_txn *txn; - MDBX_stat stat; - MDB_dbi dbi; - - LMDB_CHECK(mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn)); - LMDB_CHECK(mdbx_dbi_open(txn, db, MDB_CREATE, &dbi)); - LMDB_CHECK(mdbx_dbi_stat(txn, dbi, &stat, sizeof(stat))); - mdbx_txn_abort(txn); - printf("%15s | %15ld | %5u | %10ld | %10ld | %11ld |\n", db, - stat.ms_branch_pages, stat.ms_depth, stat.ms_entries, - stat.ms_leaf_pages, stat.ms_overflow_pages); - (*ms_branch_pages) += stat.ms_branch_pages; - (*ms_leaf_pages) += stat.ms_leaf_pages; -} - -static void periodic_stat(void) { - int64_t ms_branch_pages = 0; - int64_t ms_leaf_pages = 0; - printf(" Name | ms_branch_pages | depth | entries | " - "leaf_pages | overf_pages |\n"); - get_db_stat("session", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("session_id", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("event", &ms_branch_pages, &ms_leaf_pages); - get_db_stat("ip", &ms_branch_pages, &ms_leaf_pages); - printf("%15s | %15ld | %5s | %10s | %10ld | %11s |\n", "", ms_branch_pages, - "", "", ms_leaf_pages, ""); - static int64_t prev_add; - static int64_t prev_del; - static int64_t t = -1; - if (t > 0) { - int64_t delta = getTimeMicroseconds() - t; - printf("CPS: add %ld, delete %ld, items processed - %ldK data=%ldK " - "key=%ldK\n", - (x_add - prev_add) * 1000000 / delta, - (x_del - prev_del) * 1000000 / delta, obj_id / 1024, - x_data_size / 1024, x_key_size / 1024); - printf("usage data=%ld%%\n", - ((x_data_size + x_key_size) * 100) / - ((ms_leaf_pages + ms_branch_pages) * 4096)); - } - t = getTimeMicroseconds(); - prev_add = x_add; - prev_del = x_del; -} - -// static void periodic_add_rec() { -// for (int i = 0; i < 10240; i++) { -// if (ids_count <= REC_COUNT) { -// int64_t id = obj_id++; -// create_record(id); -// add_id_to_pool(id); -// } -// if (ids_count > REC_COUNT) { -// int64_t id = get_id_from_pool(); -// delete_record(id); -// } -// } -// periodic_stat(); -//} - -int main(int argc, char **argv) { - (void)argc; - (void)argv; - - char filename[PATH_MAX]; - int i; - int64_t t; - - mkdir(opt_db_path, 0775); - - strcpy(filename, opt_db_path); - strcat(filename, "/data.mdb"); - remove(filename); - - strcpy(filename, opt_db_path); - strcat(filename, "/lock.mdb"); - remove(filename); - - db_connect(); - periodic_stat(); - for (i = 0; i < 1024000; i++) { - int64_t id = obj_id++; - create_record(id); - add_id_to_pool(id); - } - periodic_stat(); - t = getTimeMicroseconds(); - while (1) { - int i; - int64_t now; - for (i = 0; i < 100; i++) { - int64_t id = obj_id++; - create_record(id); - add_id_to_pool(id); - id = get_id_from_pool(); - delete_record(id); - } - // int64_t id = obj_id++; - // create_record(id); - // add_id_to_pool(id); - now = getTimeMicroseconds(); - if ((now - t) > 100000) { - periodic_stat(); - t = now; - } - } - db_disconnect(); - return 0; -} diff --git a/test/utils.cc b/test/utils.cc new file mode 100644 index 00000000..c3be0ec0 --- /dev/null +++ b/test/utils.cc @@ -0,0 +1,90 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +std::string format(const char *fmt, ...) { + va_list ap, ones; + va_start(ap, fmt); + va_copy(ones, ap); +#ifdef _MSC_VER + int needed = _vscprintf(fmt, ap); +#else + int needed = vsnprintf(nullptr, 0, fmt, ap); +#endif + assert(needed >= 0); + va_end(ap); + std::string result; + result.reserve((size_t)needed + 1); + result.resize((size_t)needed, '\0'); + int actual = vsnprintf((char *)result.data(), result.capacity(), fmt, ones); + assert(actual == needed); + (void)actual; + va_end(ones); + return result; +} + +std::string data2hex(const void *ptr, size_t bytes, simple_checksum &checksum) { + std::string result; + if (bytes > 0) { + const uint8_t *data = (const uint8_t *)ptr; + checksum.push(data, bytes); + result.reserve(bytes * 2); + const uint8_t *const end = data + bytes; + do { + char h = *data >> 4; + char l = *data & 15; + result.push_back((l < 10) ? l + '0' : l - 10 + 'a'); + result.push_back((h < 10) ? h + '0' : h - 10 + 'a'); + } while (++data < end); + } + assert(result.size() == bytes * 2); + return result; +} + +bool hex2data(const char *hex_begin, const char *hex_end, void *ptr, + size_t bytes, simple_checksum &checksum) { + if (bytes * 2 != (size_t)(hex_end - hex_begin)) + return false; + + uint8_t *data = (uint8_t *)ptr; + for (const char *hex = hex_begin; hex != hex_end; hex += 2, ++data) { + unsigned l = hex[0], h = hex[1]; + + if (l >= '0' && l <= '9') + l = l - '0'; + else if (l >= 'A' && l <= 'F') + l = l - 'A' + 10; + else if (l >= 'a' && l <= 'f') + l = l - 'a' + 10; + else + return false; + + if (h >= '0' && h <= '9') + h = h - '0'; + else if (h >= 'A' && h <= 'F') + h = h - 'A' + 10; + else if (h >= 'a' && h <= 'f') + h = h - 'a' + 10; + else + return false; + + uint32_t c = l + (h << 4); + checksum.push(c); + *data = c; + } + return true; +} + +//----------------------------------------------------------------------------- diff --git a/test/utils.h b/test/utils.h new file mode 100644 index 00000000..55d8f6fd --- /dev/null +++ b/test/utils.h @@ -0,0 +1,312 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once +#include "base.h" + +#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ + !defined(__ORDER_BIG_ENDIAN__) +#ifndef _MSC_VER +#include /* for endianness */ +#endif +#if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) +#define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN +#define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN +#define __BYTE_ORDER__ __BYTE_ORDER +#else +#define __ORDER_LITTLE_ENDIAN__ 1234 +#define __ORDER_BIG_ENDIAN__ 4321 +#if defined(__LITTLE_ENDIAN__) || defined(_LITTLE_ENDIAN) || \ + defined(__ARMEL__) || defined(__THUMBEL__) || defined(__AARCH64EL__) || \ + defined(__MIPSEL__) || defined(_MIPSEL) || defined(__MIPSEL) || \ + defined(__i386) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ + defined(_X86_64_) || defined(_M_ARM) || defined(_M_ARM64) || \ + defined(__e2k__) +#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__ +#elif defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(__ARMEB__) || \ + defined(__THUMBEB__) || defined(__AARCH64EB__) || defined(__MIPSEB__) || \ + defined(_MIPSEB) || defined(__MIPSEB) || defined(_M_IA64) +#define __BYTE_ORDER__ __ORDER_BIG_ENDIAN__ +#else +#error __BYTE_ORDER__ should be defined. +#endif +#endif +#endif + +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ && \ + __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ +#error Unsupported byte order. +#endif + +#if __GNUC_PREREQ(4, 4) || defined(__clang__) +#if __GNUC_PREREQ(4, 5) || defined(__clang__) +#define unreachable() __builtin_unreachable() +#endif +#define bswap64(v) __builtin_bswap64(v) +#define bswap32(v) __builtin_bswap32(v) +#if __GNUC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) +#define bswap16(v) __builtin_bswap16(v) +#endif + +#elif defined(_MSC_VER) + +#if _MSC_FULL_VER < 190024215 +#pragma message( \ + "It is recommended to use Visual Studio 2015 (MSC 19.0) or newer.") +#endif + +#define unreachable() __assume(0) +#define bswap64(v) _byteswap_uint64(v) +#define bswap32(v) _byteswap_ulong(v) +#define bswap16(v) _byteswap_ushort(v) +#define rot64(v, s) _rotr64(v, s) +#define rot32(v, s) _rotr(v, s) + +#if defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64) +#pragma intrinsic(_umul128) +#define mul_64x64_128(a, b, ph) _umul128(a, b, ph) +#pragma intrinsic(__umulh) +#define mul_64x64_high(a, b) __umulh(a, b) +#endif + +#if defined(_M_IX86) +#pragma intrinsic(__emulu) +#define mul_32x32_64(a, b) __emulu(a, b) +#elif defined(_M_ARM) +#define mul_32x32_64(a, b) _arm_umull(a, b) +#endif + +#endif /* compiler */ + +#ifndef unreachable +#define unreachable() \ + do { \ + } while (1) +#endif + +#ifndef bswap64 +#ifdef __bswap_64 +#define bswap64(v) __bswap_64(v) +#else +static __inline uint64_t bswap64(uint64_t v) { + return v << 56 | v >> 56 | ((v << 40) & 0x00ff000000000000ull) | + ((v << 24) & 0x0000ff0000000000ull) | + ((v << 8) & 0x000000ff00000000ull) | + ((v >> 8) & 0x00000000ff000000ull) | + ((v >> 24) & 0x0000000000ff0000ull) | + ((v >> 40) & 0x000000000000ff00ull); +} +#endif +#endif /* bswap64 */ + +#ifndef bswap32 +#ifdef __bswap_32 +#define bswap32(v) __bswap_32(v) +#else +static __inline uint32_t bswap32(uint32_t v) { + return v << 24 | v >> 24 | ((v << 8) & 0x00ff0000) | ((v >> 8) & 0x0000ff00); +} +#endif +#endif /* bswap32 */ + +#ifndef bswap16 +#ifdef __bswap_16 +#define bswap16(v) __bswap_16(v) +#else +static __inline uint16_t bswap16(uint16_t v) { return v << 8 | v >> 8; } +#endif +#endif /* bswap16 */ + +#define is_byteorder_le() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define is_byteorder_be() (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + +#ifndef htole16 +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define htobe16(v) bswap16(v) +#define htole16(v) (v) +#define be16toh(v) bswap16(v) +#define le16toh(v) (v) +#else +#define htobe16(v) (v) +#define htole16(v) bswap16(v) +#define be16toh(v) (v) +#define le16toh(v) bswap16(v) +#endif +#endif /* htole16 */ + +#ifndef htole32 +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define htobe32(v) bswap32(v) +#define htole32(v) (v) +#define be32toh(v) bswap32(v) +#define le32toh(v) (v) +#else +#define htobe32(v) (v) +#define htole32(v) bswap32(v) +#define be32toh(v) (v) +#define le32toh(v) bswap32(v) +#endif +#endif /* htole32 */ + +#ifndef htole64 +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define htobe64(v) bswap64(v) +#define htole64(v) (v) +#define be64toh(v) bswap64(v) +#define le64toh(v) (v) +#else +#define htobe64(v) (v) +#define htole64(v) bswap_64(v) +#define be64toh(v) (v) +#define le64toh(v) bswap_64(v) +#endif +#endif /* htole64 */ + +namespace unaligned { + +template static __inline T load(const void *ptr) { +#if defined(_MSC_VER) && \ + (defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64)) + return *(const T __unaligned *)ptr; +#elif UNALIGNED_OK + return *(const T *)ptr; +#else + T local; +#if defined(__GNUC__) || defined(__clang__) + __builtin_memcpy(&local, (const T *)ptr, sizeof(T)); +#else + memcpy(&local, (const T *)ptr, sizeof(T)); +#endif /* __GNUC__ || __clang__ */ + return local; +#endif /* UNALIGNED_OK */ +} + +template static __inline void store(void *ptr, const T &value) { +#if defined(_MSC_VER) && \ + (defined(_M_ARM64) || defined(_M_X64) || defined(_M_IA64)) + *((T __unaligned *)ptr) = value; +#elif UNALIGNED_OK + *(volatile T *)ptr = value; +#else +#if defined(__GNUC__) || defined(__clang__) + __builtin_memcpy(ptr, &value, sizeof(T)); +#else + memcpy(ptr, &value, sizeof(T)); +#endif /* __GNUC__ || __clang__ */ +#endif /* UNALIGNED_OK */ +} + +} /* namespace unaligned */ + +//----------------------------------------------------------------------------- + +#ifndef rot64 +static __inline uint64_t rot64(uint64_t v, unsigned s) { + return (v >> s) | (v << (64 - s)); +} +#endif /* rot64 */ + +#ifndef mul_32x32_64 +static __inline uint64_t mul_32x32_64(uint32_t a, uint32_t b) { + return a * (uint64_t)b; +} +#endif /* mul_32x32_64 */ + +#ifndef mul_64x64_128 + +static __inline unsigned add_with_carry(uint64_t *sum, uint64_t addend) { + *sum += addend; + return *sum < addend; +} + +static __inline uint64_t mul_64x64_128(uint64_t a, uint64_t b, uint64_t *h) { +#if defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + __uint128_t r = (__uint128_t)a * (__uint128_t)b; + /* modern GCC could nicely optimize this */ + *h = r >> 64; + return r; +#elif defined(mul_64x64_high) + *h = mul_64x64_high(a, b); + return a * b; +#else + /* performs 64x64 to 128 bit multiplication */ + uint64_t ll = mul_32x32_64((uint32_t)a, (uint32_t)b); + uint64_t lh = mul_32x32_64(a >> 32, (uint32_t)b); + uint64_t hl = mul_32x32_64((uint32_t)a, b >> 32); + *h = mul_32x32_64(a >> 32, b >> 32) + (lh >> 32) + (hl >> 32) + + add_with_carry(&ll, lh << 32) + add_with_carry(&ll, hl << 32); + return ll; +#endif +} + +#endif /* mul_64x64_128() */ + +#ifndef mul_64x64_high +static __inline uint64_t mul_64x64_high(uint64_t a, uint64_t b) { + uint64_t h; + mul_64x64_128(a, b, &h); + return h; +} +#endif /* mul_64x64_high */ + +static __inline bool is_power2(size_t x) { return (x & (x - 1)) == 0; } + +static __inline size_t roundup2(size_t value, size_t granularity) { + assert(is_power2(granularity)); + return (value + granularity - 1) & ~(granularity - 1); +} + +//----------------------------------------------------------------------------- + +struct simple_checksum { + uint64_t value; + + simple_checksum() : value(0) {} + + void push(uint32_t data) { + value += data * UINT64_C(9386433910765580089) + 1; + value ^= value >> 41; + } + + void push(uint64_t data) { + push((uint32_t)data); + push((uint32_t)(data >> 32)); + } + + void push(bool data) { push(data ? UINT32_C(0x780E) : UINT32_C(0xFA18E)); } + + void push(const void *ptr, size_t bytes) { + const uint8_t *data = (const uint8_t *)ptr; + for (size_t i = 0; i < bytes; ++i) + push((uint32_t)data[i]); + } + + void push(const double &data) { push(&data, sizeof(double)); } + + void push(const char *cstr) { push(cstr, strlen(cstr)); } + + void push(const std::string &str) { push(str.data(), str.size()); } + +#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) + void push(const HANDLE &handle) { push(&handle, sizeof(handle)); } +#endif /* _WINDOWS */ +}; + +std::string data2hex(const void *ptr, size_t bytes, simple_checksum &checksum); +bool hex2data(const char *hex_begin, const char *hex_end, void *ptr, + size_t bytes, simple_checksum &checksum); + +std::string format(const char *fmt, ...); From 2f058bf82b2ba9b4d0383e0cdca5732a2bd613b4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 10 Apr 2017 23:34:59 +0300 Subject: [PATCH 039/303] mdbx: fix MSVC dirs (minor). --- .appveyor.yml | 10 +++++++--- dll.vcxproj | 6 +++++- mdbx.sln | 20 ++++++++++---------- test/test.vcxproj | 4 ++++ 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index a164b1c8..84a7d43f 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -11,8 +11,8 @@ environment: - Toolset: v100 platform: - - x86 - - x64 + - Win32 + - x64 configuration: - Release @@ -24,4 +24,8 @@ build: test_script: - ps: | - & "C:\projects\mdbx\$env:PLATFORM\$env:CONFIGURATION\test\test.exe" --pathname=tmp.db --basic --dont-cleanup-after + if (($env:PLATFORM -eq "x86") -and (Test-Path "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" -PathType Leaf)) { + & "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" --pathname=tmp.db --basic --dont-cleanup-after + } else { + & "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\test.exe" --pathname=tmp.db --basic --dont-cleanup-after + } diff --git a/dll.vcxproj b/dll.vcxproj index ccdd2cb1..42658ccb 100644 --- a/dll.vcxproj +++ b/dll.vcxproj @@ -64,9 +64,13 @@ true + $(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\ false + $(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\ false @@ -154,4 +158,4 @@ - \ No newline at end of file + diff --git a/mdbx.sln b/mdbx.sln index aa2025d8..29c42c0f 100644 --- a/mdbx.sln +++ b/mdbx.sln @@ -10,27 +10,27 @@ EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 - Debug|x86 = Debug|x86 + Debug|Win32 = Debug|Win32 Release|x64 = Release|x64 - Release|x86 = Release|x86 + Release|Win32 = Release|Win32 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x64.ActiveCfg = Debug|x64 {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x64.Build.0 = Debug|x64 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x86.ActiveCfg = Debug|Win32 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x86.Build.0 = Debug|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|Win32.ActiveCfg = Debug|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|Win32.Build.0 = Debug|Win32 {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.ActiveCfg = Release|x64 {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.Build.0 = Release|x64 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.ActiveCfg = Release|Win32 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.Build.0 = Release|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|Win32.ActiveCfg = Release|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|Win32.Build.0 = Release|Win32 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x64.ActiveCfg = Debug|x64 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x64.Build.0 = Debug|x64 - {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x86.ActiveCfg = Debug|Win32 - {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x86.Build.0 = Debug|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|Win32.ActiveCfg = Debug|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|Win32.Build.0 = Debug|Win32 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x64.ActiveCfg = Release|x64 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x64.Build.0 = Release|x64 - {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x86.ActiveCfg = Release|Win32 - {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x86.Build.0 = Release|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|Win32.ActiveCfg = Release|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|Win32.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/test/test.vcxproj b/test/test.vcxproj index 331963a8..e2a123f6 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -76,12 +76,16 @@ true + $(SolutionDir)$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ true false + $(SolutionDir)$(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ false From 4c9799760255bc4b401a79b66470606cc40e53c9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Apr 2017 00:21:53 +0300 Subject: [PATCH 040/303] test: fix typo in test's skeleton. --- test/main.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/main.cc b/test/main.cc index 929f2094..741d86f9 100644 --- a/test/main.cc +++ b/test/main.cc @@ -91,9 +91,9 @@ std::string thunk_param(const actor_config &config) { } void cleanup() { - log_trace(">> osal_setup"); + log_trace(">> cleanup"); /* TODO: remove each database */ - log_trace("<< osal_setup"); + log_trace("<< cleanup"); } int main(int argc, char *const argv[]) { From 8167a08431e570e2bedf02a52683f928a7d6cf3f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Apr 2017 00:26:08 +0300 Subject: [PATCH 041/303] test: use stderr for error only. --- test/log.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/log.cc b/test/log.cc index d149393e..75993d98 100644 --- a/test/log.cc +++ b/test/log.cc @@ -68,12 +68,13 @@ const char *level2str(const loglevel level) { void output(loglevel priority, const char *format, va_list ap) { if (priority >= level) { - fprintf(stderr, "[ %u %-10s %6s ] " /* TODO */, osal_getpid(), - prefix.c_str(), level2str(priority)); - vfprintf(stderr, format, ap); + FILE *out = (priority >= error) ? stderr : stdout; + fprintf(out, "[ %u %-10s %6s ] " /* TODO */, osal_getpid(), prefix.c_str(), + level2str(priority)); + vfprintf(out, format, ap); size_t len = strlen(format); if (len && format[len - 1] != '\n') - putc('\n', stderr); + putc('\n', out); } } From 38a67813219d5a4e63189f8d022144c22f6d8ea4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Apr 2017 02:20:35 +0300 Subject: [PATCH 042/303] test: setup debug-loger for libmdbx. Change-Id: I7235bd9457773cce2d20a371dcca47a5a4a61838 --- test/test.cc | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/test/test.cc b/test/test.cc index 104fe93f..e6b2ada2 100644 --- a/test/test.cc +++ b/test/test.cc @@ -51,11 +51,30 @@ const char *status2str(actor_status status) { //----------------------------------------------------------------------------- +static void mdbx_debug_logger(int type, const char *function, int line, + const char *msg, va_list args) { + loggging::loglevel level = loggging::trace; + if (type & MDBX_DBG_PRINT) + level = loggging::info; + if (type & MDBX_DBG_ASSERT) { + log_error("libmdbx assertion failure: %s, %d", + function ? function : "unknown", line); + level = loggging::failure; + } + + output(level, msg, args); + if (type & MDBX_DBG_ASSERT) + abort(); +} + void testcase::mdbx_prepare() { log_trace(">> mdbx_prepare"); + int rc = mdbx_setup_debug(MDBX_DBG_DNT, mdbx_debug_logger, MDBX_DBG_DNT); + log_info("libmdbx debug-flags: 0x%02x", rc); + MDB_env *env = nullptr; - int rc = mdbx_env_create(&env); + rc = mdbx_env_create(&env); if (rc != MDB_SUCCESS) failure_perror("mdbx_env_create()", rc); From cbb2abe5a83d1cdeec58b60c4e6ddcb09c25aaa9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Apr 2017 02:35:10 +0300 Subject: [PATCH 043/303] mdbx: fix MCVS x86 platform. Change-Id: Ic9fb26eee0f7ff50973092d87e791a320f7dd231 --- .appveyor.yml | 2 +- mdbx.sln | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 84a7d43f..7e8e1351 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -11,7 +11,7 @@ environment: - Toolset: v100 platform: - - Win32 + - x86 - x64 configuration: diff --git a/mdbx.sln b/mdbx.sln index 29c42c0f..aa2025d8 100644 --- a/mdbx.sln +++ b/mdbx.sln @@ -10,27 +10,27 @@ EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 - Debug|Win32 = Debug|Win32 + Debug|x86 = Debug|x86 Release|x64 = Release|x64 - Release|Win32 = Release|Win32 + Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x64.ActiveCfg = Debug|x64 {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x64.Build.0 = Debug|x64 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|Win32.ActiveCfg = Debug|Win32 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|Win32.Build.0 = Debug|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x86.ActiveCfg = Debug|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x86.Build.0 = Debug|Win32 {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.ActiveCfg = Release|x64 {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.Build.0 = Release|x64 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|Win32.ActiveCfg = Release|Win32 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|Win32.Build.0 = Release|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.ActiveCfg = Release|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.Build.0 = Release|Win32 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x64.ActiveCfg = Debug|x64 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x64.Build.0 = Debug|x64 - {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|Win32.ActiveCfg = Debug|Win32 - {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|Win32.Build.0 = Debug|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x86.ActiveCfg = Debug|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x86.Build.0 = Debug|Win32 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x64.ActiveCfg = Release|x64 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x64.Build.0 = Release|x64 - {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|Win32.ActiveCfg = Release|Win32 - {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|Win32.Build.0 = Release|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x86.ActiveCfg = Release|Win32 + {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE From 270b367a4faba7e80d2aa2e8fdfa626fad70503f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Apr 2017 02:42:09 +0300 Subject: [PATCH 044/303] mdbx: cleanup travis.yml Change-Id: Ia38b527f27727bc06f7182f7a7d7c5e626780b99 --- .travis.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5b6d5ee5..cc56c3a5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,15 +1,12 @@ language: c sudo: false dist: trusty -cache: bundler -notifications: - email: false compiler: - - gcc - - clang +- gcc +- clang os: - - linux +- linux script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all check; fi From 2a80ad67fbe004dd36f73c086d465ee4ea4cf997 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Apr 2017 12:55:16 +0300 Subject: [PATCH 045/303] test: refine logging. --- test/config.cc | 100 ++++++++++++++++++++++--------------------- test/config.h | 2 +- test/log.cc | 114 ++++++++++++++++++++++++++++++++++++++----------- test/log.h | 39 +++++++++++++---- test/main.cc | 10 ++--- test/test.cc | 13 +++--- test/test.h | 4 +- 7 files changed, 187 insertions(+), 95 deletions(-) diff --git a/test/config.cc b/test/config.cc index fd9bc732..b1f73368 100644 --- a/test/config.cc +++ b/test/config.cc @@ -231,94 +231,98 @@ const struct option_verb table_bits[] = { {"data.dups", MDB_DUPSORT}, {nullptr, 0}}; -static void dump_verbs(FILE *out, const char *caption, size_t bits, +static void dump_verbs(const char *caption, size_t bits, const struct option_verb *verbs) { - fprintf(out, "%s: (%" PRIu64 ")", caption, (uint64_t)bits); + log_info("%s: 0x%" PRIx64 " = ", caption, (uint64_t)bits); + const char *comma = ""; while (verbs->mask && bits) { if ((bits & verbs->mask) == verbs->mask) { - fprintf(out, ", %s", verbs->verb); + logging::feed("%s%s", comma, verbs->verb); bits -= verbs->mask; + comma = ", "; } ++verbs; } - fprintf(out, "\n"); + logging::feed("\n"); } -static void dump_duration(FILE *out, const char *caption, unsigned duration) { - fprintf(out, "%s: ", caption); +static void dump_duration(const char *caption, unsigned duration) { + log_info("%s: ", caption); if (duration) { if (duration > 24 * 3600) - fprintf(out, "%u_", duration / (24 * 3600)); + logging::feed("%u_", duration / (24 * 3600)); if (duration > 3600) - fprintf(out, "%02u:", (duration % (24 * 3600)) / 3600); - fprintf(out, "%02u:%02u", (duration % 3600) / 60, duration % 60); - } else - fprintf(out, "INFINITE"); - fprintf(out, "\n"); + logging::feed("%02u:", (duration % (24 * 3600)) / 3600); + logging::feed("%02u:%02u", (duration % 3600) / 60, duration % 60); + } else { + logging::feed("INFINITE"); + } + logging::feed("\n"); } -void dump(FILE *out) { +void dump(const char *title) { + logging::local_suffix indent(title); + for (auto i = global::actors.begin(); i != global::actors.end(); ++i) { - fprintf(out, "testcase %s\n", testcase2str(i->testcase)); - if (i->id) - fprintf(out, "\tid/table %u\n", i->id); + log_info("#%u, testcase %s, id/table %u\n", i->order, + testcase2str(i->testcase), i->id); + indent.push(); if (i->params.loglevel) { - fprintf(out, "\tlog: level %u, %s\n", i->params.loglevel, - i->params.pathname_log.empty() ? "console" - : i->params.pathname_log.c_str()); + log_info("log: level %u, %s\n", i->params.loglevel, + i->params.pathname_log.empty() ? "console" + : i->params.pathname_log.c_str()); } - fprintf(out, "\tdatabase: %s, size %" PRIu64 "\n", - i->params.pathname_db.c_str(), i->params.size); + log_info("database: %s, size %" PRIu64 "\n", i->params.pathname_db.c_str(), + i->params.size); - dump_verbs(out, "\tmode", i->params.mode_flags, mode_bits); - dump_verbs(out, "\ttable", i->params.table_flags, table_bits); + dump_verbs("mode", i->params.mode_flags, mode_bits); + dump_verbs("table", i->params.table_flags, table_bits); - fprintf(out, "\tseed %u\n", i->params.seed); + log_info("seed %u\n", i->params.seed); if (i->params.test_nrecords) - fprintf(out, "\trecords %u\n", i->params.test_nrecords); + log_info("records %u\n", i->params.test_nrecords); else - dump_duration(out, "\tduration", i->params.test_duration); + dump_duration("duration", i->params.test_duration); if (i->params.nrepeat) - fprintf(out, "\trepeat %u\n", i->params.nrepeat); + log_info("repeat %u\n", i->params.nrepeat); else - fprintf(out, "\trepeat ETERNALLY\n"); + log_info("repeat ETERNALLY\n"); - fprintf(out, "\tthreads %u\n", i->params.nthreads); + log_info("threads %u\n", i->params.nthreads); - fprintf(out, "\tkey: minlen %u, maxlen %u\n", i->params.keylen_min, - i->params.keylen_max); - fprintf(out, "\tdata: minlen %u, maxlen %u\n", i->params.datalen_min, - i->params.datalen_max); + log_info("key: minlen %u, maxlen %u\n", i->params.keylen_min, + i->params.keylen_max); + log_info("data: minlen %u, maxlen %u\n", i->params.datalen_min, + i->params.datalen_max); - fprintf(out, "\tbatch: read %u, write %u\n", i->params.batch_read, - i->params.batch_write); + log_info("batch: read %u, write %u\n", i->params.batch_read, + i->params.batch_write); if (i->params.waitfor_nops) - fprintf(out, "\twait: actor %u for %u ops\n", i->wait4id, - i->params.waitfor_nops); + log_info("wait: actor %u for %u ops\n", i->wait4id, + i->params.waitfor_nops); else if (i->params.delaystart) - dump_duration(out, "\tdelay", i->params.delaystart); + dump_duration("delay", i->params.delaystart); else - fprintf(out, "\tno-delay\n"); + log_info("no-delay\n"); - fprintf(out, "\tlimits: readers %u, tables %u\n", i->params.max_readers, - i->params.max_tables); + log_info("limits: readers %u, tables %u\n", i->params.max_readers, + i->params.max_tables); - fprintf(out, "\tdrop table: %s\n", i->params.drop_table ? "Yes" : "No"); - - fprintf(out, "\t#---\n"); + log_info("drop table: %s\n", i->params.drop_table ? "Yes" : "No"); + indent.pop(); } - dump_duration(out, "timeout", global::config::timeout); - fprintf(out, "cleanup: before %s, after %s\n", - global::config::dont_cleanup_before ? "No" : "Yes", - global::config::dont_cleanup_after ? "No" : "Yes"); + dump_duration("timeout", global::config::timeout); + log_info("cleanup: before %s, after %s\n", + global::config::dont_cleanup_before ? "No" : "Yes", + global::config::dont_cleanup_after ? "No" : "Yes"); } } /* namespace config */ diff --git a/test/config.h b/test/config.h index c16eca9a..38bc0b22 100644 --- a/test/config.h +++ b/test/config.h @@ -108,7 +108,7 @@ struct actor_config_pod { extern const struct option_verb mode_bits[]; extern const struct option_verb table_bits[]; -void dump(FILE *out); +void dump(const char *title = "config-dump: "); } /* namespace config */ diff --git a/test/log.cc b/test/log.cc index 75993d98..7def0b84 100644 --- a/test/log.cc +++ b/test/log.cc @@ -14,13 +14,15 @@ #include "test.h" +static void fflushall() { fflush(nullptr); } + void failure(const char *fmt, ...) { va_list ap; fflush(NULL); va_start(ap, fmt); - loggging::output(loggging::failure, fmt, ap); + logging::output(logging::failure, fmt, ap); va_end(ap); - fflush(NULL); + fflushall(); exit(EXIT_FAILURE); } @@ -35,10 +37,12 @@ void __noreturn failure_perror(const char *what, int errnum) { //----------------------------------------------------------------------------- -namespace loggging { +namespace logging { static std::string prefix; +static std::string suffix; static loglevel level; +static FILE *last; void setup(loglevel _level, const std::string &_prefix) { level = (_level > error) ? failure : _level; @@ -68,61 +72,123 @@ const char *level2str(const loglevel level) { void output(loglevel priority, const char *format, va_list ap) { if (priority >= level) { - FILE *out = (priority >= error) ? stderr : stdout; - fprintf(out, "[ %u %-10s %6s ] " /* TODO */, osal_getpid(), prefix.c_str(), - level2str(priority)); - vfprintf(out, format, ap); + last = (priority >= error) ? stderr : stdout; + fprintf(last, "[ %u %-10s %6s ] %s" /* TODO */, osal_getpid(), + prefix.c_str(), level2str(priority), suffix.c_str()); + vfprintf(last, format, ap); + size_t len = strlen(format); - if (len && format[len - 1] != '\n') - putc('\n', out); + char end = len ? format[len - 1] : '\0'; + switch (end) { + default: + putc('\n', last); + case '\n': + if (priority > info) + fflushall(); + break; + case ' ': + case '_': + case ':': + case '|': + case ',': + case '\t': + case '\b': + case '\r': + case '\0': + return; + } + } + last = nullptr; +} + +void feed(const char *format, ...) { + if (last) { + va_list ap; + va_start(ap, format); + vfprintf(last, format, ap); + va_end(ap); + + size_t len = strlen(format); + if (len && format[len - 1] == '\n') + last = nullptr; } } +local_suffix::local_suffix(const char *c_str) + : trim_pos(suffix.size()), indent(0) { + suffix.append(c_str); +} + +local_suffix::local_suffix(const std::string &str) + : trim_pos(suffix.size()), indent(0) { + suffix.append(str); +} + +void local_suffix::push() { + indent += 1; + suffix.push_back('\t'); +} + +void local_suffix::pop() { + assert(indent > 0); + if (indent > 0) { + indent -= 1; + suffix.pop_back(); + } +} + +local_suffix::~local_suffix() { suffix.erase(trim_pos); } + } /* namespace log */ void log_trace(const char *msg, ...) { - if (loggging::trace >= loggging::level) { + if (logging::trace >= logging::level) { va_list ap; va_start(ap, msg); - loggging::output(loggging::trace, msg, ap); + logging::output(logging::trace, msg, ap); va_end(ap); - } + } else + logging::last = nullptr; } void log_info(const char *msg, ...) { - if (loggging::info >= loggging::level) { + if (logging::info >= logging::level) { va_list ap; va_start(ap, msg); - loggging::output(loggging::info, msg, ap); + logging::output(logging::info, msg, ap); va_end(ap); - } + } else + logging::last = nullptr; } void log_notice(const char *msg, ...) { - if (loggging::notice >= loggging::level) { + if (logging::notice >= logging::level) { va_list ap; va_start(ap, msg); - loggging::output(loggging::notice, msg, ap); + logging::output(logging::notice, msg, ap); va_end(ap); - } + } else + logging::last = nullptr; } void log_warning(const char *msg, ...) { - if (loggging::warning >= loggging::level) { + if (logging::warning >= logging::level) { va_list ap; va_start(ap, msg); - loggging::output(loggging::warning, msg, ap); + logging::output(logging::warning, msg, ap); va_end(ap); - } + } else + logging::last = nullptr; } void log_error(const char *msg, ...) { - if (loggging::error >= loggging::level) { + if (logging::error >= logging::level) { va_list ap; va_start(ap, msg); - loggging::output(loggging::error, msg, ap); + logging::output(logging::error, msg, ap); va_end(ap); - } + } else + logging::last = nullptr; } void log_touble(const char *where, const char *what, int errnum) { diff --git a/test/log.h b/test/log.h index 627a11a0..868b4ee4 100644 --- a/test/log.h +++ b/test/log.h @@ -18,16 +18,19 @@ void __noreturn usage(void); -void __noreturn #ifdef __GNUC__ - __attribute__((format(printf, 1, 2))) +#define __printf_args(format_index, first_arg) \ + __attribute__((format(printf, format_index, first_arg))) +#else +#define __printf_args(format_index, first_arg) #endif - failure(const char *fmt, ...); + +void __noreturn __printf_args(1, 2) failure(const char *fmt, ...); void __noreturn failure_perror(const char *what, int errnum); const char *test_strerror(int errnum); -namespace loggging { +namespace logging { enum loglevel { trace, @@ -43,14 +46,32 @@ void setup(loglevel level, const std::string &prefix); void setup(const std::string &prefix); void output(loglevel priority, const char *format, va_list ap); +void __printf_args(1, 2) feed(const char *format, ...); + +class local_suffix { +protected: + size_t trim_pos; + int indent; + +public: + local_suffix(const local_suffix &) = delete; + local_suffix(const local_suffix &&) = delete; + const local_suffix &operator=(const local_suffix &) = delete; + + local_suffix(const char *c_str); + local_suffix(const std::string &str); + void push(); + void pop(); + ~local_suffix(); +}; } /* namespace log */ -void log_trace(const char *msg, ...); -void log_info(const char *msg, ...); -void log_notice(const char *msg, ...); -void log_warning(const char *msg, ...); -void log_error(const char *msg, ...); +void __printf_args(1, 2) log_trace(const char *msg, ...); +void __printf_args(1, 2) log_info(const char *msg, ...); +void __printf_args(1, 2) log_notice(const char *msg, ...); +void __printf_args(1, 2) log_warning(const char *msg, ...); +void __printf_args(1, 2) log_error(const char *msg, ...); void log_touble(const char *where, const char *what, int errnum); diff --git a/test/main.cc b/test/main.cc index 741d86f9..8853ea2f 100644 --- a/test/main.cc +++ b/test/main.cc @@ -26,9 +26,9 @@ void actor_params::set_defaults(void) { pathname_log = ""; loglevel = #ifdef NDEBUG - loggging::notice; + logging::notice; #else - loggging::trace; + logging::trace; #endif pathname_db = @@ -118,7 +118,7 @@ int main(int argc, char *const argv[]) { actor_params params; params.set_defaults(); global::config::dump_config = true; - loggging::setup((loggging::loglevel)params.loglevel, "main"); + logging::setup((logging::loglevel)params.loglevel, "main"); unsigned lastid = 0; if (argc == 2 && strncmp(argv[1], "--case=", 7) == 0) { @@ -228,11 +228,11 @@ int main(int argc, char *const argv[]) { } if (global::config::dump_config) - config::dump(stdout); + config::dump(); bool failed = false; if (global::actors.size()) { - loggging::setup("overlord"); + logging::setup("overlord"); if (!global::config::dont_cleanup_before) cleanup(); diff --git a/test/test.cc b/test/test.cc index e6b2ada2..45e59ceb 100644 --- a/test/test.cc +++ b/test/test.cc @@ -53,13 +53,13 @@ const char *status2str(actor_status status) { static void mdbx_debug_logger(int type, const char *function, int line, const char *msg, va_list args) { - loggging::loglevel level = loggging::trace; + logging::loglevel level = logging::trace; if (type & MDBX_DBG_PRINT) - level = loggging::info; + level = logging::info; if (type & MDBX_DBG_ASSERT) { log_error("libmdbx assertion failure: %s, %d", function ? function : "unknown", line); - level = loggging::failure; + level = logging::failure; } output(level, msg, args); @@ -122,7 +122,8 @@ bool testcase::wait4start() { log_trace(">> wait4start(%u)", config.wait4id); int rc = osal_waitfor(config.wait4id); if (rc) { - log_trace("<< wait4start(%u), failed %s", test_strerror(rc)); + log_trace("<< wait4start(%u), failed %s", config.wait4id, + test_strerror(rc)); return false; } return true; @@ -167,8 +168,8 @@ bool testcase::teardown() { bool test_execute(const actor_config &config) { const mdbx_pid_t pid = osal_getpid(); - loggging::setup((loggging::loglevel)config.params.loglevel, - format("child_%u.%u", config.order, config.id)); + logging::setup((logging::loglevel)config.params.loglevel, + format("child_%u.%u", config.order, config.id)); log_trace(">> wait4barrier"); osal_wait4barrier(); diff --git a/test/test.h b/test/test.h index b1ce82af..a119c1e6 100644 --- a/test/test.h +++ b/test/test.h @@ -90,8 +90,8 @@ protected: public: testcase(const actor_config &config, const mdbx_pid_t pid) : config(config), pid(pid) { - loggging::setup(format("%s_%u.%u", testcase2str(config.testcase), - config.order, config.id)); + logging::setup(format("%s_%u.%u", testcase2str(config.testcase), + config.order, config.id)); } virtual bool setup(); From 8b42b8bfd4df052e79a5b3eb8f8467e10d03a3fc Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Apr 2017 19:00:26 +0300 Subject: [PATCH 046/303] test: add chrono. --- libmdbx.files | 2 ++ test/base.h | 6 ++++ test/chrono.cc | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++ test/chrono.h | 75 +++++++++++++++++++++++++++++++++++++++++ test/test.h | 1 + 5 files changed, 174 insertions(+) create mode 100644 test/chrono.cc create mode 100644 test/chrono.h diff --git a/libmdbx.files b/libmdbx.files index c0b049fa..8ab54c92 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -22,6 +22,8 @@ src/tools/mdbx_stat.1 src/tools/mdbx_stat.c test/actor.cc test/base.h +test/chrono.cc +test/chrono.h test/config.h test/dead.cc test/hill.cc diff --git a/test/base.h b/test/base.h index 155cd98f..b4ba95c8 100644 --- a/test/base.h +++ b/test/base.h @@ -34,6 +34,12 @@ #include #include +#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) +#else +#include +#include +#endif + #ifdef _BSD_SOURCE #include #endif diff --git a/test/chrono.cc b/test/chrono.cc new file mode 100644 index 00000000..8c9dcebe --- /dev/null +++ b/test/chrono.cc @@ -0,0 +1,90 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "test.h" + +namespace chrono { + +#define NSEC_PER_SEC 1000000000u +uint32_t ns2fractional(uint32_t ns) { + assert(ns < NSEC_PER_SEC); + /* LY: здесь и далее используется "длинное деление", которое + * для ясности кода оставлено как есть (без ручной оптимизации). Так как + * GCC, Clang и даже MSVC сами давно умеют конвертировать деление на + * константу в быструю reciprocal-форму. */ + return ((uint64_t)ns << 32) / NSEC_PER_SEC; +} + +uint32_t fractional2ns(uint32_t fractional) { + return (fractional * (uint64_t)NSEC_PER_SEC) >> 32; +} + +#define USEC_PER_SEC 1000000u +uint32_t us2fractional(uint32_t us) { + assert(us < USEC_PER_SEC); + return ((uint64_t)us << 32) / USEC_PER_SEC; +} + +uint32_t fractional2us(uint32_t fractional) { + return (fractional * (uint64_t)USEC_PER_SEC) >> 32; +} + +#define MSEC_PER_SEC 1000u +uint32_t ms2fractional(uint32_t ms) { + assert(ms < MSEC_PER_SEC); + return ((uint64_t)ms << 32) / MSEC_PER_SEC; +} + +uint32_t fractional2ms(uint32_t fractional) { + return (fractional * (uint64_t)MSEC_PER_SEC) >> 32; +} + +time from_ns(uint64_t ns) { + time result; + result.fixedpoint = ((ns / NSEC_PER_SEC) << 32) | + ns2fractional((uint32_t)(ns % NSEC_PER_SEC)); + return result; +} + +time from_us(uint64_t us) { + time result; + result.fixedpoint = ((us / USEC_PER_SEC) << 32) | + us2fractional((uint32_t)(us % USEC_PER_SEC)); + return result; +} + +time from_ms(uint64_t ms) { + time result; + result.fixedpoint = ((ms / MSEC_PER_SEC) << 32) | + ms2fractional((uint32_t)(ms % MSEC_PER_SEC)); + return result; +} + +time now() { +#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) + FILETIME filetime; + GetSystemTimeAsFileTime(&filetime); + uint64_t ns = + (uint64_t)filetime.dwHighDateTime << 32 | filetime.dwLowDateTime; + return from_ns(ns); +#else + struct timespec ts; + if (unlikely(clock_gettime(CLOCK_REALTIME, &ts))) + failure_perror("clock_gettime(CLOCK_REALTIME", errno); + + return from_timespec(ts); +#endif +} + +} /* namespace chrono */ diff --git a/test/chrono.h b/test/chrono.h new file mode 100644 index 00000000..bb90e8e5 --- /dev/null +++ b/test/chrono.h @@ -0,0 +1,75 @@ +/* + * Copyright 2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once + +#include "base.h" +#include "log.h" +#include "utils.h" + +namespace chrono { + +typedef union time { + uint64_t fixedpoint; + struct __packed { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + uint32_t fractional; + uint32_t utc; +#else + uint32_t utc; + uint32_t fractional; +#endif + }; +} time; + +uint32_t ns2fractional(uint32_t); +uint32_t fractional2ns(uint32_t); +uint32_t us2fractional(uint32_t); +uint32_t fractional2us(uint32_t); +uint32_t ms2fractional(uint32_t); +uint32_t fractional2ms(uint32_t); + +time from_ns(uint64_t us); +time from_us(uint64_t ns); +time from_ms(uint64_t ms); + +inline time from_utc(time_t utc) { + assert(utc < UINT32_MAX); + time result; + result.fixedpoint = ((uint64_t)utc) << 32; + return result; +} + +#if defined(HAVE_TIMESPEC_TV_NSEC) || defined(__timespec_defined) || \ + defined(CLOCK_REALTIME) +inline time from_timespec(const struct timespec &ts) { + time result; + result.fixedpoint = + ((uint64_t)ts.tv_sec << 32) | ns2fractional((uint32_t)ts.tv_nsec); + return result; +} +#endif /* HAVE_TIMESPEC_TV_NSEC */ + +#if defined(HAVE_TIMEVAL_TV_USEC) || defined(_STRUCT_TIMEVAL) +inline time from_timeval(const struct timeval &tv) { + time result; + result.fixedpoint = + ((uint64_t)tv.tv_sec << 32) | us2fractional((uint32_t)tv.tv_usec); + return result; +} +#endif /* HAVE_TIMEVAL_TV_USEC */ + +time now(); + +} /* namespace chrono */ diff --git a/test/test.h b/test/test.h index a119c1e6..093b9251 100644 --- a/test/test.h +++ b/test/test.h @@ -15,6 +15,7 @@ #pragma once #include "base.h" +#include "chrono.h" #include "config.h" #include "keygen.h" #include "log.h" From 4fb2bd3a078574809fa6eac3fb622dc144bc611a Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Apr 2017 14:37:25 +0300 Subject: [PATCH 047/303] mdbx: refine Makefile. --- Makefile | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index e88007b1..730d855a 100644 --- a/Makefile +++ b/Makefile @@ -24,16 +24,13 @@ suffix ?= CC ?= gcc CXX ?= g++ -XCFLAGS ?= -DNDEBUG=1 -DMDB_DEBUG=0 -DMDBX_EXPORTS=1 +XCFLAGS ?= -DNDEBUG=1 -DMDB_DEBUG=0 -DLIBMDBX_EXPORTS=1 CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden CFLAGS += -D_GNU_SOURCE=1 -std=gnu99 -pthread $(XCFLAGS) -# COVER ?= -coverage -fprofile-arcs -ftest-coverage -Og - CXXFLAGS = -std=c++11 $(filter-out -std=gnu99,$(CFLAGS)) -# LY: for ability to built with modern glibc, -# but then run with the old -LDOPS ?= -Wl,--no-as-needed,-lrt +# LY: '--no-as-needed,-lrt' for ability to built with modern glibc, but then run with the old +LDFLAGS ?= -Wl,--gc-sections,-z,relro,-O,--no-as-needed,-lrt # LY: just for benchmarking IOARENA ?= ../ioarena.git/@BUILD/src/ioarena @@ -84,13 +81,16 @@ libmdbx.a: mdbx.o osal.o lck-posix.o $(AR) rs $@ $? libmdbx.so: mdbx.o osal.o lck-posix.o - $(CC) $(CFLAGS) $(LDFLAGS) -save-temps -pthread -shared $(LDOPS) -o $@ $^ + $(CC) $(CFLAGS) -save-temps $^ -pthread -shared $(LDFLAGS) -o $@ mdbx_%: src/tools/mdbx_%.c libmdbx.a - $(CC) $(CFLAGS) $(LDFLAGS) $(LDOPS) -o $@ $^ + $(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@ -test/test: $(wildcard test/*.h) $(filter-out test/osal-windows.cc, $(wildcard test/*.cc)) libmdbx.a - $(CXX) $(CXXFLAGS) $(LDFLAGS) -Isrc -o $@ $(filter-out %.h, $^) +test/%.o: test/%.cc $(wildcard test/*.h) Makefile + $(CXX) $(CXXFLAGS) -Isrc -c $(filter %.cc, $^) -o $@ + +test/test: $(patsubst %.cc,%.o,$(filter-out test/osal-windows.cc, $(wildcard test/*.cc))) libmdbx.a + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ ifneq ($(wildcard $(IOARENA)),) From 49cccf47919e8c3862dde6e2b8085f31a15a5919 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 11 Apr 2017 22:24:31 +0300 Subject: [PATCH 048/303] mdbx: Update README.md Change-Id: I9787d54df1efdef19ccc29c24168015e225ff097 --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1cf2ea8b..611067c0 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,8 @@ _libmdbx_ наследует все ключевые возможности и [MVCC](https://ru.wikipedia.org/wiki/MVCC) и [COW](https://ru.wikipedia.org/wiki/%D0%9A%D0%BE%D0%BF%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%BF%D1%80%D0%B8_%D0%B7%D0%B0%D0%BF%D0%B8%D1%81%D0%B8). Изменения строго последовательны и не блокируются чтением, - конфликты между транзакциями не возможны. +   конфликты между транзакциями не возможны. + При этом гарантируется чтение только зафиксированных данных, см [relaxing serializability](https://en.wikipedia.org/wiki/Serializability). 4. Чтение и поиск [без блокировок](https://ru.wikipedia.org/wiki/%D0%9D%D0%B5%D0%B1%D0%BB%D0%BE%D0%BA%D0%B8%D1%80%D1%83%D1%8E%D1%89%D0%B0%D1%8F_%D1%81%D0%B8%D0%BD%D1%85%D1%80%D0%BE%D0%BD%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D1%8F), без [атомарных операций](https://ru.wikipedia.org/wiki/%D0%90%D1%82%D0%BE%D0%BC%D0%B0%D1%80%D0%BD%D0%B0%D1%8F_%D0%BE%D0%BF%D0%B5%D1%80%D0%B0%D1%86%D0%B8%D1%8F). From a4a542c3b59e926d170c803f51a7e0a509d1c8b2 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 15 Apr 2017 22:44:48 +0300 Subject: [PATCH 049/303] mdbx: fix nasty typo in Windows's mdbx_pwritev(). --- src/osal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osal.c b/src/osal.c index 12cf7782..429c6b1f 100644 --- a/src/osal.c +++ b/src/osal.c @@ -383,7 +383,7 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, off_t offset, size_t expected_written) { #if defined(_WIN32) || defined(_WIN64) size_t written = 0; - for (int i = 0; i > iovcnt; ++i) { + for (int i = 0; i < iovcnt; ++i) { int rc = mdbx_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); if (unlikely(rc != MDB_SUCCESS)) return rc; From dfc3020426265b343804319f8bcb380920a702b4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 17 Apr 2017 19:35:36 +0300 Subject: [PATCH 050/303] mdbx: fix mdbx_canary_put(). --- mdbx.h | 2 +- src/mdbx.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/mdbx.h b/mdbx.h index 75688438..2af8b66f 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1725,7 +1725,7 @@ typedef int MDBX_pgvisitor_func(size_t pgno, unsigned pgnumber, void *ctx, LIBMDBX_API int mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, void *ctx); -typedef struct mdbx_canary { size_t x, y, z, v; } mdbx_canary; +typedef struct mdbx_canary { uint64_t x, y, z, v; } mdbx_canary; LIBMDBX_API int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary); LIBMDBX_API size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary); diff --git a/src/mdbx.c b/src/mdbx.c index 19424aba..eb29d3ab 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9841,15 +9841,22 @@ int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) return EACCES; if (likely(canary)) { + if (txn->mt_canary.x == canary->x && txn->mt_canary.y == canary->y && + txn->mt_canary.z == canary->z && txn->mt_canary.v == canary->v) + return MDB_SUCCESS; txn->mt_canary.x = canary->x; txn->mt_canary.y = canary->y; txn->mt_canary.z = canary->z; } txn->mt_canary.v = txn->mt_txnid; + txn->mt_flags |= MDB_TXN_DIRTY; return MDB_SUCCESS; } From 1b490fda243bbdbef539504220153855c7d04768 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 17 Apr 2017 21:19:48 +0300 Subject: [PATCH 051/303] mdbx: add mdbx_dbi_sequence(). --- mdbx.h | 3 +++ src/bits.h | 1 + src/mdbx.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/mdbx.h b/mdbx.h index 2af8b66f..ae9e5ca5 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1759,6 +1759,9 @@ LIBMDBX_API int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp); +LIBMDBX_API int mdbx_dbi_sequence(MDB_txn *txn, MDB_dbi dbi, uint64_t *result, + uint64_t increment); + #ifdef __cplusplus } #endif diff --git a/src/bits.h b/src/bits.h index 70cfaa48..c57fa8c3 100644 --- a/src/bits.h +++ b/src/bits.h @@ -246,6 +246,7 @@ typedef struct MDB_db { uint32_t md_xsize; /**< also ksize for LEAF2 pages */ uint16_t md_flags; /**< @ref mdbx_dbi_open */ uint16_t md_depth; /**< depth of this tree */ + uint64_t md_seq; /* table sequence counter */ pgno_t md_branch_pages; /**< number of internal pages */ pgno_t md_leaf_pages; /**< number of leaf pages */ pgno_t md_overflow_pages; /**< number of overflow pages */ diff --git a/src/mdbx.c b/src/mdbx.c index eb29d3ab..d48994be 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9073,6 +9073,7 @@ int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del) { txn->mt_dbs[dbi].md_overflow_pages = 0; txn->mt_dbs[dbi].md_entries = 0; txn->mt_dbs[dbi].md_root = P_INVALID; + txn->mt_dbs[dbi].md_seq = 0; txn->mt_flags |= MDB_TXN_DIRTY; } @@ -10241,3 +10242,41 @@ int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, } return rc; } + +int mdbx_dbi_sequence(MDB_txn *txn, MDB_dbi dbi, uint64_t *result, + uint64_t increment) { + if (unlikely(!txn)) + return EINVAL; + + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_EBADSIGN; + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; + + if (unlikely(TXN_DBI_CHANGED(txn, dbi))) + return MDB_BAD_DBI; + + MDB_db *dbs = &txn->mt_dbs[dbi]; + if (likely(result)) + *result = dbs->md_seq; + + if (likely(increment > 0)) { + if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) + return MDB_BAD_TXN; + + if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + return EACCES; + + uint64_t new = dbs->md_seq + increment; + if (unlikely(new < increment)) + return MDBX_RESULT_TRUE; + + assert(new > dbs->md_seq); + dbs->md_seq = new; + txn->mt_flags |= MDB_TXN_DIRTY; + txn->mt_dbflags[dbi] |= DB_DIRTY; + } + + return MDB_SUCCESS; +} From c2087f186ef5ae021bb1b9f6f8111e7b7eccffc1 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 18:26:32 +0300 Subject: [PATCH 052/303] mdbx: fix/rework mdbx_reader_check(). --- src/lck-posix.c | 8 +++- src/lck-windows.c | 6 +++ src/mdbx.c | 108 +++++++++++++++++++++++++--------------------- src/osal.h | 7 +++ 4 files changed, 79 insertions(+), 50 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 82b23934..426fcb62 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -171,6 +171,12 @@ int mdbx_rpid_clear(MDB_env *env) { return mdbx_lck_op(env->me_lfd, F_SETLKW, F_UNLCK, env->me_pid); } +/* Checks reader by pid. + * + * Returns: + * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) + * MDBX_RESULT_FALSE, if pid is dead (lock acquired) + * or otherwise the errcode. */ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { int rc = mdbx_lck_op(env->me_lfd, F_GETLK, F_WRLCK, pid); if (rc == 0) @@ -205,7 +211,7 @@ static int __cold mdbx_mutex_failed(MDB_env *env, mdbx_mutex_t *mutex, int rc) { int rlocked, rc2; /* We own the mutex. Clean up after dead previous owner. */ - rc = MDB_SUCCESS; + rc = MDBX_RESULT_TRUE; rlocked = (mutex == &env->me_txns->mti_rmutex); if (!rlocked) { /* Keep mtb.mti_txnid updated, otherwise next writer can diff --git a/src/lck-windows.c b/src/lck-windows.c index a037dbda..2b3d6aa9 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -314,6 +314,12 @@ int mdbx_rpid_clear(MDB_env *env) { return MDB_SUCCESS; } +/* Checks reader by pid. + * + * Returns: + * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) + * MDBX_RESULT_FALSE, if pid is dead (lock acquired) + * or otherwise the errcode. */ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { (void)env; HANDLE hProcess = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid); diff --git a/src/mdbx.c b/src/mdbx.c index d48994be..56862dfd 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2197,11 +2197,14 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { env->me_live_reader = pid; } + retry: nr = env->me_txns->mti_numreaders; for (i = 0; i < nr; i++) if (env->me_txns->mti_readers[i].mr_pid == 0) break; if (unlikely(i == env->me_maxreaders)) { + if (mdbx_reader_check0(env, 1, NULL)) + goto retry; mdbx_rdt_unlock(env); return MDB_READERS_FULL; } @@ -9195,11 +9198,8 @@ int __cold mdbx_reader_check(MDB_env *env, int *dead) { return mdbx_reader_check0(env, 0, dead); } -int __cold mdbx_reader_check0(MDB_env *env, int rlocked, int *dead) { - assert(rlocked >= 0); - unsigned i, j; - mdbx_pid_t *pids, pid; - int rc = MDB_SUCCESS, count = 0; +int __cold mdbx_reader_check0(MDB_env *env, int rdt_locked, int *dead) { + assert(rdt_locked >= 0); if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDB_FATAL_ERROR; @@ -9207,59 +9207,69 @@ int __cold mdbx_reader_check0(MDB_env *env, int rlocked, int *dead) { } unsigned snap_nreaders = env->me_txns->mti_numreaders; - pids = malloc((snap_nreaders + 1) * sizeof(mdbx_pid_t)); - if (!pids) - return ENOMEM; - + mdbx_pid_t *pids = alloca((snap_nreaders + 1) * sizeof(mdbx_pid_t)); pids[0] = 0; + + unsigned i; + int rc = MDBX_RESULT_FALSE, count = 0; MDB_reader *mr = env->me_txns->mti_readers; + for (i = 0; i < snap_nreaders; i++) { - pid = mr[i].mr_pid; - if (pid && pid != env->me_pid) { - if (mdbx_pid_insert(pids, pid) == 0) { + const mdbx_pid_t pid = mr[i].mr_pid; + if (pid == 0) + continue; + if (pid != env->me_pid) + continue; + if (mdbx_pid_insert(pids, pid) != 0) + continue; + + rc = mdbx_rpid_check(env, pid); + if (rc == MDBX_RESULT_TRUE) + continue; /* reader is live */ + + if (rc != MDBX_RESULT_FALSE) + break; /* mdbx_rpid_check() failed */ + + /* stale reader found */ + if (!rdt_locked) { + rdt_locked = -1; + rc = mdbx_rdt_lock(env); + if (rc != MDB_SUCCESS) { + if (rc != MDBX_RESULT_TRUE) + break; /* lock failed */ + /* recovered after mutex owner died */ + snap_nreaders = 0; /* the above checked all readers */ + } else { + /* a other process may have clean and reused slot, recheck */ + if (mr[i].mr_pid != pid) + continue; rc = mdbx_rpid_check(env, pid); - if (rc == MDBX_RESULT_FALSE) { - /* stale reader found */ - j = i; - if (!rlocked) { - rlocked = -1; - rc = mdbx_rdt_lock(env); - if (rc != MDB_SUCCESS) { - if (rc != MDBX_RESULT_TRUE) { - break; /* lock failed */ - } else { - /* recovered after mutex owner died */ - snap_nreaders = 0; /* the above checked all readers */ - } - } else { - /* a other process may have clean and reused slot, recheck */ - rc = mdbx_rpid_check(env, pid); - if (rc != MDBX_RESULT_FALSE) { - if (rc != MDBX_RESULT_TRUE) - break; /* mdbx_rpid_check() failed */ - /* the race with other process, slot reused */ - rc = MDB_SUCCESS; - continue; - } - } - } - for (; j < snap_nreaders; j++) { - if (mr[j].mr_pid == pid) { - mdbx_debug("clear stale reader pid %u txn %zd", (unsigned)pid, - mr[j].mr_txnid); - mr[j].mr_pid = 0; - count++; - } - } - } else if (rc != MDBX_RESULT_TRUE) - break; /* mdbx_rpid_check() failed */ + if (rc != MDBX_RESULT_FALSE) { + if (rc != MDBX_RESULT_TRUE) + break; /* mdbx_rpid_check() failed */ + /* the race with other process, slot reused */ + rc = MDBX_RESULT_FALSE; + continue; + } + } + } + + assert(mr[i].mr_pid == pid); + + /* clean it */ + unsigned j; + for (j = i; j < snap_nreaders; j++) { + if (mr[j].mr_pid == pid) { + mdbx_debug("clear stale reader pid %u txn %zd", (unsigned)pid, + mr[j].mr_txnid); + mr[j].mr_pid = 0; + count++; } } } - if (rlocked < 0) + if (rdt_locked < 0) mdbx_rdt_unlock(env); - free(pids); if (dead) *dead = count; diff --git a/src/osal.h b/src/osal.h index e0dc9244..06789ae7 100644 --- a/src/osal.h +++ b/src/osal.h @@ -421,6 +421,13 @@ void mdbx_txn_unlock(MDB_env *env); int mdbx_rpid_set(MDB_env *env); int mdbx_rpid_clear(MDB_env *env); + +/* Checks reader by pid. + * + * Returns: + * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) + * MDBX_RESULT_FALSE, if pid is dead (lock acquired) + * or otherwise the errcode. */ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid); /*----------------------------------------------------------------------------*/ From 99dd373215e213889c52375596a52aaf7c7d3d2b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 18:27:40 +0300 Subject: [PATCH 053/303] mdbx: fix/refine mdbx_txn_end() and mdbx_txn_renew0(). --- src/mdbx.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 56862dfd..f7e49adb 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2156,7 +2156,7 @@ static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { MDB_env *env = txn->mt_env; unsigned i, nr; - int rc, new_notls = 0; + int rc; if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDB_FATAL_ERROR; @@ -2231,11 +2231,8 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { #endif mdbx_rdt_unlock(env); - new_notls = MDB_END_SLOT /* == MDB_NOTLS */; - if (likely(env->me_flags & MDB_ENV_TXKEY)) { + if (likely(env->me_flags & MDB_ENV_TXKEY)) mdbx_thread_rthc_set(env->me_txkey, r); - new_notls = 0; - } } while ((env->me_flags & MDB_FATAL_ERROR) == 0) { @@ -2322,7 +2319,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { } else { return MDB_SUCCESS; } - mdbx_txn_end(txn, new_notls /*0 or MDB_END_SLOT*/ | MDB_END_FAIL_BEGIN); + mdbx_txn_end(txn, MDB_END_SLOT | MDB_END_FAIL_BEGIN); return rc; } @@ -2527,12 +2524,11 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { mdbx_mutex_lock(&tsan_mutex); #endif txn->mt_u.reader->mr_txnid = ~(txnid_t)0; - if (!(env->me_flags & MDB_NOTLS)) { - txn->mt_u.reader = NULL; /* txn does not own reader */ - } else if (mode & MDB_END_SLOT) { - txn->mt_u.reader->mr_pid = 0; + if (mode & MDB_END_SLOT) { + if ((env->me_flags & MDB_ENV_TXKEY) == 0) + txn->mt_u.reader->mr_pid = 0; txn->mt_u.reader = NULL; - } /* else txn owns the slot until it does MDB_END_SLOT */ + } #ifdef __SANITIZE_THREAD__ mdbx_mutex_unlock(&tsan_mutex); #endif From 2523170806ec25f622229a066f9033cc0e4769ef Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 18:33:35 +0300 Subject: [PATCH 054/303] test: add rnd and delay tools. --- test/chrono.cc | 33 +++++++- test/chrono.h | 33 ++++++-- test/osal-unix.cc | 47 +++++++++++ test/osal-windows.cc | 32 ++++++++ test/osal.h | 6 +- test/utils.cc | 192 +++++++++++++++++++++++++++++++++++++++++++ test/utils.h | 54 ++++++++++++ 7 files changed, 389 insertions(+), 8 deletions(-) diff --git a/test/chrono.cc b/test/chrono.cc index 8c9dcebe..3481ad74 100644 --- a/test/chrono.cc +++ b/test/chrono.cc @@ -71,7 +71,7 @@ time from_ms(uint64_t ms) { return result; } -time now() { +time now_realtime() { #if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) FILETIME filetime; GetSystemTimeAsFileTime(&filetime); @@ -87,4 +87,35 @@ time now() { #endif } +time now_motonic() { +#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) + static uint32_t reciprocal; + static LARGE_INTEGER Frequency; + if (reciprocal == 0) { + if (!QueryPerformanceFrequency(&Frequency)) + failure_perror("QueryPerformanceFrequency()", GetLastError()); + reciprocal = (UINT64_C(1) << 32) / Frequency.QuadPart; + assert(reciprocal); + } + + LARGE_INTEGER Counter; + if (!QueryPerformanceCounter(&Counter)) + failure_perror("QueryPerformanceCounter()", GetLastError()); + + time result; + result.integer = Counter.QuadPart / Frequency.QuadPart; + uint64_t mod = Counter.QuadPart % Frequency.QuadPart; + assert(mod < UINT32_MAX); + result.fractional = UInt32x32To64((uint32_t)mod, reciprocal); + assert(result.fractional == (mod << 32) / Frequency.QuadPart); + return result; +#else + struct timespec ts; + if (unlikely(clock_gettime(CLOCK_MONOTONIC, &ts))) + failure_perror("clock_gettime(CLOCK_MONOTONIC)", errno); + + return from_timespec(ts); +#endif +} + } /* namespace chrono */ diff --git a/test/chrono.h b/test/chrono.h index bb90e8e5..b417f1e1 100644 --- a/test/chrono.h +++ b/test/chrono.h @@ -25,12 +25,21 @@ typedef union time { struct __packed { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ uint32_t fractional; - uint32_t utc; + union { + uint32_t utc; + uint32_t integer; + }; #else - uint32_t utc; + union { + uint32_t utc; + uint32_t integer; + }; uint32_t fractional; #endif }; + + void reset() { fixedpoint = 0; } + uint32_t seconds() const { return utc; } } time; uint32_t ns2fractional(uint32_t); @@ -44,10 +53,21 @@ time from_ns(uint64_t us); time from_us(uint64_t ns); time from_ms(uint64_t ms); -inline time from_utc(time_t utc) { - assert(utc < UINT32_MAX); +inline time from_seconds(uint64_t seconds) { + assert(seconds < UINT32_MAX); time result; - result.fixedpoint = ((uint64_t)utc) << 32; + result.fixedpoint = seconds << 32; + return result; +} + +inline time from_utc(time_t utc) { + assert(utc >= 0); + return from_seconds(utc); +} + +inline time infinite() { + time result; + result.fixedpoint = UINT64_MAX; return result; } @@ -70,6 +90,7 @@ inline time from_timeval(const struct timeval &tv) { } #endif /* HAVE_TIMEVAL_TV_USEC */ -time now(); +time now_realtime(); +time now_motonic(); } /* namespace chrono */ diff --git a/test/osal-unix.cc b/test/osal-unix.cc index 5131181c..5e475705 100644 --- a/test/osal-unix.cc +++ b/test/osal-unix.cc @@ -157,6 +157,8 @@ static void handler_SIGCHLD(int unused) { (void)unused; } mdbx_pid_t osal_getpid(void) { return getpid(); } +int osal_delay(unsigned seconds) { return sleep(seconds) ? errno : 0; } + int osal_actor_start(const actor_config &config, mdbx_pid_t &pid) { if (childs.empty()) signal(SIGCHLD, handler_SIGCHLD); @@ -228,3 +230,48 @@ retry: return errno; } } + +void osal_yield(void) { + if (sched_yield()) + failure_perror("sched_yield()", errno); +} + +void osal_udelay(unsigned us) { + chrono::time until, now = chrono::now_motonic(); + until.fixedpoint = now.fixedpoint + chrono::from_us(us).fixedpoint; + struct timespec ts; + + static unsigned threshold_us; + if (threshold_us == 0) { + if (clock_getres(CLOCK_PROCESS_CPUTIME_ID, &ts)) { + int rc = errno; + failure_perror("clock_getres(CLOCK_PROCESS_CPUTIME_ID)", rc); + } + chrono::time threshold = chrono::from_timespec(ts); + assert(threshold.seconds() == 0); + + threshold_us = chrono::fractional2us(threshold.fractional); + if (threshold_us < 1000) + threshold_us = 1000; + } + + ts.tv_sec = ts.tv_nsec = 0; + if (us > threshold_us) { + ts.tv_sec = us / 1000000u; + ts.tv_nsec = (us % 1000000u) * 1000u; + } + + do { + if (us > threshold_us) { + if (nanosleep(&ts, &ts)) { + int rc = errno; + /* if (rc == EINTR) { ... } ? */ + failure_perror("usleep()", rc); + } + us = ts.tv_sec * 1000000u + ts.tv_nsec / 1000u; + } + cpu_relax(); + + now = chrono::now_motonic(); + } while (until.fixedpoint > now.fixedpoint); +} diff --git a/test/osal-windows.cc b/test/osal-windows.cc index d4083c81..fc3445cc 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -97,6 +97,11 @@ int osal_waitfor(unsigned id) { mdbx_pid_t osal_getpid(void) { return GetCurrentProcessId(); } +int osal_delay(unsigned seconds) { + Sleep(seconds * 1000u); + return 0; +} + //----------------------------------------------------------------------------- const std::string @@ -260,3 +265,30 @@ int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout) { return waitstatus2errcode(rc); } + +void osal_yield(void) { SwitchToThread(); } + +void osal_udelay(unsigned us) { + chrono::time until, now = chrono::now_motonic(); + until.fixedpoint = now.fixedpoint + chrono::from_us(us).fixedpoint; + + static unsigned threshold_us; + if (threshold_us == 0) { + ULONGLONG InterruptTimePrecise_100ns; + QueryInterruptTimePrecise(&InterruptTimePrecise_100ns); + threshold_us = InterruptTimePrecise_100ns / 5; + assert(threshold_us > 0); + } + + do { + if (us > threshold_us && us > 1000) { + DWORD rc = SleepEx(us / 1000, TRUE); + if (rc) + failure_perror("SleepEx()", waitstatus2errcode(rc)); + us = 0; + } + + YieldProcessor(); + now = chrono::now_motonic(); + } while (now.fixedpoint < until.fixedpoint); +} diff --git a/test/osal.h b/test/osal.h index 1e5de123..7eac2ad8 100644 --- a/test/osal.h +++ b/test/osal.h @@ -20,9 +20,13 @@ void osal_setup(const std::vector &actors); void osal_broadcast(unsigned id); int osal_waitfor(unsigned id); -mdbx_pid_t osal_getpid(void); int osal_actor_start(const actor_config &config, mdbx_pid_t &pid); actor_status osal_actor_info(const mdbx_pid_t pid); void osal_killall_actors(void); int osal_actor_poll(mdbx_pid_t &pid, unsigned timeout); void osal_wait4barrier(void); + +mdbx_pid_t osal_getpid(void); +int osal_delay(unsigned seconds); +void osal_udelay(unsigned us); +void osal_yield(void); diff --git a/test/utils.cc b/test/utils.cc index c3be0ec0..ae6797f8 100644 --- a/test/utils.cc +++ b/test/utils.cc @@ -13,6 +13,8 @@ */ #include "test.h" +#include +#include std::string format(const char *fmt, ...) { va_list ap, ones; @@ -88,3 +90,193 @@ bool hex2data(const char *hex_begin, const char *hex_end, void *ptr, } //----------------------------------------------------------------------------- + +#ifdef __mips__ +static uint64_t *mips_tsc_addr; + +__cold static void mips_rdtsc_init() { + int mem_fd = open("/dev/mem", O_RDONLY | O_SYNC, 0); + HIPPEUS_ENSURE(mem_fd >= 0); + + mips_tsc_addr = mmap(nullptr, pagesize, PROT_READ, MAP_SHARED, mem_fd, + 0x10030000 /* MIPS_ZBUS_TIMER */); + close(mem_fd); +} +#endif /* __mips__ */ + +uint64_t entropy_ticks(void) { +#if defined(__GNUC__) || defined(__clang__) +#if defined(__ia64__) + uint64_t ticks; + __asm("mov %0=ar.itc" : "=r"(ticks)); + return ticks; +#elif defined(__hppa__) + uint64_t ticks; + __asm("mfctl 16, %0" : "=r"(ticks)); + return ticks; +#elif defined(__s390__) + uint64_t ticks; + __asm("stck 0(%0)" : : "a"(&(ticks)) : "memory", "cc"); + return ticks; +#elif defined(__alpha__) + uint64_t ticks; + __asm("rpcc %0" : "=r"(ticks)); + return ticks; +#elif defined(__sparc_v9__) + uint64_t ticks; + __asm("rd %%tick, %0" : "=r"(ticks)); + return ticks; +#elif defined(__powerpc64__) || defined(__ppc64__) + uint64_t ticks; + __asm("mfspr %0, 268" : "=r"(ticks)); + return ticks; +#elif defined(__ppc__) || defined(__powerpc__) + unsigned tbl, tbu; + + /* LY: Here not a problem if a high-part (tbu) + * would been updated during reading. */ + __asm("mftb %0" : "=r"(tbl)); + __asm("mftbu %0" : "=r"(tbu)); + + return (((uin64_t)tbu0) << 32) | tbl; +#elif defined(__mips__) + if (mips_tsc_addr != MAP_FAILED) { + if (unlikely(!mips_tsc_addr)) { + static pthread_once_t is_initialized = PTHREAD_ONCE_INIT; + int rc = pthread_once(&is_initialized, mips_rdtsc_init); + if (unlikely(rc)) + failure_perror("pthread_once()", rc); + } + if (mips_tsc_addr != MAP_FAILED) + return *mips_tsc_addr; + } +#elif defined(__x86_64__) || defined(__i386__) + unsigned lo, hi; + + /* LY: Using the "a" and "d" constraints is important for correct code. */ + __asm("rdtsc" : "=a"(lo), "=d"(hi)); + + return (((uint64_t)hi) << 32) + lo; +#endif /* arch selector */ + +#elif defined(_M_IX86) || defined(_M_X64) + return __rdtsc(); +#endif /* __GNUC__ || __clang__ */ + +#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) + LARGE_INTEGER PerformanceCount; + if (QueryPerformanceCounter(&PerformanceCount)) + return PerformanceCount.QuadPart; + return GetTickCount64(); +#else + struct timespec ts; +#if defined(CLOCK_MONOTONIC_COARSE) + clockid_t clock = CLOCK_MONOTONIC_COARSE; +#elif defined(CLOCK_MONOTONIC_RAW) + clockid_t clock = CLOCK_MONOTONIC_RAW; +#else + clockid_t clock = CLOCK_MONOTONIC; +#endif + int rc = clock_gettime(clock, &ts); + if (unlikely(rc)) + failure_perror("clock_gettime()", rc); + + return (((uint64_t)ts.tv_sec) << 32) + ts.tv_nsec; +#endif +} + +//----------------------------------------------------------------------------- + +static __inline uint64_t bleach64(uint64_t dirty) { + dirty = mul_64x64_high(bswap64(dirty), UINT64_C(17048867929148541611)); + return dirty; +} + +static __inline uint32_t bleach32(uint32_t dirty) { + return (uint32_t)( + (bswap32(dirty) * UINT64_C(/*3080105489, 4267077937 */ 2175734609)) >> + 32); +} + +uint64_t prng64_careless(uint64_t &state) { + state = state * UINT64_C(6364136223846793005) + 1; + return state; +} + +uint64_t prng64_white(uint64_t &state) { + state = state * UINT64_C(6364136223846793005) + UINT64_C(1442695040888963407); + return bleach64(state); +} + +uint32_t prng32(uint64_t &state) { + return (uint32_t)(prng64_careless(state) >> 32); +} + +uint64_t entropy_white() { return bleach64(entropy_ticks()); } + +double double_from_lower(uint64_t salt) { +#ifdef IEEE754_DOUBLE_BIAS + ieee754_double r; + r.ieee.negative = 0; + r.ieee.exponent = IEEE754_DOUBLE_BIAS; + r.ieee.mantissa0 = (unsigned)(salt >> 32); + r.ieee.mantissa1 = (unsigned)salt; + return r.d; +#else + const uint64_t top = (UINT64_C(1) << DBL_MANT_DIG) - 1; + const double scale = 1.0 / (double)top; + return (salt & top) * scale; +#endif +} + +double double_from_upper(uint64_t salt) { +#ifdef IEEE754_DOUBLE_BIAS + ieee754_double r; + r.ieee.negative = 0; + r.ieee.exponent = IEEE754_DOUBLE_BIAS; + salt >>= 64 - DBL_MANT_DIG; + r.ieee.mantissa0 = (unsigned)(salt >> 32); + r.ieee.mantissa1 = (unsigned)salt; + return r.d; +#else + const uint64_t top = (UINT64_C(1) << DBL_MANT_DIG) - 1; + const double scale = 1.0 / (double)top; + return (salt >> (64 - DBL_MANT_DIG)) * scale; +#endif +} + +bool flipcoin() { return bleach32((uint32_t)entropy_ticks()) & 1; } + +bool jitter(unsigned probability_percent) { + const uint32_t top = UINT32_MAX - UINT32_MAX % 100; + uint32_t dice, edge = (top) / 100 * probability_percent; + do + dice = bleach32((uint32_t)entropy_ticks()); + while (dice >= top); + return dice < edge; +} + +void jitter_delay(bool extra) { + unsigned dice = entropy_white() & 3; + if (dice == 0) { + log_trace("== jitter.no-delay"); + } else { + log_trace(">> jitter.delay: dice %u", dice); + do { + cpu_relax(); + memory_barrier(); + cpu_relax(); + if (dice > 1) { + osal_yield(); + cpu_relax(); + if (dice > 2) { + unsigned us = entropy_white() & + (extra ? 0xfffff /* 1.05 s */ : 0x3fff /* 16 ms */); + log_trace("== jitter.delay: %0.6f", us / 1000000.0); + osal_udelay(us); + } + } + } while (flipcoin()); + log_trace("<< jitter.delay: dice %u", dice); + } +} diff --git a/test/utils.h b/test/utils.h index 55d8f6fd..b4c88834 100644 --- a/test/utils.h +++ b/test/utils.h @@ -271,6 +271,50 @@ static __inline size_t roundup2(size_t value, size_t granularity) { //----------------------------------------------------------------------------- +static __inline void memory_barrier(void) { +#if __has_extension(c_atomic) || __has_extension(cxx_atomic) + __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif defined(__ATOMIC_SEQ_CST) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#elif defined(__clang__) || defined(__GNUC__) + __sync_synchronize(); +#elif defined(_MSC_VER) + MemoryBarrier(); +#elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */ +#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64) + __mf(); +#elif defined(__i386__) || defined(__x86_64__) + _mm_mfence(); +#else +#error "Unknown target for Intel Compiler, please report to us." +#endif +#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) + __machine_rw_barrier(); +#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ + (defined(HP_IA64) || defined(__ia64)) + _Asm_mf(); +#elif defined(_AIX) || defined(__ppc__) || defined(__powerpc__) || \ + defined(__ppc64__) || defined(__powerpc64__) + __lwsync(); +#else +#error "Could not guess the kind of compiler, please report to us." +#endif +} + +static __inline void cpu_relax() { +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) + _mm_pause(); +#elif defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) || \ + defined(YieldProcessor) + YieldProcessor(); +#else +/* nope */ +#endif +} + +//----------------------------------------------------------------------------- + struct simple_checksum { uint64_t value; @@ -310,3 +354,13 @@ bool hex2data(const char *hex_begin, const char *hex_end, void *ptr, size_t bytes, simple_checksum &checksum); std::string format(const char *fmt, ...); + +uint64_t entropy_ticks(void); +uint64_t entropy_white(void); +uint64_t prng64_careless(uint64_t &state); +uint64_t prng64_white(uint64_t &state); +uint32_t prng32(uint64_t &state); + +bool flipcoin(); +bool jitter(unsigned probability_percent); +void jitter_delay(bool extra = false); From 101e015d2c4aace3c4feaa6b6eb8bd3c088b8c0d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 18:34:43 +0300 Subject: [PATCH 055/303] test: more for logging. --- test/log.cc | 114 +++++++++++++++++++++++++++++++++++----------------- test/log.h | 9 ++++- 2 files changed, 84 insertions(+), 39 deletions(-) diff --git a/test/log.cc b/test/log.cc index 7def0b84..0123cb6a 100644 --- a/test/log.cc +++ b/test/log.cc @@ -55,8 +55,12 @@ const char *level2str(const loglevel level) { switch (level) { default: return "invalid/unknown"; + case extra: + return "extra"; case trace: return "trace"; + case verbose: + return "verbose"; case info: return "info"; case notice: @@ -70,48 +74,74 @@ const char *level2str(const loglevel level) { } } -void output(loglevel priority, const char *format, va_list ap) { - if (priority >= level) { - last = (priority >= error) ? stderr : stdout; - fprintf(last, "[ %u %-10s %6s ] %s" /* TODO */, osal_getpid(), - prefix.c_str(), level2str(priority), suffix.c_str()); - vfprintf(last, format, ap); +bool output(loglevel priority, const char *format, ...) { + if (priority < level) + return false; - size_t len = strlen(format); - char end = len ? format[len - 1] : '\0'; - switch (end) { - default: - putc('\n', last); - case '\n': - if (priority > info) - fflushall(); - break; - case ' ': - case '_': - case ':': - case '|': - case ',': - case '\t': - case '\b': - case '\r': - case '\0': - return; - } - } - last = nullptr; + va_list ap; + va_start(ap, format); + output(priority, format, ap); + va_end(ap); + return true; } -void feed(const char *format, ...) { +bool output(loglevel priority, const char *format, va_list ap) { if (last) { - va_list ap; - va_start(ap, format); - vfprintf(last, format, ap); - va_end(ap); - - size_t len = strlen(format); - if (len && format[len - 1] == '\n') - last = nullptr; + putc('\n', last); + last = nullptr; } + + if (priority < level) + return false; + + last = (priority >= error) ? stderr : stdout; + fprintf(last, "[ %u%10s %.4s ] %s" /* TODO */, osal_getpid(), prefix.c_str(), + level2str(priority), suffix.c_str()); + vfprintf(last, format, ap); + + size_t len = strlen(format); + char end = len ? format[len - 1] : '\0'; + switch (end) { + default: + putc('\n', last); + case '\n': + if (priority > info) + fflushall(); + last = nullptr; + case ' ': + case '_': + case ':': + case '|': + case ',': + case '\t': + case '\b': + case '\r': + case '\0': + break; + } + return true; +} + +bool feed(const char *format, va_list ap) { + if (!last) + return false; + + vfprintf(last, format, ap); + size_t len = strlen(format); + if (len && format[len - 1] == '\n') + last = nullptr; + return true; +} + +bool feed(const char *format, ...) { + if (!last) + return false; + + va_list ap; + va_start(ap, format); + feed(format, ap); + va_end(ap); + return true; } local_suffix::local_suffix(const char *c_str) @@ -151,6 +181,16 @@ void log_trace(const char *msg, ...) { logging::last = nullptr; } +void log_verbose(const char *msg, ...) { + if (logging::verbose >= logging::level) { + va_list ap; + va_start(ap, msg); + logging::output(logging::verbose, msg, ap); + va_end(ap); + } else + logging::last = nullptr; +} + void log_info(const char *msg, ...) { if (logging::info >= logging::level) { va_list ap; diff --git a/test/log.h b/test/log.h index 868b4ee4..67ad5e1c 100644 --- a/test/log.h +++ b/test/log.h @@ -33,7 +33,9 @@ const char *test_strerror(int errnum); namespace logging { enum loglevel { + extra, trace, + verbose, info, notice, warning, @@ -45,8 +47,10 @@ const char *level2str(const loglevel level); void setup(loglevel level, const std::string &prefix); void setup(const std::string &prefix); -void output(loglevel priority, const char *format, va_list ap); -void __printf_args(1, 2) feed(const char *format, ...); +bool output(loglevel priority, const char *format, va_list ap); +bool __printf_args(2, 3) output(loglevel priority, const char *format, ...); +bool feed(const char *format, va_list ap); +bool __printf_args(1, 2) feed(const char *format, ...); class local_suffix { protected: @@ -68,6 +72,7 @@ public: } /* namespace log */ void __printf_args(1, 2) log_trace(const char *msg, ...); +void __printf_args(1, 2) log_verbose(const char *msg, ...); void __printf_args(1, 2) log_info(const char *msg, ...); void __printf_args(1, 2) log_notice(const char *msg, ...); void __printf_args(1, 2) log_warning(const char *msg, ...); From f3e31a74ee36f0d7fc9faad4c8d94bee7894ffd8 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 18:41:11 +0300 Subject: [PATCH 056/303] test: jitter testcase (squashed major refine). --- Makefile | 2 +- test/cases.cc | 23 +-- test/config.cc | 42 +++--- test/config.h | 10 +- test/dead.cc | 10 +- test/hill.cc | 2 +- test/jitter.cc | 40 ++++- test/main.cc | 342 ++++++++++++++++++++++++------------------- test/osal-unix.cc | 38 ++--- test/osal-windows.cc | 24 +-- test/test.cc | 156 ++++++++++++++++---- test/test.h | 33 +++-- 12 files changed, 454 insertions(+), 268 deletions(-) diff --git a/Makefile b/Makefile index 730d855a..9d31aba8 100644 --- a/Makefile +++ b/Makefile @@ -66,7 +66,7 @@ clean: rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err check: test/test - test/test --pathname=tmp.db --basic --dont-cleanup-after && ./mdbx_chk -vn tmp.db + test/test --pathname=tmp.db --dont-cleanup-after basic && ./mdbx_chk -vn tmp.db mdbx.o: $(MDBX_SRC) Makefile $(CC) $(CFLAGS) -c src/mdbx.c -o $@ diff --git a/test/cases.cc b/test/cases.cc index a24838c6..09da2103 100644 --- a/test/cases.cc +++ b/test/cases.cc @@ -23,8 +23,8 @@ void configure_actor(unsigned &lastid, const actor_testcase testcase, if (i->is_waitable(params.waitfor_nops)) { if (i->signal_nops && i->signal_nops != params.waitfor_nops) failure("Previous waitable actor (id=%u) already linked on %u-ops\n", - i->id, i->signal_nops); - wait4id = i->id; + i->actor_id, i->signal_nops); + wait4id = i->actor_id; i->signal_nops = params.waitfor_nops; break; } @@ -33,7 +33,7 @@ void configure_actor(unsigned &lastid, const actor_testcase testcase, failure("No previous waitable actor for %u-ops\n", params.waitfor_nops); } - unsigned long id = 0; + unsigned id = 0; if (!id_cstr || strcmp(id_cstr, "auto") == 0) id = lastid + 1; else { @@ -47,23 +47,26 @@ void configure_actor(unsigned &lastid, const actor_testcase testcase, } if (id < 1 || id > ACTOR_ID_MAX) - failure("Invalid actor-id %lu\n", id); + failure("Invalid actor-id %u\n", id); lastid = id; + log_trace("configure_actor: %u for %s", id, testcase2str(testcase)); global::actors.emplace_back(actor_config(testcase, params, id, wait4id)); global::databases.insert(params.pathname_db); } -bool testcase_setup(const char *casename, const actor_params ¶ms, +void testcase_setup(const char *casename, actor_params ¶ms, unsigned &lastid) { - log_notice("testcase_setup(%s): TODO", casename); - if (strcmp(casename, "basic") == 0) { + log_notice(">>> testcase_setup(%s)", casename); configure_actor(lastid, ac_hill, nullptr, params); - return true; + configure_actor(lastid, ac_jitter, nullptr, params); + configure_actor(lastid, ac_jitter, nullptr, params); + configure_actor(lastid, ac_jitter, nullptr, params); + log_notice("<<< testcase_setup(%s): done", casename); + } else { + failure("unknown testcase `%s`", casename); } - - return false; } /* TODO */ diff --git a/test/config.cc b/test/config.cc index b1f73368..92e078b2 100644 --- a/test/config.cc +++ b/test/config.cc @@ -32,26 +32,23 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, if (!value) { if (current[optlen + 2] == '=') failure("Option '--%s' doen't accept any value\n", option); - narg += 1; return true; } *value = nullptr; if (current[optlen + 2] == '=') { *value = ¤t[optlen + 3]; - narg += 1; return true; } - if (narg + 1 < argc && strncmp("--", argv[narg + 1], 2)) { + if (narg + 1 < argc && strncmp("--", argv[narg + 1], 2) != 0) { *value = argv[narg + 1]; - narg += 2; + ++narg; return true; } if (default_value) { *value = default_value; - narg += 1; return true; } @@ -184,11 +181,16 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, const char *value_cstr = NULL; if (!parse_option(argc, argv, narg, option, &value_cstr, "yes")) { const char *current = argv[narg]; - if (strncmp(current, "--no-", 5) || strcmp(current + 5, option)) - return false; - value = false; - narg += 1; - return true; + if (strncmp(current, "--no-", 5) == 0 && strcmp(current + 5, option) == 0) { + value = false; + return true; + } + if (strncmp(current, "--dont-", 7) == 0 && + strcmp(current + 7, option) == 0) { + value = false; + return true; + } + return false; } if (!value_cstr) { @@ -266,8 +268,8 @@ void dump(const char *title) { logging::local_suffix indent(title); for (auto i = global::actors.begin(); i != global::actors.end(); ++i) { - log_info("#%u, testcase %s, id/table %u\n", i->order, - testcase2str(i->testcase), i->id); + log_info("#%u, testcase %s, space_id/table %u\n", i->actor_id, + testcase2str(i->testcase), i->space_id); indent.push(); if (i->params.loglevel) { @@ -284,8 +286,8 @@ void dump(const char *title) { log_info("seed %u\n", i->params.seed); - if (i->params.test_nrecords) - log_info("records %u\n", i->params.test_nrecords); + if (i->params.test_nops) + log_info("iterations/records %u\n", i->params.test_nops); else dump_duration("duration", i->params.test_duration); @@ -319,10 +321,10 @@ void dump(const char *title) { indent.pop(); } - dump_duration("timeout", global::config::timeout); + dump_duration("timeout", global::config::timeout_duration_seconds); log_info("cleanup: before %s, after %s\n", - global::config::dont_cleanup_before ? "No" : "Yes", - global::config::dont_cleanup_after ? "No" : "Yes"); + global::config::cleanup_before ? "Yes" : "No", + global::config::cleanup_after ? "Yes" : "No"); } } /* namespace config */ @@ -332,10 +334,10 @@ void dump(const char *title) { using namespace config; actor_config::actor_config(actor_testcase testcase, const actor_params ¶ms, - unsigned id, unsigned wait4id) + unsigned space_id, unsigned wait4id) : params(params) { - this->id = id; - this->order = (unsigned)global::actors.size(); + this->space_id = space_id; + this->actor_id = 1 + (unsigned)global::actors.size(); this->testcase = testcase; this->wait4id = wait4id; signal_nops = 0; diff --git a/test/config.h b/test/config.h index 38bc0b22..c0a04f93 100644 --- a/test/config.h +++ b/test/config.h @@ -78,7 +78,7 @@ struct actor_params_pod { unsigned seed; unsigned test_duration; - unsigned test_nrecords; + unsigned test_nops; unsigned nrepeat; unsigned nthreads; @@ -98,7 +98,7 @@ struct actor_params_pod { }; struct actor_config_pod { - unsigned id, order; + unsigned actor_id, space_id; actor_testcase testcase; unsigned wait4id; unsigned signal_nops; @@ -123,8 +123,8 @@ struct actor_config : public config::actor_config_pod { bool wanna_event4signalling() const { return true /* TODO ? */; } - actor_config(actor_testcase testcase, const actor_params ¶ms, unsigned id, - unsigned wait4id); + actor_config(actor_testcase testcase, const actor_params ¶ms, + unsigned space_id, unsigned wait4id); actor_config(const char *str) { if (!deserialize(str, *this)) @@ -140,7 +140,7 @@ struct actor_config : public config::actor_config_pod { bool is_waitable(size_t nops) const { switch (testcase) { case ac_hill: - if (!params.test_nrecords || params.test_nrecords >= nops) + if (!params.test_nops || params.test_nops >= nops) return true; default: return false; diff --git a/test/dead.cc b/test/dead.cc index 7afa042d..f713b8b3 100644 --- a/test/dead.cc +++ b/test/dead.cc @@ -24,7 +24,8 @@ bool testcase_deadread::setup() { } bool testcase_deadread::run() { - /* TODO */ + db_open(); + txn_begin(true); return true; } @@ -33,7 +34,7 @@ bool testcase_deadread::teardown() { cursor_guard.release(); txn_guard.release(); db_guard.release(); - return true; + return inherited::teardown(); } //----------------------------------------------------------------------------- @@ -48,7 +49,8 @@ bool testcase_deadwrite::setup() { } bool testcase_deadwrite::run() { - /* TODO */ + db_open(); + txn_begin(false); return true; } @@ -57,5 +59,5 @@ bool testcase_deadwrite::teardown() { cursor_guard.release(); txn_guard.release(); db_guard.release(); - return true; + return inherited::teardown(); } diff --git a/test/hill.cc b/test/hill.cc index 98a7b82c..0a7d2fd7 100644 --- a/test/hill.cc +++ b/test/hill.cc @@ -26,7 +26,7 @@ bool testcase_hill::setup() { } bool testcase_hill::run() { - mdbx_open(); + db_open(); /* TODO */ return true; } diff --git a/test/jitter.cc b/test/jitter.cc index 00385362..4129de75 100644 --- a/test/jitter.cc +++ b/test/jitter.cc @@ -19,13 +19,47 @@ bool testcase_jitter::setup() { if (!inherited::setup()) return false; - /* TODO */ - log_trace("<< setup"); return true; } -bool testcase_jitter::run() { return true; } +bool testcase_jitter::run() { + while (should_continue()) { + jitter_delay(); + db_open(); + + if (flipcoin()) { + jitter_delay(); + txn_begin(true); + jitter_delay(); + txn_end(false); + } + + jitter_delay(); + txn_begin(mode_readonly()); + jitter_delay(); + if (!mode_readonly()) { + /* TODO: + * - db_sequence() + * - db_setsize() + * ... + */ + } + txn_end(false); + + if (flipcoin()) { + jitter_delay(); + txn_begin(true); + jitter_delay(); + txn_end(false); + } + + jitter_delay(); + db_close(); + report(1); + } + return true; +} bool testcase_jitter::teardown() { log_trace(">> teardown"); diff --git a/test/main.cc b/test/main.cc index 8853ea2f..385b5050 100644 --- a/test/main.cc +++ b/test/main.cc @@ -44,7 +44,7 @@ void actor_params::set_defaults(void) { seed = 1; test_duration = 0; - test_nrecords = 1000; + test_nops = 1000; nrepeat = 1; nthreads = 1; @@ -63,6 +63,11 @@ void actor_params::set_defaults(void) { max_readers = 42; max_tables = 42; + + global::config::timeout_duration_seconds = 0 /* infinite */; + global::config::dump_config = true; + global::config::cleanup_before = true; + global::config::cleanup_after = true; } namespace global { @@ -72,12 +77,15 @@ std::unordered_map events; std::unordered_map pid2actor; std::set databases; unsigned nactors; +chrono::time start_motonic; +chrono::time deadline_motonic; +bool singlemode; namespace config { -unsigned timeout; +unsigned timeout_duration_seconds; bool dump_config; -bool dont_cleanup_before; -bool dont_cleanup_after; +bool cleanup_before; +bool cleanup_after; } /* namespace config */ } /* namespace global */ @@ -121,122 +129,154 @@ int main(int argc, char *const argv[]) { logging::setup((logging::loglevel)params.loglevel, "main"); unsigned lastid = 0; - if (argc == 2 && strncmp(argv[1], "--case=", 7) == 0) { - const char *casename = argv[1] + 7; - if (!testcase_setup(casename, params, lastid)) - failure("unknown testcase `%s`", casename); - } else { - for (int i = 1; i < argc;) { - const char *value = nullptr; - if (config::parse_option(argc, argv, i, "basic", nullptr)) { - bool ok = testcase_setup("basic", params, lastid); - assert(ok); - (void)ok; - } else if (config::parse_option(argc, argv, i, "race", nullptr)) { - bool ok = testcase_setup("race", params, lastid); - assert(ok); - (void)ok; - } else if (config::parse_option(argc, argv, i, "bench", nullptr)) { - bool ok = testcase_setup("bench", params, lastid); - assert(ok); - (void)ok; - } else if (config::parse_option(argc, argv, i, "pathname", - params.pathname_db) || - config::parse_option(argc, argv, i, "mode", params.mode_flags, - config::mode_bits) || - config::parse_option(argc, argv, i, "table", - params.table_flags, config::table_bits) || - config::parse_option(argc, argv, i, "size", params.size, - config::binary, 4096 * 4) || - config::parse_option(argc, argv, i, "seed", params.seed, - config::no_scale) || - config::parse_option(argc, argv, i, "repeat", params.nrepeat, - config::no_scale) || - config::parse_option(argc, argv, i, "threads", params.nthreads, - config::no_scale, 1, 64) || - config::parse_option(argc, argv, i, "timeout", - global::config::timeout, config::duration, - 1) || - config::parse_option(argc, argv, i, "keylen.min", - params.keylen_min, config::no_scale, 0, - params.keylen_max) || - config::parse_option(argc, argv, i, "keylen.max", - params.keylen_max, config::no_scale, - params.keylen_min, - mdbx_get_maxkeysize(0)) || - config::parse_option(argc, argv, i, "datalen.min", - params.datalen_min, config::no_scale, 0, - params.datalen_max) || - config::parse_option(argc, argv, i, "datalen.max", - params.datalen_max, config::no_scale, - params.datalen_min, MDBX_MAXDATASIZE) || - config::parse_option(argc, argv, i, "batch.read", - params.batch_read, config::no_scale, 1) || - config::parse_option(argc, argv, i, "batch.write", - params.batch_write, config::no_scale, - 1) || - config::parse_option(argc, argv, i, "delay", params.delaystart, - config::duration) || - config::parse_option(argc, argv, i, "wait4ops", - params.waitfor_nops, config::decimal) || - config::parse_option(argc, argv, i, "drop", - params.drop_table) || - config::parse_option(argc, argv, i, "dump-config", - global::config::dump_config) || - config::parse_option(argc, argv, i, "dont-cleanup-before", - global::config::dont_cleanup_before) || - config::parse_option(argc, argv, i, "dont-cleanup-after", - global::config::dont_cleanup_after) || - config::parse_option(argc, argv, i, "max-readers", - params.max_readers, config::no_scale, 1, - 255) || - config::parse_option(argc, argv, i, "max-tables", - params.max_tables, config::no_scale, 1, - INT16_MAX) || - false) { - continue; - } else if (config::parse_option(argc, argv, i, "no-delay", nullptr)) { - params.delaystart = 0; - } else if (config::parse_option(argc, argv, i, "no-wait", nullptr)) { - params.waitfor_nops = 0; - } else if (config::parse_option(argc, argv, i, "duration", - params.test_duration, config::duration, - 1)) { - params.test_nrecords = 0; - continue; - } else if (config::parse_option(argc, argv, i, "records", - params.test_nrecords, config::decimal, - 1)) { - params.test_duration = 0; - continue; - } else if (config::parse_option(argc, argv, i, "hill", &value)) { - configure_actor(lastid, ac_hill, value, params); - continue; - } else if (config::parse_option(argc, argv, i, "jitter", nullptr)) { - configure_actor(lastid, ac_jitter, value, params); - continue; - } else if (config::parse_option(argc, argv, i, "dead.reader", nullptr)) { - configure_actor(lastid, ac_deadread, value, params); - continue; - } else if (config::parse_option(argc, argv, i, "dead.writer", nullptr)) { - configure_actor(lastid, ac_deadwrite, value, params); - continue; - } else { - failure("Unknown option '%s'\n", argv[i]); - } + for (int narg = 1; narg < argc; ++narg) { + const char *value = nullptr; + + if (config::parse_option(argc, argv, narg, "case", &value)) { + testcase_setup(value, params, lastid); + continue; } + if (config::parse_option(argc, argv, narg, "pathname", params.pathname_db)) + continue; + if (config::parse_option(argc, argv, narg, "mode", params.mode_flags, + config::mode_bits)) + continue; + if (config::parse_option(argc, argv, narg, "table", params.table_flags, + config::table_bits)) + continue; + if (config::parse_option(argc, argv, narg, "size", params.size, + config::binary, 4096 * 4)) + continue; + if (config::parse_option(argc, argv, narg, "seed", params.seed, + config::no_scale)) + continue; + if (config::parse_option(argc, argv, narg, "repeat", params.nrepeat, + config::no_scale)) + continue; + if (config::parse_option(argc, argv, narg, "threads", params.nthreads, + config::no_scale, 1, 64)) + continue; + if (config::parse_option(argc, argv, narg, "timeout", + global::config::timeout_duration_seconds, + config::duration, 1)) + continue; + if (config::parse_option(argc, argv, narg, "keylen.min", params.keylen_min, + config::no_scale, 0, params.keylen_max)) + continue; + if (config::parse_option(argc, argv, narg, "keylen.max", params.keylen_max, + config::no_scale, params.keylen_min, + mdbx_get_maxkeysize(0))) + continue; + if (config::parse_option(argc, argv, narg, "datalen.min", + params.datalen_min, config::no_scale, 0, + params.datalen_max)) + continue; + if (config::parse_option(argc, argv, narg, "datalen.max", + params.datalen_max, config::no_scale, + params.datalen_min, MDBX_MAXDATASIZE)) + continue; + if (config::parse_option(argc, argv, narg, "batch.read", params.batch_read, + config::no_scale, 1)) + continue; + if (config::parse_option(argc, argv, narg, "batch.write", + params.batch_write, config::no_scale, 1)) + continue; + if (config::parse_option(argc, argv, narg, "delay", params.delaystart, + config::duration)) + continue; + if (config::parse_option(argc, argv, narg, "wait4ops", params.waitfor_nops, + config::decimal)) + continue; + if (config::parse_option(argc, argv, narg, "drop", params.drop_table)) + continue; + if (config::parse_option(argc, argv, narg, "dump-config", + global::config::dump_config)) + continue; + if (config::parse_option(argc, argv, narg, "cleanup-before", + global::config::cleanup_before)) + continue; + if (config::parse_option(argc, argv, narg, "cleanup-after", + global::config::cleanup_after)) + continue; + if (config::parse_option(argc, argv, narg, "max-readers", + params.max_readers, config::no_scale, 1, 255)) + continue; + if (config::parse_option(argc, argv, narg, "max-tables", params.max_tables, + config::no_scale, 1, INT16_MAX)) + continue; + + if (config::parse_option(argc, argv, narg, "no-delay", nullptr)) { + params.delaystart = 0; + continue; + } + if (config::parse_option(argc, argv, narg, "no-wait", nullptr)) { + params.waitfor_nops = 0; + continue; + } + if (config::parse_option(argc, argv, narg, "duration", params.test_duration, + config::duration, 1)) { + params.test_nops = 0; + continue; + } + if (config::parse_option(argc, argv, narg, "nops", params.test_nops, + config::decimal, 1)) { + params.test_duration = 0; + continue; + } + if (config::parse_option(argc, argv, narg, "hill", &value)) { + configure_actor(lastid, ac_hill, value, params); + continue; + } + if (config::parse_option(argc, argv, narg, "jitter", nullptr)) { + configure_actor(lastid, ac_jitter, value, params); + continue; + } + if (config::parse_option(argc, argv, narg, "dead.reader", nullptr)) { + configure_actor(lastid, ac_deadread, value, params); + continue; + } + if (config::parse_option(argc, argv, narg, "dead.writer", nullptr)) { + configure_actor(lastid, ac_deadwrite, value, params); + continue; + } + + if (*argv[narg] != '-') + testcase_setup(argv[narg], params, lastid); + else + failure("Unknown option '%s'\n", argv[narg]); } if (global::config::dump_config) config::dump(); + //-------------------------------------------------------------------------- + + if (global::actors.empty()) { + log_notice("no testcase(s) configured, exiting"); + return EXIT_SUCCESS; + } + bool failed = false; - if (global::actors.size()) { + global::start_motonic = chrono::now_motonic(); + global::deadline_motonic.fixedpoint = + (global::config::timeout_duration_seconds == 0) + ? chrono::infinite().fixedpoint + : global::start_motonic.fixedpoint + + chrono::from_seconds(global::config::timeout_duration_seconds) + .fixedpoint; + + if (global::config::cleanup_before) + cleanup(); + + if (global::actors.size() == 1) { + logging::setup("main"); + global::singlemode = true; + if (!test_execute(global::actors.front())) + failed = true; + } else { logging::setup("overlord"); - if (!global::config::dont_cleanup_before) - cleanup(); - + log_trace("=== preparing..."); log_trace(">> osal_setup"); osal_setup(global::actors); log_trace("<< osal_setup"); @@ -250,58 +290,60 @@ int main(int argc, char *const argv[]) { log_trace(">> killall_actors"); osal_killall_actors(); log_trace("<< killall_actors"); - failure("Failed to start actor #%u (%s)\n", a.order, test_strerror(rc)); + failure("Failed to start actor #%u (%s)\n", a.actor_id, + test_strerror(rc)); } global::pid2actor[pid] = &a; } + log_trace("=== ready to start..."); atexit(osal_killall_actors); log_trace(">> wait4barrier"); osal_wait4barrier(); log_trace("<< wait4barrier"); - } - time_t timestamp_start = time(nullptr); - size_t left = global::actors.size(); - - while (left > 0) { - unsigned timeout = INT_MAX; - if (global::config::timeout) { - time_t timestamp_now = time(nullptr); - if (timestamp_now - timestamp_start > global::config::timeout) - timeout = 0; - else - timeout = global::config::timeout - - (unsigned)(timestamp_now - timestamp_start); - } - - mdbx_pid_t pid; - int rc = osal_actor_poll(pid, timeout); - if (rc) - failure("Poll error: %s (%d)\n", test_strerror(rc), rc); - - if (pid) { - actor_status status = osal_actor_info(pid); - actor_config *actor = global::pid2actor.at(pid); - if (!actor) - continue; - - log_info("actor #%u, id %d, pid %u: %s\n", actor->order, actor->id, pid, - status2str(status)); - if (status > as_running) { - left -= 1; - if (status != as_successful) - failed = true; + size_t left = global::actors.size(); + log_trace("=== polling..."); + while (left > 0) { + unsigned timeout_seconds_left = INT_MAX; + chrono::time now_motonic = chrono::now_motonic(); + if (now_motonic.fixedpoint >= global::deadline_motonic.fixedpoint) + timeout_seconds_left = 0; + else { + chrono::time left_motonic; + left_motonic.fixedpoint = + global::deadline_motonic.fixedpoint - now_motonic.fixedpoint; + timeout_seconds_left = left_motonic.seconds(); + } + + mdbx_pid_t pid; + int rc = osal_actor_poll(pid, timeout_seconds_left); + if (rc) + failure("Poll error: %s (%d)\n", test_strerror(rc), rc); + + if (pid) { + actor_status status = osal_actor_info(pid); + actor_config *actor = global::pid2actor.at(pid); + if (!actor) + continue; + + log_info("actor #%u, id %d, pid %u: %s\n", actor->actor_id, + actor->space_id, pid, status2str(status)); + if (status > as_running) { + left -= 1; + if (status != as_successful) + failed = true; + } + } else { + if (timeout_seconds_left == 0) + failure("Timeout\n"); } - } else { - if (global::config::timeout && - time(nullptr) - timestamp_start > global::config::timeout) - failure("Timeout\n"); } + log_trace("=== done..."); } - log_notice("OVERALL: %s\n", failed ? "Failed" : "Successful"); - if (!global::config::dont_cleanup_before) { + log_notice("RESULT: %s\n", failed ? "Failed" : "Successful"); + if (global::config::cleanup_before) { if (failed) log_info("skip cleanup"); else diff --git a/test/osal-unix.cc b/test/osal-unix.cc index 5e475705..2ab3a7aa 100644 --- a/test/osal-unix.cc +++ b/test/osal-unix.cc @@ -24,11 +24,11 @@ struct shared_t { pthread_barrier_t barrier; pthread_mutex_t mutex; - pthread_cond_t conds[0]; + size_t conds_size; + pthread_cond_t conds[1]; }; static shared_t *shared; -static std::unordered_map events; void osal_wait4barrier(void) { assert(shared != nullptr && shared != MAP_FAILED); @@ -65,13 +65,8 @@ void osal_setup(const std::vector &actors) { if (rc) failure_perror("pthread_condattr_setpshared()", rc); - size_t n = 0; - for (const auto &a : actors) - if (a.wanna_event4signalling()) - ++n; - shared = (shared_t *)mmap( - nullptr, sizeof(shared_t) + n * sizeof(pthread_cond_t), + nullptr, sizeof(shared_t) + actors.size() * sizeof(pthread_cond_t), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (MAP_FAILED == (void *)shared) failure_perror("mmap(shared_conds)", errno); @@ -84,24 +79,15 @@ void osal_setup(const std::vector &actors) { if (rc) failure_perror("pthread_barrier_init(shared)", rc); - auto a = actors.begin(); + const size_t n = actors.size() + 1; for (size_t i = 0; i < n; ++i) { pthread_cond_t *event = &shared->conds[i]; rc = pthread_cond_init(event, &condattr); if (rc) failure_perror("pthread_cond_init(shared)", rc); - - unsigned id = 0; - while (a != actors.end()) { - if (a->wanna_event4signalling()) { - id = a->id; - break; - } - ++a; - } - assert(id != 0); - events[id] = event; + log_trace("osal_setup: event(shared pthread_cond) %zu -> %p", i, event); } + shared->conds_size = actors.size() + 1; pthread_barrierattr_destroy(&barrierattr); pthread_condattr_destroy(&condattr); @@ -110,7 +96,10 @@ void osal_setup(const std::vector &actors) { void osal_broadcast(unsigned id) { assert(shared != nullptr && shared != MAP_FAILED); - int rc = pthread_cond_broadcast(events.at(id)); + log_trace("osal_broadcast: event %u", id); + if (id >= shared->conds_size) + failure("osal_broadcast: id > limit"); + int rc = pthread_cond_broadcast(shared->conds + id); if (rc) failure_perror("sem_post(shared)", rc); } @@ -118,11 +107,15 @@ void osal_broadcast(unsigned id) { int osal_waitfor(unsigned id) { assert(shared != nullptr && shared != MAP_FAILED); + log_trace("osal_waitfor: event %u", id); + if (id >= shared->conds_size) + failure("osal_waitfor: id > limit"); + int rc = pthread_mutex_lock(&shared->mutex); if (rc != 0) failure_perror("pthread_mutex_lock(shared)", rc); - rc = pthread_cond_wait(events.at(id), &shared->mutex); + rc = pthread_cond_wait(shared->conds + id, &shared->mutex); if (rc && rc != EINTR) failure_perror("pthread_cond_wait(shared)", rc); @@ -173,6 +166,7 @@ int osal_actor_start(const actor_config &config, mdbx_pid_t &pid) { if (pid < 0) return errno; + log_trace("osal_actor_start: fork pid %i for %u", pid, config.actor_id); childs[pid] = as_running; return 0; } diff --git a/test/osal-windows.cc b/test/osal-windows.cc index fc3445cc..8a93b247 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -14,7 +14,7 @@ #include "test.h" -static std::unordered_map events; +static std::vector events; static HANDLE hBarrierSemaphore, hBarrierEvent; static int waitstatus2errcode(DWORD result) { @@ -63,15 +63,17 @@ static HANDLE make_inharitable(HANDLE hHandle) { } void osal_setup(const std::vector &actors) { - size_t n = 0; - for (const auto &a : actors) { - if (a.wanna_event4signalling()) { - HANDLE hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); - if (!hEvent) - failure_perror("CreateEvent()", GetLastError()); - hEvent = make_inharitable(hEvent); - events[a.id] = hEvent; - } + assert(events.empty()); + const size_t n = actors.size() + 1; + events.reserve(n); + + for (size_t i = 0; i < n; ++i) { + HANDLE hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + if (!hEvent) + failure_perror("CreateEvent()", GetLastError()); + hEvent = make_inharitable(hEvent); + log_trace("osal_setup: event %zu -> %p", i, hEvent); + events.push_back(hEvent); } hBarrierSemaphore = CreateSemaphore(NULL, 0, (LONG)actors.size(), NULL); @@ -86,11 +88,13 @@ void osal_setup(const std::vector &actors) { } void osal_broadcast(unsigned id) { + log_trace("osal_broadcast: event %u", id); if (!SetEvent(events.at(id))) failure_perror("SetEvent()", GetLastError()); } int osal_waitfor(unsigned id) { + log_trace("osal_waitfor: event %u", id); DWORD rc = WaitForSingleObject(events.at(id), INFINITE); return waitstatus2errcode(rc); } diff --git a/test/test.cc b/test/test.cc index 45e59ceb..28290c46 100644 --- a/test/test.cc +++ b/test/test.cc @@ -53,25 +53,37 @@ const char *status2str(actor_status status) { static void mdbx_debug_logger(int type, const char *function, int line, const char *msg, va_list args) { - logging::loglevel level = logging::trace; + logging::loglevel level = logging::info; + if (type & MDBX_DBG_EXTRA) + level = logging::extra; + if (type & MDBX_DBG_TRACE) + level = logging::trace; if (type & MDBX_DBG_PRINT) - level = logging::info; + level = logging::verbose; + if (type & MDBX_DBG_ASSERT) { - log_error("libmdbx assertion failure: %s, %d", + log_error("mdbx: assertion failure: %s, %d", function ? function : "unknown", line); level = logging::failure; } - output(level, msg, args); + if (logging::output(level, "mdbx: ")) + logging::feed(msg, args); if (type & MDBX_DBG_ASSERT) abort(); } -void testcase::mdbx_prepare() { - log_trace(">> mdbx_prepare"); +void testcase::db_prepare() { + log_trace(">> db_prepare"); + assert(!db_guard); - int rc = mdbx_setup_debug(MDBX_DBG_DNT, mdbx_debug_logger, MDBX_DBG_DNT); - log_info("libmdbx debug-flags: 0x%02x", rc); + int mdbx_dbg_opts = MDBX_DBG_ASSERT; + if (config.params.loglevel <= logging::trace) + mdbx_dbg_opts |= MDBX_DBG_TRACE; + if (config.params.loglevel <= logging::verbose) + mdbx_dbg_opts |= MDBX_DBG_PRINT; + int rc = mdbx_setup_debug(mdbx_dbg_opts, mdbx_debug_logger, MDBX_DBG_DNT); + log_info("set mdbx debug-opts: 0x%02x", rc); MDB_env *env = nullptr; rc = mdbx_env_create(&env); @@ -97,83 +109,163 @@ void testcase::mdbx_prepare() { if (rc != MDB_SUCCESS) failure_perror("mdbx_env_set_mapsize()", rc); - log_trace("<< mdbx_prepare"); + log_trace("<< db_prepare"); } -void testcase::mdbx_open() { - log_trace(">> mdbx_open"); +void testcase::db_open() { + log_trace(">> db_open"); + + if (!db_guard) + db_prepare(); int rc = mdbx_env_open(db_guard.get(), config.params.pathname_db.c_str(), (unsigned)config.params.mode_flags, 0640); if (rc != MDB_SUCCESS) failure_perror("mdbx_env_open()", rc); - log_trace("<< mdbx_open"); + + log_trace("<< db_open"); } -void testcase::mdbx_close() { - log_trace(">> mdbx_close"); +void testcase::db_close() { + log_trace(">> db_close"); cursor_guard.reset(); txn_guard.reset(); db_guard.reset(); - log_trace("<< mdbx_close"); + log_trace("<< db_close"); +} + +void testcase::txn_begin(bool readonly) { + log_trace(">> txn_begin(%s)", readonly ? "read-only" : "read-write"); + assert(!txn_guard); + + MDB_txn *txn = nullptr; + int rc = + mdbx_txn_begin(db_guard.get(), nullptr, readonly ? MDB_RDONLY : 0, &txn); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_txn_begin()", rc); + txn_guard.reset(txn); + + log_trace("<< txn_begin(%s)", readonly ? "read-only" : "read-write"); +} + +void testcase::txn_end(bool abort) { + log_trace(">> txn_end(%s)", abort ? "abort" : "commit"); + assert(txn_guard); + + MDB_txn *txn = txn_guard.release(); + if (abort) { + int rc = mdbx_txn_abort(txn); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_txn_abort()", rc); + } else { + int rc = mdbx_txn_commit(txn); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_txn_commit()", rc); + } + + log_trace("<< txn_end(%s)", abort ? "abort" : "commit"); } bool testcase::wait4start() { if (config.wait4id) { log_trace(">> wait4start(%u)", config.wait4id); + assert(!global::singlemode); int rc = osal_waitfor(config.wait4id); if (rc) { log_trace("<< wait4start(%u), failed %s", config.wait4id, test_strerror(rc)); return false; } - return true; } else { - log_trace("== wait4start(not needed)"); - return true; + log_trace("== skip wait4start: not needed"); } + + if (config.params.delaystart) { + int rc = osal_delay(config.params.delaystart); + if (rc) { + log_trace("<< delay(%u), failed %s", config.params.delaystart, + test_strerror(rc)); + return false; + } + } else { + log_trace("== skip delay: not needed"); + } + + return true; } void testcase::report(size_t nops_done) { - if (config.signal_nops && !signalled && config.signal_nops <= nops_done) { - log_trace(">> signal(n-ops %zu)", nops_done); - osal_broadcast(config.id); + nops_completed += nops_done; + log_verbose("== complete +%zu iteration, total %zu done", nops_done, + nops_completed); + + if (config.signal_nops && !signalled && + config.signal_nops <= nops_completed) { + log_trace(">> signal(n-ops %zu)", nops_completed); + if (!global::singlemode) + osal_broadcast(config.actor_id); signalled = true; - log_trace("<< signal(n-ops %zu)", nops_done); + log_trace("<< signal(n-ops %zu)", nops_completed); } } void testcase::signal() { if (!signalled) { log_trace(">> signal(forced)"); - osal_broadcast(config.id); + if (!global::singlemode) + osal_broadcast(config.actor_id); signalled = true; log_trace("<< signal(forced)"); } } bool testcase::setup() { - mdbx_prepare(); - return wait4start(); + db_prepare(); + if (!wait4start()) + return false; + + start_timestamp = chrono::now_motonic(); + return true; } bool testcase::teardown() { log_trace(">> testcase::teardown"); signal(); - mdbx_close(); + db_close(); log_trace("<< testcase::teardown"); return true; } +bool testcase::should_continue() const { + bool result = true; + + if (config.params.test_duration) { + chrono::time since; + since.fixedpoint = + chrono::now_motonic().fixedpoint - start_timestamp.fixedpoint; + if (since.seconds() >= config.params.test_duration) + result = false; + } + + if (config.params.test_nops && nops_completed >= config.params.test_nops) + result = false; + + return result; +} + //----------------------------------------------------------------------------- bool test_execute(const actor_config &config) { const mdbx_pid_t pid = osal_getpid(); - logging::setup((logging::loglevel)config.params.loglevel, - format("child_%u.%u", config.order, config.id)); - log_trace(">> wait4barrier"); - osal_wait4barrier(); - log_trace("<< wait4barrier"); + if (global::singlemode) { + logging::setup(format("single_%s", testcase2str(config.testcase))); + } else { + logging::setup((logging::loglevel)config.params.loglevel, + format("child_%u.%u", config.actor_id, config.space_id)); + log_trace(">> wait4barrier"); + osal_wait4barrier(); + log_trace("<< wait4barrier"); + } try { std::unique_ptr test; @@ -206,7 +298,7 @@ bool test_execute(const actor_config &config) { return true; } } catch (const std::exception &pipets) { - failure("Exception: %s", pipets.what()); + failure("***** Exception: %s *****", pipets.what()); } return false; } diff --git a/test/test.h b/test/test.h index 093b9251..bb7764ba 100644 --- a/test/test.h +++ b/test/test.h @@ -24,7 +24,7 @@ bool test_execute(const actor_config &config); std::string thunk_param(const actor_config &config); -bool testcase_setup(const char *casename, const actor_params ¶ms, +void testcase_setup(const char *casename, actor_params ¶ms, unsigned &lastid); void configure_actor(unsigned &lastid, const actor_testcase testcase, const char *id_cstr, const actor_params ¶ms); @@ -36,12 +36,16 @@ extern std::vector actors; extern std::unordered_map events; extern std::unordered_map pid2actor; extern std::set databases; +extern unsigned nactors; +extern chrono::time start_motonic; +extern chrono::time deadline_motonic; +extern bool singlemode; namespace config { -extern unsigned timeout; +extern unsigned timeout_duration_seconds; extern bool dump_config; -extern bool dont_cleanup_before; -extern bool dont_cleanup_after; +extern bool cleanup_before; +extern bool cleanup_after; } /* namespace config */ } /* namespace global */ @@ -80,19 +84,28 @@ protected: scoped_cursor_guard cursor_guard; bool signalled; - void mdbx_prepare(); - void mdbx_open(); - void mdbx_close(); + size_t nops_completed; + chrono::time start_timestamp; + + void db_prepare(); + void db_open(); + void db_close(); + void txn_begin(bool readonly); + void txn_end(bool abort); bool wait4start(); void report(size_t nops_done); void signal(); + bool should_continue() const; + + bool mode_readonly() const { + return (config.params.mode_flags & MDB_RDONLY) ? true : false; + } public: testcase(const actor_config &config, const mdbx_pid_t pid) - : config(config), pid(pid) { - logging::setup(format("%s_%u.%u", testcase2str(config.testcase), - config.order, config.id)); + : config(config), pid(pid), nops_completed(0) { + start_timestamp.reset(); } virtual bool setup(); From 585496339a185d3c07c6fd8645351e7fcfb51fa2 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 18:57:34 +0300 Subject: [PATCH 057/303] mdbx: kill/remove mm_txnid. --- src/bits.h | 18 ++-------- src/lck-posix.c | 9 ----- src/mdbx.c | 93 ++++++++++++------------------------------------- 3 files changed, 24 insertions(+), 96 deletions(-) diff --git a/src/bits.h b/src/bits.h index c57fa8c3..7b89604d 100644 --- a/src/bits.h +++ b/src/bits.h @@ -361,10 +361,6 @@ typedef struct MDBX_lockinfo { /* Format of this lock file. Must be set to MDB_LOCK_FORMAT. */ uint64_t mti_format; - /* The ID of the last transaction committed to the database. - * This is recorded here only for convenience; the value can always - * be determined by reading the main database meta pages. */ - volatile txnid_t mti_txnid; #ifdef MDBX_OSAL_LOCK MDBX_OSAL_LOCK mti_wmutex; #endif @@ -758,21 +754,11 @@ int mdbx_reader_check0(MDB_env *env, int rlocked, int *dead); #define METAPAGE_2(env) \ (&((MDB_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) -static __inline MDB_meta *mdbx_meta_head_w(MDB_env *env) { +static __inline MDB_meta *mdbx_meta_head(MDB_env *env) { MDB_meta *a = METAPAGE_1(env); MDB_meta *b = METAPAGE_2(env); - txnid_t head_txnid = env->me_txns->mti_txnid; - mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); - if (a->mm_txnid == head_txnid) - return a; - if (likely(b->mm_txnid == head_txnid)) - return b; - - mdbx_debug("me_txns->mti_txnid not match meta-pages"); - mdbx_assert(env, head_txnid == a->mm_txnid || head_txnid == b->mm_txnid); - env->me_flags |= MDB_FATAL_ERROR; - return a; + return (a->mm_txnid > b->mm_txnid) ? a : b; } void mdbx_rthc_dtor(void *rthc); diff --git a/src/lck-posix.c b/src/lck-posix.c index 426fcb62..004d700b 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -214,15 +214,6 @@ static int __cold mdbx_mutex_failed(MDB_env *env, mdbx_mutex_t *mutex, int rc) { rc = MDBX_RESULT_TRUE; rlocked = (mutex == &env->me_txns->mti_rmutex); if (!rlocked) { - /* Keep mtb.mti_txnid updated, otherwise next writer can - * overwrite data which latest meta page refers to. - * - * LY: Hm, how this can happen, if the mtb.mti_txnid - * is updating only at the finish of a successful commit ? - */ - MDB_meta *meta = mdbx_meta_head_w(env); - assert(env->me_txns->mti_txnid == meta->mm_txnid); - (void)meta; /* env is hosed if the dead thread was ours */ if (env->me_txn) { env->me_flags |= MDB_FATAL_ERROR; diff --git a/src/mdbx.c b/src/mdbx.c index f7e49adb..1ec343df 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1387,52 +1387,12 @@ static __inline uint64_t mdbx_meta_sign(MDB_meta *meta) { return (sign > MDB_DATASIGN_WEAK) ? sign : ~sign; } -static MDB_meta *mdbx_meta_head_r(MDB_env *env) { - MDB_meta *a = METAPAGE_1(env); - MDB_meta *b = METAPAGE_2(env), *h; - -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_lock(&tsan_mutex); -#endif - - txnid_t head_txnid = env->me_txns->mti_txnid; - mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); - if (likely(a->mm_txnid == head_txnid)) { - h = a; - } else if (likely(b->mm_txnid == head_txnid)) { - h = b; - } else { - /* LY: seems got a collision with mdbx_env_sync0() */ - mdbx_coherent_barrier(); - head_txnid = env->me_txns->mti_txnid; - mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); - - if (likely(a->mm_txnid == head_txnid)) { - h = a; - } else if (likely(b->mm_txnid == head_txnid)) { - h = b; - } else { - /* LY: got a race again, or DB is corrupted */ - int rc = mdbx_txn_lock(env); - h = mdbx_meta_head_w(env); - if (rc == MDB_SUCCESS) - mdbx_txn_unlock(env); - } - } - -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_unlock(&tsan_mutex); -#endif - - return h; -} - static __inline MDB_meta *mdbx_env_meta_flipflop(const MDB_env *env, MDB_meta *meta) { return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env); } -static __inline int mdbx_meta_lt(MDB_meta *a, MDB_meta *b) { +static __inline int mdbx_meta_lt(const MDB_meta *a, const MDB_meta *b) { return (META_IS_STEADY(a) == META_IS_STEADY(b)) ? a->mm_txnid < b->mm_txnid : META_IS_STEADY(b); } @@ -1442,17 +1402,12 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { #ifdef __SANITIZE_THREAD__ mdbx_mutex_lock(&tsan_mutex); #endif + const MDB_meta *const a = METAPAGE_1(env); + const MDB_meta *const b = METAPAGE_2(env); + txnid_t oldest = mdbx_meta_lt(a, b) ? b->mm_txnid : a->mm_txnid; + int i, reader; - MDB_reader *r = env->me_txns->mti_readers; - txnid_t oldest = env->me_txns->mti_txnid; - - MDB_meta *a = METAPAGE_1(env); - MDB_meta *b = METAPAGE_2(env); - if (META_IS_WEAK(a) && oldest > b->mm_txnid) - oldest = b->mm_txnid; - if (META_IS_WEAK(b) && oldest > a->mm_txnid) - oldest = a->mm_txnid; - + const MDB_reader *const r = env->me_txns->mti_readers; for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0;) { if (r[i].mr_pid) { txnid_t snap = r[i].mr_txnid; @@ -1738,7 +1693,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { if ((flags & MDBX_ALLOC_GC) && ((flags & MDBX_ALLOC_KICK) || rc == MDB_MAP_FULL)) { - MDB_meta *head = mdbx_meta_head_w(env); + MDB_meta *head = mdbx_meta_head(env); MDB_meta *tail = mdbx_env_meta_flipflop(env, head); if (oldest == tail->mm_txnid && META_IS_WEAK(head) && @@ -1754,10 +1709,9 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { * don't make a steady-sync, but only a legacy-mode checkpoint, * just for resume reclaiming only, not for data consistency. */ - mdbx_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu, txnid %zu", + mdbx_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu", head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', - tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest, - env->me_txns->mti_txnid); + tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest); int me_flags = env->me_flags & MDB_WRITEMAP; if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC) @@ -2025,7 +1979,7 @@ int mdbx_env_sync(MDB_env *env, int force) { if (unlikely(flags & (MDB_RDONLY | MDB_FATAL_ERROR))) return EACCES; - head = mdbx_meta_head_r(env); + head = mdbx_meta_head(env); if (!META_IS_WEAK(head) && env->me_sync_pending == 0 && env->me_mapsize == head->mm_mapsize) /* LY: nothing to do */ @@ -2054,7 +2008,7 @@ int mdbx_env_sync(MDB_env *env, int force) { return rc; /* LY: head may be changed while the mutex has been acquired. */ - head = mdbx_meta_head_w(env); + head = mdbx_meta_head(env); rc = MDB_SUCCESS; if (META_IS_WEAK(head) || env->me_sync_pending != 0 || env->me_mapsize != head->mm_mapsize) { @@ -2236,12 +2190,12 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { } while ((env->me_flags & MDB_FATAL_ERROR) == 0) { - MDB_meta *meta = mdbx_meta_head_r(txn->mt_env); + MDB_meta *const meta = mdbx_meta_head(txn->mt_env); txnid_t lead = meta->mm_txnid; r->mr_txnid = lead; mdbx_coherent_barrier(); - txnid_t snap = txn->mt_env->me_txns->mti_txnid; + txnid_t snap = mdbx_meta_head(txn->mt_env)->mm_txnid; /* LY: Retry on a race, ITS#7970. */ if (likely(lead == snap)) { txn->mt_txnid = lead; @@ -2264,7 +2218,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { #ifdef __SANITIZE_THREAD__ mdbx_mutex_lock(&tsan_mutex); #endif - MDB_meta *meta = mdbx_meta_head_w(env); + MDB_meta *meta = mdbx_meta_head(env); txn->mt_canary = meta->mm_canary; txn->mt_txnid = meta->mm_txnid + 1; txn->mt_flags = flags; @@ -3429,7 +3383,7 @@ static int __cold mdbx_env_init_meta(MDB_env *env, MDB_meta *meta) { static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { int rc; - MDB_meta *head = mdbx_meta_head_w(env); + MDB_meta *head = mdbx_meta_head(env); size_t prev_mapsize = head->mm_mapsize; size_t used_size = env->me_psize * (pending->mm_last_pg + 1); @@ -3557,7 +3511,6 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { * readers will get consistent data regardless of how fresh or * how stale their view of these values is. */ - env->me_txns->mti_txnid = pending->mm_txnid; #ifdef __SANITIZE_THREAD__ mdbx_mutex_unlock(&tsan_mutex); #endif @@ -3742,7 +3695,7 @@ int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { return EINVAL; /* FIXME: lock/unlock */ - meta = mdbx_meta_head_w(env); + meta = mdbx_meta_head(env); if (!size) size = meta->mm_mapsize; /* Silently round up to minimum if the size is too small */ @@ -3953,7 +3906,6 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_format = MDB_LOCK_FORMAT; - env->me_txns->mti_txnid = ~(txnid_t)0; } else { if (env->me_txns->mti_magic != MDB_MAGIC) { mdbx_debug("lock region has invalid magic"); @@ -4079,7 +4031,6 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, if (rc == MDB_SUCCESS) { mdbx_debug("opened dbenv %p", (void *)env); if (excl > 0) { - env->me_txns->mti_txnid = meta.mm_txnid; if (exclusive == NULL || *exclusive < 2) { /* LY: downgrade lock only if exclusive access not requested. * in case exclusive==1, just leave value as is. */ @@ -4116,7 +4067,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, #if MDB_DEBUG if (rc == MDB_SUCCESS) { - MDB_meta *meta = mdbx_meta_head_r(env); + MDB_meta *meta = mdbx_meta_head(env); MDB_db *db = &meta->mm_dbs[MAIN_DBI]; int toggle = ((char *)meta == PAGEDATA(env->me_map)) ? 0 : 1; @@ -8673,7 +8624,7 @@ int __cold mdbx_env_stat(MDB_env *env, MDBX_stat *arg, size_t bytes) { if (unlikely(bytes != sizeof(MDBX_stat))) return EINVAL; - meta = mdbx_meta_head_r(env); + meta = mdbx_meta_head(env); return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); } @@ -8694,14 +8645,14 @@ int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) { m2 = METAPAGE_2(env); do { - meta = mdbx_meta_head_r(env); + meta = mdbx_meta_head(env); arg->me_last_txnid = meta->mm_txnid; arg->me_last_pgno = meta->mm_last_pg; arg->me_meta1_txnid = m1->mm_txnid; arg->me_meta1_sign = m1->mm_datasync_sign; arg->me_meta2_txnid = m2->mm_txnid; arg->me_meta2_sign = m2->mm_datasync_sign; - } while (unlikely(arg->me_last_txnid != env->me_txns->mti_txnid || + } while (unlikely(arg->me_last_txnid != mdbx_meta_head(env)->mm_txnid || arg->me_meta1_sign != m1->mm_datasync_sign || arg->me_meta2_sign != m2->mm_datasync_sign)); @@ -9606,7 +9557,7 @@ static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { continue; rc = env->me_oom_func(env, pid, tid, oldest, - mdbx_meta_head_w(env)->mm_txnid - oldest, retry); + mdbx_meta_head(env)->mm_txnid - oldest, retry); if (rc < 0) break; @@ -9669,7 +9620,7 @@ int mdbx_txn_straggler(MDB_txn *txn, int *percent) return -1; env = txn->mt_env; - meta = mdbx_meta_head_r(env); + meta = mdbx_meta_head(env); if (percent) { size_t maxpg = env->me_maxpg; size_t last = meta->mm_last_pg + 1; From ebb404e5bc247a619fb1b9a818ff627c27090883 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 18:59:32 +0300 Subject: [PATCH 058/303] mdbx: remove tsan_mutex. --- src/mdbx.c | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 1ec343df..655c0534 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -694,10 +694,6 @@ static MDB_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_ai, mdbx_cmp_int_a2, mdbx_cmp_int_ua; /** @endcond */ -#ifdef __SANITIZE_THREAD__ -static mdbx_mutex_t tsan_mutex = mdbx_mutex_initIALIZER; -#endif - /** Return the library version info. */ const char *mdbx_version(int *major, int *minor, int *patch) { if (major) @@ -1399,9 +1395,6 @@ static __inline int mdbx_meta_lt(const MDB_meta *a, const MDB_meta *b) { /** Find oldest txnid still referenced. */ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_lock(&tsan_mutex); -#endif const MDB_meta *const a = METAPAGE_1(env); const MDB_meta *const b = METAPAGE_2(env); txnid_t oldest = mdbx_meta_lt(a, b) ? b->mm_txnid : a->mm_txnid; @@ -1417,9 +1410,6 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { } } } -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_unlock(&tsan_mutex); -#endif if (laggard) *laggard = reader; @@ -2172,17 +2162,11 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { r->mr_txnid = ~(txnid_t)0; r->mr_tid = tid; mdbx_coherent_barrier(); -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_lock(&tsan_mutex); -#endif if (i == nr) env->me_txns->mti_numreaders = ++nr; if (env->me_close_readers < nr) env->me_close_readers = nr; r->mr_pid = pid; -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_unlock(&tsan_mutex); -#endif mdbx_rdt_unlock(env); if (likely(env->me_flags & MDB_ENV_TXKEY)) @@ -2215,16 +2199,10 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { if (unlikely(rc)) return rc; -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_lock(&tsan_mutex); -#endif MDB_meta *meta = mdbx_meta_head(env); txn->mt_canary = meta->mm_canary; txn->mt_txnid = meta->mm_txnid + 1; txn->mt_flags = flags; -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_unlock(&tsan_mutex); -#endif #if MDB_DEBUG if (unlikely(txn->mt_txnid == mdbx_debug_edge)) { @@ -2474,18 +2452,12 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { if (txn->mt_u.reader) { -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_lock(&tsan_mutex); -#endif txn->mt_u.reader->mr_txnid = ~(txnid_t)0; if (mode & MDB_END_SLOT) { if ((env->me_flags & MDB_ENV_TXKEY) == 0) txn->mt_u.reader->mr_pid = 0; txn->mt_u.reader = NULL; } -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_unlock(&tsan_mutex); -#endif } mdbx_coherent_barrier(); txn->mt_numdbs = 0; /* prevent further DBI activity */ @@ -3471,9 +3443,6 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { : "Legacy"); if (env->me_flags & MDB_WRITEMAP) { -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_lock(&tsan_mutex); -#endif /* LY: 'invalidate' the meta, * but mdbx_meta_head_r() will be confused/retired in collision case. */ target->mm_datasync_sign = MDB_DATASIGN_WEAK; @@ -3500,9 +3469,6 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { goto fail; } mdbx_invalidate_cache(env->me_map + offset, sizeof(MDB_meta)); -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_lock(&tsan_mutex); -#endif } /* Memory ordering issues are irrelevant; since the entire writer @@ -3511,9 +3477,6 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { * readers will get consistent data regardless of how fresh or * how stale their view of these values is. */ -#ifdef __SANITIZE_THREAD__ - mdbx_mutex_unlock(&tsan_mutex); -#endif /* LY: step#3 - sync meta-pages. */ if ((flags & (MDB_NOSYNC | MDB_NOMETASYNC)) == 0) { From 36f1491ba80a7991e935510b9b143ce66db37065 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 19:00:07 +0300 Subject: [PATCH 059/303] mdbx: cleanup mdbx_mutex_failed(). --- src/lck-posix.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 004d700b..b5368c2d 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -208,11 +208,9 @@ static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset) { static int __cold mdbx_mutex_failed(MDB_env *env, mdbx_mutex_t *mutex, int rc) { #if MDB_USE_ROBUST if (unlikely(rc == EOWNERDEAD)) { - int rlocked, rc2; - /* We own the mutex. Clean up after dead previous owner. */ - rc = MDBX_RESULT_TRUE; - rlocked = (mutex == &env->me_txns->mti_rmutex); + rc = MDB_SUCCESS; + int rlocked = (mutex == &env->me_txns->mti_rmutex); if (!rlocked) { /* env is hosed if the dead thread was ours */ if (env->me_txn) { @@ -223,7 +221,7 @@ static int __cold mdbx_mutex_failed(MDB_env *env, mdbx_mutex_t *mutex, int rc) { } mdbx_debug("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), (rc ? "this process' env is hosed" : "recovering")); - rc2 = mdbx_reader_check0(env, rlocked, NULL); + int rc2 = mdbx_reader_check0(env, rlocked, NULL); if (rc2 == 0) #if __GLIBC_PREREQ(2, 12) rc2 = pthread_mutex_consistent(mutex); From c1ad75d8107d9ff4e5e89df8058c65161d43a188 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 19:00:33 +0300 Subject: [PATCH 060/303] mdbx: check env-flags while opening live db. --- src/bits.h | 2 ++ src/mdbx.c | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/bits.h b/src/bits.h index 7b89604d..ca6dc06c 100644 --- a/src/bits.h +++ b/src/bits.h @@ -360,6 +360,8 @@ typedef struct MDBX_lockinfo { uint64_t mti_magic; /* Format of this lock file. Must be set to MDB_LOCK_FORMAT. */ uint64_t mti_format; + /* Flags which environment was opened. */ + uint64_t mti_envmode; #ifdef MDBX_OSAL_LOCK MDBX_OSAL_LOCK mti_wmutex; diff --git a/src/mdbx.c b/src/mdbx.c index 655c0534..fc7db92b 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3994,6 +3994,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, if (rc == MDB_SUCCESS) { mdbx_debug("opened dbenv %p", (void *)env); if (excl > 0) { + env->me_txns->mti_envmode = env->me_flags; if (exclusive == NULL || *exclusive < 2) { /* LY: downgrade lock only if exclusive access not requested. * in case exclusive==1, just leave value as is. */ @@ -4002,9 +4003,17 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, goto bailout; excl = 0; } - } else if (exclusive) { - /* LY: just indicate that is not an exclusive access. */ - *exclusive = 0; + } else { + if (exclusive) { + /* LY: just indicate that is not an exclusive access. */ + *exclusive = 0; + } + if ((env->me_txns->mti_envmode ^ env->me_flags) & + (MDB_WRITEMAP | MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC)) { + /* LY: Current mode/flags incompatible with requested. */ + rc = MDB_INCOMPATIBLE; + goto bailout; + } } if (!(flags & MDB_RDONLY)) { MDB_txn *txn; From 41c51fdac245ea6d6f9dce60434b4878b946886f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 16:00:43 +0300 Subject: [PATCH 061/303] mdbx: refine debug macros/levels (initial). --- src/bits.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/src/bits.h b/src/bits.h index ca6dc06c..4df1280b 100644 --- a/src/bits.h +++ b/src/bits.h @@ -700,6 +700,59 @@ void mdbx_panic(const char *fmt, ...) #define mdbx_print(fmt, ...) \ mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) +/*****************************************/ + +#define mdbx_trace(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ + mdbx_debug_log(MDBX_DBG_TRACE, __FUNCTION__, __LINE__, fmt "\n", \ + ##__VA_ARGS__); \ + } while (0) + +#define mdbx_verbose(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_info(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_notice(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_warning(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_error(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +#define mdbx_fatal(fmt, ...) \ + do { \ + if (mdbx_debug_enabled(MDBX_DBG_TRACE /* FIXME */)) \ + mdbx_debug_log(MDBX_DBG_TRACE /* FIXME */, __FUNCTION__, __LINE__, \ + fmt "\n", ##__VA_ARGS__); \ + } while (0) + +/*****************************************/ + #define mdbx_debug(fmt, ...) \ do { \ if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ From 55226499a80c219f27eacd4ff8ffea1a95e481ee Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 16:02:27 +0300 Subject: [PATCH 062/303] mdbx: rework reader_check0() and mutex recovery. --- src/bits.h | 3 +++ src/lck-posix.c | 55 +++++++++++++++++++++++++++---------------------- src/mdbx.c | 49 +++++++++++++++++++++---------------------- 3 files changed, 57 insertions(+), 50 deletions(-) diff --git a/src/bits.h b/src/bits.h index 4df1280b..41245e53 100644 --- a/src/bits.h +++ b/src/bits.h @@ -829,3 +829,6 @@ static __inline size_t roundup2(size_t value, size_t granularity) { assert(is_power2(granularity)); return (value + granularity - 1) & ~(granularity - 1); } + +#define MDBX_IS_ERROR(rc) \ + ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) diff --git a/src/lck-posix.c b/src/lck-posix.c index b5368c2d..3c2c4f61 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -121,17 +121,18 @@ int mdbx_rdt_lock(MDB_env *env) { void mdbx_rdt_unlock(MDB_env *env) { int rc = mdbx_robust_unlock(env, &env->me_txns->mti_rmutex); - if (unlikely(rc != 0)) + if (unlikely(MDBX_IS_ERROR(rc))) mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); } int mdbx_txn_lock(MDB_env *env) { - return mdbx_robust_lock(env, &env->me_txns->mti_wmutex); + int rc = mdbx_robust_lock(env, &env->me_txns->mti_wmutex); + return MDBX_IS_ERROR(rc) ? rc : MDB_SUCCESS; } void mdbx_txn_unlock(MDB_env *env) { int rc = mdbx_robust_unlock(env, &env->me_txns->mti_wmutex); - if (unlikely(rc != 0)) + if (unlikely(MDBX_IS_ERROR(rc))) mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); } @@ -205,42 +206,46 @@ static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset) { } } +#if !__GLIBC_PREREQ(2, 12) && !defined(pthread_mutex_consistent) +#define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) +#endif + static int __cold mdbx_mutex_failed(MDB_env *env, mdbx_mutex_t *mutex, int rc) { #if MDB_USE_ROBUST - if (unlikely(rc == EOWNERDEAD)) { + if (rc == EOWNERDEAD) { /* We own the mutex. Clean up after dead previous owner. */ - rc = MDB_SUCCESS; + int rlocked = (mutex == &env->me_txns->mti_rmutex); + rc = MDB_SUCCESS; if (!rlocked) { - /* env is hosed if the dead thread was ours */ - if (env->me_txn) { + if (unlikely(env->me_txn)) { + /* env is hosed if the dead thread was ours */ env->me_flags |= MDB_FATAL_ERROR; env->me_txn = NULL; rc = MDB_PANIC; } } - mdbx_debug("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); - int rc2 = mdbx_reader_check0(env, rlocked, NULL); - if (rc2 == 0) -#if __GLIBC_PREREQ(2, 12) - rc2 = pthread_mutex_consistent(mutex); -#else - rc2 = pthread_mutex_consistent_np(mutex); -#endif - if (rc || (rc = rc2)) { - mdbx_debug("mutex recovery failed, %s", mdbx_strerror(rc)); + mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); + + int check_rc = mdbx_reader_check0(env, rlocked, NULL); + int mreco_rc = pthread_mutex_consistent(mutex); + check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; + + if (unlikely(mreco_rc)) + mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc)); + + rc = (rc == MDB_SUCCESS) ? check_rc : rc; + if (MDBX_IS_ERROR(rc)) pthread_mutex_unlock(mutex); - } + return rc; } #endif /* MDB_USE_ROBUST */ - if (unlikely(rc)) { - mdbx_debug("lock mutex failed, %s", mdbx_strerror(rc)); - if (rc != EDEADLK) { - env->me_flags |= MDB_FATAL_ERROR; - rc = MDB_PANIC; - } + mdbx_error("lock mutex failed, %s", mdbx_strerror(rc)); + if (rc != EDEADLK) { + env->me_flags |= MDB_FATAL_ERROR; + rc = MDB_PANIC; } return rc; } diff --git a/src/mdbx.c b/src/mdbx.c index fc7db92b..a1202237 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2129,8 +2129,9 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { mdbx_tid_t tid = mdbx_thread_self(); rc = mdbx_rdt_lock(env); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(MDBX_IS_ERROR(rc))) return rc; + rc = MDB_SUCCESS; if (unlikely(env->me_live_reader != pid)) { rc = mdbx_rpid_set(env); @@ -9129,11 +9130,10 @@ int __cold mdbx_reader_check0(MDB_env *env, int rdt_locked, int *dead) { mdbx_pid_t *pids = alloca((snap_nreaders + 1) * sizeof(mdbx_pid_t)); pids[0] = 0; - unsigned i; int rc = MDBX_RESULT_FALSE, count = 0; MDB_reader *mr = env->me_txns->mti_readers; - for (i = 0; i < snap_nreaders; i++) { + for (unsigned i = 0; i < snap_nreaders; i++) { const mdbx_pid_t pid = mr[i].mr_pid; if (pid == 0) continue; @@ -9151,33 +9151,32 @@ int __cold mdbx_reader_check0(MDB_env *env, int rdt_locked, int *dead) { /* stale reader found */ if (!rdt_locked) { - rdt_locked = -1; rc = mdbx_rdt_lock(env); - if (rc != MDB_SUCCESS) { - if (rc != MDBX_RESULT_TRUE) - break; /* lock failed */ - /* recovered after mutex owner died */ - snap_nreaders = 0; /* the above checked all readers */ - } else { - /* a other process may have clean and reused slot, recheck */ - if (mr[i].mr_pid != pid) - continue; - rc = mdbx_rpid_check(env, pid); - if (rc != MDBX_RESULT_FALSE) { - if (rc != MDBX_RESULT_TRUE) - break; /* mdbx_rpid_check() failed */ - /* the race with other process, slot reused */ - rc = MDBX_RESULT_FALSE; - continue; - } + if (MDBX_IS_ERROR(rc)) + break; + + rdt_locked = -1; + if (rc == MDBX_RESULT_TRUE) + /* the above checked all readers */ + break; + + /* a other process may have clean and reused slot, recheck */ + if (mr[i].mr_pid != pid) + continue; + + rc = mdbx_rpid_check(env, pid); + if (MDBX_IS_ERROR(rc)) + break; + + if (rc != MDBX_RESULT_FALSE) { + /* the race with other process, slot reused */ + rc = MDBX_RESULT_FALSE; + continue; } } - assert(mr[i].mr_pid == pid); - /* clean it */ - unsigned j; - for (j = i; j < snap_nreaders; j++) { + for (unsigned j = i; j < snap_nreaders; j++) { if (mr[j].mr_pid == pid) { mdbx_debug("clear stale reader pid %u txn %zd", (unsigned)pid, mr[j].mr_txnid); From 8bed6a5c8978ff85a13f76040ee9c6ec8189ae8f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 18:19:21 +0300 Subject: [PATCH 063/303] mdbx: fix mdbx_rthc_remove(). --- src/lck-posix.c | 4 ++++ src/mdbx.c | 25 ++++++++++++++++--------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 3c2c4f61..c2a36c54 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -42,6 +42,10 @@ void mdbx_rthc_unlock(void) { mdbx_ensure(NULL, pthread_mutex_unlock(&mdbx_rthc_mutex) == 0); } +void __attribute__((destructor)) mdbx_global_destructor(void) { + mdbx_rthc_cleanup(); +} + /*----------------------------------------------------------------------------*/ /* lck */ diff --git a/src/mdbx.c b/src/mdbx.c index a1202237..d5af402b 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -61,12 +61,12 @@ static rthc_entry_t *rthc_table = rthc_table_static; __cold void mdbx_rthc_dtor(void *ptr) { MDB_reader *rthc = (MDB_reader *)ptr; - unsigned i; mdbx_rthc_lock(); - for (i = 0; i < rthc_count; ++i) { + const mdbx_pid_t self_pid = mdbx_getpid(); + for (unsigned i = 0; i < rthc_count; ++i) { if (rthc >= rthc_table[i].begin && rthc < rthc_table[i].end) { - if (rthc->mr_pid && rthc->mr_pid == mdbx_getpid()) { + if (rthc->mr_pid == self_pid) { rthc->mr_pid = 0; mdbx_coherent_barrier(); } @@ -77,15 +77,14 @@ __cold void mdbx_rthc_dtor(void *ptr) { } __cold void mdbx_rthc_cleanup(void) { - unsigned i; - mdbx_rthc_lock(); - for (i = 0; i < rthc_count; ++i) { + const mdbx_pid_t self_pid = mdbx_getpid(); + for (unsigned i = 0; i < rthc_count; ++i) { mdbx_thread_key_t key = rthc_table[i].key; MDB_reader *rthc = mdbx_thread_rthc_get(key); if (rthc) { mdbx_thread_rthc_set(key, NULL); - if (rthc->mr_pid && rthc->mr_pid == mdbx_getpid()) { + if (rthc->mr_pid == self_pid) { rthc->mr_pid = 0; mdbx_coherent_barrier(); } @@ -96,6 +95,9 @@ __cold void mdbx_rthc_cleanup(void) { __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDB_reader *begin, MDB_reader *end) { +#ifndef NDEBUG + *key = (mdbx_thread_key_t)0xBADBADBAD; +#endif /* NDEBUG */ int rc = mdbx_thread_key_create(key); if (rc != MDB_SUCCESS) return rc; @@ -132,9 +134,14 @@ __cold void mdbx_rthc_remove(mdbx_thread_key_t key) { mdbx_rthc_lock(); mdbx_thread_key_delete(key); - unsigned i; - for (i = 0; i < rthc_count; ++i) { + for (unsigned i = 0; i < rthc_count; ++i) { if (key == rthc_table[i].key) { + const mdbx_pid_t self_pid = mdbx_getpid(); + for (MDB_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; + ++rthc) + if (rthc->mr_pid == self_pid) + rthc->mr_pid = 0; + mdbx_coherent_barrier(); if (--rthc_count > 0) rthc_table[i] = rthc_table[rthc_count]; else if (rthc_table != rthc_table_static) { From cd37b81cc51923eadb8e6c7d062808ca85559f08 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Apr 2017 18:19:56 +0300 Subject: [PATCH 064/303] mdbx: refine mdbx_txn_renew0(). --- src/mdbx.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index d5af402b..6ba16361 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -635,11 +635,11 @@ enum { MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD }; -#define MDB_END_OPMASK 0x0F /**< mask for #mdbx_txn_end() operation number */ -#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ -#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ -#define MDB_END_EOTDONE 0x40 /**< txn's cursors already closed */ -#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ +#define MDB_END_OPMASK 0x0F /**< mask for #mdbx_txn_end() operation number */ +#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ +#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ +#define MDB_END_EOTDONE 0x40 /**< txn's cursors already closed */ +#define MDB_END_SLOT 0x80 /**< release any reader slot if #MDB_NOTLS */ static int mdbx_txn_end(MDB_txn *txn, unsigned mode); static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); @@ -2100,10 +2100,7 @@ static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { } } -/** Common code for #mdbx_txn_begin() and #mdbx_txn_renew(). - * @param[in] txn the transaction handle to initialize - * @return 0 on success, non-zero on failure. - */ +/* Common code for #mdbx_txn_begin() and #mdbx_txn_renew(). */ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { MDB_env *env = txn->mt_env; unsigned i, nr; @@ -2149,17 +2146,22 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { env->me_live_reader = pid; } - retry: - nr = env->me_txns->mti_numreaders; - for (i = 0; i < nr; i++) - if (env->me_txns->mti_readers[i].mr_pid == 0) + for (;;) { + nr = env->me_txns->mti_numreaders; + for (i = 0; i < nr; i++) + if (env->me_txns->mti_readers[i].mr_pid == 0) + break; + + if (likely(i < env->me_maxreaders)) break; - if (unlikely(i == env->me_maxreaders)) { - if (mdbx_reader_check0(env, 1, NULL)) - goto retry; - mdbx_rdt_unlock(env); - return MDB_READERS_FULL; + + rc = mdbx_reader_check0(env, 1, NULL); + if (rc != MDBX_RESULT_TRUE) { + mdbx_rdt_unlock(env); + return (rc == MDB_SUCCESS) ? MDB_READERS_FULL : rc; + } } + r = &env->me_txns->mti_readers[i]; /* Claim the reader slot, carefully since other code * uses the reader table un-mutexed: First reset the @@ -2259,6 +2261,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { } else { return MDB_SUCCESS; } + assert(rc != MDB_SUCCESS); mdbx_txn_end(txn, MDB_END_SLOT | MDB_END_FAIL_BEGIN); return rc; } @@ -2383,6 +2386,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, renew: rc = mdbx_txn_renew0(txn, flags); } + if (unlikely(rc)) { if (txn != env->me_txn0) free(txn); @@ -4077,15 +4081,13 @@ int __cold mdbx_env_open(MDB_env *env, const char *path, unsigned flags, /** Destroy resources from mdbx_env_open(), clear our readers & DBIs */ static void __cold mdbx_env_close0(MDB_env *env) { - int i; - if (!(env->me_flags & MDB_ENV_ACTIVE)) return; env->me_flags &= ~MDB_ENV_ACTIVE; /* Doing this here since me_dbxs may not exist during mdbx_env_close */ if (env->me_dbxs) { - for (i = env->me_maxdbs; --i >= CORE_DBS;) + for (unsigned i = env->me_maxdbs; --i >= CORE_DBS;) free(env->me_dbxs[i].md_name.mv_data); free(env->me_dbxs); } From c4846c81415424777b4fb22b862520cb594e3f8b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 23 Apr 2017 12:54:37 +0300 Subject: [PATCH 065/303] test: failfast option. Change-Id: I42d1bdb9c20c9b96cfa41304bd025b09fab20518 --- test/config.cc | 2 ++ test/main.cc | 12 ++++++++++-- test/test.h | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/test/config.cc b/test/config.cc index 92e078b2..743e022a 100644 --- a/test/config.cc +++ b/test/config.cc @@ -325,6 +325,8 @@ void dump(const char *title) { log_info("cleanup: before %s, after %s\n", global::config::cleanup_before ? "Yes" : "No", global::config::cleanup_after ? "Yes" : "No"); + + log_info("failfast: %s\n", global::config::failfast ? "Yes" : "No"); } } /* namespace config */ diff --git a/test/main.cc b/test/main.cc index 385b5050..5ad2cb20 100644 --- a/test/main.cc +++ b/test/main.cc @@ -68,6 +68,7 @@ void actor_params::set_defaults(void) { global::config::dump_config = true; global::config::cleanup_before = true; global::config::cleanup_after = true; + global::config::failfast = true; } namespace global { @@ -86,6 +87,7 @@ unsigned timeout_duration_seconds; bool dump_config; bool cleanup_before; bool cleanup_after; +bool failfast; } /* namespace config */ } /* namespace global */ @@ -287,7 +289,7 @@ int main(int argc, char *const argv[]) { int rc = osal_actor_start(a, pid); log_trace("<< actor_start"); if (rc) { - log_trace(">> killall_actors"); + log_trace(">> killall_actors: (%s)", "start failed"); osal_killall_actors(); log_trace("<< killall_actors"); failure("Failed to start actor #%u (%s)\n", a.actor_id, @@ -331,8 +333,14 @@ int main(int argc, char *const argv[]) { actor->space_id, pid, status2str(status)); if (status > as_running) { left -= 1; - if (status != as_successful) + if (status != as_successful) { + if (global::config::failfast && !failed) { + log_trace(">> killall_actors: (%s)", "failfast"); + osal_killall_actors(); + log_trace("<< killall_actors"); + } failed = true; + } } } else { if (timeout_seconds_left == 0) diff --git a/test/test.h b/test/test.h index bb7764ba..c12e08a6 100644 --- a/test/test.h +++ b/test/test.h @@ -46,6 +46,7 @@ extern unsigned timeout_duration_seconds; extern bool dump_config; extern bool cleanup_before; extern bool cleanup_after; +extern bool failfast; } /* namespace config */ } /* namespace global */ From a0f1d61a4a8e4aca5a44c680bd209eb117c72bff Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 23 Apr 2017 12:55:45 +0300 Subject: [PATCH 066/303] test: fixup for Windows. Change-Id: I13468caf53988d9599235d1423603146abf9eb46 --- TODO.md | 1 + dll.vcxproj | 4 ++-- test/chrono.cc | 4 ++-- test/osal-windows.cc | 17 ++++++++++++----- test/test.vcxproj | 6 ++++++ test/utils.cc | 2 ++ 6 files changed, 25 insertions(+), 9 deletions(-) diff --git a/TODO.md b/TODO.md index 9fb41a2b..601d2fe0 100644 --- a/TODO.md +++ b/TODO.md @@ -1,3 +1,4 @@ +- [ ] разделение errno и GetLastError() - [x] CI посредством AppVeyor - [ ] uint32/uint64 в структурах - [ ] правки API (много...) diff --git a/dll.vcxproj b/dll.vcxproj index 42658ccb..13cee8f4 100644 --- a/dll.vcxproj +++ b/dll.vcxproj @@ -77,7 +77,7 @@ - WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions) + WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions);MDB_DEBUG=1 MultiThreadedDebugDLL Level3 ProgramDatabase @@ -121,7 +121,7 @@ Level3 - WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions) + WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions);MDB_DEBUG=1 MultiThreadedDebugDLL true diff --git a/test/chrono.cc b/test/chrono.cc index 3481ad74..20eb7c36 100644 --- a/test/chrono.cc +++ b/test/chrono.cc @@ -94,7 +94,8 @@ time now_motonic() { if (reciprocal == 0) { if (!QueryPerformanceFrequency(&Frequency)) failure_perror("QueryPerformanceFrequency()", GetLastError()); - reciprocal = (UINT64_C(1) << 32) / Frequency.QuadPart; + reciprocal = + ((UINT64_C(1) << 32) + Frequency.QuadPart / 2) / Frequency.QuadPart; assert(reciprocal); } @@ -107,7 +108,6 @@ time now_motonic() { uint64_t mod = Counter.QuadPart % Frequency.QuadPart; assert(mod < UINT32_MAX); result.fractional = UInt32x32To64((uint32_t)mod, reciprocal); - assert(result.fractional == (mod << 32) / Frequency.QuadPart); return result; #else struct timespec ts; diff --git a/test/osal-windows.cc b/test/osal-windows.cc index 8a93b247..7ed4522d 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -14,7 +14,7 @@ #include "test.h" -static std::vector events; +static std::unordered_map events; static HANDLE hBarrierSemaphore, hBarrierEvent; static int waitstatus2errcode(DWORD result) { @@ -67,13 +67,13 @@ void osal_setup(const std::vector &actors) { const size_t n = actors.size() + 1; events.reserve(n); - for (size_t i = 0; i < n; ++i) { + for (unsigned i = 0; i < n; ++i) { HANDLE hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); if (!hEvent) failure_perror("CreateEvent()", GetLastError()); hEvent = make_inharitable(hEvent); log_trace("osal_setup: event %zu -> %p", i, hEvent); - events.push_back(hEvent); + events[i] = hEvent; } hBarrierSemaphore = CreateSemaphore(NULL, 0, (LONG)actors.size(), NULL); @@ -121,7 +121,7 @@ actor_config::osal_serialize(simple_checksum &checksum) const { HANDLE hSignal = INVALID_HANDLE_VALUE; if (wanna_event4signalling()) { - hSignal = events.at(id); + hSignal = events.at(actor_id); checksum.push(hSignal); } @@ -156,7 +156,7 @@ bool actor_config::osal_deserialize(const char *str, const char *end, if (wanna_event4signalling()) { checksum.push(hSignal); - events[id] = hSignal; + events[actor_id] = hSignal; } TRACE("<< osal_deserialize: OK\n"); @@ -278,9 +278,16 @@ void osal_udelay(unsigned us) { static unsigned threshold_us; if (threshold_us == 0) { +#if 1 + unsigned timeslice_ms = 1; + while (timeBeginPeriod(timeslice_ms) == TIMERR_NOCANDO) + ++timeslice_ms; + threshold_us = timeslice_ms * 1500u; +#else ULONGLONG InterruptTimePrecise_100ns; QueryInterruptTimePrecise(&InterruptTimePrecise_100ns); threshold_us = InterruptTimePrecise_100ns / 5; +#endif assert(threshold_us > 0); } diff --git a/test/test.vcxproj b/test/test.vcxproj index e2a123f6..047e6ae3 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -102,6 +102,7 @@ Console true + winmm.lib;%(AdditionalDependencies) @@ -116,6 +117,7 @@ Console true + winmm.lib;%(AdditionalDependencies) @@ -134,6 +136,7 @@ true true true + winmm.lib;%(AdditionalDependencies) @@ -152,10 +155,12 @@ true true true + winmm.lib;%(AdditionalDependencies) + @@ -164,6 +169,7 @@ + diff --git a/test/utils.cc b/test/utils.cc index ae6797f8..07020be2 100644 --- a/test/utils.cc +++ b/test/utils.cc @@ -14,7 +14,9 @@ #include "test.h" #include +#ifndef _MSC_VER #include +#endif std::string format(const char *fmt, ...) { va_list ap, ones; From 1596bceff2c0b58f32c47df3e8e1a0ee4194afec Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 23 Apr 2017 12:56:34 +0300 Subject: [PATCH 067/303] mdbx: rework mdbx_env_setup_locks()'s tasks. Change-Id: I79953554a0b2cce8235ec5dbc83f183cb3fd56a0 --- src/mdbx.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 6ba16361..003ef5ed 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3263,8 +3263,7 @@ fail: * mapping it into memory. * @param[in] env the environment handle * @param[out] meta address of where to store the meta information - * @return 0 on success, non-zero on failure. - */ + * @return 0 on success, non-zero on failure. */ static int __cold mdbx_env_read_header(MDB_env *env, MDB_meta *meta) { MDB_metabuf pbuf; MDB_page *p; @@ -3799,14 +3798,17 @@ static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) { off_t size; + assert(env->me_fd != INVALID_HANDLE_VALUE); + assert(env->me_lfd == INVALID_HANDLE_VALUE); int rc = mdbx_openfile(lpath, O_RDWR | O_CREAT, mode, &env->me_lfd); if (rc != MDB_SUCCESS) { if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { env->me_lfd = INVALID_HANDLE_VALUE; rc = MDB_SUCCESS; + } else { + return rc; } - return rc; } /* Try to get exclusive lock. If we succeed, then @@ -3979,13 +3981,6 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, } env->me_dbxs[FREE_DBI].md_cmp = mdbx_cmp_int_ai; /* aligned MDB_INTEGERKEY */ - /* For RDONLY, get lockfile after we know datafile exists */ - if (!(flags & MDB_RDONLY)) { - rc = mdbx_env_setup_locks(env, lpath, mode, &excl); - if (rc) - goto bailout; - } - if (F_ISSET(flags, MDB_RDONLY)) oflags = O_RDONLY; else @@ -3995,11 +3990,9 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, if (rc != MDB_SUCCESS) goto bailout; - if (flags & MDB_RDONLY) { - rc = mdbx_env_setup_locks(env, lpath, mode, &excl); - if (rc) - goto bailout; - } + rc = mdbx_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto bailout; MDB_meta meta; rc = mdbx_env_open2(env, &meta); From 6b4f92b22ea62eadcd6ba1dadbdbf7822493d805 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 23 Apr 2017 12:57:27 +0300 Subject: [PATCH 068/303] mdbx: rework windows-clk. Change-Id: Ie0f8422a39879c9d5e6b4697c7e90fe67076b021 --- src/lck-windows.c | 158 +++++++++++++++++++++++++++++++++------------- 1 file changed, 115 insertions(+), 43 deletions(-) diff --git a/src/lck-windows.c b/src/lck-windows.c index 2b3d6aa9..789163bc 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -142,14 +142,23 @@ static BOOL funlock(mdbx_filehandle_t fd, off_t offset, size_t bytes) { /* global `write` lock for write-txt processing, * exclusive locking both meta-pages) */ +#define LCK_MAXLEN (1u + (size_t)(MAXSSIZE_T)) +#define LCK_META_OFFSET 0 +#define LCK_META_LEN 0x10000u +#define LCK_BODY_OFFSET LCK_META_LEN +#define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET + 1u) +#define LCK_META LCK_META_OFFSET, LCK_META_LEN +#define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN +#define LCK_WHOLE 0, LCK_MAXLEN + int mdbx_txn_lock(MDB_env *env) { - if (flock(env->me_fd, LCK_EXCLUSIVE | LCK_WAITFOR, 0, env->me_psize * 2)) + if (flock(env->me_fd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_BODY)) return MDB_SUCCESS; return GetLastError(); } void mdbx_txn_unlock(MDB_env *env) { - if (!funlock(env->me_fd, 0, env->me_psize * 2)) + if (!funlock(env->me_fd, LCK_BODY)) mdbx_panic("%s failed: errcode %u", mdbx_func_, GetLastError()); } @@ -186,7 +195,7 @@ void mdbx_rdt_unlock(MDB_env *env) { /* global `initial` lock for lockfile initialization, * exclusive/shared locking first cacheline */ -/* FIXME: locking scheme/algo descritpion. +/* FIXME: locking schema/algo descritpion. ?-? = free S-? = used E-? @@ -204,21 +213,27 @@ int mdbx_lck_init(MDB_env *env) { } /* Seize state as exclusive (E-E and returns MDBX_RESULT_TRUE) - * or used (S-? and returns MDBX_RESULT_FALSE), otherwise returns an error */ -int mdbx_lck_seize(MDB_env *env) { - /* 1) now on ?-? (free), get ?-E (middle) */ - if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) - return GetLastError() /* 2) something went wrong, give up */; +* or used (S-? and returns MDBX_RESULT_FALSE), otherwise returns an error */ +static int internal_seize_lck(HANDLE lfd) { + int rc; + assert(lfd != INVALID_HANDLE_VALUE); + /* 1) now on ?-? (free), get ?-E (middle) */ + if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { + rc = GetLastError() /* 2) something went wrong, give up */; + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + "?-?(free) >> ?-E(middle)", rc); + return rc; + } /* 3) now on ?-E (middle), try E-E (exclusive) */ - if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) + if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive), done */ /* 5) still on ?-E (middle) */ - int rc = GetLastError(); + rc = GetLastError(); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ - if (!funlock(env->me_lfd, LCK_UPPER)) { + if (!funlock(lfd, LCK_UPPER)) { rc = GetLastError(); mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, "?-E(middle) >> ?-?(free)", rc); @@ -227,13 +242,16 @@ int mdbx_lck_seize(MDB_env *env) { } /* 7) still on ?-E (middle), try S-E (locked) */ - rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER) - ? MDBX_RESULT_FALSE - : GetLastError(); + rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE + : GetLastError(); + + if (rc != MDBX_RESULT_FALSE) + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + "?-E(middle) >> S-E(locked)", rc); /* 8) now on S-E (locked) or still on ?-E (middle), - * transite to S-? (used) or ?-? (free) */ - if (!funlock(env->me_lfd, LCK_UPPER)) { + * transite to S-? (used) or ?-? (free) */ + if (!funlock(lfd, LCK_UPPER)) { rc = GetLastError(); mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, "X-E(locked/middle) >> X-?(used/free)", rc); @@ -243,28 +261,68 @@ int mdbx_lck_seize(MDB_env *env) { return rc; } +int mdbx_lck_seize(MDB_env *env) { + int rc; + + assert(env->me_fd != INVALID_HANDLE_VALUE); + if (env->me_lfd == INVALID_HANDLE_VALUE) { + /* LY: without-lck mode (e.g. on read-only filesystem) */ + if (!flock(env->me_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { + rc = GetLastError(); + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); + return rc; + } + return MDBX_RESULT_FALSE; + } + + rc = internal_seize_lck(env->me_lfd); + if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDB_RDONLY) == 0) { + /* Check that another process don't operates in without-lck mode. + * Doing such check by exclusive locking the body-part of db. Should be + * noted: + * - we need an exclusive lock for do so; + * - we can't lock meta-pages, otherwise other process could get an error + * while opening db in valid (non-conflict) mode. */ + if (!flock(env->me_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { + rc = GetLastError(); + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + "lock-against-without-lck", rc); + mdbx_lck_destroy(env); + } else if (!funlock(env->me_fd, LCK_BODY)) { + rc = GetLastError(); + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "unlock-against-without-lck", rc); + } + } + + return rc; +} + /* Transite from exclusive state (E-E) to used (S-?) */ int mdbx_lck_downgrade(MDB_env *env) { int rc; + assert(env->me_fd != INVALID_HANDLE_VALUE); - /* 1) now at E-E (exclusive), continue transition to ?_E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) { - rc = GetLastError(); - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "E-E(exclusive) >> ?-E(middle)", rc); - } + if (env->me_lfd != INVALID_HANDLE_VALUE) { + /* 1) must be at E-E (exclusive), transite to ?_E (middle) */ + if (!funlock(env->me_lfd, LCK_LOWER)) { + rc = GetLastError(); + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "E-E(exclusive) >> ?-E(middle)", rc); + } - /* 2) now at ?-E (middle), transite to S-E (locked) */ - if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { - rc = GetLastError() /* 3) something went wrong, give up */; - return rc; - } + /* 2) now at ?-E (middle), transite to S-E (locked) */ + if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { + rc = GetLastError() /* 3) something went wrong, give up */; + return rc; + } - /* 4) got S-E (locked), continue transition to S-? (used) */ - if (!funlock(env->me_lfd, LCK_UPPER)) { - rc = GetLastError(); - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "S-E(locked) >> S-?(used)", rc); + /* 4) got S-E (locked), continue transition to S-? (used) */ + if (!funlock(env->me_lfd, LCK_UPPER)) { + rc = GetLastError(); + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "S-E(locked) >> S-?(used)", rc); + } } return MDB_SUCCESS /* 5) now at S-? (used), done */; } @@ -272,17 +330,6 @@ int mdbx_lck_downgrade(MDB_env *env) { void mdbx_lck_destroy(MDB_env *env) { int rc; - if (env->me_fd != INVALID_HANDLE_VALUE) { - /* explicitly unlock to avoid latency for other processes (windows kernel - * releases such locks via deferred queues) */ - while (funlock(env->me_fd, 0, env->me_psize * 2)) - ; - rc = GetLastError(); - assert(rc == ERROR_NOT_LOCKED); - (void)rc; - SetLastError(ERROR_SUCCESS); - } - if (env->me_lfd != INVALID_HANDLE_VALUE) { /* double `unlock` for robustly remove overlapped shared/exclusive locks */ while (funlock(env->me_lfd, LCK_LOWER)) @@ -299,6 +346,31 @@ void mdbx_lck_destroy(MDB_env *env) { (void)rc; SetLastError(ERROR_SUCCESS); } + + if (env->me_fd != INVALID_HANDLE_VALUE) { + /* explicitly unlock to avoid latency for other processes (windows kernel + * releases such locks via deferred queues) */ + while (funlock(env->me_fd, LCK_BODY)) + ; + rc = GetLastError(); + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + + while (funlock(env->me_fd, LCK_META)) + ; + rc = GetLastError(); + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + + while (funlock(env->me_fd, LCK_WHOLE)) + ; + rc = GetLastError(); + assert(rc == ERROR_NOT_LOCKED); + (void)rc; + SetLastError(ERROR_SUCCESS); + } } /*----------------------------------------------------------------------------*/ From 11ac6a02ff4001888566ebc63abca3b8020892ab Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 23 Apr 2017 13:52:53 +0300 Subject: [PATCH 069/303] mdbx: rework posix-lck. Change-Id: Icdd61731424b0d591fd28da0468b9623d079c94a --- src/lck-posix.c | 94 +++++++++++++++++++++++++++++++++---------------- test/base.h | 4 +++ 2 files changed, 68 insertions(+), 30 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index c2a36c54..0253ec39 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -49,7 +49,31 @@ void __attribute__((destructor)) mdbx_global_destructor(void) { /*----------------------------------------------------------------------------*/ /* lck */ -static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset); +#ifndef OFF_T_MAX +#define OFF_T_MAX (sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) +#endif +#define LCK_WHOLE OFF_T_MAX + +static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset, + off_t len) { + for (;;) { + int rc; + struct flock lock_op; + memset(&lock_op, 0, sizeof(lock_op)); + lock_op.l_type = lck; + lock_op.l_whence = SEEK_SET; + lock_op.l_start = offset; + lock_op.l_len = len; + if ((rc = fcntl(fd, op, &lock_op)) == 0) { + if (op == F_GETLK && lock_op.l_type != F_UNLCK) + rc = -lock_op.l_pid; + } else if ((rc = errno) == EINTR) { + continue; + } + return rc; + } +} + static int mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc); int mdbx_lck_init(MDB_env *env) { @@ -93,7 +117,8 @@ bailout: void mdbx_lck_destroy(MDB_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* try get exclusive access */ - if (env->me_txns && mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0) == 0) { + if (env->me_txns && + mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0, LCK_WHOLE) == 0) { /* got exclusive, drown mutexes */ int rc = pthread_mutex_destroy(&env->me_txns->mti_rmutex); if (rc == 0) @@ -140,18 +165,20 @@ void mdbx_txn_unlock(MDB_env *env) { mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); } -int mdbx_lck_seize(MDB_env *env) { +static int internal_seize_lck(int lfd) { + assert(lfd != INVALID_HANDLE_VALUE); + /* try exclusive access */ - int rc = mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0); + int rc = mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1); if (rc == 0) /* got exclusive */ return MDBX_RESULT_TRUE; if (rc == EAGAIN || rc == EACCES || rc == EBUSY) { /* get shared access */ - rc = mdbx_lck_op(env->me_lfd, F_SETLKW, F_RDLCK, 0); + rc = mdbx_lck_op(lfd, F_SETLKW, F_RDLCK, 0, 1); if (rc == 0) { /* got shared, try exclusive again */ - rc = mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0); + rc = mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1); if (rc == 0) /* now got exclusive */ return MDBX_RESULT_TRUE; @@ -160,20 +187,46 @@ int mdbx_lck_seize(MDB_env *env) { return MDBX_RESULT_FALSE; } } - assert(rc != MDBX_RESULT_FALSE && rc != MDBX_RESULT_TRUE); + assert(MDBX_IS_ERROR(rc)); return rc; } +int mdbx_lck_seize(MDB_env *env) { + assert(env->me_fd != INVALID_HANDLE_VALUE); + + if (env->me_lfd == INVALID_HANDLE_VALUE) { + /* LY: without-lck mode (e.g. on read-only filesystem) */ + int rc = mdbx_lck_op(env->me_fd, F_SETLK, F_RDLCK, 0, LCK_WHOLE); + if (rc != 0) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); + return rc; + } + return MDBX_RESULT_FALSE; + } + + if ((env->me_flags & MDB_RDONLY) == 0) { + /* Check that another process don't operates in without-lck mode. */ + int rc = mdbx_lck_op(env->me_fd, F_SETLK, F_WRLCK, env->me_pid, 1); + if (rc != 0) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + "lock-against-without-lck", rc); + return rc; + } + } + + return internal_seize_lck(env->me_lfd); +} + int mdbx_lck_downgrade(MDB_env *env) { - return mdbx_lck_op(env->me_lfd, F_SETLK, F_RDLCK, 0); + return mdbx_lck_op(env->me_lfd, F_SETLK, F_RDLCK, 0, 1); } int mdbx_rpid_set(MDB_env *env) { - return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid); + return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid, 1); } int mdbx_rpid_clear(MDB_env *env) { - return mdbx_lck_op(env->me_lfd, F_SETLKW, F_UNLCK, env->me_pid); + return mdbx_lck_op(env->me_lfd, F_SETLKW, F_UNLCK, env->me_pid, 1); } /* Checks reader by pid. @@ -183,7 +236,7 @@ int mdbx_rpid_clear(MDB_env *env) { * MDBX_RESULT_FALSE, if pid is dead (lock acquired) * or otherwise the errcode. */ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { - int rc = mdbx_lck_op(env->me_lfd, F_GETLK, F_WRLCK, pid); + int rc = mdbx_lck_op(env->me_lfd, F_GETLK, F_WRLCK, pid, 1); if (rc == 0) return MDBX_RESULT_FALSE; if (rc < 0 && -rc == pid) @@ -191,25 +244,6 @@ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { return rc; } -static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset) { - for (;;) { - int rc; - struct flock lock_op; - memset(&lock_op, 0, sizeof(lock_op)); - lock_op.l_type = lck; - lock_op.l_whence = SEEK_SET; - lock_op.l_start = offset; - lock_op.l_len = 1; - if ((rc = fcntl(fd, op, &lock_op)) == 0) { - if (op == F_GETLK && lock_op.l_type != F_UNLCK) - rc = -lock_op.l_pid; - } else if ((rc = errno) == EINTR) { - continue; - } - return rc; - } -} - #if !__GLIBC_PREREQ(2, 12) && !defined(pthread_mutex_consistent) #define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) #endif diff --git a/test/base.h b/test/base.h index b4ba95c8..ad804813 100644 --- a/test/base.h +++ b/test/base.h @@ -63,5 +63,9 @@ #include #endif +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + #include "../mdbx.h" #include "../src/defs.h" From 1bddc9dcbc15cd9fc2526bc63c92fcf486127dfa Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 23 Apr 2017 18:58:38 +0300 Subject: [PATCH 070/303] mdbx: change mdbx_canary_get() API. Change-Id: Ie8831d8acb7113916b8a44b870596a9bf1ccc8b2 --- mdbx.h | 2 +- src/mdbx.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mdbx.h b/mdbx.h index ae9e5ca5..1ecfcef3 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1728,7 +1728,7 @@ LIBMDBX_API int mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, typedef struct mdbx_canary { uint64_t x, y, z, v; } mdbx_canary; LIBMDBX_API int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary); -LIBMDBX_API size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary); +LIBMDBX_API int mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary); /* Returns: * - MDBX_RESULT_TRUE when no more data available diff --git a/src/mdbx.c b/src/mdbx.c index 003ef5ed..3ed4c46d 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9792,14 +9792,14 @@ int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { return MDB_SUCCESS; } -size_t mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary) { - if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return 0; +int mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary) { + if (unlikely(txn == NULL || canary == NULL)) + return EINVAL; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_EBADSIGN; - if (likely(canary)) - *canary = txn->mt_canary; - - return txn->mt_txnid; + *canary = txn->mt_canary; + return MDB_SUCCESS; } int mdbx_cursor_on_first(MDB_cursor *mc) { From ce9c4acea6d5c58eb6aa1a8efef4f8256e8f072e Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 23 Apr 2017 18:59:16 +0300 Subject: [PATCH 071/303] mdbx: fix mdbg_canary_put(). Change-Id: I36917a686b28405ed7ecd07dbeb1b4d517720410 --- src/mdbx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 3ed4c46d..cfca3b5a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9787,7 +9787,12 @@ int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { txn->mt_canary.z = canary->z; } txn->mt_canary.v = txn->mt_txnid; - txn->mt_flags |= MDB_TXN_DIRTY; + + if ((txn->mt_flags & MDB_TXN_DIRTY) == 0) { + MDB_env *env = txn->mt_env; + txn->mt_flags |= MDB_TXN_DIRTY; + env->me_sync_pending += env->me_psize; + } return MDB_SUCCESS; } From 32937ac63b9de9e35c1f779155fbe6f0947e3d00 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 23 Apr 2017 19:00:28 +0300 Subject: [PATCH 072/303] test: canary fetch/update for jitter testcase. Change-Id: I8402328f880addb1170e8e778b64aa4f12d18718 --- test/jitter.cc | 10 ++++++---- test/test.cc | 37 +++++++++++++++++++++++++++++++++++++ test/test.h | 7 +++++++ 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/test/jitter.cc b/test/jitter.cc index 4129de75..92d272e1 100644 --- a/test/jitter.cc +++ b/test/jitter.cc @@ -31,27 +31,29 @@ bool testcase_jitter::run() { if (flipcoin()) { jitter_delay(); txn_begin(true); + fetch_canary(); jitter_delay(); - txn_end(false); + txn_end(flipcoin()); } jitter_delay(); txn_begin(mode_readonly()); jitter_delay(); if (!mode_readonly()) { + fetch_canary(); + update_canary(1); /* TODO: - * - db_sequence() * - db_setsize() * ... */ } - txn_end(false); + txn_end(flipcoin()); if (flipcoin()) { jitter_delay(); txn_begin(true); jitter_delay(); - txn_end(false); + txn_end(flipcoin()); } jitter_delay(); diff --git a/test/test.cc b/test/test.cc index 28290c46..346d2e8d 100644 --- a/test/test.cc +++ b/test/test.cc @@ -252,6 +252,43 @@ bool testcase::should_continue() const { return result; } +void testcase::fetch_canary() { + mdbx_canary canary_now; + log_trace(">> fetch_canary"); + + int rc = mdbx_canary_get(txn_guard.get(), &canary_now); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_canary_get()", rc); + + if (canary_now.v < last.canary.v) + failure("fetch_canary: %" PRIu64 " canary-now.v) < %" PRIu64 + "(canary-last.v)", + canary_now.v, last.canary.v); + if (canary_now.y < last.canary.y) + failure("fetch_canary: %" PRIu64 "(canary-now.y) < %" PRIu64 + "(canary-last.y)", + canary_now.y, last.canary.y); + + last.canary = canary_now; + log_trace("<< fetch_canary: db-sequence %" PRIu64 + ", db-sequence.txnid %" PRIu64, + last.canary.y, last.canary.v); +} + +void testcase::update_canary(uint64_t increment) { + mdbx_canary canary_now = last.canary; + + log_trace(">> update_canary: sequence %" PRIu64 " += %" PRIu64, canary_now.y, + increment); + canary_now.y += increment; + + int rc = mdbx_canary_put(txn_guard.get(), &canary_now); + if (rc != MDB_SUCCESS) + failure_perror("mdbx_canary_put()", rc); + + log_trace(">> update_canary: sequence = %" PRIu64, canary_now.y); +} + //----------------------------------------------------------------------------- bool test_execute(const actor_config &config) { diff --git a/test/test.h b/test/test.h index c12e08a6..79c26479 100644 --- a/test/test.h +++ b/test/test.h @@ -88,11 +88,17 @@ protected: size_t nops_completed; chrono::time start_timestamp; + struct { + mdbx_canary canary; + } last; + void db_prepare(); void db_open(); void db_close(); void txn_begin(bool readonly); void txn_end(bool abort); + void fetch_canary(); + void update_canary(uint64_t increment); bool wait4start(); void report(size_t nops_done); @@ -107,6 +113,7 @@ public: testcase(const actor_config &config, const mdbx_pid_t pid) : config(config), pid(pid), nops_completed(0) { start_timestamp.reset(); + memset(&last, 0, sizeof(last)); } virtual bool setup(); From 522d0aa498103cffafe3334d32277ff26c3c2b08 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 23 Apr 2017 21:39:10 +0300 Subject: [PATCH 073/303] ci: fix appveyor.yml test params. Change-Id: I68bb64bd75468b0afd600893de2fe6bb16cd34ca --- .appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 7e8e1351..19db6535 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -25,7 +25,7 @@ build: test_script: - ps: | if (($env:PLATFORM -eq "x86") -and (Test-Path "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" -PathType Leaf)) { - & "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" --pathname=tmp.db --basic --dont-cleanup-after + & "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" --pathname=tmp.db --dont-cleanup-after basic } else { - & "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\test.exe" --pathname=tmp.db --basic --dont-cleanup-after + & "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\test.exe" --pathname=tmp.db --dont-cleanup-after basic } From 297190e3086bafba56ab2d2d662e5aa473d839d4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 23 Apr 2017 21:44:27 +0300 Subject: [PATCH 074/303] ci: add circle.yml Change-Id: I233899fd5ee83fd0a088f2752015ccfb61ec86c0 --- circle.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 circle.yml diff --git a/circle.yml b/circle.yml new file mode 100644 index 00000000..c10629e1 --- /dev/null +++ b/circle.yml @@ -0,0 +1,14 @@ +machine: + timezone: + Europe/Moscow + +database: + override: + +compile: + override: + - make all + +test: + override: + - make check From f7507110ef8efa0184a30acb872a7e2ac9c23782 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 24 Apr 2017 14:13:52 +0300 Subject: [PATCH 075/303] mdbx: alter mdbx_txn_id() API. --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index cfca3b5a..0a5d0d24 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2409,7 +2409,7 @@ MDB_env *mdbx_txn_env(MDB_txn *txn) { size_t mdbx_txn_id(MDB_txn *txn) { if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return 0; + return ~(txnid_t)0; return txn->mt_txnid; } From 2e54210e79774235fcb1a31adb21c8405376e7f7 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 24 Apr 2017 15:45:20 +0300 Subject: [PATCH 076/303] mdbx: add mdbx_osal.h with platform-like err codes. --- Makefile | 2 +- libmdbx.files | 1 + mdbx.h | 89 +--------------- mdbx_osal.h | 120 ++++++++++++++++++++++ src/mdbx.c | 280 +++++++++++++++++++++++++------------------------- src/osal.c | 4 +- src/osal.h | 1 - 7 files changed, 265 insertions(+), 232 deletions(-) create mode 100644 mdbx_osal.h diff --git a/Makefile b/Makefile index 9d31aba8..b9c36c74 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ LIBRARIES := libmdbx.a libmdbx.so TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 -MDBX_SRC := mdbx.h $(addprefix src/, mdbx.c osal.c lck-posix.c defs.h bits.h osal.h midl.h) +MDBX_SRC := mdbx.h mdbx_osal.h $(addprefix src/, mdbx.c osal.c lck-posix.c defs.h bits.h osal.h midl.h) .PHONY: mdbx all install clean check coverage diff --git a/libmdbx.files b/libmdbx.files index 8ab54c92..2c209b29 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -3,6 +3,7 @@ LICENSE Makefile README.md mdbx.h +mdbx_osal.h src/bits.h src/defs.h src/lck-posix.c diff --git a/mdbx.h b/mdbx.h index 1ecfcef3..a152fe40 100644 --- a/mdbx.h +++ b/mdbx.h @@ -52,95 +52,8 @@ #ifndef _MDBX_H_ #define _MDBX_H_ -#define MDBX_MODE_ENABLED 1 -#ifndef __has_attribute -# define __has_attribute(x) (0) -#endif - -#ifndef __dll_export -# if defined(_WIN32) || defined(__CYGWIN__) -# if defined(__GNUC__) || __has_attribute(dllexport) -# define __dll_export __attribute__((dllexport)) -# elif defined(_MSC_VER) -# define __dll_export __declspec(dllexport) -# else -# define __dll_export -# endif -# elif defined(__GNUC__) || __has_attribute(visibility) -# define __dll_export __attribute__((visibility("default"))) -# else -# define __dll_export -# endif -#endif /* __dll_export */ - -#ifndef __dll_import -# if defined(_WIN32) || defined(__CYGWIN__) -# if defined(__GNUC__) || __has_attribute(dllimport) -# define __dll_import __attribute__((dllimport)) -# elif defined(_MSC_VER) -# define __dll_import __declspec(dllimport) -# else -# define __dll_import -# endif -# else -# define __dll_import -# endif -#endif /* __dll_import */ - -#if defined(LIBMDBX_EXPORTS) -# define LIBMDBX_API __dll_export -#elif defined(LIBMDBX_IMPORTS) -# define LIBMDBX_API __dll_import -#else -# define LIBMDBX_API -#endif /* LIBMDBX_API */ - -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4514) /* 'xyz': unreferenced inline function \ - has been removed */ -#pragma warning(disable : 4710) /* 'xyz': function not inlined */ -#pragma warning(disable : 4711) /* function 'xyz' selected for \ - automatic inline expansion */ -#pragma warning(disable : 4061) /* enumerator 'abc' in switch of enum \ - 'xyz' is not explicitly handled by a case \ - label */ -#pragma warning(disable : 4201) /* nonstandard extension used : \ - nameless struct / union */ -#pragma warning(disable : 4127) /* conditional expression is constant \ - */ - -#pragma warning(push, 1) -#pragma warning(disable : 4530) /* C++ exception handler used, but \ - unwind semantics are not enabled. Specify \ - /EHsc */ -#pragma warning(disable : 4577) /* 'noexcept' used with no exception \ - handling mode specified; termination on \ - exception is not guaranteed. Specify /EHsc \ - */ -#endif /* _MSC_VER (warnings) */ - -#include -#include -#include - -#if defined(_WIN32) || defined(_WIN64) -# include -# include - typedef unsigned mode_t; - typedef HANDLE mdbx_filehandle_t; - typedef DWORD mdbx_pid_t; - typedef DWORD mdbx_tid_t; -#else -# include /* for pthread_t */ -# include /* for truct iovec */ -# include /* for pid_t */ -# define HAVE_STRUCT_IOVEC 1 - typedef int mdbx_filehandle_t; - typedef pid_t mdbx_pid_t; - typedef pthread_t mdbx_tid_t; -#endif +#include "mdbx_osal.h" #ifdef _MSC_VER #pragma warning(pop) diff --git a/mdbx_osal.h b/mdbx_osal.h new file mode 100644 index 00000000..316d24a9 --- /dev/null +++ b/mdbx_osal.h @@ -0,0 +1,120 @@ +/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ + +/* + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#pragma once + +#ifndef __has_attribute +#define __has_attribute(x) (0) +#endif + +#ifndef __dll_export +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(__GNUC__) || __has_attribute(dllexport) +#define __dll_export __attribute__((dllexport)) +#elif defined(_MSC_VER) +#define __dll_export __declspec(dllexport) +#else +#define __dll_export +#endif +#elif defined(__GNUC__) || __has_attribute(visibility) +#define __dll_export __attribute__((visibility("default"))) +#else +#define __dll_export +#endif +#endif /* __dll_export */ + +#ifndef __dll_import +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(__GNUC__) || __has_attribute(dllimport) +#define __dll_import __attribute__((dllimport)) +#elif defined(_MSC_VER) +#define __dll_import __declspec(dllimport) +#else +#define __dll_import +#endif +#else +#define __dll_import +#endif +#endif /* __dll_import */ + +#if defined(LIBMDBX_EXPORTS) +#define LIBMDBX_API __dll_export +#elif defined(LIBMDBX_IMPORTS) +#define LIBMDBX_API __dll_import +#else +#define LIBMDBX_API +#endif /* LIBMDBX_API */ + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4514) /* 'xyz': unreferenced inline function \ + has been removed */ +#pragma warning(disable : 4710) /* 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* function 'xyz' selected for \ + automatic inline expansion */ +#pragma warning(disable : 4061) /* enumerator 'abc' in switch of enum \ + 'xyz' is not explicitly handled by a case \ + label */ +#pragma warning(disable : 4201) /* nonstandard extension used : \ + nameless struct / union */ +#pragma warning(disable : 4127) /* conditional expression is constant \ + */ + +#pragma warning(push, 1) +#pragma warning(disable : 4530) /* C++ exception handler used, but \ + unwind semantics are not enabled. Specify \ + /EHsc */ +#pragma warning(disable : 4577) /* 'noexcept' used with no exception \ + handling mode specified; termination on \ + exception is not guaranteed. Specify /EHsc \ + */ +#endif /* _MSC_VER (warnings) */ + +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +typedef unsigned mode_t; +typedef HANDLE mdbx_filehandle_t; +typedef DWORD mdbx_pid_t; +typedef DWORD mdbx_tid_t; + +#define MDBX_ENODATA ERROR_HANDLE_EOF +#define MDBX_EINVAL ERROR_INVALID_PARAMETER +#define MDBX_EACCESS ERROR_ACCESS_DENIED +#define MDBX_ENOMEM ERROR_OUTOFMEMORY + +#else +#include /* for error codes */ +#include /* for pthread_t */ +#include /* for pid_t */ +#include /* for truct iovec */ +#define HAVE_STRUCT_IOVEC 1 +typedef int mdbx_filehandle_t; +typedef pid_t mdbx_pid_t; +typedef pthread_t mdbx_tid_t; + +#define MDBX_ENODATA ENODATA +#define MDBX_EINVAL EINVAL +#define MDBX_EACCESS EACCES +#define MDBX_ENOMEM ENOMEM + +#endif + +/*--------------------------------------------------------------------------*/ diff --git a/src/mdbx.c b/src/mdbx.c index 0a5d0d24..319804e5 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -108,7 +108,7 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDB_reader *begin, realloc((rthc_table == rthc_table_static) ? NULL : rthc_table, sizeof(rthc_entry_t) * rthc_limit * 2); if (new_table == NULL) { - rc = ENOMEM; + rc = MDBX_ENOMEM; goto bailout; } if (rthc_table == rthc_table_static) @@ -185,21 +185,21 @@ static void mdbx_midl_shrink(MDB_IDL *idp); /** Make room for num additional elements in an IDL. * @param[in,out] idp Address of the IDL. * @param[in] num Number of elements to make room for. - * @return 0 on success, ENOMEM on failure. + * @return 0 on success, MDBX_ENOMEM on failure. */ static int mdbx_midl_need(MDB_IDL *idp, unsigned num); /** Append an ID onto an IDL. * @param[in,out] idp Address of the IDL to append to. * @param[in] id The ID to append. - * @return 0 on success, ENOMEM if the IDL is too large. + * @return 0 on success, MDBX_ENOMEM if the IDL is too large. */ static int mdbx_midl_append(MDB_IDL *idp, MDB_ID id); /** Append an IDL onto an IDL. * @param[in,out] idp Address of the IDL to append to. * @param[in] app The IDL to append. - * @return 0 on success, ENOMEM if the IDL is too large. + * @return 0 on success, MDBX_ENOMEM if the IDL is too large. */ static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app); @@ -207,7 +207,7 @@ static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app); * @param[in,out] idp Address of the IDL to append to. * @param[in] id The lowest ID to append. * @param[in] n Number of IDs to append. - * @return 0 on success, ENOMEM if the IDL is too large. + * @return 0 on success, MDBX_ENOMEM if the IDL is too large. */ static int mdbx_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n); @@ -1305,7 +1305,7 @@ static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { if (!txn->mt_spill_pgs) { txn->mt_spill_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX); if (unlikely(!txn->mt_spill_pgs)) - return ENOMEM; + return MDBX_ENOMEM; } else { /* purge deleted slots */ MDB_IDL sl = txn->mt_spill_pgs; @@ -1615,7 +1615,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { txn->mt_lifo_reclaimed = mdbx_midl_alloc(env->me_maxfree_1pg); if (unlikely(!txn->mt_lifo_reclaimed)) { - rc = ENOMEM; + rc = MDBX_ENOMEM; goto fail; } } @@ -1626,7 +1626,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { i = idl[0]; if (!mop) { if (unlikely(!(env->me_pghead = mop = mdbx_midl_alloc(i)))) { - rc = ENOMEM; + rc = MDBX_ENOMEM; goto fail; } } else { @@ -1750,7 +1750,7 @@ done: ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize * num); } else { if (unlikely(!(np = mdbx_page_malloc(txn, num)))) { - rc = ENOMEM; + rc = MDBX_ENOMEM; goto fail; } } @@ -1830,7 +1830,7 @@ static int mdbx_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) { } else { np = mdbx_page_malloc(txn, num); if (unlikely(!np)) - return ENOMEM; + return MDBX_ENOMEM; if (num > 1) memcpy(np, mp, num * env->me_psize); else @@ -1913,7 +1913,7 @@ static int mdbx_page_touch(MDB_cursor *mc) { /* No - copy it */ np = mdbx_page_malloc(txn, 1); if (unlikely(!np)) - return ENOMEM; + return MDBX_ENOMEM; mid.mid = pgno; mid.mptr = np; rc = mdbx_mid2l_insert(dl, &mid); @@ -1964,7 +1964,7 @@ int mdbx_env_sync(MDB_env *env, int force) { unsigned flags; if (unlikely(!env)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; @@ -1974,7 +1974,7 @@ int mdbx_env_sync(MDB_env *env, int force) { flags = env->me_flags & ~MDB_NOMETASYNC; if (unlikely(flags & (MDB_RDONLY | MDB_FATAL_ERROR))) - return EACCES; + return MDBX_EACCESS; head = mdbx_meta_head(env); if (!META_IS_WEAK(head) && env->me_sync_pending == 0 && @@ -2032,7 +2032,7 @@ static int mdbx_cursor_shadow(MDB_txn *src, MDB_txn *dst) { for (; mc; mc = bk->mc_next) { bk = malloc(size); if (unlikely(!bk)) - return ENOMEM; + return MDBX_ENOMEM; *bk = *mc; mc->mc_backup = bk; mc->mc_db = &dst->mt_dbs[i]; @@ -2270,13 +2270,13 @@ int mdbx_txn_renew(MDB_txn *txn) { int rc; if (unlikely(!txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY | MDB_TXN_FINISHED))) - return EINVAL; + return MDBX_EINVAL; rc = mdbx_txn_renew0(txn, MDB_TXN_RDONLY); if (rc == MDB_SUCCESS) { @@ -2294,7 +2294,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, int rc, size, tsize; if (unlikely(!env || !ret)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; @@ -2309,16 +2309,16 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, if (unlikely(env->me_flags & MDB_RDONLY & ~flags)) /* write txn in RDONLY env */ - return EACCES; + return MDBX_EACCESS; if (parent) { if (unlikely(parent->mt_signature != MDBX_MT_SIGNATURE)) - return EINVAL; + return MDBX_EINVAL; /* Nested transactions: Max 1 child, write txns only, no writemap */ flags |= parent->mt_flags; if (unlikely(flags & (MDB_RDONLY | MDB_WRITEMAP | MDB_TXN_BLOCKED))) { - return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN; + return (parent->mt_flags & MDB_TXN_RDONLY) ? MDBX_EINVAL : MDB_BAD_TXN; } /* Child txns save MDB_pgstate and use own copy of cursors */ size = env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + 1); @@ -2334,7 +2334,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, } if (unlikely((txn = calloc(1, size)) == NULL)) { mdbx_debug("calloc: %s", "failed"); - return ENOMEM; + return MDBX_ENOMEM; } txn->mt_dbxs = env->me_dbxs; /* static */ txn->mt_dbs = (MDB_db *)((char *)txn + tsize); @@ -2351,7 +2351,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, !(txn->mt_free_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX))) { free(txn->mt_u.dirty_list); free(txn); - return ENOMEM; + return MDBX_ENOMEM; } txn->mt_txnid = parent->mt_txnid; txn->mt_dirty_room = parent->mt_dirty_room; @@ -2375,7 +2375,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, if (likely(env->me_pghead)) memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); else - rc = ENOMEM; + rc = MDBX_ENOMEM; } if (likely(!rc)) rc = mdbx_cursor_shadow(parent, txn); @@ -2528,14 +2528,14 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { int mdbx_txn_reset(MDB_txn *txn) { if (unlikely(!txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; /* This call is only valid for read-only txns */ if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) - return EINVAL; + return MDBX_EINVAL; /* LY: don't close DBI-handles in MDBX mode */ return mdbx_txn_end(txn, MDB_END_RESET | MDB_END_UPDATE); @@ -2543,7 +2543,7 @@ int mdbx_txn_reset(MDB_txn *txn) { int mdbx_txn_abort(MDB_txn *txn) { if (unlikely(!txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; @@ -2751,7 +2751,7 @@ again: if (unlikely(!txn->mt_lifo_reclaimed)) { txn->mt_lifo_reclaimed = mdbx_midl_alloc(env->me_maxfree_1pg); if (unlikely(!txn->mt_lifo_reclaimed)) { - rc = ENOMEM; + rc = MDBX_ENOMEM; goto bailout; } } @@ -3010,7 +3010,7 @@ int mdbx_txn_commit(MDB_txn *txn) { int rc; if (unlikely(txn == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; @@ -3188,7 +3188,7 @@ int mdbx_txn_commit(MDB_txn *txn) { if (unlikely(txn != env->me_txn)) { mdbx_debug("attempt to commit unknown transaction"); - rc = EINVAL; + rc = MDBX_EINVAL; goto fail; } @@ -3348,7 +3348,7 @@ static int __cold mdbx_env_init_meta(MDB_env *env, MDB_meta *meta) { p = calloc(NUM_METAS, psize); if (!p) - return ENOMEM; + return MDBX_ENOMEM; p->mp_pgno = 0; p->mp_flags = P_META; *(MDB_meta *)PAGEDATA(p) = *meta; @@ -3526,7 +3526,7 @@ fail: int __cold mdbx_env_get_maxkeysize(MDB_env *env) { if (!env || env->me_signature != MDBX_ME_SIGNATURE || !env->me_maxkey_limit) - return EINVAL; + return MDBX_EINVAL; return env->me_maxkey_limit; } @@ -3547,10 +3547,10 @@ int mdbx_get_maxkeysize(size_t pagesize) { ssize_t nodemax = mdbx_calc_nodemax(pagesize); if (nodemax < 0) - return -EINVAL; + return -MDBX_EINVAL; ssize_t maxkey = mdbx_calc_maxkey(nodemax); - return (maxkey > 0 && maxkey < INT_MAX) ? (int)maxkey : -EINVAL; + return (maxkey > 0 && maxkey < INT_MAX) ? (int)maxkey : -MDBX_EINVAL; } static void __cold mdbx_env_setup_limits(MDB_env *env, size_t pagesize) { @@ -3567,7 +3567,7 @@ int __cold mdbx_env_create(MDB_env **env) { e = calloc(1, sizeof(MDB_env)); if (!e) - return ENOMEM; + return MDBX_ENOMEM; e->me_maxreaders = DEFAULT_READERS; e->me_maxdbs = e->me_numdbs = CORE_DBS; @@ -3651,13 +3651,13 @@ static int __cold mdbx_env_map(MDB_env *env, void *addr, size_t usedsize) { int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { if (unlikely(!env)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(size < env->me_psize * 8)) - return EINVAL; + return MDBX_EINVAL; /* If env is already open, caller is responsible for making * sure there are no active txns. @@ -3666,7 +3666,7 @@ int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { int rc; MDB_meta *meta; if (env->me_txn) - return EINVAL; + return MDBX_EINVAL; /* FIXME: lock/unlock */ meta = mdbx_meta_head(env); @@ -3701,13 +3701,13 @@ int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { int __cold mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { if (unlikely(!env)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(env->me_map)) - return EINVAL; + return MDBX_EINVAL; env->me_maxdbs = dbs + CORE_DBS; return MDB_SUCCESS; @@ -3715,13 +3715,13 @@ int __cold mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { int __cold mdbx_env_set_maxreaders(MDB_env *env, unsigned readers) { if (unlikely(!env || readers < 1)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(env->me_map || readers > INT16_MAX)) - return EINVAL; + return MDBX_EINVAL; env->me_maxreaders = readers; return MDB_SUCCESS; @@ -3729,7 +3729,7 @@ int __cold mdbx_env_set_maxreaders(MDB_env *env, unsigned readers) { int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) { if (!env || !readers) - return EINVAL; + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; @@ -3926,14 +3926,14 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, char *lpath, *dpath; if (unlikely(!env || !path)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; if (env->me_fd != INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE | CHANGELESS))) - return EINVAL; + return MDBX_EINVAL; len = strlen(path); if (flags & MDB_NOSUBDIR) { @@ -3965,7 +3965,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, } else { if (!((env->me_free_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX)) && (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) - rc = ENOMEM; + rc = MDBX_ENOMEM; } env->me_flags = flags |= MDB_ENV_ACTIVE; if (rc) @@ -3976,7 +3976,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned)); if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { - rc = ENOMEM; + rc = MDBX_ENOMEM; goto bailout; } env->me_dbxs[FREE_DBI].md_cmp = mdbx_cmp_int_ai; /* aligned MDB_INTEGERKEY */ @@ -4130,7 +4130,7 @@ int __cold mdbx_env_close_ex(MDB_env *env, int dont_sync) { int rc = MDB_SUCCESS; if (unlikely(!env)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; @@ -4798,13 +4798,13 @@ int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); if (unlikely(!key || !data || !txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) return MDB_BAD_TXN; @@ -5329,7 +5329,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, int (*mfunc)(MDB_cursor * mc, MDB_val * key, MDB_val * data); if (unlikely(mc == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; @@ -5340,7 +5340,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, switch (op) { case MDB_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return EINVAL; + return MDBX_EINVAL; MDB_page *mp = mc->mc_pg[mc->mc_top]; unsigned nkeys = NUMKEYS(mp); if (mc->mc_ki[mc->mc_top] >= nkeys) { @@ -5378,7 +5378,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, case MDB_GET_BOTH: case MDB_GET_BOTH_RANGE: if (unlikely(data == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_xcursor == NULL)) return MDB_INCOMPATIBLE; /* FALLTHRU */ @@ -5386,13 +5386,13 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, case MDB_SET_KEY: case MDB_SET_RANGE: if (unlikely(key == NULL)) - return EINVAL; + return MDBX_EINVAL; rc = mdbx_cursor_set(mc, key, data, op, op == MDB_SET_RANGE ? NULL : &exact); break; case MDB_GET_MULTIPLE: if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) return MDB_INCOMPATIBLE; rc = MDB_SUCCESS; @@ -5402,7 +5402,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, goto fetchm; case MDB_NEXT_MULTIPLE: if (unlikely(data == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) return MDB_INCOMPATIBLE; rc = mdbx_cursor_next(mc, key, data, MDB_NEXT_DUP); @@ -5421,7 +5421,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, break; case MDB_PREV_MULTIPLE: if (data == NULL) - return EINVAL; + return MDBX_EINVAL; if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) return MDB_INCOMPATIBLE; rc = MDB_SUCCESS; @@ -5455,7 +5455,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, mfunc = mdbx_cursor_first; mmove: if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_xcursor == NULL)) return MDB_INCOMPATIBLE; { @@ -5467,7 +5467,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) - return EINVAL; + return MDBX_EINVAL; rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); break; case MDB_LAST: @@ -5478,7 +5478,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, goto mmove; default: mdbx_debug("unhandled/unimplemented cursor operation %u", op); - return EINVAL; + return MDBX_EINVAL; } mc->mc_flags &= ~C_DEL; @@ -5533,7 +5533,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, DKBUF; if (unlikely(mc == NULL || key == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; @@ -5559,7 +5559,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, flags &= ~MDB_NOSPILL; if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; if (unlikely(key->mv_size > env->me_maxkey_limit)) return MDB_BAD_VALSIZE; @@ -5589,7 +5589,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, int dupdata_flag = 0; if (flags & MDB_CURRENT) { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return EINVAL; + return MDBX_EINVAL; if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5890,7 +5890,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); MDB_ID2 id2; if (unlikely(!np)) - return ENOMEM; + return MDBX_ENOMEM; id2.mid = pg; id2.mptr = np; /* Note - this page is already counted in parent's dirty_room */ @@ -6078,16 +6078,16 @@ int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { int rc; if (unlikely(!mc)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))) return MDB_NOTFOUND; @@ -6280,7 +6280,7 @@ static __inline size_t mdbx_branch_size(MDB_env *env, MDB_val *key) { * @param[in] flags Flags for the node. * @return 0 on success, non-zero on failure. Possible errors are: *
    - *
  • ENOMEM - failed to allocate overflow pages for the node. + *
  • MDBX_ENOMEM - failed to allocate overflow pages for the node. *
  • MDB_PAGE_FULL - there is insufficient room in the page. This error * should never happen since all callers already calculate the * page's free space before calling this function. @@ -6643,19 +6643,19 @@ int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { size_t size = sizeof(MDB_cursor); if (unlikely(!ret || !txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) return MDB_BAD_TXN; if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) - return EINVAL; + return MDBX_EINVAL; if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) size += sizeof(MDB_xcursor); @@ -6668,7 +6668,7 @@ int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { mc->mc_flags |= C_UNTRACK; } } else { - return ENOMEM; + return MDBX_ENOMEM; } *ret = mc; @@ -6678,20 +6678,20 @@ int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { if (unlikely(!mc || !txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE && mc->mc_signature != MDBX_MC_READY4CLOSE)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(!TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_backup)) - return EINVAL; + return MDBX_EINVAL; if (unlikely((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)) { MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; @@ -6712,7 +6712,7 @@ int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { /* Return the count of duplicate data items for the current key */ int mdbx_cursor_count(MDB_cursor *mc, size_t *countp) { if (unlikely(mc == NULL || countp == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; @@ -6721,7 +6721,7 @@ int mdbx_cursor_count(MDB_cursor *mc, size_t *countp) { return MDB_BAD_TXN; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return EINVAL; + return MDBX_EINVAL; if (!mc->mc_snum) { *countp = 0; @@ -7542,16 +7542,16 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { if (unlikely(!key || !txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + return (txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; return mdbx_del0(txn, dbi, key, data, 0); } @@ -7737,7 +7737,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, /* grab a page to hold a temporary copy */ copy = mdbx_page_malloc(mc->mc_txn, 1); if (unlikely(copy == NULL)) { - rc = ENOMEM; + rc = MDBX_ENOMEM; goto done; } copy->mp_pgno = mp->mp_pgno; @@ -8028,21 +8028,21 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, MDB_xcursor mx; if (unlikely(!key || !data || !txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(flags & ~(MDB_NOOVERWRITE | MDB_NODUPDATA | MDB_RESERVE | MDB_APPEND | MDB_APPENDDUP | MDB_CURRENT))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + return (txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; mdbx_cursor_init(&mc, txn, dbi, &mx); mc.mc_next = txn->mt_cursors[dbi]; @@ -8184,7 +8184,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { /* Make cursor pages writable */ buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); if (buf == NULL) - return ENOMEM; + return MDBX_ENOMEM; for (i = 0; i < mc.mc_top; i++) { mdbx_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); @@ -8513,7 +8513,7 @@ int __cold mdbx_env_copy(MDB_env *env, const char *path) { int __cold mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff) { if (unlikely(flags & ~CHANGEABLE)) - return EINVAL; + return MDBX_EINVAL; int rc = mdbx_txn_lock(env); if (unlikely(rc)) @@ -8530,7 +8530,7 @@ int __cold mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff) { int __cold mdbx_env_get_flags(MDB_env *env, unsigned *arg) { if (unlikely(!env || !arg)) - return EINVAL; + return MDBX_EINVAL; *arg = env->me_flags & (CHANGEABLE | CHANGELESS); return MDB_SUCCESS; @@ -8538,7 +8538,7 @@ int __cold mdbx_env_get_flags(MDB_env *env, unsigned *arg) { int __cold mdbx_env_set_userctx(MDB_env *env, void *ctx) { if (unlikely(!env)) - return EINVAL; + return MDBX_EINVAL; env->me_userctx = ctx; return MDB_SUCCESS; } @@ -8549,7 +8549,7 @@ void *__cold mdbx_env_get_userctx(MDB_env *env) { int __cold mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func) { if (unlikely(!env)) - return EINVAL; + return MDBX_EINVAL; #if MDB_DEBUG env->me_assert_func = func; return MDB_SUCCESS; @@ -8561,7 +8561,7 @@ int __cold mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func) { int __cold mdbx_env_get_path(MDB_env *env, const char **arg) { if (unlikely(!env || !arg)) - return EINVAL; + return MDBX_EINVAL; *arg = env->me_path; return MDB_SUCCESS; @@ -8569,7 +8569,7 @@ int __cold mdbx_env_get_path(MDB_env *env, const char **arg) { int __cold mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *arg) { if (unlikely(!env || !arg)) - return EINVAL; + return MDBX_EINVAL; *arg = env->me_fd; return MDB_SUCCESS; @@ -8595,9 +8595,9 @@ int __cold mdbx_env_stat(MDB_env *env, MDBX_stat *arg, size_t bytes) { MDB_meta *meta; if (unlikely(env == NULL || arg == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(bytes != sizeof(MDBX_stat))) - return EINVAL; + return MDBX_EINVAL; meta = mdbx_meta_head(env); return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); @@ -8607,10 +8607,10 @@ int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) { MDB_meta *meta; if (unlikely(env == NULL || arg == NULL)) - return EINVAL; + return MDBX_EINVAL; if (bytes != sizeof(MDBX_envinfo)) - return EINVAL; + return MDBX_EINVAL; MDB_meta *m1, *m2; MDB_reader *r; @@ -8688,13 +8688,13 @@ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, size_t len; if (unlikely(!txn || !dbi)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(flags & ~VALID_FLAGS)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) return MDB_BAD_TXN; @@ -8761,7 +8761,7 @@ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, /* Done here so we cannot fail after creating a new DB */ if (unlikely((namedup = mdbx_strdup(name)) == NULL)) - return ENOMEM; + return MDBX_ENOMEM; if (unlikely(rc)) { MDB_db db_dummy; @@ -8803,16 +8803,16 @@ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, int __cold mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, size_t bytes) { if (unlikely(!arg || !txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(bytes != sizeof(MDBX_stat))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) return MDB_BAD_TXN; @@ -8843,13 +8843,13 @@ void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi) { int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags) { if (unlikely(!txn || !flags)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) - return EINVAL; + return MDBX_EINVAL; *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; return MDB_SUCCESS; @@ -8955,19 +8955,19 @@ int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del) { int rc; if (unlikely(1 < (unsigned)del || !txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(TXN_DBI_CHANGED(txn, dbi))) return MDB_BAD_DBI; if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) - return EACCES; + return MDBX_EACCESS; rc = mdbx_cursor_open(txn, dbi, &mc); if (unlikely(rc)) @@ -9009,13 +9009,13 @@ leave: int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { if (unlikely(!txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + return MDBX_EINVAL; txn->mt_dbxs[dbi].md_cmp = cmp; return MDB_SUCCESS; @@ -9023,13 +9023,13 @@ int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { if (unlikely(!txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + return MDBX_EINVAL; txn->mt_dbxs[dbi].md_dcmp = cmp; return MDB_SUCCESS; @@ -9042,7 +9042,7 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { int rc = 0, first = 1; if (unlikely(!env || !func)) - return -EINVAL; + return -MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; @@ -9114,7 +9114,7 @@ static int __cold mdbx_pid_insert(mdbx_pid_t *ids, mdbx_pid_t pid) { int __cold mdbx_reader_check(MDB_env *env, int *dead) { if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) - return EINVAL; + return MDBX_EINVAL; if (dead) *dead = 0; return mdbx_reader_check0(env, 0, dead); @@ -9258,7 +9258,7 @@ static int mdbx_midl_grow(MDB_IDL *idp, int num) { /* grow it */ idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID)); if (!idn) - return ENOMEM; + return MDBX_ENOMEM; *idn++ += num; *idp = idn; return 0; @@ -9270,7 +9270,7 @@ static int mdbx_midl_need(MDB_IDL *idp, unsigned num) { if (num > ids[-1]) { num = (num + num / 4 + (256 + 2)) & -256; if (!(ids = realloc(ids - 1, num * sizeof(MDB_ID)))) - return ENOMEM; + return MDBX_ENOMEM; *ids++ = num - 2; *idp = ids; } @@ -9282,7 +9282,7 @@ static int mdbx_midl_append(MDB_IDL *idp, MDB_ID id) { /* Too big? */ if (ids[0] >= ids[-1]) { if (mdbx_midl_grow(idp, MDB_IDL_UM_MAX)) - return ENOMEM; + return MDBX_ENOMEM; ids = *idp; } ids[0]++; @@ -9295,7 +9295,7 @@ static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app) { /* Too big? */ if (ids[0] + app[0] >= ids[-1]) { if (mdbx_midl_grow(idp, app[0])) - return ENOMEM; + return MDBX_ENOMEM; ids = *idp; } memcpy(&ids[ids[0] + 1], &app[1], app[0] * sizeof(MDB_ID)); @@ -9308,7 +9308,7 @@ static int mdbx_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n) { /* Too big? */ if (len + n > ids[-1]) { if (mdbx_midl_grow(idp, n | MDB_IDL_UM_MAX)) - return ENOMEM; + return MDBX_ENOMEM; ids = *idp; } ids[0] = len + n; @@ -9553,7 +9553,7 @@ static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { int __cold mdbx_env_set_syncbytes(MDB_env *env, size_t bytes) { if (unlikely(!env)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; @@ -9584,7 +9584,7 @@ int mdbx_txn_straggler(MDB_txn *txn, int *percent) txnid_t lag; if (unlikely(!txn)) - return -EINVAL; + return -MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; @@ -9767,7 +9767,7 @@ int __cold mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { if (unlikely(!txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; @@ -9776,7 +9776,7 @@ int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { return MDB_BAD_TXN; if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) - return EACCES; + return MDBX_EACCESS; if (likely(canary)) { if (txn->mt_canary.x == canary->x && txn->mt_canary.y == canary->y && @@ -9799,7 +9799,7 @@ int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { int mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary) { if (unlikely(txn == NULL || canary == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; @@ -9809,7 +9809,7 @@ int mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary) { int mdbx_cursor_on_first(MDB_cursor *mc) { if (unlikely(mc == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; @@ -9828,7 +9828,7 @@ int mdbx_cursor_on_first(MDB_cursor *mc) { int mdbx_cursor_on_last(MDB_cursor *mc) { if (unlikely(mc == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; @@ -9848,7 +9848,7 @@ int mdbx_cursor_on_last(MDB_cursor *mc) { int mdbx_cursor_eof(MDB_cursor *mc) { if (unlikely(mc == NULL)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; @@ -9900,27 +9900,27 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, MDB_xcursor mx; if (unlikely(!key || !old_data || !txn || old_data == new_data)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(new_data == NULL && !(flags & MDB_CURRENT))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(flags & ~(MDB_NOOVERWRITE | MDB_NODUPDATA | MDB_RESERVE | MDB_APPEND | MDB_APPENDDUP | MDB_CURRENT))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; + return (txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; mdbx_cursor_init(&mc, txn, dbi, &mx); mc.mc_next = txn->mt_cursors[dbi]; @@ -9931,7 +9931,7 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, if (F_ISSET(flags, MDB_CURRENT | MDB_NOOVERWRITE)) { /* в old_data значение для выбора конкретного дубликата */ if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDB_DUPSORT))) { - rc = EINVAL; + rc = MDBX_EINVAL; goto bailout; } @@ -9971,7 +9971,7 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, } else { /* в old_data буфер для сохранения предыдущего значения */ if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) - return EINVAL; + return MDBX_EINVAL; MDB_val present_data; rc = mdbx_cursor_get(&mc, &present_key, &present_data, MDB_SET_KEY); if (unlikely(rc != MDB_SUCCESS)) { @@ -10053,13 +10053,13 @@ int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); if (unlikely(!key || !data || !txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) return MDB_BAD_TXN; @@ -10114,7 +10114,7 @@ int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, * P_OVERFLOW страниц с длинными данными. */ int mdbx_is_dirty(const MDB_txn *txn, const void *ptr) { if (unlikely(!txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; @@ -10181,13 +10181,13 @@ int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, int mdbx_dbi_sequence(MDB_txn *txn, MDB_dbi dbi, uint64_t *result, uint64_t increment) { if (unlikely(!txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(TXN_DBI_CHANGED(txn, dbi))) return MDB_BAD_DBI; @@ -10201,7 +10201,7 @@ int mdbx_dbi_sequence(MDB_txn *txn, MDB_dbi dbi, uint64_t *result, return MDB_BAD_TXN; if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) - return EACCES; + return MDBX_EACCESS; uint64_t new = dbs->md_seq + increment; if (unlikely(new < increment)) diff --git a/src/osal.c b/src/osal.c index 429c6b1f..2502b097 100644 --- a/src/osal.c +++ b/src/osal.c @@ -337,7 +337,7 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { int rc = GetLastError(); if (rc == ERROR_HANDLE_EOF && read == 0 && offset == 0) - return ENOENT; + return MDBX_ENODATA; return rc; } return (read == bytes) ? MDB_SUCCESS : ERROR_READ_FAULT; @@ -347,7 +347,7 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { return MDB_SUCCESS; if (read < 0) return errno; - return (read == 0 && offset == 0) ? ENOENT : EIO; + return (read == 0 && offset == 0) ? MDBX_ENODATA : EIO; #endif } diff --git a/src/osal.h b/src/osal.h index 06789ae7..fcb6fce6 100644 --- a/src/osal.h +++ b/src/osal.h @@ -33,7 +33,6 @@ #include #include -#include #include #include #include From 19d877635c9a9a25e1d3c32728c393a396621c32 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 24 Apr 2017 15:50:43 +0300 Subject: [PATCH 077/303] mdbx: alter db-file extensions. --- src/mdbx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 319804e5..8052b351 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3900,11 +3900,11 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, } /** The name of the lock file in the DB environment */ -#define LOCKNAME "/lock.mdb" +#define LOCKNAME "/mdbx.lck" /** The name of the data file in the DB environment */ -#define DATANAME "/data.mdb" +#define DATANAME "/mdbx.dat" /** The suffix of the lock file when no subdir is used */ -#define LOCKSUFF "-lock" +#define LOCKSUFF "-lck" /** Only a subset of the @ref mdbx_env flags can be changed * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. From 0d59cd4fe2642328cc62ca24154d313f032f0bae Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 24 Apr 2017 15:51:21 +0300 Subject: [PATCH 078/303] mdbx: rework lck/body setup. --- src/mdbx.c | 248 ++++++++++++++++++++++++++--------------------------- src/osal.c | 8 +- 2 files changed, 127 insertions(+), 129 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 8052b351..a0f29642 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -655,7 +655,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned nflags); -static int mdbx_env_read_header(MDB_env *env, MDB_meta *meta); +static int mdbx_read_header(MDB_env *env, MDB_meta *meta); static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); static void mdbx_env_close0(MDB_env *env); @@ -3264,7 +3264,7 @@ fail: * @param[in] env the environment handle * @param[out] meta address of where to store the meta information * @return 0 on success, non-zero on failure. */ -static int __cold mdbx_env_read_header(MDB_env *env, MDB_meta *meta) { +static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { MDB_metabuf pbuf; MDB_page *p; MDB_meta *m; @@ -3612,16 +3612,14 @@ static int __cold mdbx_env_map(MDB_env *env, void *addr, size_t usedsize) { #endif #ifdef MADV_DONTDUMP - if (!(flags & MDBX_PAGEPERTURB)) { + if (!(flags & MDBX_PAGEPERTURB)) (void)madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP); - } #endif #ifdef MADV_REMOVE - if (flags & MDB_WRITEMAP) { + if (flags & MDB_WRITEMAP) (void)madvise(env->me_map + usedsize, env->me_mapsize - usedsize, MADV_REMOVE); - } #else (void)usedsize; #endif @@ -3739,14 +3737,17 @@ int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) { } /* Further setup required for opening an LMDB environment */ -static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { - int newenv = 0; - int rc = mdbx_env_read_header(env, meta); - if (unlikely(rc != MDB_SUCCESS)) { - if (rc != ENOENT) - return rc; - mdbx_debug("new mdbenv"); - newenv = 1; +static int __cold mdbx_setup_body(MDB_env *env, MDB_meta *meta, int lck_rc) { + int rc = MDBX_RESULT_FALSE; + int err = mdbx_read_header(env, meta); + if (unlikely(err != MDB_SUCCESS)) { + if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || + (env->me_flags & MDB_RDONLY)) + return err; + + mdbx_debug("create new database"); + rc = /* new database */ MDBX_RESULT_TRUE; + env->me_psize = env->me_os_psize; if (env->me_psize > MAX_PAGESIZE) env->me_psize = MAX_PAGESIZE; @@ -3771,84 +3772,77 @@ static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { meta->mm_mapsize = env->me_mapsize; } - if (newenv) { + if (rc == MDBX_RESULT_TRUE) { /* mdbx_env_map() may grow the datafile. Write the metapages * first, so the file will be valid if initialization fails. */ - rc = mdbx_env_init_meta(env, meta); - if (unlikely(rc != MDB_SUCCESS)) - return rc; + err = mdbx_env_init_meta(env, meta); + if (unlikely(err != MDB_SUCCESS)) + return err; - rc = mdbx_ftruncate(env->me_fd, env->me_mapsize); - if (unlikely(rc != MDB_SUCCESS)) - return rc; + err = mdbx_ftruncate(env->me_fd, env->me_mapsize); + if (unlikely(err != MDB_SUCCESS)) + return err; } const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; - rc = mdbx_env_map(env, NULL, usedsize); - if (rc) - return rc; + err = mdbx_env_map(env, NULL, usedsize); + if (err) + return err; mdbx_env_setup_limits(env, env->me_psize); - return MDB_SUCCESS; + return rc; } /****************************************************************************/ /* Open and/or initialize the lock region for the environment. */ -static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, - int *excl) { +static int __cold mdbx_setup_locks(MDB_env *env, char *lck_pathname, int mode) { off_t size; assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd == INVALID_HANDLE_VALUE); - int rc = mdbx_openfile(lpath, O_RDWR | O_CREAT, mode, &env->me_lfd); - if (rc != MDB_SUCCESS) { - if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { - env->me_lfd = INVALID_HANDLE_VALUE; - rc = MDB_SUCCESS; - } else { - return rc; - } + int err = mdbx_openfile(lck_pathname, O_RDWR | O_CREAT, mode, &env->me_lfd); + if (err != MDB_SUCCESS) { + if (err != EROFS || (env->me_flags & MDB_RDONLY) == 0) + return err; + /* LY: without-lck mode (e.g. on read-only filesystem) */ + env->me_lfd = INVALID_HANDLE_VALUE; } /* Try to get exclusive lock. If we succeed, then * nobody is using the lock region and we should initialize it. */ - rc = mdbx_lck_seize(env); - if (rc == MDBX_RESULT_TRUE) - *excl = true; - else if (rc == MDBX_RESULT_FALSE) - *excl = false; - else + const int rc = mdbx_lck_seize(env); + if (MDBX_IS_ERROR(rc)) return rc; - rc = mdbx_filesize(env->me_lfd, &size); - if (unlikely(rc != MDB_SUCCESS)) - return rc; + err = mdbx_filesize(env->me_lfd, &size); + if (unlikely(err != MDB_SUCCESS)) + return err; - if (*excl > 0) { + if (rc == MDBX_RESULT_TRUE) { off_t wanna = roundup2((env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDBX_lockinfo), env->me_os_psize); if (size != wanna) { - rc = mdbx_ftruncate(env->me_lfd, wanna); - if (unlikely(rc != MDB_SUCCESS)) - return rc; + err = mdbx_ftruncate(env->me_lfd, wanna); + if (unlikely(err != MDB_SUCCESS)) + return err; size = wanna; } } env->me_maxreaders = (size - sizeof(MDBX_lockinfo)) / sizeof(MDB_reader) + 1; void *addr = NULL; - rc = mdbx_mmap(&addr, size, true, env->me_lfd); - if (unlikely(rc != MDB_SUCCESS)) - return rc; + err = mdbx_mmap(&addr, size, true, env->me_lfd); + if (unlikely(err != MDB_SUCCESS)) + return err; env->me_txns = addr; if (!(env->me_flags & MDB_NOTLS)) { - rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_txns->mti_readers[0], - &env->me_txns->mti_readers[env->me_maxreaders]); - if (unlikely(rc != MDB_SUCCESS)) - return rc; + err = mdbx_rthc_alloc(&env->me_txkey, &env->me_txns->mti_readers[0], + &env->me_txns->mti_readers[env->me_maxreaders]); + if (unlikely(err != MDB_SUCCESS)) + return err; env->me_flags |= MDB_ENV_TXKEY; } @@ -3875,11 +3869,12 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, return errno; #endif - if (*excl > 0) { + if (rc == MDBX_RESULT_TRUE) { + /* LY: exlcusive mode, init lck */ memset(env->me_txns, 0, sizeof(MDBX_lockinfo)); - rc = mdbx_lck_init(env); - if (rc) - return rc; + err = mdbx_lck_init(env); + if (err) + return err; env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_format = MDB_LOCK_FORMAT; @@ -3896,7 +3891,7 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, } } - return MDB_SUCCESS; + return rc; } /** The name of the lock file in the DB environment */ @@ -3922,8 +3917,8 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mode_t mode, int *exclusive) { - int oflags, rc, len, excl = -1; - char *lpath, *dpath; + int oflags, rc, len; + char *lck_pathname, *dxb_pathname; if (unlikely(!env || !path)) return MDBX_EINVAL; @@ -3941,18 +3936,18 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, } else { rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); } - lpath = malloc(rc); - if (!lpath) - return ENOMEM; + lck_pathname = malloc(rc); + if (!lck_pathname) + return MDBX_ENOMEM; if (flags & MDB_NOSUBDIR) { - dpath = lpath + len + sizeof(LOCKSUFF); - sprintf(lpath, "%s" LOCKSUFF, path); - strcpy(dpath, path); + dxb_pathname = lck_pathname + len + sizeof(LOCKSUFF); + sprintf(lck_pathname, "%s" LOCKSUFF, path); + strcpy(dxb_pathname, path); } else { - dpath = lpath + len + sizeof(LOCKNAME); - sprintf(lpath, "%s" LOCKNAME, path); - sprintf(dpath, "%s" DATANAME, path); + dxb_pathname = lck_pathname + len + sizeof(LOCKNAME); + sprintf(lck_pathname, "%s" LOCKNAME, path); + sprintf(dxb_pathname, "%s" DATANAME, path); } rc = MDB_SUCCESS; @@ -3986,59 +3981,62 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, else oflags = O_RDWR | O_CREAT; - rc = mdbx_openfile(dpath, oflags, mode, &env->me_fd); + rc = mdbx_openfile(dxb_pathname, oflags, mode, &env->me_fd); if (rc != MDB_SUCCESS) goto bailout; - rc = mdbx_env_setup_locks(env, lpath, mode, &excl); - if (rc) + const int lck_rc = mdbx_setup_locks(env, lck_pathname, mode); + if (MDBX_IS_ERROR(lck_rc)) { + rc = lck_rc; goto bailout; + } MDB_meta meta; - rc = mdbx_env_open2(env, &meta); - if (rc == MDB_SUCCESS) { - mdbx_debug("opened dbenv %p", (void *)env); - if (excl > 0) { - env->me_txns->mti_envmode = env->me_flags; - if (exclusive == NULL || *exclusive < 2) { - /* LY: downgrade lock only if exclusive access not requested. - * in case exclusive==1, just leave value as is. */ - rc = mdbx_lck_downgrade(env); - if (rc != MDB_SUCCESS) - goto bailout; - excl = 0; - } - } else { - if (exclusive) { - /* LY: just indicate that is not an exclusive access. */ - *exclusive = 0; - } - if ((env->me_txns->mti_envmode ^ env->me_flags) & - (MDB_WRITEMAP | MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC)) { - /* LY: Current mode/flags incompatible with requested. */ - rc = MDB_INCOMPATIBLE; + const int dxb_rc = mdbx_setup_body(env, &meta, lck_rc); + if (MDBX_IS_ERROR(dxb_rc)) { + rc = dxb_rc; + goto bailout; + } + + mdbx_debug("opened dbenv %p", (void *)env); + if (lck_rc == MDBX_RESULT_TRUE) { + env->me_txns->mti_envmode = env->me_flags; + if (exclusive == NULL || *exclusive < 2) { + /* LY: downgrade lock only if exclusive access not requested. + * in case exclusive==1, just leave value as is. */ + rc = mdbx_lck_downgrade(env); + if (rc != MDB_SUCCESS) goto bailout; - } } - if (!(flags & MDB_RDONLY)) { - MDB_txn *txn; - int tsize = sizeof(MDB_txn), - size = tsize + - env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + - sizeof(unsigned) + 1); - if ((env->me_pbuf = calloc(1, env->me_psize)) && - (txn = calloc(1, size))) { - txn->mt_dbs = (MDB_db *)((char *)txn + tsize); - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); - txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); - txn->mt_env = env; - txn->mt_dbxs = env->me_dbxs; - txn->mt_flags = MDB_TXN_FINISHED; - env->me_txn0 = txn; - } else { - rc = ENOMEM; - } + } else { + if (exclusive) { + /* LY: just indicate that is not an exclusive access. */ + *exclusive = 0; + } + if ((env->me_txns->mti_envmode ^ env->me_flags) & + (MDB_WRITEMAP | MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC)) { + /* LY: Current mode/flags incompatible with requested. */ + rc = MDB_INCOMPATIBLE; + goto bailout; + } + } + if (!(flags & MDB_RDONLY)) { + MDB_txn *txn; + int tsize = sizeof(MDB_txn), + size = tsize + + env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + + sizeof(unsigned) + 1); + if ((env->me_pbuf = calloc(1, env->me_psize)) && (txn = calloc(1, size))) { + txn->mt_dbs = (MDB_db *)((char *)txn + tsize); + txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); + txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_env = env; + txn->mt_dbxs = env->me_dbxs; + txn->mt_flags = MDB_TXN_FINISHED; + env->me_txn0 = txn; + } else { + rc = MDBX_ENOMEM; } } @@ -4063,7 +4061,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, bailout: if (rc) mdbx_env_close0(env); - free(lpath); + free(lck_pathname); return rc; } @@ -8464,24 +8462,24 @@ int __cold mdbx_env_copyfd(MDB_env *env, mdbx_filehandle_t fd) { int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { int rc, len; - char *lpath; + char *lck_pathname; mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE; if (env->me_flags & MDB_NOSUBDIR) { - lpath = (char *)path; + lck_pathname = (char *)path; } else { len = strlen(path); len += sizeof(DATANAME); - lpath = malloc(len); - if (!lpath) - return ENOMEM; - sprintf(lpath, "%s" DATANAME, path); + lck_pathname = malloc(len); + if (!lck_pathname) + return MDBX_ENOMEM; + sprintf(lck_pathname, "%s" DATANAME, path); } /* The destination path must exist, but the destination file must not. * We don't want the OS to cache the writes, since the source data is * already in the OS cache. */ - rc = mdbx_openfile(lpath, O_WRONLY | O_CREAT | O_EXCL, 0666, &newfd); + rc = mdbx_openfile(lck_pathname, O_WRONLY | O_CREAT | O_EXCL, 0666, &newfd); if (rc == MDB_SUCCESS) { if (env->me_psize >= env->me_os_psize) { #ifdef F_NOCACHE /* __APPLE__ */ @@ -8496,7 +8494,7 @@ int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { } if (!(env->me_flags & MDB_NOSUBDIR)) - free(lpath); + free(lck_pathname); if (newfd != INVALID_HANDLE_VALUE) { int err = mdbx_closefile(newfd); diff --git a/src/osal.c b/src/osal.c index 2502b097..c2d7ef78 100644 --- a/src/osal.c +++ b/src/osal.c @@ -333,12 +333,12 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { ov.Offset = (DWORD)offset; ov.OffsetHigh = HIGH_DWORD(offset); - DWORD read; + DWORD read = 0; if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { int rc = GetLastError(); - if (rc == ERROR_HANDLE_EOF && read == 0 && offset == 0) - return MDBX_ENODATA; - return rc; + if (rc == ERROR_HANDLE_EOF) + return (read == 0 && offset == 0) ? MDBX_ENODATA : ERROR_READ_FAULT; + return (rc == MDB_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; } return (read == bytes) ? MDB_SUCCESS : ERROR_READ_FAULT; #else From 1b26de1f44733afa3d0e0426f9acedede52131fa Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 24 Apr 2017 16:35:02 +0300 Subject: [PATCH 079/303] ci: rename appveyor.yml --- .appveyor.yml => appveyor.yml | 2 -- 1 file changed, 2 deletions(-) rename .appveyor.yml => appveyor.yml (98%) diff --git a/.appveyor.yml b/appveyor.yml similarity index 98% rename from .appveyor.yml rename to appveyor.yml index 19db6535..850e61fb 100644 --- a/.appveyor.yml +++ b/appveyor.yml @@ -1,5 +1,3 @@ -max_jobs: 1 - image: Visual Studio 2015 environment: From e98a1e5319a00bd76302a8b6802411dcfaf9dbc0 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 24 Apr 2017 17:52:56 +0300 Subject: [PATCH 080/303] mdbx: move MDBX_LOCK_SUFFIX to API defs. --- mdbx.h | 7 +++++++ src/mdbx.c | 24 +++++++++--------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/mdbx.h b/mdbx.h index a152fe40..d08f1822 100644 --- a/mdbx.h +++ b/mdbx.h @@ -95,6 +95,13 @@ extern "C" { MDBX_VERFOO(MDBX_VERSION_MAJOR, MDBX_VERSION_MINOR, MDBX_VERSION_PATCH, \ MDBX_VERSION_DATE) +/* The name of the lock file in the DB environment */ +#define MDBX_LOCKNAME "/mdbx.lck" +/* The name of the data file in the DB environment */ +#define MDBX_DATANAME "/mdbx.dat" +/* The suffix of the lock file when no subdir is used */ +#define MDBX_LOCK_SUFFIX "-lck" + /* Opaque structure for a database environment. * * A DB environment supports multiple databases, all residing in the same diff --git a/src/mdbx.c b/src/mdbx.c index a0f29642..2bea310f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3894,12 +3894,6 @@ static int __cold mdbx_setup_locks(MDB_env *env, char *lck_pathname, int mode) { return rc; } -/** The name of the lock file in the DB environment */ -#define LOCKNAME "/mdbx.lck" -/** The name of the data file in the DB environment */ -#define DATANAME "/mdbx.dat" -/** The suffix of the lock file when no subdir is used */ -#define LOCKSUFF "-lck" /** Only a subset of the @ref mdbx_env flags can be changed * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. @@ -3932,22 +3926,22 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, len = strlen(path); if (flags & MDB_NOSUBDIR) { - rc = len + sizeof(LOCKSUFF) + len + 1; + rc = len + sizeof(MDBX_LOCK_SUFFIX) + len + 1; } else { - rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); + rc = len + sizeof(MDBX_LOCKNAME) + len + sizeof(MDBX_DATANAME); } lck_pathname = malloc(rc); if (!lck_pathname) return MDBX_ENOMEM; if (flags & MDB_NOSUBDIR) { - dxb_pathname = lck_pathname + len + sizeof(LOCKSUFF); - sprintf(lck_pathname, "%s" LOCKSUFF, path); + dxb_pathname = lck_pathname + len + sizeof(MDBX_LOCK_SUFFIX); + sprintf(lck_pathname, "%s" MDBX_LOCK_SUFFIX, path); strcpy(dxb_pathname, path); } else { - dxb_pathname = lck_pathname + len + sizeof(LOCKNAME); - sprintf(lck_pathname, "%s" LOCKNAME, path); - sprintf(dxb_pathname, "%s" DATANAME, path); + dxb_pathname = lck_pathname + len + sizeof(MDBX_LOCKNAME); + sprintf(lck_pathname, "%s" MDBX_LOCKNAME, path); + sprintf(dxb_pathname, "%s" MDBX_DATANAME, path); } rc = MDB_SUCCESS; @@ -8469,11 +8463,11 @@ int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { lck_pathname = (char *)path; } else { len = strlen(path); - len += sizeof(DATANAME); + len += sizeof(MDBX_DATANAME); lck_pathname = malloc(len); if (!lck_pathname) return MDBX_ENOMEM; - sprintf(lck_pathname, "%s" DATANAME, path); + sprintf(lck_pathname, "%s" MDBX_DATANAME, path); } /* The destination path must exist, but the destination file must not. From 66d842c23bcad252aa3b0c85e93a481490026300 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 24 Apr 2017 19:03:38 +0300 Subject: [PATCH 081/303] ci: provide test.log --- .gitignore | 5 +++-- Makefile | 2 +- appveyor.yml | 8 ++++++-- circle.yml | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 84d48914..0222b199 100644 --- a/.gitignore +++ b/.gitignore @@ -23,11 +23,12 @@ mdbx_stat /test/test test/test.vcxproj.user test/tmp.db -test/tmp.db-lock +test/tmp.db-lck tmp.db -tmp.db-lock +tmp.db-lck valgrind.* .vs/ Win32/ x64/ x86/ +test.log diff --git a/Makefile b/Makefile index b9c36c74..cbe96991 100644 --- a/Makefile +++ b/Makefile @@ -66,7 +66,7 @@ clean: rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err check: test/test - test/test --pathname=tmp.db --dont-cleanup-after basic && ./mdbx_chk -vn tmp.db + test/test --pathname=tmp.db --dont-cleanup-after basic | tee test.log | tail -n 42 && ./mdbx_chk -vn tmp.db mdbx.o: $(MDBX_SRC) Makefile $(CC) $(CFLAGS) -c src/mdbx.c -o $@ diff --git a/appveyor.yml b/appveyor.yml index 850e61fb..214c853d 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -23,7 +23,11 @@ build: test_script: - ps: | if (($env:PLATFORM -eq "x86") -and (Test-Path "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" -PathType Leaf)) { - & "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" --pathname=tmp.db --dont-cleanup-after basic + $test = "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" } else { - & "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\test.exe" --pathname=tmp.db --dont-cleanup-after basic + $test = "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\test.exe" } + & "$test" --pathname=tmp.db --dont-cleanup-after basic | Tee-Object -file test.log | Select-Object -last 42 + +artifacts: + - path: test.log diff --git a/circle.yml b/circle.yml index c10629e1..77da30e9 100644 --- a/circle.yml +++ b/circle.yml @@ -11,4 +11,4 @@ compile: test: override: - - make check + - make check || mv test.log ${CIRCLE_ARTIFACTS}/ From 993730d2f1af4f311817950231395df66e132e16 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 24 Apr 2017 19:37:01 +0300 Subject: [PATCH 082/303] mdbx: more cleanup for Windows. --- mdbx_osal.h | 8 ++++++-- src/mdbx.c | 27 ++++++++++----------------- src/osal.c | 28 ++++++++++++++++++---------- src/osal.h | 2 +- test/chrono.cc | 6 +++--- 5 files changed, 38 insertions(+), 33 deletions(-) diff --git a/mdbx_osal.h b/mdbx_osal.h index 316d24a9..da0e2ca4 100644 --- a/mdbx_osal.h +++ b/mdbx_osal.h @@ -88,19 +88,22 @@ #include #if defined(_WIN32) || defined(_WIN64) + #include #include typedef unsigned mode_t; typedef HANDLE mdbx_filehandle_t; typedef DWORD mdbx_pid_t; typedef DWORD mdbx_tid_t; - #define MDBX_ENODATA ERROR_HANDLE_EOF #define MDBX_EINVAL ERROR_INVALID_PARAMETER #define MDBX_EACCESS ERROR_ACCESS_DENIED #define MDBX_ENOMEM ERROR_OUTOFMEMORY +#define MDBX_EROFS ERROR_FILE_READ_ONLY +#define MDBX_ENOSYS ERROR_NOT_SUPPORTED #else + #include /* for error codes */ #include /* for pthread_t */ #include /* for pid_t */ @@ -109,11 +112,12 @@ typedef DWORD mdbx_tid_t; typedef int mdbx_filehandle_t; typedef pid_t mdbx_pid_t; typedef pthread_t mdbx_tid_t; - #define MDBX_ENODATA ENODATA #define MDBX_EINVAL EINVAL #define MDBX_EACCESS EACCES #define MDBX_ENOMEM ENOMEM +#define MDBX_EROFS EROFS +#define MDBX_ENOSYS ENOSYS #endif diff --git a/src/mdbx.c b/src/mdbx.c index 2bea310f..85fc918a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2962,12 +2962,9 @@ static int mdbx_page_flush(MDB_txn *txn, int keep) { /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ if (pos != next_pos || n == MDB_COMMIT_PAGES || wsize + size > MAX_WRITE) { if (n) { - retry: /* Write previous page(s) */ rc = mdbx_pwritev(env->me_fd, iov, n, wpos, wsize); if (unlikely(rc != MDB_SUCCESS)) { - if (rc == EINTR) - goto retry; mdbx_debug("Write error: %s", strerror(rc)); return rc; } @@ -3395,12 +3392,11 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { if (flags & MDB_WRITEMAP) { rc = mdbx_msync(env->me_map, used_size, flags & MDB_MAPASYNC); if (unlikely(rc != MDB_SUCCESS)) - /* LY: mdbx_msync() should never return EINTR */ goto fail; if ((flags & MDB_MAPASYNC) == 0) env->me_sync_pending = 0; } else { - bool syncmeta = false; + bool fullsync = false; if (unlikely(prev_mapsize != pending->mm_mapsize)) { /* LY: It is no reason to use fdatasync() here, even in case * no such bug in a kernel. Because "no-bug" mean that a kernel @@ -3412,13 +3408,11 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { * * For more info about of a corresponding fdatasync() bug * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ - syncmeta = true; - } - while ( - unlikely((rc = mdbx_filesync(env->me_fd, syncmeta)) != MDB_SUCCESS)) { - if (rc != EINTR) - goto fail; + fullsync = true; } + rc = mdbx_filesync(env->me_fd, fullsync); + if (unlikely(rc != MDB_SUCCESS)) + goto fail; env->me_sync_pending = 0; } } @@ -3498,10 +3492,9 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { if (unlikely(rc != MDB_SUCCESS)) goto fail; } else { - while (unlikely((rc = mdbx_filesync(env->me_fd, false)) != MDB_SUCCESS)) { - if (rc != EINTR) - goto undo; - } + rc = mdbx_filesync(env->me_fd, false); + if (rc != MDB_SUCCESS) + goto undo; } } @@ -3803,7 +3796,7 @@ static int __cold mdbx_setup_locks(MDB_env *env, char *lck_pathname, int mode) { int err = mdbx_openfile(lck_pathname, O_RDWR | O_CREAT, mode, &env->me_lfd); if (err != MDB_SUCCESS) { - if (err != EROFS || (env->me_flags & MDB_RDONLY) == 0) + if (err != MDBX_EROFS || (env->me_flags & MDB_RDONLY) == 0) return err; /* LY: without-lck mode (e.g. on read-only filesystem) */ env->me_lfd = INVALID_HANDLE_VALUE; @@ -8547,7 +8540,7 @@ int __cold mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func) { return MDB_SUCCESS; #else (void)func; - return ENOSYS; + return MDBX_ENOSYS; #endif } diff --git a/src/osal.c b/src/osal.c index c2d7ef78..efa17ba1 100644 --- a/src/osal.c +++ b/src/osal.c @@ -114,7 +114,8 @@ int mdbx_asprintf(char **strp, const char *fmt, ...) { *strp = malloc(needed + 1); if (unlikely(*strp == NULL)) { va_end(ones); - return -ENOMEM; + SetLastError(MDBX_ENOMEM); + return -1; } #if defined(vsnprintf) || defined(_BSD_SOURCE) || _XOPEN_SOURCE >= 500 || \ @@ -451,19 +452,26 @@ int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { } } -int mdbx_filesync(mdbx_filehandle_t fd, bool syncmeta) { +int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync) { #if defined(_WIN32) || defined(_WIN64) - (void)syncmeta; - return FlushFileBuffers(fd) ? 0 : -1; + (void)fullsync; + return FlushFileBuffers(fd) ? MDB_SUCCESS : GetLastError(); #elif __GLIBC_PREREQ(2, 16) || _BSD_SOURCE || _XOPEN_SOURCE || \ (__GLIBC_PREREQ(2, 8) && _POSIX_C_SOURCE >= 200112L) + for (;;) { #if _POSIX_C_SOURCE >= 199309L || _XOPEN_SOURCE >= 500 || \ defined(_POSIX_SYNCHRONIZED_IO) - if (!syncmeta) - return (fdatasync(fd) == 0) ? MDB_SUCCESS : errno; + if (!fullsync && fdatasync(fd) == 0) + return MDB_SUCCESS; +#else + (void)fullsync; #endif - (void)syncmeta; - return (fsync(fd) == 0) ? MDB_SUCCESS : errno; + if (fsync(fd) == 0) + return MDB_SUCCESS; + int rc = errno; + if (rc != EINTR) + return rc; + } #else #error FIXME #endif @@ -568,7 +576,7 @@ int mdbx_msync(void *addr, size_t length, int async) { #if defined(_WIN32) || defined(_WIN64) if (async) return MDB_SUCCESS; - return FlushViewOfFile(addr, length) ? 0 : GetLastError(); + return FlushViewOfFile(addr, length) ? MDB_SUCCESS : GetLastError(); #else return (msync(addr, length, async ? MS_ASYNC : MS_SYNC) == 0) ? MDB_SUCCESS : errno; @@ -580,7 +588,7 @@ int mdbx_mremap_size(void **address, size_t old_size, size_t new_size) { *address = MAP_FAILED; (void)old_size; (void)new_size; - return ERROR_NOT_SUPPORTED; + return ERROR_CALL_NOT_IMPLEMENTED; #else *address = mremap(*address, old_size, new_size, 0, address); return (*address != MAP_FAILED) ? MDB_SUCCESS : errno; diff --git a/src/osal.h b/src/osal.h index fcb6fce6..27d6ee27 100644 --- a/src/osal.h +++ b/src/osal.h @@ -363,7 +363,7 @@ void mdbx_thread_key_delete(mdbx_thread_key_t key); void *mdbx_thread_rthc_get(mdbx_thread_key_t key); void mdbx_thread_rthc_set(mdbx_thread_key_t key, const void *value); -int mdbx_filesync(mdbx_filehandle_t fd, bool syncmeta); +int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync); int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length); int mdbx_filesize(mdbx_filehandle_t fd, off_t *length); int mdbx_openfile(const char *pathname, int flags, mode_t mode, diff --git a/test/chrono.cc b/test/chrono.cc index 20eb7c36..bea5392d 100644 --- a/test/chrono.cc +++ b/test/chrono.cc @@ -94,8 +94,8 @@ time now_motonic() { if (reciprocal == 0) { if (!QueryPerformanceFrequency(&Frequency)) failure_perror("QueryPerformanceFrequency()", GetLastError()); - reciprocal = - ((UINT64_C(1) << 32) + Frequency.QuadPart / 2) / Frequency.QuadPart; + reciprocal = (uint32_t)(((UINT64_C(1) << 32) + Frequency.QuadPart / 2) / + Frequency.QuadPart); assert(reciprocal); } @@ -104,7 +104,7 @@ time now_motonic() { failure_perror("QueryPerformanceCounter()", GetLastError()); time result; - result.integer = Counter.QuadPart / Frequency.QuadPart; + result.integer = (uint32_t)(Counter.QuadPart / Frequency.QuadPart); uint64_t mod = Counter.QuadPart % Frequency.QuadPart; assert(mod < UINT32_MAX); result.fractional = UInt32x32To64((uint32_t)mod, reciprocal); From e7e8e1c59a318225a044f3b463c2d473ceb2ee92 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 25 Apr 2017 00:26:11 +0300 Subject: [PATCH 083/303] test: refine reciprocal division (chrono). --- test/chrono.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/test/chrono.cc b/test/chrono.cc index bea5392d..444aca66 100644 --- a/test/chrono.cc +++ b/test/chrono.cc @@ -89,13 +89,13 @@ time now_realtime() { time now_motonic() { #if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) - static uint32_t reciprocal; + static uint64_t reciprocal; static LARGE_INTEGER Frequency; if (reciprocal == 0) { if (!QueryPerformanceFrequency(&Frequency)) failure_perror("QueryPerformanceFrequency()", GetLastError()); - reciprocal = (uint32_t)(((UINT64_C(1) << 32) + Frequency.QuadPart / 2) / - Frequency.QuadPart); + reciprocal = (((UINT64_C(1) << 48) + Frequency.QuadPart / 2 + 1) / + Frequency.QuadPart); assert(reciprocal); } @@ -104,10 +104,9 @@ time now_motonic() { failure_perror("QueryPerformanceCounter()", GetLastError()); time result; - result.integer = (uint32_t)(Counter.QuadPart / Frequency.QuadPart); + result.fixedpoint = (Counter.QuadPart / Frequency.QuadPart) << 32; uint64_t mod = Counter.QuadPart % Frequency.QuadPart; - assert(mod < UINT32_MAX); - result.fractional = UInt32x32To64((uint32_t)mod, reciprocal); + result.fixedpoint += (mod * reciprocal) >> 16; return result; #else struct timespec ts; From 925064aa11631615c30ee18aa49357f69beb4b83 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 25 Apr 2017 15:00:36 +0300 Subject: [PATCH 084/303] test: add us-timestamp to logs. --- test/log.cc | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/test/log.cc b/test/log.cc index 0123cb6a..3f6df2ce 100644 --- a/test/log.cc +++ b/test/log.cc @@ -88,15 +88,30 @@ bool output(loglevel priority, const char *format, ...) { bool output(loglevel priority, const char *format, va_list ap) { if (last) { putc('\n', last); + fflush(last); last = nullptr; } if (priority < level) return false; + chrono::time now = chrono::now_realtime(); + struct tm tm; + time_t time = now.utc; +#ifdef _MSC_VER + int rc = _localtime32_s(&tm, (const __time32_t *)&now.utc); +#else + int rc = localtime_r(&time, &tm) ? MDB_SUCCESS : errno; +#endif + if (rc != MDB_SUCCESS) + failure_perror("localtime_r()", rc); + last = (priority >= error) ? stderr : stdout; - fprintf(last, "[ %u%10s %.4s ] %s" /* TODO */, osal_getpid(), prefix.c_str(), - level2str(priority), suffix.c_str()); + fprintf(last, + "[ %02d%02d%02d-%02d:%02d:%02d.%06d_%05u %-10s %.4s ] %s" /* TODO */, + tm.tm_year - 100, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, + tm.tm_sec, chrono::fractional2us(now.fractional), osal_getpid(), + prefix.c_str(), level2str(priority), suffix.c_str()); vfprintf(last, format, ap); size_t len = strlen(format); @@ -105,9 +120,10 @@ bool output(loglevel priority, const char *format, va_list ap) { default: putc('\n', last); case '\n': + fflush(last); + last = nullptr; if (priority > info) fflushall(); - last = nullptr; case ' ': case '_': case ':': @@ -128,8 +144,10 @@ bool feed(const char *format, va_list ap) { vfprintf(last, format, ap); size_t len = strlen(format); - if (len && format[len - 1] == '\n') + if (len && format[len - 1] == '\n') { + fflush(last); last = nullptr; + } return true; } From 99abf56c6c612680ec886df906aa11fc8cc5ee96 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 25 Apr 2017 16:09:02 +0300 Subject: [PATCH 085/303] test: fix now_realtime() for Windows. --- test/chrono.cc | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/test/chrono.cc b/test/chrono.cc index 444aca66..b6245295 100644 --- a/test/chrono.cc +++ b/test/chrono.cc @@ -73,11 +73,20 @@ time from_ms(uint64_t ms) { time now_realtime() { #if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) + static void(WINAPI * query_time)(LPFILETIME); + if (!query_time) { + query_time = (void(WINAPI *)(LPFILETIME))GetProcAddress( + GetModuleHandle(TEXT("kernel32.dll")), + "GetSystemTimePreciseAsFileTime"); + if (!query_time) + query_time = GetSystemTimeAsFileTime; + } + FILETIME filetime; - GetSystemTimeAsFileTime(&filetime); - uint64_t ns = + query_time(&filetime); + uint64_t ns100 = (uint64_t)filetime.dwHighDateTime << 32 | filetime.dwLowDateTime; - return from_ns(ns); + return from_ns((ns100 - UINT64_C(116444736000000000)) * 100u); #else struct timespec ts; if (unlikely(clock_gettime(CLOCK_REALTIME, &ts))) From f91218bda4c0d7022496127d3c11866bf47ebe0a Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 25 Apr 2017 18:30:31 +0300 Subject: [PATCH 086/303] ci: Push-AppveyorArtifact on failure. --- appveyor.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 214c853d..f8b9393c 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -29,5 +29,5 @@ test_script: } & "$test" --pathname=tmp.db --dont-cleanup-after basic | Tee-Object -file test.log | Select-Object -last 42 -artifacts: - - path: test.log +on_failure: + - ps: Push-AppveyorArtifact test.log From 326dea8bc12e653f14df06d3589ff5e85216c75b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 25 Apr 2017 19:17:57 +0300 Subject: [PATCH 087/303] mdbx: use FormatMessage() for Windows. --- TODO.md | 2 +- src/mdbx.c | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/TODO.md b/TODO.md index 601d2fe0..ab59b4c3 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,4 @@ -- [ ] разделение errno и GetLastError() +- [x] разделение errno и GetLastError() - [x] CI посредством AppVeyor - [ ] uint32/uint64 в структурах - [ ] правки API (много...) diff --git a/src/mdbx.c b/src/mdbx.c index 85fc918a..657fe6f7 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -763,10 +763,10 @@ const char *__cold mdbx_strerror_r(int errnum, char *buf, size_t buflen) { if (!buflen) return NULL; #ifdef _MSC_VER - int rc = strerror_s(buf, buflen, errnum); - assert(rc == 0); - (void)rc; - return buf; + size_t size = FormatMessageA( + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, + errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, buflen, NULL); + return size ? buf : NULL; #elif defined(_GNU_SOURCE) /* GNU-specific */ msg = strerror_r(errnum, buf, buflen); @@ -792,10 +792,12 @@ const char *__cold mdbx_strerror(int errnum) { if (!msg) { #ifdef _MSC_VER static __thread char buffer[1024]; - int rc = strerror_s(buffer, sizeof(buffer), errnum); - assert(rc == 0); - (void)rc; - msg = buffer; + size_t size = FormatMessageA( + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, + errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buffer, + sizeof(buffer), NULL); + if (size) + msg = buffer; #else msg = strerror(errnum); #endif From aa59522dbe38ef945d079468e58ce0d19ac8917f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 25 Apr 2017 19:51:23 +0300 Subject: [PATCH 088/303] mdbx: minor refine open-path. --- src/mdbx.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 657fe6f7..c48601c5 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3732,7 +3732,7 @@ int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) { } /* Further setup required for opening an LMDB environment */ -static int __cold mdbx_setup_body(MDB_env *env, MDB_meta *meta, int lck_rc) { +static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { int rc = MDBX_RESULT_FALSE; int err = mdbx_read_header(env, meta); if (unlikely(err != MDB_SUCCESS)) { @@ -3791,7 +3791,7 @@ static int __cold mdbx_setup_body(MDB_env *env, MDB_meta *meta, int lck_rc) { /****************************************************************************/ /* Open and/or initialize the lock region for the environment. */ -static int __cold mdbx_setup_locks(MDB_env *env, char *lck_pathname, int mode) { +static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { off_t size; assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd == INVALID_HANDLE_VALUE); @@ -3833,14 +3833,6 @@ static int __cold mdbx_setup_locks(MDB_env *env, char *lck_pathname, int mode) { return err; env->me_txns = addr; - if (!(env->me_flags & MDB_NOTLS)) { - err = mdbx_rthc_alloc(&env->me_txkey, &env->me_txns->mti_readers[0], - &env->me_txns->mti_readers[env->me_maxreaders]); - if (unlikely(err != MDB_SUCCESS)) - return err; - env->me_flags |= MDB_ENV_TXKEY; - } - #ifdef MADV_NOHUGEPAGE (void)madvise(env->me_txns, size, MADV_NOHUGEPAGE); #endif @@ -3974,22 +3966,24 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, if (rc != MDB_SUCCESS) goto bailout; - const int lck_rc = mdbx_setup_locks(env, lck_pathname, mode); + const int lck_rc = mdbx_setup_lck(env, lck_pathname, mode); if (MDBX_IS_ERROR(lck_rc)) { rc = lck_rc; goto bailout; } MDB_meta meta; - const int dxb_rc = mdbx_setup_body(env, &meta, lck_rc); + const int dxb_rc = mdbx_setup_dxb(env, &meta, lck_rc); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; goto bailout; } mdbx_debug("opened dbenv %p", (void *)env); + const unsigned mode_flags = + MDB_WRITEMAP | MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC; if (lck_rc == MDBX_RESULT_TRUE) { - env->me_txns->mti_envmode = env->me_flags; + env->me_txns->mti_envmode = env->me_flags & mode_flags; if (exclusive == NULL || *exclusive < 2) { /* LY: downgrade lock only if exclusive access not requested. * in case exclusive==1, just leave value as is. */ @@ -4002,14 +3996,22 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, /* LY: just indicate that is not an exclusive access. */ *exclusive = 0; } - if ((env->me_txns->mti_envmode ^ env->me_flags) & - (MDB_WRITEMAP | MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC)) { + if ((env->me_txns->mti_envmode ^ env->me_flags) & mode_flags) { /* LY: Current mode/flags incompatible with requested. */ rc = MDB_INCOMPATIBLE; goto bailout; } } - if (!(flags & MDB_RDONLY)) { + + if ((env->me_flags & MDB_NOTLS) == 0) { + rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_txns->mti_readers[0], + &env->me_txns->mti_readers[env->me_maxreaders]); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + env->me_flags |= MDB_ENV_TXKEY; + } + + if ((flags & MDB_RDONLY) == 0) { MDB_txn *txn; int tsize = sizeof(MDB_txn), size = tsize + From 5ed0ccfcbb02df87c6db93474d730be5cd891708 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 25 Apr 2017 19:58:00 +0300 Subject: [PATCH 089/303] mdbx: rework mdbx_pread(). --- src/mdbx.c | 2 +- src/osal.c | 13 +++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index c48601c5..c1014675 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3737,7 +3737,7 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { int err = mdbx_read_header(env, meta); if (unlikely(err != MDB_SUCCESS)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || - (env->me_flags & MDB_RDONLY)) + (env->me_flags & MDB_RDONLY) != 0) return err; mdbx_debug("create new database"); diff --git a/src/osal.c b/src/osal.c index efa17ba1..0315056c 100644 --- a/src/osal.c +++ b/src/osal.c @@ -337,19 +337,16 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { DWORD read = 0; if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { int rc = GetLastError(); - if (rc == ERROR_HANDLE_EOF) - return (read == 0 && offset == 0) ? MDBX_ENODATA : ERROR_READ_FAULT; return (rc == MDB_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; } - return (read == bytes) ? MDB_SUCCESS : ERROR_READ_FAULT; #else ssize_t read = pread(fd, buf, bytes, offset); - if (likely(bytes == (size_t)read)) - return MDB_SUCCESS; - if (read < 0) - return errno; - return (read == 0 && offset == 0) ? MDBX_ENODATA : EIO; + if (read < 0) { + int rc = errno; + return (rc == MDB_SUCCESS) ? /* paranoia */ EIO : rc; + } #endif + return (bytes == (size_t)read) ? MDB_SUCCESS : MDBX_ENODATA; } int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, From 9a2806c6a8eb225ed8b72e22e9ff02f8acf8045c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 25 Apr 2017 20:02:28 +0300 Subject: [PATCH 090/303] test: add failfast option. --- test/main.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/main.cc b/test/main.cc index 5ad2cb20..14805366 100644 --- a/test/main.cc +++ b/test/main.cc @@ -241,6 +241,9 @@ int main(int argc, char *const argv[]) { configure_actor(lastid, ac_deadwrite, value, params); continue; } + if (config::parse_option(argc, argv, narg, "failfast", + global::config::failfast)) + continue; if (*argv[narg] != '-') testcase_setup(argv[narg], params, lastid); From 99002d016f773c530ed619b58bee5024de40d0cd Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 26 Apr 2017 14:17:52 +0300 Subject: [PATCH 091/303] mdbx: add yielding for lck-testing for Windows if NDEBUG undefined. --- src/lck-windows.c | 69 +++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/src/lck-windows.c b/src/lck-windows.c index 789163bc..250e345e 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -24,6 +24,19 @@ * LY */ +static __inline void jitter4testing(void) { +#ifndef NDEBUG + for (;;) { + unsigned coin = ((unsigned)__rdtsc() * 277u) % 43u; + if (coin < 43 / 3) + break; + SwitchToThread(); + if (coin > 43 * 2 / 3) + Sleep(1); + } +#endif +} + /*----------------------------------------------------------------------------*/ /* rthc */ @@ -98,44 +111,18 @@ void mdbx_rthc_unlock(void) { LeaveCriticalSection(&rthc_critical_section); } #define LCK_WAITFOR 0 #define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY -static BOOL flock(mdbx_filehandle_t fd, DWORD flags, off_t offset, - size_t bytes) { +static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, off_t offset, + size_t bytes) { OVERLAPPED ov; ov.hEvent = 0; ov.Offset = (DWORD)offset; ov.OffsetHigh = HIGH_DWORD(offset); - -#ifdef MDBX_WINDOWS_UnlockFile_CRUTCH - if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) - return true; - - if ((flags & LOCKFILE_FAIL_IMMEDIATELY) == 0) - return false; - - int rc = GetLastError(); - if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) - return false; - - /* FIXME: Windows kernel is ugly and mad... */ - SwitchToThread(); - Sleep(42); - SwitchToThread(); -#endif /* MDBX_WINDOWS_UnlockFile_CRUTCH */ return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov); } -static BOOL funlock(mdbx_filehandle_t fd, off_t offset, size_t bytes) { - if (!UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, - HIGH_DWORD(bytes))) - return false; - -#ifdef MDBX_WINDOWS_UnlockFile_CRUTCH - /* FIXME: Windows kernel is ugly and mad... */ - SwitchToThread(); - Sleep(42); - SwitchToThread(); -#endif /* MDBX_WINDOWS_UnlockFile_CRUTCH */ - return true; +static __inline BOOL funlock(mdbx_filehandle_t fd, off_t offset, size_t bytes) { + return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, + HIGH_DWORD(bytes)); } /*----------------------------------------------------------------------------*/ @@ -219,18 +206,22 @@ static int internal_seize_lck(HANDLE lfd) { assert(lfd != INVALID_HANDLE_VALUE); /* 1) now on ?-? (free), get ?-E (middle) */ + jitter4testing(); if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { rc = GetLastError() /* 2) something went wrong, give up */; mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "?-?(free) >> ?-E(middle)", rc); return rc; } + /* 3) now on ?-E (middle), try E-E (exclusive) */ + jitter4testing(); if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive), done */ /* 5) still on ?-E (middle) */ rc = GetLastError(); + jitter4testing(); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ if (!funlock(lfd, LCK_UPPER)) { @@ -242,9 +233,11 @@ static int internal_seize_lck(HANDLE lfd) { } /* 7) still on ?-E (middle), try S-E (locked) */ + jitter4testing(); rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE : GetLastError(); + jitter4testing(); if (rc != MDBX_RESULT_FALSE) mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "?-E(middle) >> S-E(locked)", rc); @@ -267,6 +260,7 @@ int mdbx_lck_seize(MDB_env *env) { assert(env->me_fd != INVALID_HANDLE_VALUE); if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. on read-only filesystem) */ + jitter4testing(); if (!flock(env->me_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { rc = GetLastError(); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); @@ -276,6 +270,7 @@ int mdbx_lck_seize(MDB_env *env) { } rc = internal_seize_lck(env->me_lfd); + jitter4testing(); if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDB_RDONLY) == 0) { /* Check that another process don't operates in without-lck mode. * Doing such check by exclusive locking the body-part of db. Should be @@ -287,11 +282,15 @@ int mdbx_lck_seize(MDB_env *env) { rc = GetLastError(); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lock-against-without-lck", rc); + jitter4testing(); mdbx_lck_destroy(env); - } else if (!funlock(env->me_fd, LCK_BODY)) { - rc = GetLastError(); - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "unlock-against-without-lck", rc); + } else { + jitter4testing(); + if (!funlock(env->me_fd, LCK_BODY)) { + rc = GetLastError(); + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "unlock-against-without-lck", rc); + } } } From 953c6962ea0f67335dab97d3fd076490db0cb1f6 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 26 Apr 2017 18:15:09 +0300 Subject: [PATCH 092/303] test: minor typo fix. --- test/test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test.cc b/test/test.cc index 346d2e8d..6e5dd884 100644 --- a/test/test.cc +++ b/test/test.cc @@ -261,7 +261,7 @@ void testcase::fetch_canary() { failure_perror("mdbx_canary_get()", rc); if (canary_now.v < last.canary.v) - failure("fetch_canary: %" PRIu64 " canary-now.v) < %" PRIu64 + failure("fetch_canary: %" PRIu64 "(canary-now.v) < %" PRIu64 "(canary-last.v)", canary_now.v, last.canary.v); if (canary_now.y < last.canary.y) From 333e5fada3414b29b3c1d4378d017e7282355ac2 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 26 Apr 2017 18:13:48 +0300 Subject: [PATCH 093/303] mdbx: internal cleanup (rename, etc). --- src/lck-posix.c | 20 +++++++-------- src/mdbx.c | 68 ++++++++++++++++++++++++------------------------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 0253ec39..aae8020e 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -104,10 +104,10 @@ int mdbx_lck_init(MDB_env *env) { goto bailout; #endif /* PTHREAD_PRIO_INHERIT */ - rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &ma); + rc = pthread_mutex_init(&env->me_lck->mti_rmutex, &ma); if (rc) goto bailout; - rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &ma); + rc = pthread_mutex_init(&env->me_lck->mti_wmutex, &ma); bailout: pthread_mutexattr_destroy(&ma); @@ -117,12 +117,12 @@ bailout: void mdbx_lck_destroy(MDB_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* try get exclusive access */ - if (env->me_txns && + if (env->me_lck && mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0, LCK_WHOLE) == 0) { /* got exclusive, drown mutexes */ - int rc = pthread_mutex_destroy(&env->me_txns->mti_rmutex); + int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); if (rc == 0) - rc = pthread_mutex_destroy(&env->me_txns->mti_wmutex); + rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); assert(rc == 0); (void)rc; /* lock would be released (by kernel) while the me_lfd will be closed */ @@ -145,22 +145,22 @@ static int mdbx_robust_unlock(MDB_env *env, pthread_mutex_t *mutex) { } int mdbx_rdt_lock(MDB_env *env) { - return mdbx_robust_lock(env, &env->me_txns->mti_rmutex); + return mdbx_robust_lock(env, &env->me_lck->mti_rmutex); } void mdbx_rdt_unlock(MDB_env *env) { - int rc = mdbx_robust_unlock(env, &env->me_txns->mti_rmutex); + int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex); if (unlikely(MDBX_IS_ERROR(rc))) mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); } int mdbx_txn_lock(MDB_env *env) { - int rc = mdbx_robust_lock(env, &env->me_txns->mti_wmutex); + int rc = mdbx_robust_lock(env, &env->me_lck->mti_wmutex); return MDBX_IS_ERROR(rc) ? rc : MDB_SUCCESS; } void mdbx_txn_unlock(MDB_env *env) { - int rc = mdbx_robust_unlock(env, &env->me_txns->mti_wmutex); + int rc = mdbx_robust_unlock(env, &env->me_lck->mti_wmutex); if (unlikely(MDBX_IS_ERROR(rc))) mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); } @@ -253,7 +253,7 @@ static int __cold mdbx_mutex_failed(MDB_env *env, mdbx_mutex_t *mutex, int rc) { if (rc == EOWNERDEAD) { /* We own the mutex. Clean up after dead previous owner. */ - int rlocked = (mutex == &env->me_txns->mti_rmutex); + int rlocked = (mutex == &env->me_lck->mti_rmutex); rc = MDB_SUCCESS; if (!rlocked) { if (unlikely(env->me_txn)) { diff --git a/src/mdbx.c b/src/mdbx.c index c1014675..f36bbc37 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1409,8 +1409,8 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { txnid_t oldest = mdbx_meta_lt(a, b) ? b->mm_txnid : a->mm_txnid; int i, reader; - const MDB_reader *const r = env->me_txns->mti_readers; - for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0;) { + const MDB_reader *const r = env->me_lck->mti_readers; + for (reader = -1, i = env->me_lck->mti_numreaders; --i >= 0;) { if (r[i].mr_pid) { txnid_t snap = r[i].mr_txnid; if (oldest > snap) { @@ -1971,7 +1971,7 @@ int mdbx_env_sync(MDB_env *env, int force) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(!env->me_txns)) + if (unlikely(!env->me_lck)) return MDB_PANIC; flags = env->me_flags & ~MDB_NOMETASYNC; @@ -2149,9 +2149,9 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { } for (;;) { - nr = env->me_txns->mti_numreaders; + nr = env->me_lck->mti_numreaders; for (i = 0; i < nr; i++) - if (env->me_txns->mti_readers[i].mr_pid == 0) + if (env->me_lck->mti_readers[i].mr_pid == 0) break; if (likely(i < env->me_maxreaders)) @@ -2164,7 +2164,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { } } - r = &env->me_txns->mti_readers[i]; + r = &env->me_lck->mti_readers[i]; /* Claim the reader slot, carefully since other code * uses the reader table un-mutexed: First reset the * slot, next publish it in mtb.mti_numreaders. After @@ -2175,7 +2175,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { r->mr_tid = tid; mdbx_coherent_barrier(); if (i == nr) - env->me_txns->mti_numreaders = ++nr; + env->me_lck->mti_numreaders = ++nr; if (env->me_close_readers < nr) env->me_close_readers = nr; r->mr_pid = pid; @@ -3831,49 +3831,49 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { err = mdbx_mmap(&addr, size, true, env->me_lfd); if (unlikely(err != MDB_SUCCESS)) return err; - env->me_txns = addr; + env->me_lck = addr; #ifdef MADV_NOHUGEPAGE - (void)madvise(env->me_txns, size, MADV_NOHUGEPAGE); + (void)madvise(env->me_lck, size, MADV_NOHUGEPAGE); #endif #ifdef MADV_DODUMP - (void)madvise(env->me_txns, size, MADV_DODUMP); + (void)madvise(env->me_lck, size, MADV_DODUMP); #endif #ifdef MADV_DONTFORK - if (madvise(env->me_txns, size, MADV_DONTFORK) < 0) + if (madvise(env->me_lck, size, MADV_DONTFORK) < 0) return errno; #endif #ifdef MADV_WILLNEED - if (madvise(env->me_txns, size, MADV_WILLNEED) < 0) + if (madvise(env->me_lck, size, MADV_WILLNEED) < 0) return errno; #endif #ifdef MADV_RANDOM - if (madvise(env->me_txns, size, MADV_RANDOM) < 0) + if (madvise(env->me_lck, size, MADV_RANDOM) < 0) return errno; #endif if (rc == MDBX_RESULT_TRUE) { /* LY: exlcusive mode, init lck */ - memset(env->me_txns, 0, sizeof(MDBX_lockinfo)); + memset(env->me_lck, 0, sizeof(MDBX_lockinfo)); err = mdbx_lck_init(env); if (err) return err; - env->me_txns->mti_magic = MDB_MAGIC; - env->me_txns->mti_format = MDB_LOCK_FORMAT; + env->me_lck->mti_magic = MDB_MAGIC; + env->me_lck->mti_format = MDB_LOCK_FORMAT; } else { - if (env->me_txns->mti_magic != MDB_MAGIC) { + if (env->me_lck->mti_magic != MDB_MAGIC) { mdbx_debug("lock region has invalid magic"); return MDB_INVALID; } - if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { + if (env->me_lck->mti_format != MDB_LOCK_FORMAT) { mdbx_debug("lock region has format+version 0x%" PRIx64 ", expected 0x%" PRIx64, - env->me_txns->mti_format, MDB_LOCK_FORMAT); + env->me_lck->mti_format, MDB_LOCK_FORMAT); return MDB_VERSION_MISMATCH; } } @@ -3983,7 +3983,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, const unsigned mode_flags = MDB_WRITEMAP | MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC; if (lck_rc == MDBX_RESULT_TRUE) { - env->me_txns->mti_envmode = env->me_flags & mode_flags; + env->me_lck->mti_envmode = env->me_flags & mode_flags; if (exclusive == NULL || *exclusive < 2) { /* LY: downgrade lock only if exclusive access not requested. * in case exclusive==1, just leave value as is. */ @@ -3996,7 +3996,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, /* LY: just indicate that is not an exclusive access. */ *exclusive = 0; } - if ((env->me_txns->mti_envmode ^ env->me_flags) & mode_flags) { + if ((env->me_lck->mti_envmode ^ env->me_flags) & mode_flags) { /* LY: Current mode/flags incompatible with requested. */ rc = MDB_INCOMPATIBLE; goto bailout; @@ -4004,8 +4004,8 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, } if ((env->me_flags & MDB_NOTLS) == 0) { - rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_txns->mti_readers[0], - &env->me_txns->mti_readers[env->me_maxreaders]); + rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_lck->mti_readers[0], + &env->me_lck->mti_readers[env->me_maxreaders]); if (unlikely(rc != MDB_SUCCESS)) return rc; env->me_flags |= MDB_ENV_TXKEY; @@ -4101,10 +4101,10 @@ static void __cold mdbx_env_close0(MDB_env *env) { env->me_fd = INVALID_HANDLE_VALUE; } - mdbx_munmap((void *)env->me_txns, + mdbx_munmap((void *)env->me_lck, (env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDBX_lockinfo)); - env->me_txns = NULL; + env->me_lck = NULL; env->me_pid = 0; mdbx_lck_destroy(env); @@ -4123,7 +4123,7 @@ int __cold mdbx_env_close_ex(MDB_env *env, int dont_sync) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - if (!dont_sync && env->me_txns) + if (!dont_sync && env->me_lck) rc = mdbx_env_sync(env, 1); VALGRIND_DESTROY_MEMPOOL(env); @@ -8622,10 +8622,10 @@ int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) { arg->me_mapsize = env->me_mapsize; arg->me_maxreaders = env->me_maxreaders; - arg->me_numreaders = env->me_txns->mti_numreaders; + arg->me_numreaders = env->me_lck->mti_numreaders; arg->me_tail_txnid = 0; - r = env->me_txns->mti_readers; + r = env->me_lck->mti_readers; arg->me_tail_txnid = arg->me_last_txnid; for (i = 0; i < arg->me_numreaders; ++i) { if (r[i].mr_pid) { @@ -9036,8 +9036,8 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - snap_nreaders = env->me_txns->mti_numreaders; - mr = env->me_txns->mti_readers; + snap_nreaders = env->me_lck->mti_numreaders; + mr = env->me_lck->mti_readers; for (i = 0; i < snap_nreaders; i++) { if (mr[i].mr_pid) { txnid_t txnid = mr[i].mr_txnid; @@ -9117,12 +9117,12 @@ int __cold mdbx_reader_check0(MDB_env *env, int rdt_locked, int *dead) { return MDB_PANIC; } - unsigned snap_nreaders = env->me_txns->mti_numreaders; + unsigned snap_nreaders = env->me_lck->mti_numreaders; mdbx_pid_t *pids = alloca((snap_nreaders + 1) * sizeof(mdbx_pid_t)); pids[0] = 0; int rc = MDBX_RESULT_FALSE, count = 0; - MDB_reader *mr = env->me_txns->mti_readers; + MDB_reader *mr = env->me_lck->mti_readers; for (unsigned i = 0; i < snap_nreaders; i++) { const mdbx_pid_t pid = mr[i].mr_pid; @@ -9512,7 +9512,7 @@ static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { if (!env->me_oom_func) break; - r = &env->me_txns->mti_readers[reader]; + r = &env->me_lck->mti_readers[reader]; pid = r->mr_pid; tid = r->mr_tid; if (r->mr_txnid != oldest || pid <= 0) @@ -9769,7 +9769,7 @@ int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { if (likely(canary)) { if (txn->mt_canary.x == canary->x && txn->mt_canary.y == canary->y && - txn->mt_canary.z == canary->z && txn->mt_canary.v == canary->v) + txn->mt_canary.z == canary->z) return MDB_SUCCESS; txn->mt_canary.x = canary->x; txn->mt_canary.y = canary->y; From 7ec571c9a08ecc354383da3c17cf07ac86c08e6f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 26 Apr 2017 18:12:48 +0300 Subject: [PATCH 094/303] mdbx: partial cleanup/reformat comments. Change-Id: Ia96a5b307dc065b4d1920234c3cce93a0e585876 --- mdbx.h | 162 +++++++++++++++++++++++++---------------------------- src/bits.h | 77 ++++++++++++------------- 2 files changed, 113 insertions(+), 126 deletions(-) diff --git a/mdbx.h b/mdbx.h index d08f1822..d68b7529 100644 --- a/mdbx.h +++ b/mdbx.h @@ -309,12 +309,10 @@ typedef enum MDB_cursor_op { /* Database contents grew beyond environment mapsize */ #define MDB_MAP_RESIZED (-30785) /* Operation and DB incompatible, or DB type changed. This can mean: - * - The operation expects an MDB_DUPSORT / MDB_DUPFIXED database. - * - Opening a named DB when the unnamed DB has - *MDB_DUPSORT/MDB_INTEGERKEY. - * - Accessing a data record as a database, or vice versa. - * - The database was dropped and recreated with different flags. - */ + * - The operation expects an MDB_DUPSORT / MDB_DUPFIXED database. + * - Opening a named DB when the unnamed DB has MDB_DUPSORT/MDB_INTEGERKEY. + * - Accessing a data record as a database, or vice versa. + * - The database was dropped and recreated with different flags. */ #define MDB_INCOMPATIBLE (-30784) /* Invalid reuse of reader locktable slot */ #define MDB_BAD_RSLOT (-30783) @@ -365,12 +363,9 @@ typedef struct MDBX_envinfo { /* Return the LMDB library version information. * - * [out] major if non-NULL, the library major version number is copied - * here - * [out] minor if non-NULL, the library minor version number is copied - * here - * [out] patch if non-NULL, the library patch version number is copied - * here + * [out] major if non-NULL, the library major version number is copied here + * [out] minor if non-NULL, the library minor version number is copied here + * [out] patch if non-NULL, the library patch version number is copied here * Returns "version string" The library version as a string */ LIBMDBX_API const char *mdbx_version(int *major, int *minor, int *patch); @@ -381,8 +376,10 @@ LIBMDBX_API const char *mdbx_version(int *major, int *minor, int *patch); * function. If the error code is greater than or equal to 0, then the string * returned by the system function strerror(3) is returned. If the error code * is less than 0, an error string corresponding to the LMDB library error is - * returned. See errors for a list of LMDB-specific error codes. + * returned. See errors for a list of MDBX-specific error codes. + * * [in] err The error code + * * Returns "error message" The description of the error */ LIBMDBX_API const char *mdbx_strerror(int errnum); @@ -395,54 +392,50 @@ LIBMDBX_API const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); * Before the handle may be used, it must be opened using mdbx_env_open(). * Various other options may also need to be set before opening the handle, * e.g. mdbx_env_set_mapsize(), mdbx_env_set_maxreaders(), - * mdbx_env_set_maxdbs(), - * depending on usage requirements. + * mdbx_env_set_maxdbs(), depending on usage requirements. + * * [out] env The address where the new handle will be stored + * * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_create(MDB_env **env); /* Open an environment handle. * - * If this function fails, mdbx_env_close() must be called to discard the - *MDB_env handle. - * [in] env An environment handle returned by mdbx_env_create() - * [in] path The directory in which the database files reside. This - * directory must already exist and be writable. + * If this function fails, mdbx_env_close() must be called to discard + * the MDB_env handle. + * [in] env An environment handle returned by mdbx_env_create() + * [in] path The directory in which the database files reside. + * This directory must already exist and be writable. * [in] flags Special options for this environment. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - * Flags set by mdbx_env_set_flags() are also used. - * - MDB_NOSUBDIR - * By default, LMDB creates its environment in a directory whose - * pathname is given in \b path, and creates its data and lock - *files - * under that directory. With this option, \b path is used as-is - *for - * the database main data file. The database lock file is the \b - *path - * with "-lock" appended. - * - MDB_RDONLY - * Open the environment in read-only mode. No write operations will - *be - * allowed. LMDB will still modify the lock file - except on - *read-only - * filesystems, where LMDB does not use locks. - * - MDB_WRITEMAP - * Use a writeable memory map unless MDB_RDONLY is set. This uses - * fewer mallocs but loses protection from application bugs - * like wild pointer writes and other bad updates into the - *database. - * This may be slightly faster for DBs that fit entirely in RAM, - *but - * is slower for DBs larger than RAM. - * Incompatible with nested transactions. - * Do not mix processes with and without MDB_WRITEMAP on the same - * environment. This can defeat durability (mdbx_env_sync etc). - * - MDB_NOMETASYNC - * Flush system buffers to disk only once per transaction, omit - *the - * metadata flush. Defer that until the system flushes files to + * must be set to 0 or by bitwise OR'ing together one + * or more of the values described here. + * + * Flags set by mdbx_env_set_flags() are also used: + * - MDB_NOSUBDIR + * By default, LMDB creates its environment in a directory whose + * pathname is given in path, and creates its data and lock files + * under that directory. With this option, path is used as-is for + * the database main data file. The database lock file is the path + * with "-lock" appended. + * + * - MDB_RDONLY + * Open the environment in read-only mode. No write operations will + * be allowed. LMDB will still modify the lock file - except on + * read-only filesystems, where MDBX does not use locks. + * + * - MDB_WRITEMAP + * Use a writeable memory map unless MDB_RDONLY is set. This uses fewer + * mallocs but loses protection from application bugs like wild pointer + * writes and other bad updates into the database. + * This may be slightly faster for DBs that fit entirely in RAM, + * but is slower for DBs larger than RAM. + * Incompatible with nested transactions. + * Do not mix processes with and without MDB_WRITEMAP on the same + * environment. This can defeat durability (mdbx_env_sync etc). + * - MDB_NOMETASYNC + * Flush system buffers to disk only once per transaction, omit the + * metadata flush. Defer that until the system flushes files to *disk, * or next non-MDB_RDONLY commit or mdbx_env_sync(). This *optimization @@ -1492,11 +1485,11 @@ LIBMDBX_API int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); * data items MDB_DUPSORT. * [in] cursor A cursor handle returned by mdbx_cursor_open() * [out] countp Address where the count will be stored - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - cursor is not initialized, or an invalid parameter was - *specified. - */ + * + * Returns A non-zero error value on failure and 0 on success. + * Some possible errors are: + * - EINVAL - cursor is not initialized, + * or an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, size_t *countp); /* Compare two data items according to a particular database. @@ -1507,8 +1500,8 @@ LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, size_t *countp); * [in] dbi A database handle returned by mdbx_dbi_open() * [in] a The first item to compare * [in] b The second item to compare - * Returns < 0 if a < b, 0 if a == b, > 0 if a > b - */ + * + * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ LIBMDBX_API int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); @@ -1520,8 +1513,8 @@ LIBMDBX_API int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, * [in] dbi A database handle returned by mdbx_dbi_open() * [in] a The first item to compare * [in] b The second item to compare - * Returns < 0 if a < b, 0 if a == b, > 0 if a > b - */ + * + * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ LIBMDBX_API int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); @@ -1529,8 +1522,8 @@ LIBMDBX_API int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, * * [in] msg The string to be printed. * [in] ctx An arbitrary context pointer for the callback. - * Returns < 0 on failure, >= 0 on success. - */ + * + * Returns < 0 on failure, >= 0 on success. */ typedef int(MDB_msg_func)(const char *msg, void *ctx); /* Dump the entries in the reader lock table. @@ -1538,16 +1531,16 @@ typedef int(MDB_msg_func)(const char *msg, void *ctx); * [in] env An environment handle returned by mdbx_env_create() * [in] func A MDB_msg_func function * [in] ctx Anything the message function needs - * Returns < 0 on failure, >= 0 on success. - */ + * + * Returns < 0 on failure, >= 0 on success. */ LIBMDBX_API int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); /* Check for stale entries in the reader lock table. * * [in] env An environment handle returned by mdbx_env_create() * [out] dead Number of stale slots that were cleared - * Returns 0 on success, non-zero on failure. - */ + * + * Returns 0 on success, non-zero on failure. */ LIBMDBX_API int mdbx_reader_check(MDB_env *env, int *dead); LIBMDBX_API char *mdbx_dkey(MDB_val *key, char *buf, const size_t bufsize); @@ -1568,9 +1561,9 @@ LIBMDBX_API int mdbx_env_close_ex(MDB_env *env, int dont_sync); * * [in] env An environment handle returned by mdbx_env_create() * [in] bytes The size in bytes of summary changes - * when a synchronous flush would be made. - * Returns A non-zero error value on failure and 0 on success. - */ + * when a synchronous flush would be made. + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); /* Returns a lag of the reading. @@ -1580,9 +1573,9 @@ LIBMDBX_API int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); * * [in] txn A transaction handle returned by mdbx_txn_begin() * [out] percent Percentage of page allocation in the database. + * * Returns Number of transactions committed after the given was started for - * read, or -1 on failure. - */ + * read, or -1 on failure. */ LIBMDBX_API int mdbx_txn_straggler(MDB_txn *txn, int *percent); /* A callback function for killing a laggard readers, @@ -1594,11 +1587,11 @@ LIBMDBX_API int mdbx_txn_straggler(MDB_txn *txn, int *percent); * [in] txn Transaction number on which stalled. * [in] gap a lag from the last commited txn. * [in] retry a retry number, less that zero for notify end of OOM-loop. + * * Returns -1 on failure (reader is not killed), - * 0 on a race condition (no such reader), - * 1 on success (reader was killed), - * >1 on success (reader was SURE killed). - */ + * 0 on a race condition (no such reader), + * 1 on success (reader was killed), + * >1 on success (reader was SURE killed). */ typedef int(MDBX_oom_func)(MDB_env *env, int pid, mdbx_tid_t thread_id, size_t txn, unsigned gap, int retry); @@ -1608,8 +1601,7 @@ typedef int(MDBX_oom_func)(MDB_env *env, int pid, mdbx_tid_t thread_id, * a laggard readers to allowing reclaiming of freeDB. * * [in] env An environment handle returned by mdbx_env_create(). - * [in] oomfunc A #MDBX_oom_func function or NULL to disable. - */ + * [in] oomfunc A #MDBX_oom_func function or NULL to disable. */ LIBMDBX_API void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); /* Get the current oom_func callback. @@ -1618,8 +1610,7 @@ LIBMDBX_API void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); * a laggard readers to allowing reclaiming of freeDB. * * [in] env An environment handle returned by mdbx_env_create(). - * Returns A #MDBX_oom_func function or NULL if disabled. - */ + * Returns A #MDBX_oom_func function or NULL if disabled. */ LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDB_env *env); #define MDBX_DBG_ASSERT 1 @@ -1651,10 +1642,11 @@ LIBMDBX_API int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary); LIBMDBX_API int mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary); /* Returns: - * - MDBX_RESULT_TRUE when no more data available - * or cursor not positioned; - * - MDBX_RESULT_FALSE when data available; - * - Otherwise the error code. */ + * - MDBX_RESULT_TRUE + * when no more data available or cursor not positioned; + * - MDBX_RESULT_FALSE + * when data available; + * - Otherwise the error code. */ LIBMDBX_API int mdbx_cursor_eof(MDB_cursor *mc); /* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */ diff --git a/src/bits.h b/src/bits.h index 41245e53..ce5736ae 100644 --- a/src/bits.h +++ b/src/bits.h @@ -368,8 +368,8 @@ typedef struct MDBX_lockinfo { #endif /* The number of slots that have been used in the reader table. - * This always records the maximum count, it is not decremented - * when readers release their slots. */ + * This always records the maximum count, it is not decremented + * when readers release their slots. */ __cache_aligned volatile unsigned mti_numreaders; #ifdef MDBX_OSAL_LOCK /* Mutex protecting access to this table. */ @@ -380,19 +380,17 @@ typedef struct MDBX_lockinfo { #pragma pack(pop) -/** Auxiliary DB info. -* The information here is mostly static/read-only. There is -* only a single copy of this record in the environment. -*/ +/* Auxiliary DB info. + * The information here is mostly static/read-only. There is + * only a single copy of this record in the environment. */ typedef struct MDB_dbx { MDB_val md_name; /**< name of the database */ MDB_cmp_func *md_cmp; /**< function for comparing keys */ MDB_cmp_func *md_dcmp; /**< function for comparing data items */ } MDB_dbx; -/** A database transaction. -* Every operation requires a transaction handle. -*/ +/* A database transaction. + * Every operation requires a transaction handle. */ struct MDB_txn { #define MDBX_MT_SIGNATURE (0x93D53A31) unsigned mt_signature; @@ -400,40 +398,37 @@ struct MDB_txn { /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ MDB_txn *mt_child; pgno_t mt_next_pgno; /**< next unallocated page */ - /** The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. - */ + /* The ID of this transaction. IDs are integers incrementing from 1. + * Only committed write transactions increment the ID. If a transaction + * aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; MDB_env *mt_env; /**< the DB environment */ /** The list of reclaimed txns from freeDB */ MDB_IDL mt_lifo_reclaimed; - /** The list of pages that became unused during this transaction. - */ + /* The list of pages that became unused during this transaction. */ MDB_IDL mt_free_pgs; - /** The list of loose pages that became unused and may be reused - * in this transaction, linked through #NEXT_LOOSE_PAGE(page). - */ + /* The list of loose pages that became unused and may be reused + * in this transaction, linked through #NEXT_LOOSE_PAGE(page). */ MDB_page *mt_loose_pgs; /** Number of loose pages (#mt_loose_pgs) */ int mt_loose_count; - /** The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. - */ + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ MDB_IDL mt_spill_pgs; union { - /** For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ + /* For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ MDB_ID2L dirty_list; - /** For read txns: This thread/txn's reader table slot, or NULL. */ + /* For read txns: This thread/txn's reader table slot, or NULL. */ MDB_reader *reader; } mt_u; - /** Array of records for each DB known in the environment. */ + /* Array of records for each DB known in the environment. */ MDB_dbx *mt_dbxs; - /** Array of MDB_db records for each known DB */ + /* Array of MDB_db records for each known DB */ MDB_db *mt_dbs; - /** Array of sequence numbers for each DB handle */ + /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; + /** @defgroup mt_dbflag Transaction DB Flags * @ingroup internal * @{ @@ -609,20 +604,20 @@ struct MDB_env { unsigned me_maxreaders; /**< size of the reader table */ /** Max #MDBX_lockinfo.mti_numreaders of interest to #mdbx_env_close() */ unsigned me_close_readers; - MDB_dbi me_numdbs; /**< number of DBs opened */ - MDB_dbi me_maxdbs; /**< size of the DB table */ - mdbx_pid_t me_pid; /**< process ID of this env */ - char *me_path; /**< path to the DB files */ - char *me_map; /**< the memory map of the data file */ - MDBX_lockinfo *me_txns; /**< the memory map of the lock file, never NULL */ - void *me_pbuf; /**< scratch area for DUPSORT put() */ - MDB_txn *me_txn; /**< current write transaction */ - MDB_txn *me_txn0; /**< prealloc'd write transaction */ - size_t me_mapsize; /**< size of the data memory map */ - pgno_t me_maxpg; /**< me_mapsize / me_psize */ - MDB_dbx *me_dbxs; /**< array of static DB info */ - uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ - unsigned *me_dbiseqs; /**< array of dbi sequence numbers */ + MDB_dbi me_numdbs; /**< number of DBs opened */ + MDB_dbi me_maxdbs; /**< size of the DB table */ + mdbx_pid_t me_pid; /**< process ID of this env */ + char *me_path; /**< path to the DB files */ + char *me_map; /**< the memory map of the data file */ + MDBX_lockinfo *me_lck; /**< the memory map of the lock file, never NULL */ + void *me_pbuf; /**< scratch area for DUPSORT put() */ + MDB_txn *me_txn; /**< current write transaction */ + MDB_txn *me_txn0; /**< prealloc'd write transaction */ + size_t me_mapsize; /**< size of the data memory map */ + pgno_t me_maxpg; /**< me_mapsize / me_psize */ + MDB_dbx *me_dbxs; /**< array of static DB info */ + uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ + unsigned *me_dbiseqs; /**< array of dbi sequence numbers */ mdbx_thread_key_t me_txkey; /**< thread-key for readers */ txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ From 939285bef2ccfa1ea4816c7470f2ef48f470f74b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 26 Apr 2017 18:14:49 +0300 Subject: [PATCH 095/303] mdbx: add atomics-ops for assertions. --- src/osal.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/src/osal.h b/src/osal.h index 27d6ee27..1ece89b9 100644 --- a/src/osal.h +++ b/src/osal.h @@ -442,3 +442,75 @@ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid); _vsnprintf_s(buffer, buffer_size, _TRUNCATE, format, args) #endif /* vsnprintf */ #endif /* _MSC_VER */ + +/*----------------------------------------------------------------------------*/ + +#if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) +#include +#elif defined(__GNUC__) || defined(__clang__) +/* LY: nothing required */ +#elif defined(_MSC_VER) +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include +#else +#error FIXME atomic-ops +#endif + +static __inline size_t mdbx_atomic_add(volatile size_t *p, size_t v) { +#ifdef ATOMIC_VAR_INIT + return atomic_fetch_add(p, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_fetch_and_add(p, v); +#else + switch (sizeof(size_t)) { + case 4: +#ifdef _MSC_VER + return _InterlockedExchangeAdd(p, v); +#endif +#ifdef __APPLE__ + return OSAtomicAdd32(v, (volatile int32_t *)p); +#endif + case 8: +#ifdef _MSC_VER + return _InterlockedExchangeAdd64(p, v); +#endif +#ifdef __APPLE__ + return OSAtomicAdd64(v, (volatile int64_t *)p); +#endif + } + while (1) + ; +#endif +} + +#define mdbx_atomic_sub(p, v) mdbx_atomic_add(p, -(v)) + +static __inline bool mdbx_atomic_compare_and_swap(volatile size_t *p, size_t c, + size_t v) { +#ifdef ATOMIC_VAR_INIT + return atomic_compare_exchange_strong(p, &c, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_bool_compare_and_swap(p, c, v); +#else + switch (sizeof(size_t)) { + case 4: +#ifdef _MSC_VER + return c == _InterlockedCompareExchange(p, v, c); +#endif +#ifdef __APPLE__ + return c == OSAtomicCompareAndSwap32Barrier(c, v, (volatile int32_t *)p); +#endif + case 8: +#ifdef _MSC_VER + return c == _InterlockedCompareExchange64(p, v, c); +#endif +#ifdef __APPLE__ + return c == OSAtomicCompareAndSwap64Barrier(c, v, (volatile int32_t *)p); +#endif + } + while (1) + ; +#endif +} From d5b5434fa6de837565996dd81d3fd88e6df4e1f6 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Thu, 27 Apr 2017 00:20:48 +0300 Subject: [PATCH 096/303] test: reduce short jitter-delay to 1ms. Change-Id: I0fac7518dd23d7345c3344e5d5883661a7cbce19 --- test/utils.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/utils.cc b/test/utils.cc index 07020be2..2c7e6d0b 100644 --- a/test/utils.cc +++ b/test/utils.cc @@ -273,7 +273,7 @@ void jitter_delay(bool extra) { cpu_relax(); if (dice > 2) { unsigned us = entropy_white() & - (extra ? 0xfffff /* 1.05 s */ : 0x3fff /* 16 ms */); + (extra ? 0xfffff /* 1.05 s */ : 0x3ff /* 1 ms */); log_trace("== jitter.delay: %0.6f", us / 1000000.0); osal_udelay(us); } From 9e07b71c2716a175169c60612c0209c1228a6cb5 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 01:26:33 +0300 Subject: [PATCH 097/303] mdbx: fix snap-state bug. Change-Id: I8700209ab012bfef21c88fe06c39b80a640291c6 --- src/mdbx.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index f36bbc37..0046d5e2 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2185,22 +2185,21 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { mdbx_thread_rthc_set(env->me_txkey, r); } - while ((env->me_flags & MDB_FATAL_ERROR) == 0) { + while (1) { MDB_meta *const meta = mdbx_meta_head(txn->mt_env); - txnid_t lead = meta->mm_txnid; - r->mr_txnid = lead; + const txnid_t snap = meta->mm_txnid; + r->mr_txnid = snap; mdbx_coherent_barrier(); - txnid_t snap = mdbx_meta_head(txn->mt_env)->mm_txnid; + /* Snap the state from current meta-head */ + txn->mt_txnid = snap; + txn->mt_next_pgno = meta->mm_last_pg + 1; + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); + txn->mt_canary = meta->mm_canary; + /* LY: Retry on a race, ITS#7970. */ - if (likely(lead == snap)) { - txn->mt_txnid = lead; - txn->mt_next_pgno = meta->mm_last_pg + 1; - /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); - txn->mt_canary = meta->mm_canary; + if (likely(meta == mdbx_meta_head(txn->mt_env) && snap == meta->mm_txnid)) break; - } } txn->mt_u.reader = r; From fd078ee163d0639294d706c9b0c6bb5c810fa58f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 01:27:41 +0300 Subject: [PATCH 098/303] mdbx: check for txnid overflow (paranoia). Change-Id: Id7cd62abf8605150ff5491fb1129383e3e748603 --- src/mdbx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/mdbx.c b/src/mdbx.c index 0046d5e2..7653415b 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2213,6 +2213,12 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { MDB_meta *meta = mdbx_meta_head(env); txn->mt_canary = meta->mm_canary; txn->mt_txnid = meta->mm_txnid + 1; + if (unlikely(txn->mt_txnid < meta->mm_txnid)) { + mdbx_debug("txnid overflow!"); + rc = MDB_TXN_FULL; + goto bailout; + } + txn->mt_flags = flags; #if MDB_DEBUG @@ -2262,6 +2268,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { } else { return MDB_SUCCESS; } +bailout: assert(rc != MDB_SUCCESS); mdbx_txn_end(txn, MDB_END_SLOT | MDB_END_FAIL_BEGIN); return rc; From 40dee6f05f79d95893710e61e3e4eab15c0970bf Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 15:17:30 +0300 Subject: [PATCH 099/303] mdbx: rework mdbx_read_header(). --- src/bits.h | 6 ++++-- src/mdbx.c | 58 ++++++++++++++++++++++++++++++++---------------------- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/src/bits.h b/src/bits.h index ce5736ae..3db04875 100644 --- a/src/bits.h +++ b/src/bits.h @@ -278,8 +278,10 @@ typedef struct MDB_meta { #define MDB_DATASIGN_NONE 0u #define MDB_DATASIGN_WEAK 1u volatile uint64_t mm_datasync_sign; -#define META_IS_WEAK(meta) ((meta)->mm_datasync_sign == MDB_DATASIGN_WEAK) -#define META_IS_STEADY(meta) ((meta)->mm_datasync_sign > MDB_DATASIGN_WEAK) +#define SIGN_IS_WEAK(sign) ((sign) == MDB_DATASIGN_WEAK) +#define SIGN_IS_STEADY(sign) ((sign) > MDB_DATASIGN_WEAK) +#define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) +#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) volatile mdbx_canary mm_canary; } MDB_meta; diff --git a/src/mdbx.c b/src/mdbx.c index 7653415b..d4610ca1 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3264,38 +3264,32 @@ fail: return rc; } -/** Read the environment parameters of a DB environment before - * mapping it into memory. - * @param[in] env the environment handle - * @param[out] meta address of where to store the meta information - * @return 0 on success, non-zero on failure. */ +/* Read the environment parameters of a DB environment + * before mapping it into memory. */ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { - MDB_metabuf pbuf; - MDB_page *p; - MDB_meta *m; - int i, rc, off; assert(offsetof(MDB_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); - - /* We don't know the page size yet, so use a minimum value. - * Read both meta pages so we can use the latest one. */ - meta->mm_datasync_sign = MDB_DATASIGN_WEAK; meta->mm_txnid = 0; - for (i = off = 0; i < NUM_METAS; i++, off += meta->mm_psize) { - rc = mdbx_pread(env->me_fd, &pbuf, sizeof(pbuf), off); + off_t offset = 0; + + /* Read both meta pages so we can use the latest one. */ + for (int loops_left = 2; --loops_left >= 0;) { + MDB_metabuf buf; + + /* We don't know the page size on first time, so use a minimum value. */ + int rc = mdbx_pread(env->me_fd, &buf, sizeof(buf), offset); if (rc != MDB_SUCCESS) { mdbx_debug("read: %s", mdbx_strerror(rc)); return rc; } - p = (MDB_page *)&pbuf; - + MDB_page *p = (MDB_page *)&buf; if (!F_ISSET(p->mp_flags, P_META)) { mdbx_debug("page %zu not a meta page", p->mp_pgno); return MDB_INVALID; } - m = PAGEDATA(p); + MDB_meta *m = PAGEDATA(p); if (m->mm_magic != MDB_MAGIC) { mdbx_debug("meta has invalid magic"); return MDB_INVALID; @@ -3307,22 +3301,38 @@ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { return MDB_VERSION_MISMATCH; } - if (m->mm_datasync_sign > MDB_DATASIGN_WEAK && - m->mm_datasync_sign != mdbx_meta_sign(m)) + /* LY: check signature as a checksum */ + if (META_IS_STEADY(m) && m->mm_datasync_sign != mdbx_meta_sign(m)) { + mdbx_debug("steady-meta has invalid checksum"); continue; + } - if (mdbx_meta_lt(meta, m)) + if (mdbx_meta_lt(meta, m)) { *meta = *m; + if (META_IS_WEAK(meta)) + loops_left += 1; /* LY: should re-read to avoid race */ + } + + if (offset) + offset = 0; + else { + offset = meta->mm_psize; + if (!offset) + offset = m->mm_psize; + if (!offset) + offset = env->me_os_psize; + } } - if (meta->mm_datasync_sign == MDB_DATASIGN_WEAK) - /* LY: Both meta-pages are weak. */ + if (META_IS_WEAK(meta)) { + mdbx_debug("both meta-pages are weak, database is corrupted"); return MDB_CORRUPTED; + } return MDB_SUCCESS; } -/** Fill in most of the zeroed #MDB_meta for an empty database environment */ +/* Fill in most of the zeroed MDB_meta for an empty database environment */ static void __cold mdbx_env_init_meta0(MDB_env *env, MDB_meta *meta) { meta->mm_magic = MDB_MAGIC; meta->mm_version = MDB_DATA_VERSION; From 7204c4642106ad1e80c6e419698f9dda110ff706 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 15:18:33 +0300 Subject: [PATCH 100/303] mdbx: add mdbx_osal_jitter() and mdbx_jitter4testing(). --- src/bits.h | 8 ++++++++ src/lck-windows.c | 13 ------------- src/osal.c | 26 ++++++++++++++++++++++++++ src/osal.h | 2 ++ 4 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/bits.h b/src/bits.h index 3db04875..0d7d98d7 100644 --- a/src/bits.h +++ b/src/bits.h @@ -799,6 +799,14 @@ void mdbx_panic(const char *fmt, ...) /*----------------------------------------------------------------------------*/ +static __inline void mdbx_jitter4testing(bool tiny) { +#ifndef NDEBUG + mdbx_osal_jitter(tiny); +#else + (void)tiny; +#endif +} + int mdbx_reader_check0(MDB_env *env, int rlocked, int *dead); #define METAPAGE_1(env) (&((MDB_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) diff --git a/src/lck-windows.c b/src/lck-windows.c index 250e345e..c9d43151 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -24,19 +24,6 @@ * LY */ -static __inline void jitter4testing(void) { -#ifndef NDEBUG - for (;;) { - unsigned coin = ((unsigned)__rdtsc() * 277u) % 43u; - if (coin < 43 / 3) - break; - SwitchToThread(); - if (coin > 43 * 2 / 3) - Sleep(1); - } -#endif -} - /*----------------------------------------------------------------------------*/ /* rthc */ diff --git a/src/osal.c b/src/osal.c index 0315056c..3ad6e573 100644 --- a/src/osal.c +++ b/src/osal.c @@ -626,3 +626,29 @@ int mdbx_mlock(const void *address, size_t length) { return (mlock(address, length) == 0) ? MDB_SUCCESS : errno; #endif } + +/*----------------------------------------------------------------------------*/ + +__cold void mdbx_osal_jitter(bool tiny) { + for (;;) { +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__) + const unsigned salt = 277u * (unsigned)__rdtsc(); +#else + const unsigned salt = rand(); +#endif + + const unsigned coin = salt % (tiny ? 29u : 43u); + if (coin < 43 / 3) + break; +#if defined(_WIN32) || defined(_WIN64) + SwitchToThread(); + if (coin > 43 * 2 / 3) + Sleep(1); +#else + sched_yield(); + if (coin > 43 * 2 / 3) + usleep(coin); +#endif + } +} diff --git a/src/osal.h b/src/osal.h index 1ece89b9..20110ed7 100644 --- a/src/osal.h +++ b/src/osal.h @@ -383,6 +383,8 @@ static __inline mdbx_pid_t mdbx_getpid(void) { #endif } +void mdbx_osal_jitter(bool tiny); + /*----------------------------------------------------------------------------*/ #ifndef mdbx_assert_fail From 4b2cb6645340c6802bc8930a45ca29c27f9b487b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 15:51:58 +0300 Subject: [PATCH 101/303] mdbx: use mdbx_jitter4testing() for race detection. --- src/bits.h | 3 +++ src/lck-windows.c | 18 +++++++++--------- src/mdbx.c | 30 ++++++++++++++++++++++-------- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/src/bits.h b/src/bits.h index 0d7d98d7..7d6fca09 100644 --- a/src/bits.h +++ b/src/bits.h @@ -815,8 +815,11 @@ int mdbx_reader_check0(MDB_env *env, int rlocked, int *dead); (&((MDB_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) static __inline MDB_meta *mdbx_meta_head(MDB_env *env) { + mdbx_jitter4testing(true); MDB_meta *a = METAPAGE_1(env); + mdbx_jitter4testing(true); MDB_meta *b = METAPAGE_2(env); + mdbx_jitter4testing(true); return (a->mm_txnid > b->mm_txnid) ? a : b; } diff --git a/src/lck-windows.c b/src/lck-windows.c index c9d43151..1de72935 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -193,7 +193,7 @@ static int internal_seize_lck(HANDLE lfd) { assert(lfd != INVALID_HANDLE_VALUE); /* 1) now on ?-? (free), get ?-E (middle) */ - jitter4testing(); + mdbx_jitter4testing(false); if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { rc = GetLastError() /* 2) something went wrong, give up */; mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, @@ -202,13 +202,13 @@ static int internal_seize_lck(HANDLE lfd) { } /* 3) now on ?-E (middle), try E-E (exclusive) */ - jitter4testing(); + mdbx_jitter4testing(false); if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive), done */ /* 5) still on ?-E (middle) */ rc = GetLastError(); - jitter4testing(); + mdbx_jitter4testing(false); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ if (!funlock(lfd, LCK_UPPER)) { @@ -220,11 +220,11 @@ static int internal_seize_lck(HANDLE lfd) { } /* 7) still on ?-E (middle), try S-E (locked) */ - jitter4testing(); + mdbx_jitter4testing(false); rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE : GetLastError(); - jitter4testing(); + mdbx_jitter4testing(false); if (rc != MDBX_RESULT_FALSE) mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "?-E(middle) >> S-E(locked)", rc); @@ -247,7 +247,7 @@ int mdbx_lck_seize(MDB_env *env) { assert(env->me_fd != INVALID_HANDLE_VALUE); if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. on read-only filesystem) */ - jitter4testing(); + mdbx_jitter4testing(false); if (!flock(env->me_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { rc = GetLastError(); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); @@ -257,7 +257,7 @@ int mdbx_lck_seize(MDB_env *env) { } rc = internal_seize_lck(env->me_lfd); - jitter4testing(); + mdbx_jitter4testing(false); if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDB_RDONLY) == 0) { /* Check that another process don't operates in without-lck mode. * Doing such check by exclusive locking the body-part of db. Should be @@ -269,10 +269,10 @@ int mdbx_lck_seize(MDB_env *env) { rc = GetLastError(); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lock-against-without-lck", rc); - jitter4testing(); + mdbx_jitter4testing(false); mdbx_lck_destroy(env); } else { - jitter4testing(); + mdbx_jitter4testing(false); if (!funlock(env->me_fd, LCK_BODY)) { rc = GetLastError(); mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, diff --git a/src/mdbx.c b/src/mdbx.c index d4610ca1..42557d94 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1412,6 +1412,7 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { const MDB_reader *const r = env->me_lck->mti_readers; for (reader = -1, i = env->me_lck->mti_numreaders; --i >= 0;) { if (r[i].mr_pid) { + mdbx_jitter4testing(true); txnid_t snap = r[i].mr_txnid; if (oldest > snap) { oldest = snap; @@ -2187,9 +2188,13 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { while (1) { MDB_meta *const meta = mdbx_meta_head(txn->mt_env); + mdbx_jitter4testing(false); const txnid_t snap = meta->mm_txnid; + mdbx_jitter4testing(false); r->mr_txnid = snap; + mdbx_jitter4testing(false); mdbx_coherent_barrier(); + mdbx_jitter4testing(true); /* Snap the state from current meta-head */ txn->mt_txnid = snap; @@ -2206,21 +2211,16 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ } else { /* Not yet touching txn == env->me_txn0, it may be active */ + mdbx_jitter4testing(false); rc = mdbx_txn_lock(env); if (unlikely(rc)) return rc; + mdbx_jitter4testing(false); MDB_meta *meta = mdbx_meta_head(env); + mdbx_jitter4testing(false); txn->mt_canary = meta->mm_canary; txn->mt_txnid = meta->mm_txnid + 1; - if (unlikely(txn->mt_txnid < meta->mm_txnid)) { - mdbx_debug("txnid overflow!"); - rc = MDB_TXN_FULL; - goto bailout; - } - - txn->mt_flags = flags; - #if MDB_DEBUG if (unlikely(txn->mt_txnid == mdbx_debug_edge)) { if (!mdbx_debug_logger) @@ -2230,6 +2230,13 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { "on/off edge (txn %zu)", txn->mt_txnid); } #endif + if (unlikely(txn->mt_txnid < meta->mm_txnid)) { + mdbx_debug("txnid overflow!"); + rc = MDB_TXN_FULL; + goto bailout; + } + + txn->mt_flags = flags; txn->mt_child = NULL; txn->mt_loose_pgs = NULL; txn->mt_loose_count = 0; @@ -3834,6 +3841,13 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { off_t wanna = roundup2((env->me_maxreaders - 1) * sizeof(MDB_reader) + sizeof(MDBX_lockinfo), env->me_os_psize); +#ifndef NDEBUG + err = mdbx_ftruncate(env->me_lfd, size = 0); + if (unlikely(err != MDB_SUCCESS)) + return err; +#endif + mdbx_jitter4testing(false); + if (size != wanna) { err = mdbx_ftruncate(env->me_lfd, wanna); if (unlikely(err != MDB_SUCCESS)) From 150f2c0afc23672265c82c3a9a0175cc1851a99a Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 16:22:57 +0300 Subject: [PATCH 102/303] test: log error into stdout too. --- test/log.cc | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/test/log.cc b/test/log.cc index 3f6df2ce..69a7558c 100644 --- a/test/log.cc +++ b/test/log.cc @@ -106,24 +106,23 @@ bool output(loglevel priority, const char *format, va_list ap) { if (rc != MDB_SUCCESS) failure_perror("localtime_r()", rc); - last = (priority >= error) ? stderr : stdout; + last = stdout; fprintf(last, "[ %02d%02d%02d-%02d:%02d:%02d.%06d_%05u %-10s %.4s ] %s" /* TODO */, - tm.tm_year - 100, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, + tm.tm_year - 100, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, chrono::fractional2us(now.fractional), osal_getpid(), prefix.c_str(), level2str(priority), suffix.c_str()); vfprintf(last, format, ap); size_t len = strlen(format); char end = len ? format[len - 1] : '\0'; + switch (end) { default: putc('\n', last); case '\n': fflush(last); last = nullptr; - if (priority > info) - fflushall(); case ' ': case '_': case ':': @@ -135,6 +134,16 @@ bool output(loglevel priority, const char *format, va_list ap) { case '\0': break; } + + if (priority >= error && last != stderr) { + fprintf(stderr, "[ %05u %-10s %.4s ] %s", osal_getpid(), prefix.c_str(), + level2str(priority), suffix.c_str()); + vfprintf(stderr, format, ap); + if (end != '\n') + putc('\n', stderr); + fflush(stderr); + } + return true; } From f8903ca7c7cec661c80d0274f976e63689b9e708 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 18:06:41 +0300 Subject: [PATCH 103/303] mdbx: minor refine/speedup mdbx_cursor_put(). --- src/mdbx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 42557d94..e2edc31c 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -5828,15 +5828,14 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, /* Too big for a sub-page, convert to sub-DB */ fp_flags &= ~P_SUBP; prep_subDB: + dummy.md_xsize = 0; + dummy.md_flags = 0; if (mc->mc_db->md_flags & MDB_DUPFIXED) { fp_flags |= P_LEAF2; dummy.md_xsize = fp->mp_leaf2_ksize; dummy.md_flags = MDB_DUPFIXED; if (mc->mc_db->md_flags & MDB_INTEGERDUP) dummy.md_flags |= MDB_INTEGERKEY; - } else { - dummy.md_xsize = 0; - dummy.md_flags = 0; } dummy.md_depth = 1; dummy.md_branch_pages = 0; From 678e4f5738068b1681123ed889a135580022b019 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 18:13:39 +0300 Subject: [PATCH 104/303] mdbx: more check/debug around mdbx_pread() and mdbx_read_header(). --- src/mdbx.c | 9 +++++---- src/osal.c | 4 ++-- src/osal.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index e2edc31c..c3dcfb30 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3286,19 +3286,20 @@ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { /* We don't know the page size on first time, so use a minimum value. */ int rc = mdbx_pread(env->me_fd, &buf, sizeof(buf), offset); if (rc != MDB_SUCCESS) { - mdbx_debug("read: %s", mdbx_strerror(rc)); + mdbx_debug("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(buf), rc, + mdbx_strerror(rc)); return rc; } MDB_page *p = (MDB_page *)&buf; if (!F_ISSET(p->mp_flags, P_META)) { - mdbx_debug("page %zu not a meta page", p->mp_pgno); + mdbx_debug("page %zu not a meta-page", p->mp_pgno); return MDB_INVALID; } MDB_meta *m = PAGEDATA(p); if (m->mm_magic != MDB_MAGIC) { - mdbx_debug("meta has invalid magic"); + mdbx_debug("meta[%u] has invalid magic", offset); return MDB_INVALID; } @@ -3310,7 +3311,7 @@ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { /* LY: check signature as a checksum */ if (META_IS_STEADY(m) && m->mm_datasync_sign != mdbx_meta_sign(m)) { - mdbx_debug("steady-meta has invalid checksum"); + mdbx_debug("steady-meta[%u] has invalid checksum", offset); continue; } diff --git a/src/osal.c b/src/osal.c index 3ad6e573..17bc602f 100644 --- a/src/osal.c +++ b/src/osal.c @@ -325,9 +325,9 @@ int mdbx_closefile(mdbx_filehandle_t fd) { } int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { -#if defined(_WIN32) || defined(_WIN64) if (bytes > MAX_WRITE) - return ERROR_INVALID_PARAMETER; + return MDBX_EINVAL; +#if defined(_WIN32) || defined(_WIN64) OVERLAPPED ov; ov.hEvent = 0; diff --git a/src/osal.h b/src/osal.h index 20110ed7..dbb8b7a1 100644 --- a/src/osal.h +++ b/src/osal.h @@ -308,7 +308,7 @@ static __inline void mdbx_invalidate_cache(void *addr, size_t nbytes) { /*----------------------------------------------------------------------------*/ /* max bytes to write in one call */ -#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) +#define MAX_WRITE UINT32_C(0x3fff0000) /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is From 82f053a68557462311f54baf95fbcc1df5234e30 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 18:14:19 +0300 Subject: [PATCH 105/303] mdbx: more debug for lck-seize. --- src/mdbx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mdbx.c b/src/mdbx.c index c3dcfb30..48595b13 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3834,6 +3834,9 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { if (MDBX_IS_ERROR(rc)) return rc; + mdbx_debug("lck-setup: %s ", + (rc == MDBX_RESULT_TRUE) ? "exclusive" : "shared"); + err = mdbx_filesize(env->me_lfd, &size); if (unlikely(err != MDB_SUCCESS)) return err; @@ -4019,6 +4022,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, /* LY: downgrade lock only if exclusive access not requested. * in case exclusive==1, just leave value as is. */ rc = mdbx_lck_downgrade(env); + mdbx_debug("lck-downgrade: rc %i ", rc); if (rc != MDB_SUCCESS) goto bailout; } From 4e9b734b5296463ab20f1e07c89ee6239db18916 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 18:06:07 +0300 Subject: [PATCH 106/303] mdbx: fix mdbx_read_header(). --- src/mdbx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 48595b13..1923f0d7 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3275,9 +3275,9 @@ fail: * before mapping it into memory. */ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { assert(offsetof(MDB_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); + memset(meta, 0, sizeof(MDB_meta)); meta->mm_datasync_sign = MDB_DATASIGN_WEAK; - meta->mm_txnid = 0; - off_t offset = 0; + unsigned offset = 0; /* Read both meta pages so we can use the latest one. */ for (int loops_left = 2; --loops_left >= 0;) { From 8848df34bcc2698e8f144302a488ac2de2b7894b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 27 Apr 2017 18:33:05 +0300 Subject: [PATCH 107/303] mdbx: check gcc/clang version for stdatomic.h --- src/osal.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/osal.h b/src/osal.h index dbb8b7a1..46ad1b3f 100644 --- a/src/osal.h +++ b/src/osal.h @@ -447,7 +447,9 @@ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid); /*----------------------------------------------------------------------------*/ -#if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) +#if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) #include #elif defined(__GNUC__) || defined(__clang__) /* LY: nothing required */ From 29f01cf3f487af22f42997f10973fa10e84503c9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 10 May 2017 19:16:14 +0300 Subject: [PATCH 108/303] mdbx: use PRIuPTR/PRIiPTR/PRIxPTR instead of %z. --- src/mdbx.c | 197 +++++++++++++++++++++++------------------- src/tools/mdbx_chk.c | 169 ++++++++++++++++++++---------------- src/tools/mdbx_dump.c | 3 +- src/tools/mdbx_load.c | 50 ++++++----- src/tools/mdbx_stat.c | 43 ++++----- test/osal-unix.cc | 3 +- test/osal-windows.cc | 2 +- test/test.cc | 8 +- 8 files changed, 260 insertions(+), 215 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 1923f0d7..73248250 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -899,20 +899,20 @@ mdbx_page_list(MDB_page *mp) case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; case P_OVERFLOW: - mdbx_print("Overflow page %zu pages %u%s\n", + mdbx_print("Overflow page %" PRIuPTR " pages %u%s\n", pgno, mp->mp_pages, state); return; case P_META: - mdbx_print("Meta-page %zu txnid %zu\n", + mdbx_print("Meta-page %" PRIuPTR " txnid %" PRIuPTR "\n", pgno, ((MDB_meta *)PAGEDATA(mp))->mm_txnid); return; default: - mdbx_print("Bad page %zu flags 0x%X\n", pgno, mp->mp_flags); + mdbx_print("Bad page %" PRIuPTR " flags 0x%X\n", pgno, mp->mp_flags); return; } nkeys = NUMKEYS(mp); - mdbx_print("%s %zu numkeys %u%s\n", type, pgno, nkeys, state); + mdbx_print("%s %" PRIuPTR " numkeys %u%s\n", type, pgno, nkeys, state); for (i=0; imn_data; nsize = NODESIZE + key.mv_size; if (IS_BRANCH(mp)) { - mdbx_print("key %u: page %zu, %s\n", i, NODEPGNO(node), DKEY(&key)); + mdbx_print("key %u: page %" PRIuPTR ", %s\n", i, NODEPGNO(node), DKEY(&key)); total += nsize; } else { if (F_ISSET(node->mn_flags, F_BIGDATA)) @@ -1162,7 +1162,7 @@ static int mdbx_page_loose(MDB_cursor *mc, MDB_page *mp) { } } if (loose) { - mdbx_debug("loosen db %d page %zu", DDBI(mc), mp->mp_pgno); + mdbx_debug("loosen db %d page %" PRIuPTR "", DDBI(mc), mp->mp_pgno); MDB_page **link = &NEXT_LOOSE_PAGE(mp); if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) { mdbx_kill_page(txn->mt_env, pgno); @@ -1496,7 +1496,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { np = txn->mt_loose_pgs; txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); txn->mt_loose_count--; - mdbx_debug("db %d use loose page %zu", DDBI(mc), np->mp_pgno); + mdbx_debug("db %d use loose page %" PRIuPTR "", DDBI(mc), np->mp_pgno); ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize); *mp = np; return MDB_SUCCESS; @@ -1644,10 +1644,11 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { env->me_pglast = last; if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { - mdbx_debug_extra("IDL read txn %zu root %zu num %u, IDL", last, - txn->mt_dbs[FREE_DBI].md_root, i); + mdbx_debug_extra("IDL read txn %" PRIuPTR " root %" PRIuPTR + " num %u, IDL", + last, txn->mt_dbs[FREE_DBI].md_root, i); for (j = i; j; j--) - mdbx_debug_extra_print(" %zu", idl[j]); + mdbx_debug_extra_print(" %" PRIuPTR "", idl[j]); mdbx_debug_extra_print("\n"); } @@ -1709,7 +1710,8 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { * don't make a steady-sync, but only a legacy-mode checkpoint, * just for resume reclaiming only, not for data consistency. */ - mdbx_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu", + mdbx_debug("kick-gc: head %" PRIuPTR "/%c, tail %" PRIuPTR + "/%c, oldest %" PRIuPTR "", head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest); @@ -1885,7 +1887,8 @@ static int mdbx_page_touch(MDB_cursor *mc) { (rc = mdbx_page_alloc(mc, 1, &np, MDBX_ALLOC_ALL)))) goto fail; pgno = np->mp_pgno; - mdbx_debug("touched db %d page %zu -> %zu", DDBI(mc), mp->mp_pgno, pgno); + mdbx_debug("touched db %d page %" PRIuPTR " -> %" PRIuPTR "", DDBI(mc), + mp->mp_pgno, pgno); mdbx_cassert(mc, mp->mp_pgno != pgno); mdbx_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ @@ -2227,7 +2230,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { mdbx_runtime_flags |= MDBX_DBG_TRACE | MDBX_DBG_EXTRA | MDBX_DBG_AUDIT | MDBX_DBG_ASSERT; mdbx_debug_log(MDBX_DBG_EDGE, __FUNCTION__, __LINE__, - "on/off edge (txn %zu)", txn->mt_txnid); + "on/off edge (txn %" PRIuPTR ")", txn->mt_txnid); } #endif if (unlikely(txn->mt_txnid < meta->mm_txnid)) { @@ -2295,9 +2298,10 @@ int mdbx_txn_renew(MDB_txn *txn) { rc = mdbx_txn_renew0(txn, MDB_TXN_RDONLY); if (rc == MDB_SUCCESS) { - mdbx_debug("renew txn %zu%c %p on mdbenv %p, root page %zu", txn->mt_txnid, - (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, - (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); + mdbx_debug("renew txn %" PRIuPTR "%c %p on mdbenv %p, root page %" PRIuPTR + "", + txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); } return rc; } @@ -2408,9 +2412,10 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, } else { txn->mt_signature = MDBX_MT_SIGNATURE; *ret = txn; - mdbx_debug("begin txn %zu%c %p on mdbenv %p, root page %zu", txn->mt_txnid, - (flags & MDB_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, - txn->mt_dbs[MAIN_DBI].md_root); + mdbx_debug("begin txn %" PRIuPTR "%c %p on mdbenv %p, root page %" PRIuPTR + "", + txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)env, txn->mt_dbs[MAIN_DBI].md_root); } return rc; @@ -2472,7 +2477,7 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { /* Export or close DBI handles opened in this txn */ mdbx_dbis_update(txn, mode & MDB_END_UPDATE); - mdbx_debug("%s txn %zu%c %p on mdbenv %p, root page %zu", + mdbx_debug("%s txn %" PRIuPTR "%c %p on mdbenv %p, root page %" PRIuPTR "", names[mode & MDB_END_OPMASK], txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); @@ -2715,10 +2720,11 @@ again: if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { unsigned i = free_pgs[0]; - mdbx_debug_extra("IDL write txn %zu root %zu num %u, IDL", + mdbx_debug_extra("IDL write txn %" PRIuPTR " root %" PRIuPTR + " num %u, IDL", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) - mdbx_debug_extra_print(" %zu", free_pgs[i]); + mdbx_debug_extra_print(" %" PRIuPTR "", free_pgs[i]); mdbx_debug_extra_print("\n"); } continue; @@ -2990,7 +2996,7 @@ static int mdbx_page_flush(MDB_txn *txn, int keep) { wpos = pos; wsize = 0; } - mdbx_debug("committing page %zu", pgno); + mdbx_debug("committing page %" PRIuPTR "", pgno); next_pos = pos + size; iov[n].iov_len = size; iov[n].iov_base = (char *)dp; @@ -3211,8 +3217,9 @@ int mdbx_txn_commit(MDB_txn *txn) { !(txn->mt_flags & (MDB_TXN_DIRTY | MDB_TXN_SPILLS))) goto done; - mdbx_debug("committing txn %zu %p on mdbenv %p, root page %zu", txn->mt_txnid, - (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); + mdbx_debug( + "committing txn %" PRIuPTR " %p on mdbenv %p, root page %" PRIuPTR "", + txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); /* Update DB root pointers */ if (txn->mt_numdbs > CORE_DBS) { @@ -3293,7 +3300,7 @@ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { MDB_page *p = (MDB_page *)&buf; if (!F_ISSET(p->mp_flags, P_META)) { - mdbx_debug("page %zu not a meta-page", p->mp_pgno); + mdbx_debug("page %" PRIuPTR " not a meta-page", p->mp_pgno); return MDB_INVALID; } @@ -3461,8 +3468,9 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { MDB_meta *stay = mdbx_env_meta_flipflop(env, (MDB_meta *)target); mdbx_debug( - "writing meta %d (%s, was %zu/%s, stay %s %zu/%s), root %zu, " - "txn_id %zu, %s", + "writing meta %d (%s, was %" PRIuPTR "/%s, stay %s %" PRIuPTR + "/%s), root %" PRIuPTR ", " + "txn_id %" PRIuPTR ", %s", offset >= (off_t)env->me_psize, target == head ? "head" : "tail", target->mm_txnid, META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" @@ -4074,13 +4082,13 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, env->me_psize); - mdbx_debug("using meta page %d, txn %zu", toggle, meta->mm_txnid); + mdbx_debug("using meta page %d, txn %" PRIuPTR "", toggle, meta->mm_txnid); mdbx_debug("depth: %u", db->md_depth); - mdbx_debug("entries: %zu", db->md_entries); - mdbx_debug("branch pages: %zu", db->md_branch_pages); - mdbx_debug("leaf pages: %zu", db->md_leaf_pages); - mdbx_debug("overflow pages: %zu", db->md_overflow_pages); - mdbx_debug("root: %zu", db->md_root); + mdbx_debug("entries: %" PRIuPTR "", db->md_entries); + mdbx_debug("branch pages: %" PRIuPTR "", db->md_branch_pages); + mdbx_debug("leaf pages: %" PRIuPTR "", db->md_leaf_pages); + mdbx_debug("overflow pages: %" PRIuPTR "", db->md_overflow_pages); + mdbx_debug("root: %" PRIuPTR "", db->md_root); } #endif @@ -4349,7 +4357,7 @@ static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, nkeys = NUMKEYS(mp); - mdbx_debug("searching %u keys in %s %spage %zu", nkeys, + mdbx_debug("searching %u keys in %s %spage %" PRIuPTR "", nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdbx_dbg_pgno(mp)); @@ -4390,7 +4398,7 @@ static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, if (IS_LEAF(mp)) mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc); else - mdbx_debug("found branch index %u [%s -> %zu], rc = %i", i, + mdbx_debug("found branch index %u [%s -> %" PRIuPTR "], rc = %i", i, DKEY(&nodekey), NODEPGNO(node), rc); if (rc == 0) break; @@ -4433,7 +4441,7 @@ mdbx_cursor_adjust(MDB_cursor *mc, func) /** Pop a page off the top of the cursor's stack. */ static void mdbx_cursor_pop(MDB_cursor *mc) { if (mc->mc_snum) { - mdbx_debug("popped page %zu off db %d cursor %p", + mdbx_debug("popped page %" PRIuPTR " off db %d cursor %p", mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); mc->mc_snum--; @@ -4449,8 +4457,8 @@ static void mdbx_cursor_pop(MDB_cursor *mc) { * Set #MDB_TXN_ERROR on failure. */ static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp) { - mdbx_debug("pushing page %zu on db %d cursor %p", mp->mp_pgno, DDBI(mc), - (void *)mc); + mdbx_debug("pushing page %" PRIuPTR " on db %d cursor %p", mp->mp_pgno, + DDBI(mc), (void *)mc); if (unlikely(mc->mc_snum >= CURSOR_STACK)) { mc->mc_txn->mt_flags |= MDB_TXN_ERROR; @@ -4509,7 +4517,7 @@ static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, } if (unlikely(pgno >= txn->mt_next_pgno)) { - mdbx_debug("page %zu not found", pgno); + mdbx_debug("page %" PRIuPTR " not found", pgno); txn->mt_flags |= MDB_TXN_ERROR; return MDB_PAGE_NOTFOUND; } @@ -4537,13 +4545,14 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { MDB_node *node; indx_t i; - mdbx_debug("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp)); + mdbx_debug("branch page %" PRIuPTR " has %u keys", mp->mp_pgno, + NUMKEYS(mp)); /* Don't assert on branch pages in the FreeDB. We can get here * while in the process of rebalancing a FreeDB branch page; we must * let that proceed. ITS#8336 */ mdbx_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); - mdbx_debug("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0))); + mdbx_debug("found index 0 to page %" PRIuPTR "", NODEPGNO(NODEPTR(mp, 0))); if (flags & (MDB_PS_FIRST | MDB_PS_LAST)) { i = 0; @@ -4597,7 +4606,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { return MDB_CORRUPTED; } - mdbx_debug("found leaf page %zu for key [%s]", mp->mp_pgno, + mdbx_debug("found leaf page %" PRIuPTR " for key [%s]", mp->mp_pgno, key ? DKEY(key) : "null"); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -4696,7 +4705,7 @@ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { mc->mc_snum = 1; mc->mc_top = 0; - mdbx_debug("db %d root page %zu has flags 0x%X", DDBI(mc), root, + mdbx_debug("db %d root page %" PRIuPTR " has flags 0x%X", DDBI(mc), root, mc->mc_pg[0]->mp_flags); if (flags & MDB_PS_MODIFY) { @@ -4719,7 +4728,7 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDB_page *mp) { MDB_ID pn = pg << 1; int rc; - mdbx_debug("free ov page %zu (%u)", pg, ovpages); + mdbx_debug("free ov page %" PRIuPTR " (%u)", pg, ovpages); /* If the page is dirty or on the spill list we just acquired it, * so we should give it back to our current free list, if any. * Otherwise put it onto the list of pages we freed in this txn. @@ -4805,7 +4814,7 @@ static __inline int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); if (unlikely((rc = mdbx_page_get(mc, pgno, &omp, NULL)) != 0)) { - mdbx_debug("read overflow page %zu failed", pgno); + mdbx_debug("read overflow page %" PRIuPTR " failed", pgno); return rc; } data->mv_data = PAGEDATA(omp); @@ -4855,7 +4864,7 @@ static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { } mdbx_cursor_pop(mc); - mdbx_debug("parent page is page %zu, index %u", + mdbx_debug("parent page is page %" PRIuPTR ", index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); if (move_right @@ -4931,8 +4940,8 @@ static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } - mdbx_debug("cursor_next: top page is %zu in cursor %p", mdbx_dbg_pgno(mp), - (void *)mc); + mdbx_debug("cursor_next: top page is %" PRIuPTR " in cursor %p", + mdbx_dbg_pgno(mp), (void *)mc); if (mc->mc_flags & C_DEL) { mc->mc_flags ^= C_DEL; goto skip; @@ -4945,13 +4954,14 @@ static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, return rc; } mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("next page is %zu, key index %u", mp->mp_pgno, + mdbx_debug("next page is %" PRIuPTR ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); } else mc->mc_ki[mc->mc_top]++; skip: - mdbx_debug("==> cursor points to page %zu with %u keys, key index %u", + mdbx_debug("==> cursor points to page %" PRIuPTR + " with %u keys, key index %u", mdbx_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mp)) { @@ -5020,8 +5030,8 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } - mdbx_debug("cursor_prev: top page is %zu in cursor %p", mdbx_dbg_pgno(mp), - (void *)mc); + mdbx_debug("cursor_prev: top page is %" PRIuPTR " in cursor %p", + mdbx_dbg_pgno(mp), (void *)mc); mc->mc_flags &= ~(C_EOF | C_DEL); @@ -5032,12 +5042,13 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, } mp = mc->mc_pg[mc->mc_top]; mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - mdbx_debug("prev page is %zu, key index %u", mp->mp_pgno, + mdbx_debug("prev page is %" PRIuPTR ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); } else mc->mc_ki[mc->mc_top]--; - mdbx_debug("==> cursor points to page %zu with %u keys, key index %u", + mdbx_debug("==> cursor points to page %" PRIuPTR + " with %u keys, key index %u", mdbx_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mp)) { @@ -5607,8 +5618,9 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDB_BAD_VALSIZE; } - mdbx_debug("==> put db %d key [%s], size %zu, data size %zu", DDBI(mc), - DKEY(key), key ? key->mv_size : 0, data->mv_size); + mdbx_debug("==> put db %d key [%s], size %" PRIuPTR ", data size %" PRIuPTR + "", + DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size); int dupdata_flag = 0; if (flags & MDB_CURRENT) { @@ -6225,7 +6237,7 @@ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, if (unlikely((rc = mdbx_page_alloc(mc, num, &np, MDBX_ALLOC_ALL)))) return rc; - mdbx_debug("allocated new mpage %zu, page size %u", np->mp_pgno, + mdbx_debug("allocated new mpage %" PRIuPTR ", page size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize); np->mp_flags = flags | P_DIRTY; np->mp_lower = (PAGEHDRSZ - PAGEBASE); @@ -6323,7 +6335,8 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, mdbx_cassert(mc, mp->mp_upper >= mp->mp_lower); - mdbx_debug("add to %s %spage %zu index %i, data size %zu key size %zu [%s]", + mdbx_debug("add to %s %spage %" PRIuPTR " index %i, data size %" PRIuPTR + " key size %" PRIuPTR " [%s]", IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdbx_dbg_pgno(mp), indx, data ? data->mv_size : 0, key ? key->mv_size : 0, key ? DKEY(key) : "null"); @@ -6358,15 +6371,15 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); int rc; /* Put data on overflow page. */ - mdbx_debug( - "data size is %zu, node would be %zu, put data on overflow page", - data->mv_size, node_size + data->mv_size); + mdbx_debug("data size is %" PRIuPTR ", node would be %" PRIuPTR + ", put data on overflow page", + data->mv_size, node_size + data->mv_size); node_size = EVEN(node_size + sizeof(pgno_t)); if ((ssize_t)node_size > room) goto full; if ((rc = mdbx_page_new(mc, P_OVERFLOW, ovpages, &ofp))) return rc; - mdbx_debug("allocated overflow page %zu", ofp->mp_pgno); + mdbx_debug("allocated overflow page %" PRIuPTR "", ofp->mp_pgno); flags |= F_BIGDATA; goto update; } else { @@ -6423,10 +6436,11 @@ update: return MDB_SUCCESS; full: - mdbx_debug("not enough room in page %zu, got %u ptrs", mdbx_dbg_pgno(mp), - NUMKEYS(mp)); - mdbx_debug("upper-lower = %u - %u = %zd", mp->mp_upper, mp->mp_lower, room); - mdbx_debug("node size = %zu", node_size); + mdbx_debug("not enough room in page %" PRIuPTR ", got %u ptrs", + mdbx_dbg_pgno(mp), NUMKEYS(mp)); + mdbx_debug("upper-lower = %u - %u = %" PRIiPTR "", mp->mp_upper, mp->mp_lower, + room); + mdbx_debug("node size = %" PRIuPTR "", node_size); mc->mc_txn->mt_flags |= MDB_TXN_ERROR; return MDB_PAGE_FULL; } @@ -6444,7 +6458,7 @@ static void mdbx_node_del(MDB_cursor *mc, int ksize) { MDB_node *node; char *base; - mdbx_debug("delete node %u on %s page %zu", indx, + mdbx_debug("delete node %u on %s page %" PRIuPTR "", indx, IS_LEAF(mp) ? "leaf" : "branch", mdbx_dbg_pgno(mp)); numkeys = NUMKEYS(mp); mdbx_cassert(mc, indx < numkeys); @@ -6595,7 +6609,7 @@ static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node) { mx->mx_db.md_flags |= MDB_INTEGERKEY; } } - mdbx_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, + mdbx_debug("Sub-db -%u root page %" PRIuPTR "", mx->mx_cursor.mc_dbi, mx->mx_db.md_root); mx->mx_dbflag = DB_VALID | DB_USRVALID | DB_DUPDATA; /* #if UINT_MAX < SIZE_MAX @@ -6629,7 +6643,7 @@ static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, } mx->mx_db = src_mx->mx_db; mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; - mdbx_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, + mdbx_debug("Sub-db -%u root page %" PRIuPTR "", mx->mx_cursor.mc_dbi, mx->mx_db.md_root); } @@ -6830,8 +6844,9 @@ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { char kbuf2[DKBUF_MAXKEYSIZE * 2 + 1]; k2.mv_data = NODEKEY(node); k2.mv_size = node->mn_ksize; - mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %zu", indx, ptr, - mdbx_dkey(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), mp->mp_pgno); + mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %" PRIuPTR "", indx, + ptr, mdbx_dkey(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), + mp->mp_pgno); } /* Sizes must be 2-byte aligned. */ @@ -6973,7 +6988,8 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { return rc; } - mdbx_debug("moving %s node %u [%s] on page %zu to node %u on page %zu", + mdbx_debug("moving %s node %u [%s] on page %" PRIuPTR + " to node %u on page %" PRIuPTR "", IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", csrc->mc_ki[csrc->mc_top], DKEY(&key), csrc->mc_pg[csrc->mc_top]->mp_pgno, cdst->mc_ki[cdst->mc_top], @@ -7056,7 +7072,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - mdbx_debug("update separator for source page %zu to [%s]", + mdbx_debug("update separator for source page %" PRIuPTR " to [%s]", csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)); mdbx_cursor_copy(csrc, &mn); mn.mc_snum--; @@ -7086,7 +7102,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - mdbx_debug("update separator for destination page %zu to [%s]", + mdbx_debug("update separator for destination page %" PRIuPTR " to [%s]", cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)); mdbx_cursor_copy(cdst, &mn); mn.mc_snum--; @@ -7129,7 +7145,8 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_debug("merging page %zu into %zu", psrc->mp_pgno, pdst->mp_pgno); + mdbx_debug("merging page %" PRIuPTR " into %" PRIuPTR "", psrc->mp_pgno, + pdst->mp_pgno); mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ mdbx_cassert(csrc, cdst->mc_snum > 1); @@ -7187,8 +7204,9 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { } } - mdbx_debug("dst page %zu now has %u keys (%.1f%% filled)", pdst->mp_pgno, - NUMKEYS(pdst), (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10); + mdbx_debug("dst page %" PRIuPTR " now has %u keys (%.1f%% filled)", + pdst->mp_pgno, NUMKEYS(pdst), + (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10); /* Unlink the src page from parent and add to free list. */ @@ -7296,7 +7314,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { minkeys = 1; thresh = FILL_THRESHOLD; } - mdbx_debug("rebalancing %s page %zu (has %u keys, %.1f%% full)", + mdbx_debug("rebalancing %s page %" PRIuPTR " (has %u keys, %.1f%% full)", IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", mdbx_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), @@ -7304,7 +7322,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { - mdbx_debug("no need to rebalance page %zu, above fill threshold", + mdbx_debug("no need to rebalance page %" PRIuPTR ", above fill threshold", mdbx_dbg_pgno(mc->mc_pg[mc->mc_top])); return MDB_SUCCESS; } @@ -7435,7 +7453,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { fromleft = 1; } - mdbx_debug("found neighbor page %zu (%u keys, %.1f%% full)", + mdbx_debug("found neighbor page %" PRIuPTR " (%u keys, %.1f%% full)", mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10); @@ -7649,7 +7667,8 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, newindx = mc->mc_ki[mc->mc_top]; nkeys = NUMKEYS(mp); - mdbx_debug("-----> splitting %s page %zu and adding [%s] at index %i/%i", + mdbx_debug("-----> splitting %s page %" PRIuPTR + " and adding [%s] at index %i/%i", IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys); @@ -7657,7 +7676,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, if ((rc = mdbx_page_new(mc, mp->mp_flags, 1, &rp))) return rc; rp->mp_leaf2_ksize = mp->mp_leaf2_ksize; - mdbx_debug("new right sibling: page %zu", rp->mp_pgno); + mdbx_debug("new right sibling: page %" PRIuPTR "", rp->mp_pgno); /* Usually when splitting the root page, the cursor * height is 1. But when called from mdbx_update_key, @@ -7675,7 +7694,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; - mdbx_debug("root split! new root = %zu", pp->mp_pgno); + mdbx_debug("root split! new root = %" PRIuPTR "", pp->mp_pgno); new_root = mc->mc_db->md_depth++; /* Add left (implicit) pointer. */ @@ -7693,7 +7712,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, ptop = 0; } else { ptop = mc->mc_top - 1; - mdbx_debug("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno); + mdbx_debug("parent branch page is %" PRIuPTR "", mc->mc_pg[ptop]->mp_pgno); } mdbx_cursor_copy(mc, &mn); @@ -9076,11 +9095,11 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { if (mr[i].mr_pid) { txnid_t txnid = mr[i].mr_txnid; if (txnid == ~(txnid_t)0) - snprintf(buf, sizeof(buf), "%10d %zx -\n", (int)mr[i].mr_pid, + snprintf(buf, sizeof(buf), "%10d %" PRIxPTR " -\n", (int)mr[i].mr_pid, (size_t)mr[i].mr_tid); else - snprintf(buf, sizeof(buf), "%10d %zx %zu\n", (int)mr[i].mr_pid, - (size_t)mr[i].mr_tid, txnid); + snprintf(buf, sizeof(buf), "%10d %" PRIxPTR " %" PRIuPTR "\n", + (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); if (first) { first = 0; @@ -9203,7 +9222,7 @@ int __cold mdbx_reader_check0(MDB_env *env, int rdt_locked, int *dead) { /* clean it */ for (unsigned j = i; j < snap_nreaders; j++) { if (mr[j].mr_pid == pid) { - mdbx_debug("clear stale reader pid %u txn %zd", (unsigned)pid, + mdbx_debug("clear stale reader pid %u txn %" PRIiPTR "", (unsigned)pid, mr[j].mr_txnid); mr[j].mr_pid = 0; count++; diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index cc0efdd9..f8effef2 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -177,7 +178,7 @@ static void problem_add(const char *object, size_t entry_number, p->count++; if (verbose > 1) { - print(" %s #%zu: %s", object, entry_number, msg); + print(" %s #%" PRIuPTR ": %s", object, entry_number, msg); if (extra) { va_list args; printf(" ("); @@ -209,7 +210,7 @@ static size_t problems_pop(struct problem *list) { for (i = 0; problems_list; ++i) { struct problem *p = problems_list->pr_next; count += problems_list->count; - print("%s%s (%zu)", i ? ", " : "", problems_list->caption, + print("%s%s (%" PRIuPTR ")", i ? ", " : "", problems_list->caption, problems_list->count); free(problems_list); problems_list = p; @@ -236,9 +237,9 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, if (verbose > 2 && (!only_subdb || strcmp(only_subdb, dbi) == 0)) { if (pgnumber == 1) - print(" %s-page %zu", type, pgno); + print(" %s-page %" PRIuPTR "", type, pgno); else - print(" %s-span %zu[%u]", type, pgno, pgnumber); + print(" %s-span %" PRIuPTR "[%u]", type, pgno, pgnumber); print(" of %s: header %i, payload %i, unused %i\n", dbi, header_bytes, payload_bytes, unused_bytes); } @@ -246,13 +247,15 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, walk.pgcount += pgnumber; if (unused_bytes < 0 || (size_t)unused_bytes > page_size) - problem_add("page", pgno, "illegal unused-bytes", "%zu < %i < %zu", 0, - unused_bytes, stat.ms_psize); + problem_add("page", pgno, "illegal unused-bytes", + "%" PRIuPTR " < %i < %" PRIuPTR "", 0, unused_bytes, + stat.ms_psize); if (header_bytes < (int)sizeof(long) || (size_t)header_bytes >= stat.ms_psize - sizeof(long)) - problem_add("page", pgno, "illegal header-length", "%zu < %i < %zu", - sizeof(long), header_bytes, stat.ms_psize - sizeof(long)); + problem_add("page", pgno, "illegal header-length", + "%" PRIuPTR " < %i < %" PRIuPTR "", sizeof(long), + header_bytes, stat.ms_psize - sizeof(long)); if (payload_bytes < 1) { if (nentries > 1) { problem_add("page", pgno, "zero size-of-entry", @@ -269,9 +272,9 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, } if (page_bytes != page_size) { - problem_add("page", pgno, "misused", "%zu != %zu (%ih + %ip + %iu)", - page_size, page_bytes, header_bytes, payload_bytes, - unused_bytes); + problem_add("page", pgno, "misused", + "%" PRIuPTR " != %" PRIuPTR " (%ih + %ip + %iu)", page_size, + page_bytes, header_bytes, payload_bytes, unused_bytes); if (page_size > page_bytes) walk.dbi_lost_bytes[index] += page_size - page_bytes; } else { @@ -282,8 +285,8 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, if (pgnumber) { do { if (pgno >= lastpgno) - problem_add("page", pgno, "wrong page-no", "%zu > %zi", pgno, - lastpgno); + problem_add("page", pgno, "wrong page-no", + "%" PRIuPTR " > %" PRIiPTR "", pgno, lastpgno); else if (walk.pagemap[pgno]) problem_add("page", pgno, "already used", "in %s", walk.dbi_names[walk.pagemap[pgno]]); @@ -316,20 +319,23 @@ static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { size_t *iptr = data->mv_data, txnid = *(size_t *)key->mv_data; if (key->mv_size != sizeof(txnid)) - problem_add("entry", record_number, "wrong txn-id size", "key-size %zi", - key->mv_size); + problem_add("entry", record_number, "wrong txn-id size", + "key-size %" PRIiPTR "", key->mv_size); else if (txnid < 1 || txnid > info.me_last_txnid) - problem_add("entry", record_number, "wrong txn-id", "%zu", txnid); + problem_add("entry", record_number, "wrong txn-id", "%" PRIuPTR "", txnid); if (data->mv_size < sizeof(size_t) || data->mv_size % sizeof(size_t)) - problem_add("entry", record_number, "wrong idl size", "%zu", data->mv_size); + problem_add("entry", record_number, "wrong idl size", "%" PRIuPTR "", + data->mv_size); else { number = *iptr++; if (number >= MDB_IDL_UM_MAX) - problem_add("entry", record_number, "wrong idl length", "%zi", number); + problem_add("entry", record_number, "wrong idl length", "%" PRIiPTR "", + number); else if ((number + 1) * sizeof(size_t) != data->mv_size) - problem_add("entry", record_number, "mismatch idl length", "%zi != %zu", - number * sizeof(size_t), data->mv_size); + problem_add("entry", record_number, "mismatch idl length", + "%" PRIiPTR " != %" PRIuPTR "", number * sizeof(size_t), + data->mv_size); else { freedb_pages += number; if (info.me_tail_txnid > txnid) @@ -338,11 +344,11 @@ static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { pg = iptr[i]; if (pg < 2 /* META_PAGE */ || pg > info.me_last_pgno) problem_add("entry", record_number, "wrong idl entry", - "2 < %zi < %zi", pg, info.me_last_pgno); + "2 < %" PRIiPTR " < %" PRIiPTR "", pg, info.me_last_pgno); else if (pg <= prev) { bad = " [bad sequence]"; - problem_add("entry", record_number, "bad sequence", "%zi <= %zi", pg, - prev); + problem_add("entry", record_number, "bad sequence", + "%" PRIiPTR " <= %" PRIiPTR "", pg, prev); } prev = pg; pg += span; @@ -350,7 +356,8 @@ static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { ; } if (verbose > 2 && !only_subdb) { - print(" transaction %zu, %zd pages, maxspan %zd%s\n", + print(" transaction %" PRIuPTR ", %" PRIiPTR + " pages, maxspan %" PRIiPTR "%s\n", *(size_t *)key->mv_data, number, span, bad); if (verbose > 3) { int j = number - 1; @@ -359,7 +366,7 @@ static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) ; if (span > 1) - print(" %9zu[%zd]\n", pg, span); + print(" %9zu[%" PRIiPTR "]\n", pg, span); else print(" %9zu\n", pg); } @@ -459,8 +466,10 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { } print(" (0x%02X)\n", flags); if (verbose > 1) { - print(" - page size %u, entries %zu\n", ms.ms_psize, ms.ms_entries); - print(" - b-tree depth %u, pages: branch %zu, leaf %zu, overflow %zu\n", + print(" - page size %u, entries %" PRIuPTR "\n", ms.ms_psize, + ms.ms_entries); + print(" - b-tree depth %u, pages: branch %" PRIuPTR ", leaf %" PRIuPTR + ", overflow %" PRIuPTR "\n", ms.ms_depth, ms.ms_branch_pages, ms.ms_leaf_pages, ms.ms_overflow_pages); } @@ -486,23 +495,24 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { if (key.mv_size > maxkeysize) { problem_add("entry", record_count, "key length exceeds max-key-size", - "%zu > %zu", key.mv_size, maxkeysize); + "%" PRIuPTR " > %" PRIuPTR "", key.mv_size, maxkeysize); } else if ((flags & MDB_INTEGERKEY) && key.mv_size != sizeof(size_t) && key.mv_size != sizeof(int)) { - problem_add("entry", record_count, "wrong key length", "%zu != %zu", - key.mv_size, sizeof(size_t)); + problem_add("entry", record_count, "wrong key length", + "%" PRIuPTR " != %" PRIuPTR "", key.mv_size, sizeof(size_t)); } if ((flags & MDB_INTEGERDUP) && data.mv_size != sizeof(size_t) && data.mv_size != sizeof(int)) { - problem_add("entry", record_count, "wrong data length", "%zu != %zu", - data.mv_size, sizeof(size_t)); + problem_add("entry", record_count, "wrong data length", + "%" PRIuPTR " != %" PRIuPTR "", data.mv_size, sizeof(size_t)); } if (prev_key.mv_data) { if ((flags & MDB_DUPFIXED) && prev_data.mv_size != data.mv_size) { problem_add("entry", record_count, "different data length", - "%zu != %zu", prev_data.mv_size, data.mv_size); + "%" PRIuPTR " != %" PRIuPTR "", prev_data.mv_size, + data.mv_size); } int cmp = mdbx_cmp(txn, dbi, &prev_key, &key); @@ -521,9 +531,9 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { } } else if (verbose) { if (flags & MDB_INTEGERKEY) - print(" - fixed key-size %zu\n", key.mv_size); + print(" - fixed key-size %" PRIuPTR "\n", key.mv_size); if (flags & (MDB_INTEGERDUP | MDB_DUPFIXED)) - print(" - fixed data-size %zu\n", data.mv_size); + print(" - fixed data-size %" PRIuPTR "\n", data.mv_size); } if (handler) { @@ -547,12 +557,13 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { if (record_count != ms.ms_entries) problem_add("entry", record_count, "differentent number of entries", - "%zu != %zu", record_count, ms.ms_entries); + "%" PRIuPTR " != %" PRIuPTR "", record_count, ms.ms_entries); bailout: problems_count = problems_pop(saved_list); if (!silent && verbose) { - print(" - summary: %u records, %u dups, %zu key's bytes, %zu data's " - "bytes, %zu problems\n", + print(" - summary: %u records, %u dups, %" PRIuPTR " key's bytes, %" PRIuPTR + " data's " + "bytes, %" PRIuPTR " problems\n", record_count, dups, key_bytes, data_bytes, problems_count); fflush(NULL); } @@ -730,34 +741,35 @@ int main(int argc, char *argv[]) { "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ for (i = 0; sf[i + 1] && info.me_mapsize / k > 1000.0; ++i) k *= 1024; - print(" - map size %zu (%.2f %cb)\n", info.me_mapsize, info.me_mapsize / k, - sf[i]); + print(" - map size %" PRIuPTR " (%.2f %cb)\n", info.me_mapsize, + info.me_mapsize / k, sf[i]); if (info.me_mapaddr) print(" - mapaddr %p\n", info.me_mapaddr); - print(" - pagesize %u, max keysize %zu, max readers %u\n", stat.ms_psize, - maxkeysize, info.me_maxreaders); - print(" - transactions: last %zu, bottom %zu, lag reading %zi\n", + print(" - pagesize %u, max keysize %" PRIuPTR ", max readers %u\n", + stat.ms_psize, maxkeysize, info.me_maxreaders); + print(" - transactions: last %" PRIuPTR ", bottom %" PRIuPTR + ", lag reading %" PRIiPTR "\n", info.me_last_txnid, info.me_tail_txnid, info.me_last_txnid - info.me_tail_txnid); - print(" - meta-1: %s %zu, %s", meta_synctype(info.me_meta1_sign), + print(" - meta-1: %s %" PRIuPTR ", %s", meta_synctype(info.me_meta1_sign), info.me_meta1_txnid, meta_lt(info.me_meta1_txnid, info.me_meta1_sign, info.me_meta2_txnid, info.me_meta2_sign) ? "tail" : "head"); if (info.me_meta1_txnid > info.me_last_txnid) - print(", rolled-back %zu (%zu >>> %zu)", + print(", rolled-back %" PRIuPTR " (%" PRIuPTR " >>> %" PRIuPTR ")", info.me_meta1_txnid - info.me_last_txnid, info.me_meta1_txnid, info.me_last_txnid); print("\n"); - print(" - meta-2: %s %zu, %s", meta_synctype(info.me_meta2_sign), + print(" - meta-2: %s %" PRIuPTR ", %s", meta_synctype(info.me_meta2_sign), info.me_meta2_txnid, meta_lt(info.me_meta2_txnid, info.me_meta2_sign, info.me_meta1_txnid, info.me_meta1_sign) ? "tail" : "head"); if (info.me_meta2_txnid > info.me_last_txnid) - print(", rolled-back %zu (%zu >>> %zu)", + print(", rolled-back %" PRIuPTR " (%" PRIuPTR " >>> %" PRIuPTR ")", info.me_meta2_txnid - info.me_last_txnid, info.me_meta2_txnid, info.me_last_txnid); print("\n"); @@ -770,7 +782,8 @@ int main(int argc, char *argv[]) { if (!meta_lt(info.me_meta1_txnid, info.me_meta1_sign, info.me_meta2_txnid, info.me_meta2_sign) && info.me_meta1_txnid != info.me_last_txnid) { - print(" - meta-1 txn-id mismatch last-txn-id (%zi != %zi)\n", + print(" - meta-1 txn-id mismatch last-txn-id (%" PRIiPTR " != %" PRIiPTR + ")\n", info.me_meta1_txnid, info.me_last_txnid); ++problems_meta; } @@ -778,7 +791,8 @@ int main(int argc, char *argv[]) { if (!meta_lt(info.me_meta2_txnid, info.me_meta2_sign, info.me_meta1_txnid, info.me_meta1_sign) && info.me_meta2_txnid != info.me_last_txnid) { - print(" - meta-2 txn-id mismatch last-txn-id (%zi != %zi)\n", + print(" - meta-2 txn-id mismatch last-txn-id (%" PRIiPTR " != %" PRIiPTR + ")\n", info.me_meta2_txnid, info.me_last_txnid); ++problems_meta; } @@ -790,8 +804,9 @@ int main(int argc, char *argv[]) { ? info.me_meta2_txnid : info.me_meta1_txnid; if (last != info.me_last_txnid) { - print(" - last-meta mismatch last-txn-id (%zi != %zi)\n", last, - info.me_last_txnid); + print(" - last-meta mismatch last-txn-id (%" PRIiPTR " != %" PRIiPTR + ")\n", + last, info.me_last_txnid); ++problems_meta; } } else if (verbose) { @@ -839,14 +854,15 @@ int main(int argc, char *argv[]) { if (verbose) { size_t total_page_bytes = walk.pgcount * stat.ms_psize; - print(" - dbi pages: %zu total", walk.pgcount); + print(" - dbi pages: %" PRIuPTR " total", walk.pgcount); if (verbose > 1) for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) - print(", %s %zu", walk.dbi_names[i], walk.dbi_pages[i]); - print(", %s %zu\n", walk.dbi_names[0], walk.dbi_pages[0]); + print(", %s %" PRIuPTR "", walk.dbi_names[i], walk.dbi_pages[i]); + print(", %s %" PRIuPTR "\n", walk.dbi_names[0], walk.dbi_pages[0]); if (verbose > 1) { - print(" - space info: total %zu bytes, payload %zu (%.1f%%), unused " - "%zu (%.1f%%)\n", + print(" - space info: total %" PRIuPTR " bytes, payload %" PRIuPTR + " (%.1f%%), unused " + "%" PRIuPTR " (%.1f%%)\n", total_page_bytes, walk.total_payload_bytes, walk.total_payload_bytes * 100.0 / total_page_bytes, total_page_bytes - walk.total_payload_bytes, @@ -854,27 +870,28 @@ int main(int argc, char *argv[]) { total_page_bytes); for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { size_t dbi_bytes = walk.dbi_pages[i] * stat.ms_psize; - print(" %s: subtotal %zu bytes (%.1f%%), payload %zu (%.1f%%), " - "unused %zu (%.1f%%)", + print(" %s: subtotal %" PRIuPTR + " bytes (%.1f%%), payload %" PRIuPTR " (%.1f%%), " + "unused %" PRIuPTR " (%.1f%%)", walk.dbi_names[i], dbi_bytes, dbi_bytes * 100.0 / total_page_bytes, walk.dbi_payload_bytes[i], walk.dbi_payload_bytes[i] * 100.0 / dbi_bytes, dbi_bytes - walk.dbi_payload_bytes[i], (dbi_bytes - walk.dbi_payload_bytes[i]) * 100.0 / dbi_bytes); if (walk.dbi_empty_pages[i]) - print(", %zu empty pages", walk.dbi_empty_pages[i]); + print(", %" PRIuPTR " empty pages", walk.dbi_empty_pages[i]); if (walk.dbi_lost_bytes[i]) - print(", %zu bytes lost", walk.dbi_lost_bytes[i]); + print(", %" PRIuPTR " bytes lost", walk.dbi_lost_bytes[i]); print("\n"); } } print(" - summary: average fill %.1f%%", walk.total_payload_bytes * 100.0 / total_page_bytes); if (empty_pages) - print(", %zu empty pages", empty_pages); + print(", %" PRIuPTR " empty pages", empty_pages); if (lost_bytes) - print(", %zu bytes lost", lost_bytes); - print(", %zu problems\n", traversal_problems); + print(", %" PRIuPTR " bytes lost", lost_bytes); + print(", %" PRIuPTR " problems\n", traversal_problems); } } else if (verbose) { print("Skipping b-tree walk...\n"); @@ -889,38 +906,38 @@ int main(int argc, char *argv[]) { if (verbose) { size_t value = info.me_mapsize / stat.ms_psize; double percent = value / 100.0; - print(" - pages info: %zu total", value); - print(", allocated %zu (%.1f%%)", lastpgno, lastpgno / percent); + print(" - pages info: %" PRIuPTR " total", value); + print(", allocated %" PRIuPTR " (%.1f%%)", lastpgno, lastpgno / percent); if (verbose > 1) { value = info.me_mapsize / stat.ms_psize - lastpgno; - print(", remained %zu (%.1f%%)", value, value / percent); + print(", remained %" PRIuPTR " (%.1f%%)", value, value / percent); value = lastpgno - freedb_pages; - print(", used %zu (%.1f%%)", value, value / percent); + print(", used %" PRIuPTR " (%.1f%%)", value, value / percent); - print(", gc %zu (%.1f%%)", freedb_pages, freedb_pages / percent); + print(", gc %" PRIuPTR " (%.1f%%)", freedb_pages, freedb_pages / percent); value = freedb_pages - reclaimable_pages; - print(", detained %zu (%.1f%%)", value, value / percent); + print(", detained %" PRIuPTR " (%.1f%%)", value, value / percent); - print(", reclaimable %zu (%.1f%%)", reclaimable_pages, + print(", reclaimable %" PRIuPTR " (%.1f%%)", reclaimable_pages, reclaimable_pages / percent); } value = info.me_mapsize / stat.ms_psize - lastpgno + reclaimable_pages; - print(", available %zu (%.1f%%)\n", value, value / percent); + print(", available %" PRIuPTR " (%.1f%%)\n", value, value / percent); } if (problems_maindb == 0 && problems_freedb == 0) { if (!dont_traversal && (exclusive || locktxn)) { if (walk.pgcount != lastpgno - freedb_pages) { - error("used pages mismatch (%zu != %zu)\n", walk.pgcount, - lastpgno - freedb_pages); + error("used pages mismatch (%" PRIuPTR " != %" PRIuPTR ")\n", + walk.pgcount, lastpgno - freedb_pages); } if (walk.dbi_pages[0] != freedb_pages) { - error("gc pages mismatch (%zu != %zu)\n", walk.dbi_pages[0], - freedb_pages); + error("gc pages mismatch (%" PRIuPTR " != %" PRIuPTR ")\n", + walk.dbi_pages[0], freedb_pages); } } else if (verbose) { print(" - skip check used and gc pages (btree-traversal with " @@ -958,7 +975,7 @@ bailout: total_problems += problems_meta; if (total_problems || problems_maindb || problems_freedb) { - print("Total %zu error(s) is detected, elapsed %.3f seconds.\n", + print("Total %" PRIuPTR " error(s) is detected, elapsed %.3f seconds.\n", total_problems, elapsed); if (problems_meta || problems_maindb || problems_freedb) return EXIT_FAILURE_CHECK_MAJOR; diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index c217c503..871dd55e 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -17,6 +17,7 @@ #include "../../mdbx.h" #include #include +#include #include #include #include @@ -109,7 +110,7 @@ static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) { if (name) printf("database=%s\n", name); printf("type=btree\n"); - printf("mapsize=%zu\n", info.me_mapsize); + printf("mapsize=%" PRIuPTR "\n", info.me_mapsize); if (info.me_mapaddr) printf("mapaddr=%p\n", info.me_mapaddr); printf("maxreaders=%u\n", info.me_maxreaders); diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index d9c62d59..5af8a913 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -17,6 +17,7 @@ #include "../../mdbx.h" #include #include +#include #include #include #include @@ -72,8 +73,8 @@ static void readhdr(void) { } else if (!strncmp(dbuf.mv_data, "VERSION=", STRLENOF("VERSION="))) { version = atoi((char *)dbuf.mv_data + STRLENOF("VERSION=")); if (version > 3) { - fprintf(stderr, "%s: line %zd: unsupported VERSION %d\n", prog, lineno, - version); + fprintf(stderr, "%s: line %" PRIiPTR ": unsupported VERSION %d\n", prog, + lineno, version); exit(EXIT_FAILURE); } } else if (!strncmp(dbuf.mv_data, "HEADER=END", STRLENOF("HEADER=END"))) { @@ -84,8 +85,8 @@ static void readhdr(void) { mode |= PRINT; else if (strncmp((char *)dbuf.mv_data + STRLENOF("FORMAT="), "bytevalue", STRLENOF("bytevalue"))) { - fprintf(stderr, "%s: line %zd: unsupported FORMAT %s\n", prog, lineno, - (char *)dbuf.mv_data + STRLENOF("FORMAT=")); + fprintf(stderr, "%s: line %" PRIiPTR ": unsupported FORMAT %s\n", prog, + lineno, (char *)dbuf.mv_data + STRLENOF("FORMAT=")); exit(EXIT_FAILURE); } } else if (!strncmp(dbuf.mv_data, "database=", STRLENOF("database="))) { @@ -98,8 +99,8 @@ static void readhdr(void) { } else if (!strncmp(dbuf.mv_data, "type=", STRLENOF("type="))) { if (strncmp((char *)dbuf.mv_data + STRLENOF("type="), "btree", STRLENOF("btree"))) { - fprintf(stderr, "%s: line %zd: unsupported type %s\n", prog, lineno, - (char *)dbuf.mv_data + STRLENOF("type=")); + fprintf(stderr, "%s: line %" PRIiPTR ": unsupported type %s\n", prog, + lineno, (char *)dbuf.mv_data + STRLENOF("type=")); exit(EXIT_FAILURE); } } else if (!strncmp(dbuf.mv_data, "mapaddr=", STRLENOF("mapaddr="))) { @@ -110,8 +111,8 @@ static void readhdr(void) { i = sscanf((char *)dbuf.mv_data + STRLENOF("mapaddr="), "%p", &info.me_mapaddr); if (i != 1) { - fprintf(stderr, "%s: line %zd: invalid mapaddr %s\n", prog, lineno, - (char *)dbuf.mv_data + STRLENOF("mapaddr=")); + fprintf(stderr, "%s: line %" PRIiPTR ": invalid mapaddr %s\n", prog, + lineno, (char *)dbuf.mv_data + STRLENOF("mapaddr=")); exit(EXIT_FAILURE); } } else if (!strncmp(dbuf.mv_data, "mapsize=", STRLENOF("mapsize="))) { @@ -119,11 +120,11 @@ static void readhdr(void) { ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data + STRLENOF("mapsize="), "%zu", + i = sscanf((char *)dbuf.mv_data + STRLENOF("mapsize="), "%" PRIuPTR "", &info.me_mapsize); if (i != 1) { - fprintf(stderr, "%s: line %zd: invalid mapsize %s\n", prog, lineno, - (char *)dbuf.mv_data + STRLENOF("mapsize=")); + fprintf(stderr, "%s: line %" PRIiPTR ": invalid mapsize %s\n", prog, + lineno, (char *)dbuf.mv_data + STRLENOF("mapsize=")); exit(EXIT_FAILURE); } } else if (!strncmp(dbuf.mv_data, "maxreaders=", STRLENOF("maxreaders="))) { @@ -134,8 +135,8 @@ static void readhdr(void) { i = sscanf((char *)dbuf.mv_data + STRLENOF("maxreaders="), "%u", &info.me_maxreaders); if (i != 1) { - fprintf(stderr, "%s: line %zd: invalid maxreaders %s\n", prog, lineno, - (char *)dbuf.mv_data + STRLENOF("maxreaders=")); + fprintf(stderr, "%s: line %" PRIiPTR ": invalid maxreaders %s\n", prog, + lineno, (char *)dbuf.mv_data + STRLENOF("maxreaders=")); exit(EXIT_FAILURE); } } else { @@ -151,11 +152,13 @@ static void readhdr(void) { if (!dbflags[i].bit) { ptr = memchr(dbuf.mv_data, '=', dbuf.mv_size); if (!ptr) { - fprintf(stderr, "%s: line %zd: unexpected format\n", prog, lineno); + fprintf(stderr, "%s: line %" PRIiPTR ": unexpected format\n", prog, + lineno); exit(EXIT_FAILURE); } else { *ptr = '\0'; - fprintf(stderr, "%s: line %zd: unrecognized keyword ignored: %s\n", + fprintf(stderr, + "%s: line %" PRIiPTR ": unrecognized keyword ignored: %s\n", prog, lineno, (char *)dbuf.mv_data); } } @@ -164,7 +167,8 @@ static void readhdr(void) { } static void badend(void) { - fprintf(stderr, "%s: line %zd: unexpected end of input\n", prog, lineno); + fprintf(stderr, "%s: line %" PRIiPTR ": unexpected end of input\n", prog, + lineno); } static int unhex(unsigned char *c2) { @@ -219,8 +223,8 @@ static int readline(MDB_val *out, MDB_val *buf) { buf->mv_data = realloc(buf->mv_data, buf->mv_size * 2); if (!buf->mv_data) { Eof = 1; - fprintf(stderr, "%s: line %zd: out of memory, line too long\n", prog, - lineno); + fprintf(stderr, "%s: line %" PRIiPTR ": out of memory, line too long\n", + prog, lineno); return EOF; } c1 = buf->mv_data; @@ -410,8 +414,8 @@ int main(int argc, char *argv[]) { rc = readline(&data, &dbuf); if (rc) { - fprintf(stderr, "%s: line %zd: failed to read key value\n", prog, - lineno); + fprintf(stderr, "%s: line %" PRIiPTR ": failed to read key value\n", + prog, lineno); goto txn_abort; } @@ -427,8 +431,8 @@ int main(int argc, char *argv[]) { if (batch == 100) { rc = mdbx_txn_commit(txn); if (rc) { - fprintf(stderr, "%s: line %zd: txn_commit: %s\n", prog, lineno, - mdbx_strerror(rc)); + fprintf(stderr, "%s: line %" PRIiPTR ": txn_commit: %s\n", prog, + lineno, mdbx_strerror(rc)); goto env_close; } rc = mdbx_txn_begin(env, NULL, 0, &txn); @@ -449,7 +453,7 @@ int main(int argc, char *argv[]) { rc = mdbx_txn_commit(txn); txn = NULL; if (rc) { - fprintf(stderr, "%s: line %zd: txn_commit: %s\n", prog, lineno, + fprintf(stderr, "%s: line %" PRIiPTR ": txn_commit: %s\n", prog, lineno, mdbx_strerror(rc)); goto env_close; } diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index 565d58bb..46eeb94d 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -15,6 +15,7 @@ */ #include "../../mdbx.h" +#include #include #include #include @@ -23,10 +24,10 @@ static void prstat(MDBX_stat *ms) { printf(" Page size: %u\n", ms->ms_psize); printf(" Tree depth: %u\n", ms->ms_depth); - printf(" Branch pages: %zu\n", ms->ms_branch_pages); - printf(" Leaf pages: %zu\n", ms->ms_leaf_pages); - printf(" Overflow pages: %zu\n", ms->ms_overflow_pages); - printf(" Entries: %zu\n", ms->ms_entries); + printf(" Branch pages: %" PRIuPTR "\n", ms->ms_branch_pages); + printf(" Leaf pages: %" PRIuPTR "\n", ms->ms_leaf_pages); + printf(" Overflow pages: %" PRIuPTR "\n", ms->ms_overflow_pages); + printf(" Entries: %" PRIuPTR "\n", ms->ms_entries); } static void usage(char *prog) { @@ -121,13 +122,13 @@ int main(int argc, char *argv[]) { (void)mdbx_env_info(env, &mei, sizeof(mei)); printf("Environment Info\n"); printf(" Map address: %p\n", mei.me_mapaddr); - printf(" Map size: %zu\n", mei.me_mapsize); + printf(" Map size: %" PRIuPTR "\n", mei.me_mapsize); printf(" Page size: %u\n", mst.ms_psize); - printf(" Max pages: %zu\n", mei.me_mapsize / mst.ms_psize); - printf(" Number of pages used: %zu\n", mei.me_last_pgno + 1); - printf(" Last transaction ID: %zu\n", mei.me_last_txnid); - printf(" Tail transaction ID: %zu (%zi)\n", mei.me_tail_txnid, - mei.me_tail_txnid - mei.me_last_txnid); + printf(" Max pages: %" PRIuPTR "\n", mei.me_mapsize / mst.ms_psize); + printf(" Number of pages used: %" PRIuPTR "\n", mei.me_last_pgno + 1); + printf(" Last transaction ID: %" PRIuPTR "\n", mei.me_last_txnid); + printf(" Tail transaction ID: %" PRIuPTR " (%" PRIiPTR ")\n", + mei.me_tail_txnid, mei.me_tail_txnid - mei.me_last_txnid); printf(" Max readers: %u\n", mei.me_maxreaders); printf(" Number of readers used: %u\n", mei.me_numreaders); } else { @@ -196,7 +197,8 @@ int main(int argc, char *argv[]) { for (; i >= span && iptr[i - span] == pg; span++, pg++) ; } - printf(" Transaction %zu, %zd pages, maxspan %zd%s\n", + printf(" Transaction %" PRIuPTR ", %" PRIiPTR + " pages, maxspan %" PRIiPTR "%s\n", *(size_t *)key.mv_data, j, span, bad); if (freinfo > 2) { for (--j; j >= 0;) { @@ -204,7 +206,7 @@ int main(int argc, char *argv[]) { for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) ; if (span > 1) - printf(" %9zu[%zd]\n", pg, span); + printf(" %9zu[%" PRIiPTR "]\n", pg, span); else printf(" %9zu\n", pg); } @@ -219,28 +221,29 @@ int main(int argc, char *argv[]) { printf(" Max pages: %9zu 100%%\n", value); value = mei.me_last_pgno + 1; - printf(" Number of pages used: %zu %.1f%%\n", value, value / percent); + printf(" Number of pages used: %" PRIuPTR " %.1f%%\n", value, + value / percent); value = mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1); - printf(" Remained: %zu %.1f%%\n", value, value / percent); + printf(" Remained: %" PRIuPTR " %.1f%%\n", value, value / percent); value = mei.me_last_pgno + 1 - pages; - printf(" Used now: %zu %.1f%%\n", value, value / percent); + printf(" Used now: %" PRIuPTR " %.1f%%\n", value, value / percent); value = pages; - printf(" Unallocated: %zu %.1f%%\n", value, value / percent); + printf(" Unallocated: %" PRIuPTR " %.1f%%\n", value, value / percent); value = pages - reclaimable; - printf(" Detained: %zu %.1f%%\n", value, value / percent); + printf(" Detained: %" PRIuPTR " %.1f%%\n", value, value / percent); value = reclaimable; - printf(" Reclaimable: %zu %.1f%%\n", value, value / percent); + printf(" Reclaimable: %" PRIuPTR " %.1f%%\n", value, value / percent); value = mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1) + reclaimable; - printf(" Available: %zu %.1f%%\n", value, value / percent); + printf(" Available: %" PRIuPTR " %.1f%%\n", value, value / percent); } else - printf(" Free pages: %zu\n", pages); + printf(" Free pages: %" PRIuPTR "\n", pages); } rc = mdbx_dbi_open(txn, subname, 0, &dbi); diff --git a/test/osal-unix.cc b/test/osal-unix.cc index 2ab3a7aa..c44ade47 100644 --- a/test/osal-unix.cc +++ b/test/osal-unix.cc @@ -85,7 +85,8 @@ void osal_setup(const std::vector &actors) { rc = pthread_cond_init(event, &condattr); if (rc) failure_perror("pthread_cond_init(shared)", rc); - log_trace("osal_setup: event(shared pthread_cond) %zu -> %p", i, event); + log_trace("osal_setup: event(shared pthread_cond) %" PRIuPTR " -> %p", i, + event); } shared->conds_size = actors.size() + 1; diff --git a/test/osal-windows.cc b/test/osal-windows.cc index 7ed4522d..c42513f5 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -72,7 +72,7 @@ void osal_setup(const std::vector &actors) { if (!hEvent) failure_perror("CreateEvent()", GetLastError()); hEvent = make_inharitable(hEvent); - log_trace("osal_setup: event %zu -> %p", i, hEvent); + log_trace("osal_setup: event %" PRIuPTR " -> %p", i, hEvent); events[i] = hEvent; } diff --git a/test/test.cc b/test/test.cc index 6e5dd884..a225ee34 100644 --- a/test/test.cc +++ b/test/test.cc @@ -195,16 +195,16 @@ bool testcase::wait4start() { void testcase::report(size_t nops_done) { nops_completed += nops_done; - log_verbose("== complete +%zu iteration, total %zu done", nops_done, - nops_completed); + log_verbose("== complete +%" PRIuPTR " iteration, total %" PRIuPTR " done", + nops_done, nops_completed); if (config.signal_nops && !signalled && config.signal_nops <= nops_completed) { - log_trace(">> signal(n-ops %zu)", nops_completed); + log_trace(">> signal(n-ops %" PRIuPTR ")", nops_completed); if (!global::singlemode) osal_broadcast(config.actor_id); signalled = true; - log_trace("<< signal(n-ops %zu)", nops_completed); + log_trace("<< signal(n-ops %" PRIuPTR ")", nops_completed); } } From e8a430999c3885bcf4e1dd83e3a5881d95a1452a Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 10 May 2017 19:26:56 +0300 Subject: [PATCH 109/303] mdbx: hush MSVC warnings for atomic stubs. --- src/osal.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/osal.h b/src/osal.h index 46ad1b3f..414bde14 100644 --- a/src/osal.h +++ b/src/osal.h @@ -454,6 +454,12 @@ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid); #elif defined(__GNUC__) || defined(__clang__) /* LY: nothing required */ #elif defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ #pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) #pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) #elif defined(__APPLE__) @@ -518,3 +524,7 @@ static __inline bool mdbx_atomic_compare_and_swap(volatile size_t *p, size_t c, ; #endif } + +#ifdef _MSC_VER +#pragma warning(pop) +#endif From 533e01cc935748d83490ac54eeee97440754dcc8 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 10 May 2017 20:27:30 +0300 Subject: [PATCH 110/303] mdbx: change mdbx_dbi_close() API. --- mdbx.h | 2 +- src/mdbx.c | 28 ++++++++++++++++++---------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/mdbx.h b/mdbx.h index d68b7529..75c1d177 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1124,7 +1124,7 @@ LIBMDBX_API int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); * [in] env An environment handle returned by mdbx_env_create() * [in] dbi A database handle returned by mdbx_dbi_open() */ -LIBMDBX_API void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); +LIBMDBX_API int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); /* Empty or delete+close a database. * diff --git a/src/mdbx.c b/src/mdbx.c index 73248250..03c9864e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -8801,6 +8801,8 @@ int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, return rc; } + /* FIXME: locking to avoid races ? */ + /* Done here so we cannot fail after creating a new DB */ if (unlikely((namedup = mdbx_strdup(name)) == NULL)) return MDBX_ENOMEM; @@ -8868,19 +8870,23 @@ int __cold mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, return mdbx_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); } -void mdbx_dbi_close(MDB_env *env, MDB_dbi dbi) { +int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi) { char *ptr; - if (dbi < CORE_DBS || dbi >= env->me_maxdbs) - return; + if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) + return MDBX_EINVAL; + + /* FIXME: locking to avoid races ? */ ptr = env->me_dbxs[dbi].md_name.mv_data; /* If there was no name, this was already closed */ - if (ptr) { - env->me_dbxs[dbi].md_name.mv_data = NULL; - env->me_dbxs[dbi].md_name.mv_size = 0; - env->me_dbflags[dbi] = 0; - env->me_dbiseqs[dbi]++; - free(ptr); - } + if (unlikely(!ptr)) + return MDB_BAD_DBI; + + env->me_dbxs[dbi].md_name.mv_data = NULL; + env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbflags[dbi] = 0; + env->me_dbiseqs[dbi]++; + free(ptr); + return MDB_SUCCESS; } int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags) { @@ -9015,6 +9021,8 @@ int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del) { if (unlikely(rc)) return rc; + /* FIXME: locking to avoid races ? */ + rc = mdbx_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); /* Invalidate the dropped DB's cursors */ for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) From 00081298d6f7e0f758f0cbe3e1c7683f9d040fdf Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 15 May 2017 12:08:04 +0300 Subject: [PATCH 111/303] mdbx: add mdbx_get_errno_checked(). --- src/lck-windows.c | 61 ++++++++++++++++-------------------- src/osal.c | 49 ++++++++++++++--------------- src/osal.h | 78 +++++++++++++++++++++++++++++++---------------- 3 files changed, 102 insertions(+), 86 deletions(-) diff --git a/src/lck-windows.c b/src/lck-windows.c index 1de72935..7654f635 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -128,7 +128,7 @@ static __inline BOOL funlock(mdbx_filehandle_t fd, off_t offset, size_t bytes) { int mdbx_txn_lock(MDB_env *env) { if (flock(env->me_fd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_BODY)) return MDB_SUCCESS; - return GetLastError(); + return mdbx_get_errno_checked(); } void mdbx_txn_unlock(MDB_env *env) { @@ -154,7 +154,7 @@ int mdbx_rdt_lock(MDB_env *env) { /* transite from S-? (used) to S-E (locked), e.g. exlcusive lock upper-part */ if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) return MDB_SUCCESS; - return GetLastError(); + return mdbx_get_errno_checked(); } void mdbx_rdt_unlock(MDB_env *env) { @@ -195,7 +195,7 @@ static int internal_seize_lck(HANDLE lfd) { /* 1) now on ?-? (free), get ?-E (middle) */ mdbx_jitter4testing(false); if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { - rc = GetLastError() /* 2) something went wrong, give up */; + rc = mdbx_get_errno_checked() /* 2) something went wrong, give up */; mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "?-?(free) >> ?-E(middle)", rc); return rc; @@ -207,22 +207,21 @@ static int internal_seize_lck(HANDLE lfd) { return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive), done */ /* 5) still on ?-E (middle) */ - rc = GetLastError(); + rc = mdbx_get_errno_checked(); mdbx_jitter4testing(false); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ - if (!funlock(lfd, LCK_UPPER)) { - rc = GetLastError(); + if (!funlock(lfd, LCK_UPPER)) mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "?-E(middle) >> ?-?(free)", rc); - } + "?-E(middle) >> ?-?(free)", GetLastError()); return rc; } /* 7) still on ?-E (middle), try S-E (locked) */ mdbx_jitter4testing(false); - rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE - : GetLastError(); + rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) + ? MDBX_RESULT_FALSE + : mdbx_get_errno_checked(); mdbx_jitter4testing(false); if (rc != MDBX_RESULT_FALSE) @@ -231,11 +230,9 @@ static int internal_seize_lck(HANDLE lfd) { /* 8) now on S-E (locked) or still on ?-E (middle), * transite to S-? (used) or ?-? (free) */ - if (!funlock(lfd, LCK_UPPER)) { - rc = GetLastError(); + if (!funlock(lfd, LCK_UPPER)) mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "X-E(locked/middle) >> X-?(used/free)", rc); - } + "X-E(locked/middle) >> X-?(used/free)", GetLastError()); /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ return rc; @@ -249,7 +246,7 @@ int mdbx_lck_seize(MDB_env *env) { /* LY: without-lck mode (e.g. on read-only filesystem) */ mdbx_jitter4testing(false); if (!flock(env->me_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { - rc = GetLastError(); + rc = mdbx_get_errno_checked(); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); return rc; } @@ -266,18 +263,16 @@ int mdbx_lck_seize(MDB_env *env) { * - we can't lock meta-pages, otherwise other process could get an error * while opening db in valid (non-conflict) mode. */ if (!flock(env->me_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { - rc = GetLastError(); + rc = mdbx_get_errno_checked(); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lock-against-without-lck", rc); mdbx_jitter4testing(false); mdbx_lck_destroy(env); } else { mdbx_jitter4testing(false); - if (!funlock(env->me_fd, LCK_BODY)) { - rc = GetLastError(); + if (!funlock(env->me_fd, LCK_BODY)) mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "unlock-against-without-lck", rc); - } + "unlock-against-without-lck", GetLastError()); } } @@ -291,24 +286,20 @@ int mdbx_lck_downgrade(MDB_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* 1) must be at E-E (exclusive), transite to ?_E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) { - rc = GetLastError(); + if (!funlock(env->me_lfd, LCK_LOWER)) mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "E-E(exclusive) >> ?-E(middle)", rc); - } + "E-E(exclusive) >> ?-E(middle)", GetLastError()); /* 2) now at ?-E (middle), transite to S-E (locked) */ if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { - rc = GetLastError() /* 3) something went wrong, give up */; + rc = mdbx_get_errno_checked() /* 3) something went wrong, give up */; return rc; } /* 4) got S-E (locked), continue transition to S-? (used) */ - if (!funlock(env->me_lfd, LCK_UPPER)) { - rc = GetLastError(); + if (!funlock(env->me_lfd, LCK_UPPER)) mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "S-E(locked) >> S-?(used)", rc); - } + "S-E(locked) >> S-?(used)", GetLastError()); } return MDB_SUCCESS /* 5) now at S-? (used), done */; } @@ -320,14 +311,14 @@ void mdbx_lck_destroy(MDB_env *env) { /* double `unlock` for robustly remove overlapped shared/exclusive locks */ while (funlock(env->me_lfd, LCK_LOWER)) ; - rc = GetLastError(); + rc = mdbx_get_errno_checked(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); while (funlock(env->me_lfd, LCK_UPPER)) ; - rc = GetLastError(); + rc = mdbx_get_errno_checked(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); @@ -338,21 +329,21 @@ void mdbx_lck_destroy(MDB_env *env) { * releases such locks via deferred queues) */ while (funlock(env->me_fd, LCK_BODY)) ; - rc = GetLastError(); + rc = mdbx_get_errno_checked(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); while (funlock(env->me_fd, LCK_META)) ; - rc = GetLastError(); + rc = mdbx_get_errno_checked(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); while (funlock(env->me_fd, LCK_WHOLE)) ; - rc = GetLastError(); + rc = mdbx_get_errno_checked(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); @@ -386,7 +377,7 @@ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { rc = WaitForSingleObject(hProcess, 0); CloseHandle(hProcess); } else { - rc = GetLastError(); + rc = mdbx_get_errno_checked(); } switch (rc) { diff --git a/src/osal.c b/src/osal.c index 17bc602f..459b9f49 100644 --- a/src/osal.c +++ b/src/osal.c @@ -22,7 +22,7 @@ static int waitstatus2errcode(DWORD result) { case WAIT_OBJECT_0: return MDB_SUCCESS; case WAIT_FAILED: - return GetLastError(); + return mdbx_get_errno_checked(); case WAIT_ABANDONED: return ERROR_ABANDONED_WAIT_0; case WAIT_IO_COMPLETION: @@ -167,7 +167,7 @@ void mdbx_memalign_free(void *ptr) { int mdbx_mutex_init(mdbx_mutex_t *mutex) { #if defined(_WIN32) || defined(_WIN64) *mutex = CreateMutex(NULL, FALSE, NULL); - return *mutex ? MDB_SUCCESS : GetLastError(); + return *mutex ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return pthread_mutex_init(mutex, NULL); #endif @@ -175,7 +175,7 @@ int mdbx_mutex_init(mdbx_mutex_t *mutex) { int mdbx_mutex_destroy(mdbx_mutex_t *mutex) { #if defined(_WIN32) || defined(_WIN64) - return CloseHandle(*mutex) ? MDB_SUCCESS : GetLastError(); + return CloseHandle(*mutex) ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return pthread_mutex_destroy(mutex); #endif @@ -192,7 +192,7 @@ int mdbx_mutex_lock(mdbx_mutex_t *mutex) { int mdbx_mutex_unlock(mdbx_mutex_t *mutex) { #if defined(_WIN32) || defined(_WIN64) - return ReleaseMutex(*mutex) ? MDB_SUCCESS : GetLastError(); + return ReleaseMutex(*mutex) ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return pthread_mutex_unlock(mutex); #endif @@ -203,7 +203,7 @@ int mdbx_mutex_unlock(mdbx_mutex_t *mutex) { int mdbx_cond_init(mdbx_cond_t *cond) { #if defined(_WIN32) || defined(_WIN64) *cond = CreateEvent(NULL, FALSE, FALSE, NULL); - return *cond ? MDB_SUCCESS : GetLastError(); + return *cond ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return pthread_cond_init(cond, NULL); #endif @@ -212,7 +212,7 @@ int mdbx_cond_init(mdbx_cond_t *cond) { #ifndef mdbx_cond_destroy int mdbx_cond_destroy(mdbx_cond_t *cond) { #if defined(_WIN32) || defined(_WIN64) - return CloseHandle(*cond) ? MDB_SUCCESS : GetLastError(); + return CloseHandle(*cond) ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return pthread_cond_destroy(cond); #endif @@ -221,7 +221,7 @@ int mdbx_cond_destroy(mdbx_cond_t *cond) { int mdbx_cond_signal(mdbx_cond_t *cond) { #if defined(_WIN32) || defined(_WIN64) - return SetEvent(*cond) ? MDB_SUCCESS : GetLastError(); + return SetEvent(*cond) ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return pthread_cond_signal(cond); #endif @@ -286,14 +286,14 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, CreationDisposition, FlagsAndAttributes, NULL); if (*fd == INVALID_HANDLE_VALUE) - return GetLastError(); - if ((flags & O_CREAT) && GetLastError() != ERROR_ALREADY_EXISTS) { + return mdbx_get_errno_checked(); + if ((flags & O_CREAT) && mdbx_get_errno_checked() != ERROR_ALREADY_EXISTS) { /* set FILE_ATTRIBUTE_NOT_CONTENT_INDEXED for new file */ DWORD FileAttributes = GetFileAttributesA(pathname); if (FileAttributes == INVALID_FILE_ATTRIBUTES || !SetFileAttributesA(pathname, FileAttributes | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED)) { - int rc = GetLastError(); + int rc = mdbx_get_errno_checked(); CloseHandle(*fd); *fd = INVALID_HANDLE_VALUE; return rc; @@ -318,7 +318,7 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, int mdbx_closefile(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) - return CloseHandle(fd) ? MDB_SUCCESS : GetLastError(); + return CloseHandle(fd) ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return (close(fd) == 0) ? MDB_SUCCESS : errno; #endif @@ -336,7 +336,7 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { DWORD read = 0; if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { - int rc = GetLastError(); + int rc = mdbx_get_errno_checked(); return (rc == MDB_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; } #else @@ -363,7 +363,7 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, DWORD written; if (likely(WriteFile(fd, buf, (DWORD)bytes, &written, &ov))) return (bytes == written) ? MDB_SUCCESS : ERROR_WRITE_FAULT; - return GetLastError(); + return mdbx_get_errno_checked(); #else int rc; ssize_t written; @@ -418,7 +418,7 @@ int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { #if defined(_WIN32) || defined(_WIN64) DWORD written; if (unlikely(!WriteFile(fd, ptr, (DWORD)chunk, &written, NULL))) - return GetLastError(); + return mdbx_get_errno_checked(); #else ssize_t written = write(fd, ptr, chunk); if (written < 0) { @@ -452,7 +452,7 @@ int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync) { #if defined(_WIN32) || defined(_WIN64) (void)fullsync; - return FlushFileBuffers(fd) ? MDB_SUCCESS : GetLastError(); + return FlushFileBuffers(fd) ? MDB_SUCCESS : mdbx_get_errno_checked(); #elif __GLIBC_PREREQ(2, 16) || _BSD_SOURCE || _XOPEN_SOURCE || \ (__GLIBC_PREREQ(2, 8) && _POSIX_C_SOURCE >= 200112L) for (;;) { @@ -478,7 +478,7 @@ int mdbx_filesize(mdbx_filehandle_t fd, off_t *length) { #if defined(_WIN32) || defined(_WIN64) BY_HANDLE_FILE_INFORMATION info; if (!GetFileInformationByHandle(fd, &info)) - return GetLastError(); + return mdbx_get_errno_checked(); *length = info.nFileSizeLow | (uint64_t)info.nFileIndexHigh << 32; #else struct stat st; @@ -497,7 +497,7 @@ int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length) { li.QuadPart = length; return (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) && SetEndOfFile(fd)) ? MDB_SUCCESS - : GetLastError(); + : mdbx_get_errno_checked(); #else return ftruncate(fd, length) == 0 ? MDB_SUCCESS : errno; #endif @@ -508,7 +508,7 @@ int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length) { int mdbx_thread_key_create(mdbx_thread_key_t *key) { #if defined(_WIN32) || defined(_WIN64) *key = TlsAlloc(); - return (*key != TLS_OUT_OF_INDEXES) ? MDB_SUCCESS : GetLastError(); + return (*key != TLS_OUT_OF_INDEXES) ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return pthread_key_create(key, mdbx_rthc_dtor); #endif @@ -551,7 +551,7 @@ int mdbx_thread_create(mdbx_thread_t *thread, void *arg) { #if defined(_WIN32) || defined(_WIN64) *thread = CreateThread(NULL, 0, start_routine, arg, 0, NULL); - return *thread ? MDB_SUCCESS : GetLastError(); + return *thread ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return pthread_create(thread, NULL, start_routine, arg); #endif @@ -573,7 +573,7 @@ int mdbx_msync(void *addr, size_t length, int async) { #if defined(_WIN32) || defined(_WIN64) if (async) return MDB_SUCCESS; - return FlushViewOfFile(addr, length) ? MDB_SUCCESS : GetLastError(); + return FlushViewOfFile(addr, length) ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return (msync(addr, length, async ? MS_ASYNC : MS_SYNC) == 0) ? MDB_SUCCESS : errno; @@ -597,10 +597,10 @@ int mdbx_mmap(void **address, size_t length, int rw, mdbx_filehandle_t fd) { HANDLE h = CreateFileMapping(fd, NULL, rw ? PAGE_READWRITE : PAGE_READONLY, HIGH_DWORD(length), (DWORD)length, NULL); if (!h) - return GetLastError(); + return mdbx_get_errno_checked(); *address = MapViewOfFileEx(h, rw ? FILE_MAP_WRITE : FILE_MAP_READ, 0, 0, length, *address); - int rc = (*address != MAP_FAILED) ? MDB_SUCCESS : GetLastError(); + int rc = (*address != MAP_FAILED) ? MDB_SUCCESS : mdbx_get_errno_checked(); CloseHandle(h); return rc; #else @@ -613,7 +613,7 @@ int mdbx_mmap(void **address, size_t length, int rw, mdbx_filehandle_t fd) { int mdbx_munmap(void *address, size_t length) { #if defined(_WIN32) || defined(_WIN64) (void)length; - return UnmapViewOfFile(address) ? MDB_SUCCESS : GetLastError(); + return UnmapViewOfFile(address) ? MDB_SUCCESS : mdbx_get_errno_checked(); #else return (munmap(address, length) == 0) ? MDB_SUCCESS : errno; #endif @@ -621,7 +621,8 @@ int mdbx_munmap(void *address, size_t length) { int mdbx_mlock(const void *address, size_t length) { #if defined(_WIN32) || defined(_WIN64) - return VirtualLock((void *)address, length) ? MDB_SUCCESS : GetLastError(); + return VirtualLock((void *)address, length) ? MDB_SUCCESS + : mdbx_get_errno_checked(); #else return (mlock(address, length) == 0) ? MDB_SUCCESS : errno; #endif diff --git a/src/osal.h b/src/osal.h index 414bde14..39c4de88 100644 --- a/src/osal.h +++ b/src/osal.h @@ -307,6 +307,31 @@ static __inline void mdbx_invalidate_cache(void *addr, size_t nbytes) { /*----------------------------------------------------------------------------*/ +#ifndef mdbx_assert_fail +void mdbx_assert_fail(MDB_env *env, const char *msg, const char *func, + int line); +#endif /* mdbx_assert_fail */ + +#if __GLIBC_PREREQ(2, 1) +#define mdbx_asprintf asprintf +#else +int mdbx_asprintf(char **strp, const char *fmt, ...); +#endif + +#ifdef _MSC_VER +#ifndef snprintf +#define snprintf(buffer, buffer_size, format, ...) \ + _snprintf_s(buffer, buffer_size, _TRUNCATE, format, __VA_ARGS__) +#endif /* snprintf */ + +#ifndef vsnprintf +#define vsnprintf(buffer, buffer_size, format, args) \ + _vsnprintf_s(buffer, buffer_size, _TRUNCATE, format, args) +#endif /* vsnprintf */ +#endif /* _MSC_VER */ + +/*----------------------------------------------------------------------------*/ + /* max bytes to write in one call */ #define MAX_WRITE UINT32_C(0x3fff0000) @@ -331,6 +356,30 @@ static __inline char *mdbx_strdup(const char *str) { #endif } +static __inline int mdbx_get_errno(void) { +#if defined(_WIN32) || defined(_WIN64) + DWORD rc = GetLastError(); +#else + int rc = errno; +#endif + return rc; +} + +static __inline int __mdbx_get_errno_checked(const char *file, unsigned line) { +#if defined(_WIN32) || defined(_WIN64) + DWORD rc = GetLastError(); + if (unlikely(rc == MDBX_EINVAL)) + mdbx_assert_fail(nullptr, "unexpected ERROR_INVALID_PARAMETER", file, line); +#else + int rc = errno; + if (unlikely(rc == MDBX_EINVAL)) + mdbx_assert_fail(nullptr, "unexpected EINVAL", file, line); +#endif + return rc; +} + +#define mdbx_get_errno_checked() __mdbx_get_errno_checked(__FILE__, __LINE__) + int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result); void mdbx_memalign_free(void *ptr); @@ -387,19 +436,6 @@ void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ -#ifndef mdbx_assert_fail -void mdbx_assert_fail(MDB_env *env, const char *msg, const char *func, - int line); -#endif /* mdbx_assert_fail */ - -#if __GLIBC_PREREQ(2, 1) -#define mdbx_asprintf asprintf -#else -int mdbx_asprintf(char **strp, const char *fmt, ...); -#endif - -/*----------------------------------------------------------------------------*/ - #if defined(_WIN32) || defined(_WIN64) #undef MDBX_OSAL_LOCK #define MDBX_OSAL_LOCK_SIGN MDBX_TETRAD('f', 'l', 'c', 'k') @@ -433,20 +469,6 @@ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid); /*----------------------------------------------------------------------------*/ -#ifdef _MSC_VER -#ifndef snprintf -#define snprintf(buffer, buffer_size, format, ...) \ - _snprintf_s(buffer, buffer_size, _TRUNCATE, format, __VA_ARGS__) -#endif /* snprintf */ - -#ifndef vsnprintf -#define vsnprintf(buffer, buffer_size, format, args) \ - _vsnprintf_s(buffer, buffer_size, _TRUNCATE, format, args) -#endif /* vsnprintf */ -#endif /* _MSC_VER */ - -/*----------------------------------------------------------------------------*/ - #if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) && \ (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ !(defined(__GNUC__) || defined(__clang__))) @@ -460,6 +482,8 @@ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid); 'size_t' to 'LONGLONG' */ #pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ #pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) #pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) #elif defined(__APPLE__) From 5fdad46cb97fc4c86bf496812cce28f524ea35e9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 15 May 2017 13:03:09 +0300 Subject: [PATCH 112/303] mdbx: add translation by Yandex. --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 611067c0..a23da3b7 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,8 @@ Extended LMDB, aka "Расширенная LMDB". [![Build Status](https://travis-ci.org/ReOpen/libmdbx.svg?branch=devel)](https://travis-ci.org/ReOpen/libmdbx) [![Build status](https://ci.appveyor.com/api/projects/status/v21jlh5kfmk85r7t/branch/devel?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/devel) -English version by Google [is here](https://translate.googleusercontent.com/translate_c?act=url&ie=UTF8&sl=ru&tl=en&u=https://github.com/ReOpen/libmdbx/tree/devel). +English version [by Google](https://translate.googleusercontent.com/translate_c?act=url&ie=UTF8&sl=ru&tl=en&u=https://github.com/ReOpen/libmdbx/tree/devel) +and [by Yandex](https://translate.yandex.ru/translate?url=https%3A%2F%2Fgithub.com%2FReOpen%2Flibmdbx%2Ftree%2Fdevel&lang=ru-en). ## Кратко From ed46246931ec386a5079a96b79f9c30b7112b6c0 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 15 May 2017 21:18:52 +0300 Subject: [PATCH 113/303] mdbx: rework mdbx_dbi_open(). --- mdbx.h | 52 ------------- src/mdbx.c | 214 +++++++++++++++++++++++++++++------------------------ 2 files changed, 116 insertions(+), 150 deletions(-) diff --git a/mdbx.h b/mdbx.h index 75c1d177..5a2f078b 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1137,58 +1137,6 @@ LIBMDBX_API int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); */ LIBMDBX_API int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); -/* Set a custom key comparison function for a database. - * - * The comparison function is called whenever it is necessary to compare a - * key specified by the application with a key currently stored in the - *database. - * If no comparison function is specified, and no special key flags were - *specified - * with mdbx_dbi_open(), the keys are compared lexically, with shorter keys - *collating - * before longer keys. - * Warning: This function must be called before any data access functions are - *used, - * otherwise data corruption may occur. The same comparison function must be - *used by every - * program accessing the database, every time the database is used. - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] cmp A MDB_cmp_func function - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - */ -LIBMDBX_API int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); - -/* Set a custom data comparison function for a MDB_DUPSORT database. - * - * This comparison function is called whenever it is necessary to compare a - *data - * item specified by the application with a data item currently stored in the - *database. - * This function only takes effect if the database was opened with the - *MDB_DUPSORT - * flag. - * If no comparison function is specified, and no special key flags were - *specified - * with mdbx_dbi_open(), the data items are compared lexically, with shorter - *items collating - * before longer items. - * Warning: This function must be called before any data access functions are - *used, - * otherwise data corruption may occur. The same comparison function must be - *used by every - * program accessing the database, every time the database is used. - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] cmp A MDB_cmp_func function - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - */ -LIBMDBX_API int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp); - /* Get items from a database. * * This function retrieves key/data pairs from the database. The address diff --git a/src/mdbx.c b/src/mdbx.c index 03c9864e..c8e5b70d 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2435,12 +2435,11 @@ size_t mdbx_txn_id(MDB_txn *txn) { /** Export or close DBI handles opened in this txn. */ static void mdbx_dbis_update(MDB_txn *txn, int keep) { - int i; MDB_dbi n = txn->mt_numdbs; MDB_env *env = txn->mt_env; unsigned char *tdbflags = txn->mt_dbflags; - for (i = n; --i >= CORE_DBS;) { + for (unsigned i = n; --i >= CORE_DBS;) { if (tdbflags[i] & DB_NEW) { if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; @@ -8706,144 +8705,177 @@ static MDB_cmp_func *mdbx_default_datacmp(unsigned flags) { : mdbx_cmp_memn)); } -/** Set the default comparison functions for a database. - * Called immediately after a database is opened to set the defaults. - * The user can then override them with #mdbx_set_compare() or - * #mdbx_set_dupsort(). - * @param[in] txn A transaction handle returned by #mdbx_txn_begin() - * @param[in] dbi A database handle returned by #mdbx_dbi_open() - */ -static void mdbx_default_cmp(MDB_txn *txn, MDB_dbi dbi) { - unsigned flags = txn->mt_dbs[dbi].md_flags; - txn->mt_dbxs[dbi].md_cmp = mdbx_default_keycmp(flags); - txn->mt_dbxs[dbi].md_dcmp = mdbx_default_datacmp(flags); +static int mdbx_dbi_bind(MDB_txn *txn, const MDB_dbi dbi, unsigned user_flags, + MDB_cmp_func *keycmp, MDB_cmp_func *datacmp) { + /* LY: so, accepting only three cases for the table's flags: + * 1) user_flags and both comparators are zero + * = assume that a by-default mode/flags is requested for reading; + * 2) user_flags exactly the same + * = assume that the target mode/flags are requested properly; + * 3) user_flags differs, but table is empty and MDB_CREATE is provided + * = assume that a properly create request with custom flags; + */ + if ((user_flags ^ txn->mt_dbs[dbi].md_flags) & PERSISTENT_FLAGS) { + /* flags ara differs, check other conditions */ + if (!user_flags && (!keycmp || keycmp == txn->mt_dbxs[dbi].md_cmp) && + (!datacmp || datacmp == txn->mt_dbxs[dbi].md_dcmp)) { + /* no comparators were provided and flags are zero, + * seems that is case #1 above */ + user_flags = txn->mt_dbs[dbi].md_flags; + } else if ((user_flags & MDB_CREATE) && txn->mt_dbs[dbi].md_entries == 0) { + if (txn->mt_flags & MDB_TXN_RDONLY) + return /* FIXME: return extended info */ MDBX_EACCESS; + /* make sure flags changes get committed */ + txn->mt_dbs[dbi].md_flags = user_flags & PERSISTENT_FLAGS; + txn->mt_flags |= MDB_TXN_DIRTY; + } else { + return /* FIXME: return extended info */ MDB_INCOMPATIBLE; + } + } + + if (!txn->mt_dbxs[dbi].md_cmp || MDB_DEBUG) { + if (!keycmp) + keycmp = mdbx_default_keycmp(user_flags); + assert(!txn->mt_dbxs[dbi].md_cmp || txn->mt_dbxs[dbi].md_cmp == keycmp); + txn->mt_dbxs[dbi].md_cmp = keycmp; + } + + if (!txn->mt_dbxs[dbi].md_dcmp || MDB_DEBUG) { + if (!datacmp) + datacmp = mdbx_default_datacmp(user_flags); + assert(!txn->mt_dbxs[dbi].md_dcmp || txn->mt_dbxs[dbi].md_dcmp == datacmp); + txn->mt_dbxs[dbi].md_dcmp = datacmp; + } + + return MDB_SUCCESS; } -int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, - MDB_dbi *dbi) { - MDB_val key, data; - MDB_dbi i; - MDB_cursor mc; - int rc, dbflag, exact; - unsigned unused = 0, seq; - char *namedup; - size_t len; - - if (unlikely(!txn || !dbi)) +int mdbx_dbi_open_ex(MDB_txn *txn, const char *table_name, unsigned user_flags, + MDB_dbi *dbi, MDB_cmp_func *keycmp, + MDB_cmp_func *datacmp) { + if (unlikely(!txn || !dbi || (user_flags & ~VALID_FLAGS) != 0)) return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(flags & ~VALID_FLAGS)) - return MDBX_EINVAL; - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) return MDB_BAD_TXN; - /* main DB? */ - if (!name) { + /* main table? */ + if (!table_name) { *dbi = MAIN_DBI; - if (flags & PERSISTENT_FLAGS) { - uint16_t f2 = flags & PERSISTENT_FLAGS; - /* make sure flag changes get committed */ - if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != - txn->mt_dbs[MAIN_DBI].md_flags) { - txn->mt_dbs[MAIN_DBI].md_flags |= f2; - txn->mt_flags |= MDB_TXN_DIRTY; - } - } - mdbx_default_cmp(txn, MAIN_DBI); - return MDB_SUCCESS; + return mdbx_dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); } if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { - mdbx_default_cmp(txn, MAIN_DBI); + txn->mt_dbxs[MAIN_DBI].md_cmp = + mdbx_default_keycmp(txn->mt_dbs[MAIN_DBI].md_flags); + txn->mt_dbxs[MAIN_DBI].md_dcmp = + mdbx_default_datacmp(txn->mt_dbs[MAIN_DBI].md_flags); } /* Is the DB already open? */ - len = strlen(name); - for (i = CORE_DBS; i < txn->mt_numdbs; i++) { - if (!txn->mt_dbxs[i].md_name.mv_size) { + size_t len = strlen(table_name); + MDB_dbi scan, slot = txn->mt_numdbs; + for (scan = txn->mt_numdbs; --scan >= CORE_DBS;) { + if (!txn->mt_dbxs[scan].md_name.mv_size) { /* Remember this free slot */ - if (!unused) - unused = i; + slot = scan; continue; } - if (len == txn->mt_dbxs[i].md_name.mv_size && - !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) { - *dbi = i; - return MDB_SUCCESS; + if (len == txn->mt_dbxs[scan].md_name.mv_size && + !strncmp(table_name, txn->mt_dbxs[scan].md_name.mv_data, len)) { + *dbi = scan; + return mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp); } } - /* If no free slot and max hit, fail */ - if (!unused && unlikely(txn->mt_numdbs >= txn->mt_env->me_maxdbs)) + /* Fail, if no free slot and max hit */ + if (unlikely(slot >= txn->mt_env->me_maxdbs)) return MDB_DBS_FULL; - /* Cannot mix named databases with some mainDB flags */ + /* Cannot mix named table with some main-table flags */ if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT | MDB_INTEGERKEY))) - return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + return (user_flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; /* Find the DB info */ - dbflag = DB_NEW | DB_VALID | DB_USRVALID; - exact = 0; + int exact = 0; + MDB_val key, data; key.mv_size = len; - key.mv_data = (void *)name; + key.mv_data = (void *)table_name; + MDB_cursor mc; mdbx_cursor_init(&mc, txn, MAIN_DBI, NULL); - rc = mdbx_cursor_set(&mc, &key, &data, MDB_SET, &exact); - if (likely(rc == MDB_SUCCESS)) { - /* make sure this is actually a DB */ + int rc = mdbx_cursor_set(&mc, &key, &data, MDB_SET, &exact); + if (unlikely(rc != MDB_SUCCESS)) { + if (rc != MDB_NOTFOUND || !(user_flags & MDB_CREATE)) + return rc; + } else { + /* make sure this is actually a table */ MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if (unlikely((node->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) return MDB_INCOMPATIBLE; - } else if (!(rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { - return rc; } - /* FIXME: locking to avoid races ? */ - /* Done here so we cannot fail after creating a new DB */ - if (unlikely((namedup = mdbx_strdup(name)) == NULL)) + char *namedup = mdbx_strdup(table_name); + if (unlikely(!namedup)) return MDBX_ENOMEM; + /* FIXME: lock here (to avoid races !!!) */ + + unsigned dbflag = DB_NEW | DB_VALID | DB_USRVALID; if (unlikely(rc)) { - MDB_db db_dummy; /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ + assert(rc == MDB_NOTFOUND); + MDB_db db_dummy; memset(&db_dummy, 0, sizeof(db_dummy)); db_dummy.md_root = P_INVALID; - db_dummy.md_flags = flags & PERSISTENT_FLAGS; + db_dummy.md_flags = user_flags & PERSISTENT_FLAGS; data.mv_size = sizeof(db_dummy); data.mv_data = &db_dummy; - WITH_CURSOR_TRACKING(mc, rc = mdbx_cursor_put(&mc, &key, &data, F_SUBDATA)); + WITH_CURSOR_TRACKING(mc, rc = mdbx_cursor_put(&mc, &key, &data, + F_SUBDATA | MDB_NOOVERWRITE)); + + if (unlikely(rc != MDB_SUCCESS)) + goto bailout; + dbflag |= DB_DIRTY; } - if (unlikely(rc)) { + /* Got info, register DBI in this txn */ + txn->mt_dbxs[slot].md_name.mv_data = namedup; + txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_cmp = nullptr; + txn->mt_dbxs[slot].md_dcmp = nullptr; + txn->mt_dbflags[slot] = dbflag; + txn->mt_dbiseqs[slot] = (txn->mt_env->me_dbiseqs[slot] += 1); + + memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp); + if (unlikely(rc != MDB_SUCCESS)) { + assert((dbflag & DB_DIRTY) == 0); + /* cleanup slot */ + txn->mt_dbxs[slot].md_name.mv_data = NULL; + txn->mt_dbxs[slot].md_name.mv_size = 0; + txn->mt_dbflags[slot] = 0; + bailout: free(namedup); } else { - /* Got info, register DBI in this txn */ - unsigned slot = unused ? unused : txn->mt_numdbs; - txn->mt_dbxs[slot].md_name.mv_data = namedup; - txn->mt_dbxs[slot].md_name.mv_size = len; - txn->mt_dbflags[slot] = dbflag; - /* txn-> and env-> are the same in read txns, use - * tmp variable to avoid undefined assignment - */ - seq = ++txn->mt_env->me_dbiseqs[slot]; - txn->mt_dbiseqs[slot] = seq; - - memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); *dbi = slot; - mdbx_default_cmp(txn, slot); - if (!unused) { + if (slot == txn->mt_numdbs) txn->mt_numdbs++; - } } + /* FIXME: unlock here (to avoid races !!!) */ return rc; } +int mdbx_dbi_open(MDB_txn *txn, const char *table_name, unsigned table_flags, + MDB_dbi *dbi) { + return mdbx_dbi_open_ex(txn, table_name, table_flags, dbi, nullptr, nullptr); +} + int __cold mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, size_t bytes) { if (unlikely(!arg || !txn)) @@ -10214,20 +10246,6 @@ int mdbx_is_dirty(const MDB_txn *txn, const void *ptr) { return MDBX_RESULT_TRUE; } -int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, - MDB_dbi *pdbi, MDB_cmp_func *keycmp, - MDB_cmp_func *datacmp) { - int rc = mdbx_dbi_open(txn, name, flags, pdbi); - if (likely(rc == MDB_SUCCESS)) { - MDB_dbi dbi = *pdbi; - unsigned md_flags = txn->mt_dbs[dbi].md_flags; - txn->mt_dbxs[dbi].md_cmp = keycmp ? keycmp : mdbx_default_keycmp(md_flags); - txn->mt_dbxs[dbi].md_dcmp = - datacmp ? datacmp : mdbx_default_datacmp(md_flags); - } - return rc; -} - int mdbx_dbi_sequence(MDB_txn *txn, MDB_dbi dbi, uint64_t *result, uint64_t increment) { if (unlikely(!txn)) From 6d7bfeb87adac25cf82d097c70b673211ac6971c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 17 May 2017 20:25:16 +0300 Subject: [PATCH 114/303] mdbx: add 'pipefail' to makefile's check target. --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cbe96991..6c0d99c8 100644 --- a/Makefile +++ b/Makefile @@ -43,6 +43,7 @@ TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 MDBX_SRC := mdbx.h mdbx_osal.h $(addprefix src/, mdbx.c osal.c lck-posix.c defs.h bits.h osal.h midl.h) +SHELL := /bin/bash .PHONY: mdbx all install clean check coverage @@ -66,7 +67,7 @@ clean: rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err check: test/test - test/test --pathname=tmp.db --dont-cleanup-after basic | tee test.log | tail -n 42 && ./mdbx_chk -vn tmp.db + (set -o pipefail; test/test --pathname=tmp.db --dont-cleanup-after basic | tee test.log | tail -n 42) && ./mdbx_chk -vn tmp.db mdbx.o: $(MDBX_SRC) Makefile $(CC) $(CFLAGS) -c src/mdbx.c -o $@ From bfa3e864b6eaa60fa8475eb32c9d8ab6b4496bfa Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 17 May 2017 15:46:44 +0300 Subject: [PATCH 115/303] test: fix logging va_copy() bug. --- test/log.cc | 33 ++++++++++++++++++++++----------- test/log.h | 8 +++++--- test/test.h | 2 +- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/test/log.cc b/test/log.cc index 69a7558c..b42ae4f6 100644 --- a/test/log.cc +++ b/test/log.cc @@ -18,8 +18,8 @@ static void fflushall() { fflush(nullptr); } void failure(const char *fmt, ...) { va_list ap; - fflush(NULL); va_start(ap, fmt); + fflushall(); logging::output(logging::failure, fmt, ap); va_end(ap); fflushall(); @@ -74,7 +74,7 @@ const char *level2str(const loglevel level) { } } -bool output(loglevel priority, const char *format, ...) { +bool output(const loglevel priority, const char *format, ...) { if (priority < level) return false; @@ -85,7 +85,7 @@ bool output(loglevel priority, const char *format, ...) { return true; } -bool output(loglevel priority, const char *format, va_list ap) { +bool output(const logging::loglevel priority, const char *format, va_list ap) { if (last) { putc('\n', last); fflush(last); @@ -112,6 +112,10 @@ bool output(loglevel priority, const char *format, va_list ap) { tm.tm_year - 100, tm.tm_mon + 1, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, chrono::fractional2us(now.fractional), osal_getpid(), prefix.c_str(), level2str(priority), suffix.c_str()); + + va_list ones; + if (priority >= error) + va_copy(ones, ap); vfprintf(last, format, ap); size_t len = strlen(format); @@ -135,13 +139,16 @@ bool output(loglevel priority, const char *format, va_list ap) { break; } - if (priority >= error && last != stderr) { - fprintf(stderr, "[ %05u %-10s %.4s ] %s", osal_getpid(), prefix.c_str(), - level2str(priority), suffix.c_str()); - vfprintf(stderr, format, ap); - if (end != '\n') - putc('\n', stderr); - fflush(stderr); + if (priority >= error) { + if (last != stderr) { + fprintf(stderr, "[ %05u %-10s %.4s ] %s", osal_getpid(), prefix.c_str(), + level2str(priority), suffix.c_str()); + vfprintf(stderr, format, ones); + if (end != '\n') + putc('\n', stderr); + fflush(stderr); + } + va_end(ones); } return true; @@ -258,6 +265,10 @@ void log_error(const char *msg, ...) { logging::last = nullptr; } -void log_touble(const char *where, const char *what, int errnum) { +void log_trouble(const char *where, const char *what, int errnum) { log_error("%s: %s %s", where, what, test_strerror(errnum)); } + +bool log_enabled(const logging::loglevel priority) { + return (priority >= logging::level); +} diff --git a/test/log.h b/test/log.h index 67ad5e1c..dbadb567 100644 --- a/test/log.h +++ b/test/log.h @@ -47,8 +47,9 @@ const char *level2str(const loglevel level); void setup(loglevel level, const std::string &prefix); void setup(const std::string &prefix); -bool output(loglevel priority, const char *format, va_list ap); -bool __printf_args(2, 3) output(loglevel priority, const char *format, ...); +bool output(const loglevel priority, const char *format, va_list ap); +bool __printf_args(2, 3) + output(const loglevel priority, const char *format, ...); bool feed(const char *format, va_list ap); bool __printf_args(1, 2) feed(const char *format, ...); @@ -78,7 +79,8 @@ void __printf_args(1, 2) log_notice(const char *msg, ...); void __printf_args(1, 2) log_warning(const char *msg, ...); void __printf_args(1, 2) log_error(const char *msg, ...); -void log_touble(const char *where, const char *what, int errnum); +void log_trouble(const char *where, const char *what, int errnum); +bool log_enabled(const logging::loglevel priority); #ifdef _DEBUG #define TRACE(...) log_trace(__VA_ARGS__) diff --git a/test/test.h b/test/test.h index 79c26479..f1a039f7 100644 --- a/test/test.h +++ b/test/test.h @@ -61,7 +61,7 @@ struct txn_deleter : public std::unary_function { void operator()(MDB_txn *txn) const { int rc = mdbx_txn_abort(txn); if (rc) - log_touble(__func__, "mdbx_txn_abort()", rc); + log_trouble(__func__, "mdbx_txn_abort()", rc); } }; From 015a242c89eee5f25a5f71803af6e8efb296b6da Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 17 May 2017 17:23:54 +0300 Subject: [PATCH 116/303] mdbx: rollback weak checkpoint or MDB_CORRUPTED. --- src/mdbx.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index c8e5b70d..8b5b38ed 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1398,8 +1398,9 @@ static __inline MDB_meta *mdbx_env_meta_flipflop(const MDB_env *env, } static __inline int mdbx_meta_lt(const MDB_meta *a, const MDB_meta *b) { - return (META_IS_STEADY(a) == META_IS_STEADY(b)) ? a->mm_txnid < b->mm_txnid - : META_IS_STEADY(b); + if (META_IS_STEADY(a) == META_IS_STEADY(b)) + return a->mm_txnid < b->mm_txnid; + return META_IS_STEADY(b); } /** Find oldest txnid still referenced. */ @@ -3815,6 +3816,39 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { if (err) return err; + MDB_meta *const head = mdbx_meta_head(env); + if (head->mm_txnid != meta->mm_txnid) { + mdbx_trace("head->mm_txnid (%" PRIuPTR ") != (%" PRIuPTR + ") meta->mm_txnid", + head->mm_txnid, meta->mm_txnid); + if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { + assert(META_IS_STEADY(meta) && !META_IS_STEADY(head)); + if (env->me_flags & MDB_RDONLY) { + mdbx_trace("exclusive, but read-only, unable recovery/rollback"); + return MDB_CORRUPTED /* LY: could not recovery/rollback */; + } + + /* LY: rollback weak checkpoint */ + MDB_meta rollback = *head; + rollback.mm_txnid = 0; + if (rollback.mm_txnid == meta->mm_txnid) + rollback = *meta; + err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDB_meta), + (uint8_t *)head - (uint8_t *)env->me_map); + if (err) + return err; + } else if (!env->me_lck) { + /* LY: without-lck (read-only) mode, so it is imposible that other + * process made weak checkpoint. */ + mdbx_trace("without-lck, unable recovery/rollback"); + return MDB_CORRUPTED; + } else { + /* LY: assume just have a collision with other running process, + * or someone make a weak checkpoint */ + mdbx_trace("assume collision or online weak checkpoint"); + } + } + mdbx_env_setup_limits(env, env->me_psize); return rc; } @@ -3872,6 +3906,7 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { err = mdbx_mmap(&addr, size, true, env->me_lfd); if (unlikely(err != MDB_SUCCESS)) return err; + assert(addr != nullptr); env->me_lck = addr; #ifdef MADV_NOHUGEPAGE From 677625ead1c60183109c11d8e55f89d98d3e30b7 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 17 May 2017 20:54:16 +0300 Subject: [PATCH 117/303] mdbx: add MDBX_WANNA_RECOVERY. --- mdbx.h | 4 ++++ src/mdbx.c | 14 ++++++++------ src/tools/mdbx_chk.c | 2 ++ 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/mdbx.h b/mdbx.h index 5a2f078b..261ff1e0 100644 --- a/mdbx.h +++ b/mdbx.h @@ -336,6 +336,10 @@ typedef enum MDB_cursor_op { * - ABI version mismatch (rare case); */ #define MDBX_EBADSIGN (-30420) +/* Database should be recovered, but this could be done automatically + * right now (e.g. in readonly mode and so forth). */ +#define MDBX_WANNA_RECOVERY (-30419) + /* Statistics for a database in the environment */ typedef struct MDBX_stat { unsigned ms_psize; /* Size of a database page. diff --git a/src/mdbx.c b/src/mdbx.c index 8b5b38ed..89829b4b 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -718,14 +718,14 @@ static const char *__mdbx_strerr(int errnum) { "MDB_KEYEXIST: Key/data pair already exists", "MDB_NOTFOUND: No matching key/data pair found", "MDB_PAGE_NOTFOUND: Requested page not found", - "MDB_CORRUPTED: Located page was wrong data", + "MDB_CORRUPTED: Database is corrupted", "MDB_PANIC: Update of meta page failed or environment had fatal error", "MDB_VERSION_MISMATCH: DB version mismatch libmdbx", "MDB_INVALID: File is not an LMDB file", "MDB_MAP_FULL: Environment mapsize limit reached", "MDB_DBS_FULL: Too may DBI (maxdbs reached)", "MDB_READERS_FULL: Too many readers (maxreaders reached)", - NULL /* -30789 unused in MDBX */, + NULL /* MDB_TLS_FULL (-30789): unused in MDBX */, "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too " "big", "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", @@ -752,6 +752,9 @@ static const char *__mdbx_strerr(int errnum) { return "MDBX_EMULTIVAL: Unable to update multi-value for the given key"; case MDBX_EBADSIGN: return "MDBX_EBADSIGN: Wrong signature of a runtime object(s)"; + case MDBX_WANNA_RECOVERY: + return "MDBX_WANNA_RECOVERY: Database should be recovered, but this could " + "be done in a read-only mode"; default: return NULL; } @@ -3818,14 +3821,13 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { MDB_meta *const head = mdbx_meta_head(env); if (head->mm_txnid != meta->mm_txnid) { - mdbx_trace("head->mm_txnid (%" PRIuPTR ") != (%" PRIuPTR - ") meta->mm_txnid", + mdbx_trace("head->mm_txnid (%" PRIuPTR ") != (%" PRIuPTR ") meta->mm_txnid", head->mm_txnid, meta->mm_txnid); if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { assert(META_IS_STEADY(meta) && !META_IS_STEADY(head)); if (env->me_flags & MDB_RDONLY) { mdbx_trace("exclusive, but read-only, unable recovery/rollback"); - return MDB_CORRUPTED /* LY: could not recovery/rollback */; + return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */; } /* LY: rollback weak checkpoint */ @@ -3841,7 +3843,7 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { /* LY: without-lck (read-only) mode, so it is imposible that other * process made weak checkpoint. */ mdbx_trace("without-lck, unable recovery/rollback"); - return MDB_CORRUPTED; + return MDBX_WANNA_RECOVERY; } else { /* LY: assume just have a collision with other running process, * or someone make a weak checkpoint */ diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index f8effef2..b99f3b2b 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -691,6 +691,8 @@ int main(int argc, char *argv[]) { rc = mdbx_env_open_ex(env, envname, envflags, 0664, &exclusive); if (rc) { error("mdbx_env_open failed, error %d %s\n", rc, mdbx_strerror(rc)); + if (rc == MDBX_WANNA_RECOVERY && (envflags & MDB_RDONLY)) + print("Please run %s in the read-write mode (with '-w' option).\n", prog); goto bailout; } if (verbose) From ec114d06b7a061bb7adbbeaf7ca1b3b754fafe97 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 17 May 2017 18:49:19 +0300 Subject: [PATCH 118/303] test: add log_extra(). --- test/log.cc | 10 ++++++++++ test/log.h | 1 + test/test.cc | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/test/log.cc b/test/log.cc index b42ae4f6..4f1df4e4 100644 --- a/test/log.cc +++ b/test/log.cc @@ -205,6 +205,16 @@ local_suffix::~local_suffix() { suffix.erase(trim_pos); } } /* namespace log */ +void log_extra(const char *msg, ...) { + if (logging::extra >= logging::level) { + va_list ap; + va_start(ap, msg); + logging::output(logging::extra, msg, ap); + va_end(ap); + } else + logging::last = nullptr; +} + void log_trace(const char *msg, ...) { if (logging::trace >= logging::level) { va_list ap; diff --git a/test/log.h b/test/log.h index dbadb567..3ffc21d2 100644 --- a/test/log.h +++ b/test/log.h @@ -72,6 +72,7 @@ public: } /* namespace log */ +void __printf_args(1, 2) log_extra(const char *msg, ...); void __printf_args(1, 2) log_trace(const char *msg, ...); void __printf_args(1, 2) log_verbose(const char *msg, ...); void __printf_args(1, 2) log_info(const char *msg, ...); diff --git a/test/test.cc b/test/test.cc index a225ee34..4e8052e7 100644 --- a/test/test.cc +++ b/test/test.cc @@ -67,7 +67,7 @@ static void mdbx_debug_logger(int type, const char *function, int line, level = logging::failure; } - if (logging::output(level, "mdbx: ")) + if (logging::output(level, "mdbx: %s: ", function)) logging::feed(msg, args); if (type & MDBX_DBG_ASSERT) abort(); From 00be62e235d504ef26074cd2212930045b01642c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 17 May 2017 20:06:57 +0300 Subject: [PATCH 119/303] mdbx: cleanup formating. --- mdbx.h | 805 +++++++++++++++++++++++++---------------------------- src/bits.h | 31 +-- src/mdbx.c | 46 ++- 3 files changed, 420 insertions(+), 462 deletions(-) diff --git a/mdbx.h b/mdbx.h index 261ff1e0..a6405a53 100644 --- a/mdbx.h +++ b/mdbx.h @@ -105,15 +105,13 @@ extern "C" { /* Opaque structure for a database environment. * * A DB environment supports multiple databases, all residing in the same - * shared-memory map. - */ + * shared-memory map. */ typedef struct MDB_env MDB_env; /* Opaque structure for a transaction handle. * * All database operations require a transaction handle. Transactions may be - * read-only or read-write. - */ + * read-only or read-write. */ typedef struct MDB_txn MDB_txn; /* A handle for an individual database in the DB environment. */ @@ -131,8 +129,7 @@ typedef struct MDB_cursor MDB_cursor; * * Key sizes must be between 1 and mdbx_env_get_maxkeysize() inclusive. * The same applies to data sizes in databases with the MDB_DUPSORT flag. - * Other data items can in theory be from 0 to 0xffffffff bytes long. - */ + * Other data items can in theory be from 0 to 0xffffffff bytes long. */ #ifndef HAVE_STRUCT_IOVEC struct iovec { void *iov_base; @@ -189,7 +186,7 @@ typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); /* use sorted duplicates */ #define MDB_DUPSORT 0x04u /* numeric keys in native byte order, either unsigned int or mdbx_size_t. - * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) + * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) * The keys must all be of the same size. */ #define MDB_INTEGERKEY 0x08u /* with MDB_DUPSORT, sorted dup items have fixed size */ @@ -206,16 +203,14 @@ typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); #define MDB_NOOVERWRITE 0x10u /* Only for MDB_DUPSORT * For put: don't write if the key and data pair already exist. - * For mdbx_cursor_del: remove all duplicate data items. - */ + * For mdbx_cursor_del: remove all duplicate data items. */ #define MDB_NODUPDATA 0x20u /* For mdbx_cursor_put: overwrite the current key/data pair - * MDBX allows this flag for mdbx_put() for explicit overwrite/update without + * MDBX allows this flag for mdbx_put() for explicit overwrite/update without * insertion. */ #define MDB_CURRENT 0x40u /* For put: Just reserve space for data, don't copy it. Return a - * pointer to the reserved space. - */ + * pointer to the reserved space. */ #define MDB_RESERVE 0x10000u /* Data is being appended, don't split full pages. */ #define MDB_APPEND 0x20000u @@ -231,45 +226,38 @@ typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); /* Cursor Get operations. * - * This is the set of all operations for retrieving data - * using a cursor. - */ + * This is the set of all operations for retrieving data + * using a cursor. */ typedef enum MDB_cursor_op { MDB_FIRST, /* Position at first key/data item */ - MDB_FIRST_DUP, /* Position at first data item of current key. - Only for MDB_DUPSORT */ - MDB_GET_BOTH, /* Position at key/data pair. Only for MDB_DUPSORT */ - MDB_GET_BOTH_RANGE, /* position at key, nearest data. Only for - MDB_DUPSORT */ + MDB_FIRST_DUP, /* MDB_DUPSORT-only: Position at first data item + * of current key. */ + MDB_GET_BOTH, /* MDB_DUPSORT-only: Position at key/data pair. */ + MDB_GET_BOTH_RANGE, /* MDB_DUPSORT-only: position at key, nearest data. */ MDB_GET_CURRENT, /* Return key/data at current cursor position */ - MDB_GET_MULTIPLE, /* Return key and up to a page of duplicate data items - from current cursor position. Move - cursor to prepare - for MDB_NEXT_MULTIPLE. Only for - MDB_DUPFIXED */ + MDB_GET_MULTIPLE, /* MDB_DUPFIXED-only: Return key and up to a page of + * duplicate data items from current cursor position. + * Move cursor to prepare for MDB_NEXT_MULTIPLE.*/ MDB_LAST, /* Position at last key/data item */ - MDB_LAST_DUP, /* Position at last data item of current key. - Only for MDB_DUPSORT */ + MDB_LAST_DUP, /* MDB_DUPSORT-only: Position at last data item + * of current key. */ MDB_NEXT, /* Position at next data item */ - MDB_NEXT_DUP, /* Position at next data item of current key. - Only for MDB_DUPSORT */ - MDB_NEXT_MULTIPLE, /* Return key and up to a page of duplicate data items - from next cursor position. Move - cursor to prepare - for MDB_NEXT_MULTIPLE. Only for - MDB_DUPFIXED */ + MDB_NEXT_DUP, /* MDB_DUPSORT-only: Position at next data item + * of current key. */ + MDB_NEXT_MULTIPLE, /* MDB_DUPFIXED-only: Return key and up to a page of + * duplicate data items from next cursor position. + * Move cursor to prepare for MDB_NEXT_MULTIPLE. */ MDB_NEXT_NODUP, /* Position at first data item of next key */ MDB_PREV, /* Position at previous data item */ - MDB_PREV_DUP, /* Position at previous data item of current key. - Only for MDB_DUPSORT */ + MDB_PREV_DUP, /* MDB_DUPSORT-only: Position at previous data item + * of current key. */ MDB_PREV_NODUP, /* Position at last data item of previous key */ MDB_SET, /* Position at specified key */ - MDB_SET_KEY, /* Position at specified key, return key + data */ - MDB_SET_RANGE, /* Position at first key greater than or equal to specified - key. */ - MDB_PREV_MULTIPLE /* Position at previous page and return key and up to - a page of duplicate data items. - Only for MDB_DUPFIXED */ + MDB_SET_KEY, /* Position at specified key, return both key and data */ + MDB_SET_RANGE, /* Position at first key greater than or equal to + * specified key. */ + MDB_PREV_MULTIPLE /* MDB_DUPFIXED-only: Position at previous page and + * return key and up to a page of duplicate data items. */ } MDB_cursor_op; /* Return Codes @@ -343,8 +331,7 @@ typedef enum MDB_cursor_op { /* Statistics for a database in the environment */ typedef struct MDBX_stat { unsigned ms_psize; /* Size of a database page. - This is currently the - same for all databases. */ + * This is currently the same for all databases. */ unsigned ms_depth; /* Depth (height) of the B-tree */ size_t ms_branch_pages; /* Number of internal (non-leaf) pages */ size_t ms_leaf_pages; /* Number of leaf pages */ @@ -370,8 +357,7 @@ typedef struct MDBX_envinfo { * [out] major if non-NULL, the library major version number is copied here * [out] minor if non-NULL, the library minor version number is copied here * [out] patch if non-NULL, the library patch version number is copied here - * Returns "version string" The library version as a string - */ + * Returns "version string" The library version as a string */ LIBMDBX_API const char *mdbx_version(int *major, int *minor, int *patch); /* Return a string describing a given error code. @@ -384,8 +370,7 @@ LIBMDBX_API const char *mdbx_version(int *major, int *minor, int *patch); * * [in] err The error code * - * Returns "error message" The description of the error - */ + * Returns "error message" The description of the error */ LIBMDBX_API const char *mdbx_strerror(int errnum); LIBMDBX_API const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); @@ -400,8 +385,7 @@ LIBMDBX_API const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); * * [out] env The address where the new handle will be stored * - * Returns A non-zero error value on failure and 0 on success. - */ + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_create(MDB_env **env); /* Open an environment handle. @@ -441,137 +425,136 @@ LIBMDBX_API int mdbx_env_create(MDB_env **env); * Flush system buffers to disk only once per transaction, omit the * metadata flush. Defer that until the system flushes files to *disk, - * or next non-MDB_RDONLY commit or mdbx_env_sync(). This + * or next non-MDB_RDONLY commit or mdbx_env_sync(). This *optimization - * maintains database integrity, but a system crash may undo the + * maintains database integrity, but a system crash may undo the *last - * committed transaction. I.e. it preserves the ACI (atomicity, - * consistency, isolation) but not D (durability) database + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database *property. - * This flag may be changed at any time using + * This flag may be changed at any time using *mdbx_env_set_flags(). - * - MDB_NOSYNC - * Don't flush system buffers to disk when committing a + * - MDB_NOSYNC + * Don't flush system buffers to disk when committing a *transaction. - * This optimization means a system crash can corrupt the database + * This optimization means a system crash can corrupt the database *or - * lose the last transactions if buffers are not yet flushed to + * lose the last transactions if buffers are not yet flushed to *disk. - * The risk is governed by how often the system flushes dirty + * The risk is governed by how often the system flushes dirty *buffers - * to disk and how often mdbx_env_sync() is called. However, if + * to disk and how often mdbx_env_sync() is called. However, if *the - * filesystem preserves write order and the MDB_WRITEMAP flag is + * filesystem preserves write order and the MDB_WRITEMAP flag is *not - * used, transactions exhibit ACI (atomicity, consistency, + * used, transactions exhibit ACI (atomicity, consistency, *isolation) - * properties and only lose D (durability). I.e. database + * properties and only lose D (durability). I.e. database *integrity - * is maintained, but a system crash may undo the final + * is maintained, but a system crash may undo the final *transactions. - * Note that (MDB_NOSYNC | MDB_WRITEMAP) leaves the system with + * Note that (MDB_NOSYNC | MDB_WRITEMAP) leaves the system with *no - * hint for when to write transactions to disk, unless + * hint for when to write transactions to disk, unless *mdbx_env_sync() - * is called. (MDB_MAPASYNC | MDB_WRITEMAP) may be preferable. - * This flag may be changed at any time using + * is called. (MDB_MAPASYNC | MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using *mdbx_env_set_flags(). - * - MDB_MAPASYNC - * When using MDB_WRITEMAP, use asynchronous flushes to disk. - * As with MDB_NOSYNC, a system crash can then corrupt the - * database or lose the last transactions. Calling + * - MDB_MAPASYNC + * When using MDB_WRITEMAP, use asynchronous flushes to disk. + * As with MDB_NOSYNC, a system crash can then corrupt the + * database or lose the last transactions. Calling *mdbx_env_sync() - * ensures on-disk database integrity until next commit. - * This flag may be changed at any time using + * ensures on-disk database integrity until next commit. + * This flag may be changed at any time using *mdbx_env_set_flags(). - * - MDB_NOTLS - * Don't use Thread-Local Storage. Tie reader locktable slots to - * MDB_txn objects instead of to threads. I.e. mdbx_txn_reset() + * - MDB_NOTLS + * Don't use Thread-Local Storage. Tie reader locktable slots to + * MDB_txn objects instead of to threads. I.e. mdbx_txn_reset() *keeps - * the slot reseved for the MDB_txn object. A thread may use + * the slot reseved for the MDB_txn object. A thread may use *parallel - * read-only transactions. A read-only transaction may span threads + * read-only transactions. A read-only transaction may span threads *if - * the user synchronizes its use. Applications that multiplex + * the user synchronizes its use. Applications that multiplex *many - * user threads over individual OS threads need this option. Such + * user threads over individual OS threads need this option. Such *an - * application must also serialize the write transactions in an + * application must also serialize the write transactions in an *OS - * thread, since LMDB's write locking is unaware of the user + * thread, since LMDB's write locking is unaware of the user *threads. - * - MDB_NOLOCK - * Don't do any locking. If concurrent access is anticipated, the - * caller must manage all concurrency itself. For proper + * - MDB_NOLOCK + * Don't do any locking. If concurrent access is anticipated, the + * caller must manage all concurrency itself. For proper *operation - * the caller must enforce single-writer semantics, and must + * the caller must enforce single-writer semantics, and must *ensure - * that no readers are using old transactions while a writer is - * active. The simplest approach is to use an exclusive lock so + * that no readers are using old transactions while a writer is + * active. The simplest approach is to use an exclusive lock so *that - * no readers may be active at all when a writer begins. - * - MDB_NORDAHEAD - * Turn off readahead. Most operating systems perform readahead + * no readers may be active at all when a writer begins. + * - MDB_NORDAHEAD + * Turn off readahead. Most operating systems perform readahead *on - * read requests by default. This option turns it off if the OS - * supports it. Turning it off may help random read performance - * when the DB is larger than RAM and system RAM is full. - * - MDB_NOMEMINIT - * Don't initialize malloc'd memory before writing to unused + * read requests by default. This option turns it off if the OS + * supports it. Turning it off may help random read performance + * when the DB is larger than RAM and system RAM is full. + * - MDB_NOMEMINIT + * Don't initialize malloc'd memory before writing to unused *spaces - * in the data file. By default, memory for pages written to the + * in the data file. By default, memory for pages written to the *data - * file is obtained using malloc. While these pages may be reused + * file is obtained using malloc. While these pages may be reused *in - * subsequent transactions, freshly malloc'd pages will be + * subsequent transactions, freshly malloc'd pages will be *initialized - * to zeroes before use. This avoids persisting leftover data from + * to zeroes before use. This avoids persisting leftover data from *other - * code (that used the heap and subsequently freed the memory) into + * code (that used the heap and subsequently freed the memory) into *the - * data file. Note that many other system libraries may allocate - * and free memory from the heap for arbitrary uses. E.g., stdio + * data file. Note that many other system libraries may allocate + * and free memory from the heap for arbitrary uses. E.g., stdio *may - * use the heap for file I/O buffers. This initialization step has + * use the heap for file I/O buffers. This initialization step has *a - * modest performance cost so some applications may want to + * modest performance cost so some applications may want to *disable - * it using this flag. This option can be a problem for + * it using this flag. This option can be a problem for *applications - * which handle sensitive data like passwords, and it makes + * which handle sensitive data like passwords, and it makes *memory - * checkers like Valgrind noisy. This flag is not needed with + * checkers like Valgrind noisy. This flag is not needed with *MDB_WRITEMAP, - * which writes directly to the mmap instead of using malloc for + * which writes directly to the mmap instead of using malloc for *pages. The - * initialization is also skipped if MDB_RESERVE is used; the - * caller is expected to overwrite all of the memory that was - * reserved in that case. - * This flag may be changed at any time using + * initialization is also skipped if MDB_RESERVE is used; the + * caller is expected to overwrite all of the memory that was + * reserved in that case. + * This flag may be changed at any time using *mdbx_env_set_flags(). * - #MDBX_COALESCE - * Aim to coalesce records while reclaiming FreeDB. - * This flag may be changed at any time using + * Aim to coalesce records while reclaiming FreeDB. + * This flag may be changed at any time using *mdbx_env_set_flags(). * - #MDBX_LIFORECLAIM - * LIFO policy for reclaiming FreeDB records. This significantly + * LIFO policy for reclaiming FreeDB records. This significantly *reduce - * write IPOS in case MDB_NOSYNC with periodically checkpoints. + * write IPOS in case MDB_NOSYNC with periodically checkpoints. * [in] mode The UNIX permissions to set on created files and *semaphores. - * This parameter is ignored on Windows. - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - MDB_VERSION_MISMATCH - the version of the LMDB library doesn't + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDB_VERSION_MISMATCH - the version of the LMDB library doesn't *match the - * version that created the database environment. - * - MDB_INVALID - the environment file headers are corrupted. - * - ENOENT - the directory specified by the path parameter doesn't + * version that created the database environment. + * - MDB_INVALID - the environment file headers are corrupted. + * - ENOENT - the directory specified by the path parameter doesn't *exist. - * - EACCES - the user didn't have permission to access the environment + * - EACCES - the user didn't have permission to access the environment *files. - * - EAGAIN - the environment was locked by another process. - */ + * - EAGAIN - the environment was locked by another process. */ LIBMDBX_API int mdbx_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode); LIBMDBX_API int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, @@ -589,8 +572,8 @@ LIBMDBX_API int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, * [in] path The directory in which the copy will reside. This * directory must already exist and be writable but must otherwise be * empty. - * Returns A non-zero error value on failure and 0 on success. - */ + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copy(MDB_env *env, const char *path); /* Copy an LMDB environment to the specified file descriptor. @@ -604,8 +587,8 @@ LIBMDBX_API int mdbx_env_copy(MDB_env *env, const char *path); * must have already been opened successfully. * [in] fd The filedescriptor to write the copy to. It must * have already been opened for Write access. - * Returns A non-zero error value on failure and 0 on success. - */ + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copyfd(MDB_env *env, mdbx_filehandle_t fd); /* Copy an LMDB environment to the specified path, with options. @@ -623,18 +606,18 @@ LIBMDBX_API int mdbx_env_copyfd(MDB_env *env, mdbx_filehandle_t fd); * [in] flags Special options for this operation. This parameter * must be set to 0 or by bitwise OR'ing together one or more of the * values described here. - * - MDB_CP_COMPACT - Perform compaction while copying: omit free - * pages and sequentially renumber all pages in output. This + * - MDB_CP_COMPACT - Perform compaction while copying: omit free + * pages and sequentially renumber all pages in output. This *option - * consumes more CPU and runs more slowly than the default. - * Currently it fails if the environment has suffered a page + * consumes more CPU and runs more slowly than the default. + * Currently it fails if the environment has suffered a page *leak. - * Returns A non-zero error value on failure and 0 on success. - */ + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags); /* Copy an LMDB environment to the specified file descriptor, - * with options. + * with options. * * This function may be used to make a backup of an existing environment. * No lockfile is created, since it gets recreated at need. See @@ -648,8 +631,8 @@ LIBMDBX_API int mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags); * have already been opened for Write access. * [in] flags Special options for this operation. * See mdbx_env_copy2() for options. - * Returns A non-zero error value on failure and 0 on success. - */ + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copyfd2(MDB_env *env, mdbx_filehandle_t fd, unsigned flags); @@ -657,7 +640,7 @@ LIBMDBX_API int mdbx_env_copyfd2(MDB_env *env, mdbx_filehandle_t fd, * * [in] env An environment handle returned by mdbx_env_create() * [out] stat The address of an MDB_stat structure - * where the statistics will be copied + * where the statistics will be copied */ LIBMDBX_API int mdbx_env_stat(MDB_env *env, MDBX_stat *stat, size_t bytes); @@ -665,7 +648,7 @@ LIBMDBX_API int mdbx_env_stat(MDB_env *env, MDBX_stat *stat, size_t bytes); * * [in] env An environment handle returned by mdbx_env_create() * [out] stat The address of an MDB_envinfo structure - * where the information will be copied + * where the information will be copied */ LIBMDBX_API int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); @@ -679,13 +662,13 @@ LIBMDBX_API int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); * [in] env An environment handle returned by mdbx_env_create() * [in] force If non-zero, force a synchronous flush. Otherwise * if the environment has the MDB_NOSYNC flag set the flushes - * will be omitted, and with MDB_MAPASYNC they will be asynchronous. - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EACCES - the environment is read-only. - * - EINVAL - an invalid parameter was specified. - * - EIO - an error occurred during synchronization. - */ + * will be omitted, and with MDB_MAPASYNC they will be asynchronous. + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EACCES - the environment is read-only. + * - EINVAL - an invalid parameter was specified. + * - EIO - an error occurred during synchronization. */ LIBMDBX_API int mdbx_env_sync(MDB_env *env, int force); /* Close the environment and release the memory map. @@ -713,20 +696,20 @@ LIBMDBX_API void mdbx_env_close(MDB_env *env); * [in] env An environment handle returned by mdbx_env_create() * [in] flags The flags to change, bitwise OR'ed together * [in] onoff A non-zero value sets the flags, zero clears them. - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff); /* Get environment flags. * * [in] env An environment handle returned by mdbx_env_create() * [out] flags The address of an integer to store the flags - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_flags(MDB_env *env, unsigned *flags); /* Return the path that was used in mdbx_env_open(). @@ -735,10 +718,10 @@ LIBMDBX_API int mdbx_env_get_flags(MDB_env *env, unsigned *flags); * [out] path Address of a string pointer to contain the path. This * is the actual string in the environment, not a copy. It should not be * altered in any way. - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_path(MDB_env *env, const char **path); /* Return the filedescriptor for the given environment. @@ -749,10 +732,10 @@ LIBMDBX_API int mdbx_env_get_path(MDB_env *env, const char **path); * * [in] env An environment handle returned by mdbx_env_create() * [out] fd Address of a int to contain the descriptor. - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *fd); /* Set the size of the memory map to use for this environment. @@ -782,12 +765,11 @@ LIBMDBX_API int mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *fd); *space. * [in] env An environment handle returned by mdbx_env_create() * [in] size The size in bytes - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified, or the environment - *has - * an active write transaction. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified, + * or the environment has an active write transaction. */ LIBMDBX_API int mdbx_env_set_mapsize(MDB_env *env, size_t size); /* Set the maximum number of threads/reader slots for the environment. @@ -803,21 +785,21 @@ LIBMDBX_API int mdbx_env_set_mapsize(MDB_env *env, size_t size); *mdbx_env_open(). * [in] env An environment handle returned by mdbx_env_create() * [in] readers The maximum number of reader lock table slots - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified, or the environment is - *already open. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified, + * or the environment is already open. */ LIBMDBX_API int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); /* Get the maximum number of threads/reader slots for the environment. * * [in] env An environment handle returned by mdbx_env_create() * [out] readers Address of an integer to store the number of readers - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); /* Set the maximum number of named databases for the environment. @@ -833,11 +815,11 @@ LIBMDBX_API int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); * does a linear search of the opened slots. * [in] env An environment handle returned by mdbx_env_create() * [in] dbs The maximum number of databases - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified, or the environment is - *already open. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified, + * or the environment is already open. */ LIBMDBX_API int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); /* Get the maximum size of keys and MDB_DUPSORT data we can write. @@ -852,8 +834,8 @@ LIBMDBX_API int mdbx_get_maxkeysize(size_t pagesize); * * [in] env An environment handle returned by mdbx_env_create() * [in] ctx An arbitrary pointer for whatever the application needs. - * Returns A non-zero error value on failure and 0 on success. - */ + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_set_userctx(MDB_env *env, void *ctx); /* Get the application information associated with the MDB_env. @@ -877,8 +859,8 @@ typedef void MDB_assert_func(MDB_env *env, const char *msg, * Note: This hack should become obsolete as lmdb's error handling matures. * [in] env An environment handle returned by mdbx_env_create(). * [in] func An MDB_assert_func function, or 0. - * Returns A non-zero error value on failure and 0 on success. - */ + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); /* Create a transaction for use with the environment. @@ -891,28 +873,28 @@ LIBMDBX_API int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); * Note: Cursors may not span transactions. * [in] env An environment handle returned by mdbx_env_create() * [in] parent If this parameter is non-NULL, the new transaction - * will be a nested transaction, with the transaction indicated by \b parent + * will be a nested transaction, with the transaction indicated by parent * as its parent. Transactions may be nested to any level. A parent * transaction and its cursors may not issue any other operations than * mdbx_txn_commit and mdbx_txn_abort while it has active child transactions. * [in] flags Special options for this transaction. This parameter * must be set to 0 or by bitwise OR'ing together one or more of the * values described here. - * - MDB_RDONLY - * This transaction will not perform any write operations. + * - MDB_RDONLY + * This transaction will not perform any write operations. * [out] txn Address where the new MDB_txn handle will be stored - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - * - MDB_MAP_RESIZED - another process wrote data beyond this - *MDB_env's - * mapsize and this environment's map must be resized as well. - * See mdbx_env_set_mapsize(). - * - MDB_READERS_FULL - a read-only transaction was requested and - * the reader lock table is full. See mdbx_env_set_maxreaders(). - * - ENOMEM - out of memory. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + * - MDB_MAP_RESIZED - another process wrote data beyond this + * MDB_env's + * mapsize and this environment's map must be resized as well. + * See mdbx_env_set_mapsize(). + * - MDB_READERS_FULL - a read-only transaction was requested and + * the reader lock table is full. See mdbx_env_set_maxreaders(). + * - ENOMEM - out of memory. */ LIBMDBX_API int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **txn); @@ -929,8 +911,8 @@ LIBMDBX_API MDB_env *mdbx_txn_env(MDB_txn *txn); * concurrent readers will frequently have the same transaction ID. * * [in] txn A transaction handle returned by mdbx_txn_begin() - * Returns A transaction ID, valid if input is an active transaction. - */ + * + * Returns A transaction ID, valid if input is an active transaction. */ LIBMDBX_API size_t mdbx_txn_id(MDB_txn *txn); /* Commit all the operations of a transaction into the database. @@ -948,13 +930,13 @@ LIBMDBX_API size_t mdbx_txn_id(MDB_txn *txn); * Only write-transactions free cursors. * * [in] txn A transaction handle returned by mdbx_txn_begin() - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - * - ENOSPC - no more disk space. - * - EIO - a low-level I/O error occurred while writing. - * - ENOMEM - out of memory. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified. + * - ENOSPC - no more disk space. + * - EIO - a low-level I/O error occurred while writing. + * - ENOMEM - out of memory. */ LIBMDBX_API int mdbx_txn_commit(MDB_txn *txn); /* Abandon all the operations of the transaction instead of saving @@ -1001,22 +983,21 @@ LIBMDBX_API int mdbx_txn_reset(MDB_txn *txn); * released by mdbx_txn_reset(). It must be called before a reset transaction * may be used again. * [in] txn A transaction handle returned by mdbx_txn_begin() - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_txn_renew(MDB_txn *txn); -/* Open a database in the environment. - * A database handle denotes the name and parameters of a database, - * independently of whether such a database exists. - * The database handle may be discarded by calling mdbx_dbi_close(). - * The old database handle is returned if the database was already open. - * The handle may only be closed once. +/* Open a table in the environment. + * A table handle denotes the name and parameters of a table, independently + * of whether such a table exists. The table handle may be discarded by + * calling mdbx_dbi_close(). The old table handle is returned if the table + * was already open. The handle may only be closed once. * - * The database handle will be private to the current transaction until + * The table handle will be private to the current transaction until * the transaction is successfully committed. If the transaction is * aborted the handle will be closed automatically. * After a successful commit the handle will reside in the shared @@ -1027,66 +1008,57 @@ LIBMDBX_API int mdbx_txn_renew(MDB_txn *txn); * this function must finish (either commit or abort) before * any other transaction in the process may use this function. * - * To use named databases (with name != NULL), mdbx_env_set_maxdbs() - * must be called before opening the environment. Database names are - * keys in the unnamed database, and may be read but not written. + * To use named table (with name != NULL), mdbx_env_set_maxdbs() + * must be called before opening the environment. Table names are + * keys in the internal unnamed table, and may be read but not written. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] name The name of the database to open. If only a single - * database is needed in the environment, this value may be NULL. - * [in] flags Special options for this database. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - * - MDB_REVERSEKEY - * Keys are strings to be compared in reverse order, from the end - * of the strings to the beginning. By default, Keys are treated as - *strings and - * compared from beginning to end. - * - MDB_DUPSORT - * Duplicate keys may be used in the database. (Or, from another - *perspective, - * keys may have multiple data items, stored in sorted order.) By - *default - * keys must be unique and may have only a single data item. - * - MDB_INTEGERKEY - * Keys are binary integers in native byte order, either unsigned - *int - * or mdbx_size_t, and will be sorted as such. - * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) - * The keys must all be of the same size. - * - MDB_DUPFIXED - * This flag may only be used in combination with MDB_DUPSORT. - *This option - * tells the library that the data items for this database are all - *the same - * size, which allows further optimizations in storage and - *retrieval. When - * all data items are the same size, the MDB_GET_MULTIPLE, - *MDB_NEXT_MULTIPLE - * and MDB_PREV_MULTIPLE cursor operations may be used to retrieve - *multiple - * items at once. - * - MDB_INTEGERDUP - * This option specifies that duplicate data items are binary - *integers, - * similar to MDB_INTEGERKEY keys. - * - MDB_REVERSEDUP - * This option specifies that duplicate data items should be - *compared as - * strings in reverse order. - * - MDB_CREATE - * Create the named database if it doesn't exist. This option is - *not - * allowed in a read-only transaction or a read-only environment. - * [out] dbi Address where the new MDB_dbi handle will be stored - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - MDB_NOTFOUND - the specified database doesn't exist in the - *environment - * and MDB_CREATE was not specified. - * - MDB_DBS_FULL - too many databases have been opened. See - *mdbx_env_set_maxdbs(). - */ + * [in] txn transaction handle returned by mdbx_txn_begin() + * [in] name The name of the table to open. If only a single + * table is needed in the environment, this value may be NULL. + * [in] flags Special options for this table. This parameter must be set + * to 0 or by bitwise OR'ing together one or more of the values + * described here: + * - MDB_REVERSEKEY + * Keys are strings to be compared in reverse order, from the end + * of the strings to the beginning. By default, Keys are treated as + * strings and compared from beginning to end. + * - MDB_DUPSORT + * Duplicate keys may be used in the table. Or, from another point of + * view, keys may have multiple data items, stored in sorted order. By + * default keys must be unique and may have only a single data item. + * - MDB_INTEGERKEY + * Keys are binary integers in native byte order, either uin32_t or + * uint64_t, and will be sorted as such. The keys must all be of the + * same size. + * - MDB_DUPFIXED + * This flag may only be used in combination with MDB_DUPSORT. This + * option tells the library that the data items for this database are + * all the same size, which allows further optimizations in storage and + * retrieval. When all data items are the same size, the MDB_GET_MULTIPLE, + * MDB_NEXT_MULTIPLE and MDB_PREV_MULTIPLE cursor operations may be used + * to retrieve multiple items at once. + * - MDB_INTEGERDUP + * This option specifies that duplicate data items are binary integers, + * similar to MDB_INTEGERKEY keys. + * - MDB_REVERSEDUP + * This option specifies that duplicate data items should be compared as + * strings in reverse order (the comparison is performed in the direction + * from the last byte to the first). + * - MDB_CREATE + * Create the named database if it doesn't exist. This option is not + * allowed in a read-only transaction or a read-only environment. + * + * [out] dbi Address where the new MDB_dbi handle will be stored + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDB_NOTFOUND - the specified database doesn't exist in the + * environment and MDB_CREATE was not specified. + * - MDB_DBS_FULL - too many databases have been opened. + * See mdbx_env_set_maxdbs(). */ +LIBMDBX_API int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, + MDB_dbi *dbi, MDB_cmp_func *keycmp, + MDB_cmp_func *datacmp); LIBMDBX_API int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi); @@ -1095,11 +1067,11 @@ LIBMDBX_API int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] dbi A database handle returned by mdbx_dbi_open() * [out] stat The address of an MDB_stat structure - * where the statistics will be copied - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - */ + * where the statistics will be copied + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, size_t bytes); @@ -1108,8 +1080,8 @@ LIBMDBX_API int mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] dbi A database handle returned by mdbx_dbi_open() * [out] flags Address where the flags will be returned. - * Returns A non-zero error value on failure and 0 on success. - */ + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); /* Close a database handle. Normally unnecessary. Use with care: @@ -1137,15 +1109,15 @@ LIBMDBX_API int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); * [in] dbi A database handle returned by mdbx_dbi_open() * [in] del 0 to empty the DB, 1 to delete it from the * environment and close the DB handle. - * Returns A non-zero error value on failure and 0 on success. - */ + * + * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); /* Get items from a database. * * This function retrieves key/data pairs from the database. The address - * and length of the data associated with the specified \b key are returned - * in the structure to which \b data refers. + * and length of the data associated with the specified key are returned + * in the structure to which data refers. * If the database supports duplicate keys (MDB_DUPSORT) then the * first data item for the key will be returned. Retrieval of other * items requires the use of mdbx_cursor_get(). @@ -1156,15 +1128,16 @@ LIBMDBX_API int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); * any modification attempts will cause a SIGSEGV. * Note: Values returned from the database are valid only until a * subsequent update operation, or the end of the transaction. + * * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] dbi A database handle returned by mdbx_dbi_open() * [in] key The key to search for in the database * [out] data The data corresponding to the key - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - MDB_NOTFOUND - the key was not in the database. - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDB_NOTFOUND - the key was not in the database. + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); @@ -1174,50 +1147,45 @@ LIBMDBX_API int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, * is to enter the new key/data pair, replacing any previously existing key * if duplicates are disallowed, or adding a duplicate data item if * duplicates are allowed (MDB_DUPSORT). + * * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] dbi A database handle returned by mdbx_dbi_open() * [in] key The key to store in the database * [in,out] data The data to store - * [in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - * - MDB_NODUPDATA - enter the new key/data pair only if it does not - * already appear in the database. This flag may only be - *specified - * if the database was opened with MDB_DUPSORT. The function - *will - * return MDB_KEYEXIST if the key/data pair already appears in - *the - * database. - * - MDB_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will - *return - * MDB_KEYEXIST if the key already appears in the database, even - *if - * the database supports duplicates (MDB_DUPSORT). The \b data - * parameter will be set to point to the existing item. - * - MDB_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. - * LMDB does nothing else with this memory, the caller is - *expected - * to modify all of the space requested. This flag must not be - * specified if the database was opened with MDB_DUPSORT. - * - MDB_APPEND - append the given key/data pair to the end of the - * database. This option allows fast bulk loading when keys are - * already known to be in the correct order. Loading unsorted - *keys - * with this flag will cause a MDB_KEYEXIST error. - * - MDB_APPENDDUP - as above, but for sorted dup data. - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). - * - MDB_TXN_FULL - the transaction has too many dirty pages. - * - EACCES - an attempt was made to write in a read-only transaction. - * - EINVAL - an invalid parameter was specified. - */ + * [in] flags Special options for this operation. This parameter must be + * set to 0 or by bitwise OR'ing together one or more of the values + * described here. + * + * - MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be specified + * if the database was opened with MDB_DUPSORT. The function will + * return MDB_KEYEXIST if the key/data pair already appears in the + * database. + * - MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will return + * MDB_KEYEXIST if the key already appears in the database, even if + * the database supports duplicates (MDB_DUPSORT). The data + * parameter will be set to point to the existing item. + * - MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. + * LMDB does nothing else with this memory, the caller is expected + * to modify all of the space requested. This flag must not be + * specified if the database was opened with MDB_DUPSORT. + * - MDB_APPEND - append the given key/data pair to the end of the + * database. This option allows fast bulk loading when keys are + * already known to be in the correct order. Loading unsorted keys + * with this flag will cause a MDB_KEYEXIST error. + * - MDB_APPENDDUP - as above, but for sorted dup data. + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). + * - MDB_TXN_FULL - the transaction has too many dirty pages. + * - EACCES - an attempt was made to write in a read-only transaction. + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); @@ -1244,11 +1212,11 @@ LIBMDBX_API int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, * [in] dbi A database handle returned by mdbx_dbi_open() * [in] key The key to delete from the database * [in] data The data to delete - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EACCES - an attempt was made to write in a read-only transaction. - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EACCES - an attempt was made to write in a read-only transaction. + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); @@ -1276,10 +1244,10 @@ LIBMDBX_API int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] dbi A database handle returned by mdbx_dbi_open() * [out] cursor Address where the new MDB_cursor handle will be stored - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); @@ -1301,10 +1269,10 @@ LIBMDBX_API void mdbx_cursor_close(MDB_cursor *cursor); * This may be done whether the previous transaction is live or dead. * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] cursor A cursor handle returned by mdbx_cursor_open() - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); /* Return the cursor's transaction handle. @@ -1323,9 +1291,9 @@ LIBMDBX_API MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); * * This function retrieves key/data pairs from the database. The address and *length - * of the key are returned in the object to which \b key refers (except for + * of the key are returned in the object to which key refers (except for *the - * case of the MDB_SET option, in which the \b key object is unchanged), and + * case of the MDB_SET option, in which the key object is unchanged), and * the address and length of the data are returned in the object to which \b *data * refers. @@ -1334,11 +1302,11 @@ LIBMDBX_API MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); * [in,out] key The key for a retrieved item * [in,out] data The data of a retrieved item * [in] op A cursor operation MDB_cursor_op - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - MDB_NOTFOUND - no matching key found. - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDB_NOTFOUND - no matching key found. + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, MDB_cursor_op op); @@ -1353,65 +1321,65 @@ LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * [in] data The data operated on. * [in] flags Options for this operation. This parameter * must be set to 0 or one of the values described here. - * - MDB_CURRENT - replace the item at the current cursor position. - * The \b key parameter must still be provided, and must match + * - MDB_CURRENT - replace the item at the current cursor position. + * The key parameter must still be provided, and must match *it. - * If using sorted duplicates (MDB_DUPSORT) the data item must + * If using sorted duplicates (MDB_DUPSORT) the data item must *still - * sort into the same place. This is intended to be used when the - * new data is the same size as the old. Otherwise it will simply - * perform a delete of the old record followed by an insert. - * - MDB_NODUPDATA - enter the new key/data pair only if it does not - * already appear in the database. This flag may only be + * sort into the same place. This is intended to be used when the + * new data is the same size as the old. Otherwise it will simply + * perform a delete of the old record followed by an insert. + * - MDB_NODUPDATA - enter the new key/data pair only if it does not + * already appear in the database. This flag may only be *specified - * if the database was opened with MDB_DUPSORT. The function + * if the database was opened with MDB_DUPSORT. The function *will - * return MDB_KEYEXIST if the key/data pair already appears in + * return MDB_KEYEXIST if the key/data pair already appears in *the - * database. - * - MDB_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will + * database. + * - MDB_NOOVERWRITE - enter the new key/data pair only if the key + * does not already appear in the database. The function will *return - * MDB_KEYEXIST if the key already appears in the database, even + * MDB_KEYEXIST if the key already appears in the database, even *if - * the database supports duplicates (MDB_DUPSORT). - * - MDB_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. This + * the database supports duplicates (MDB_DUPSORT). + * - MDB_RESERVE - reserve space for data of the given size, but + * don't copy the given data. Instead, return a pointer to the + * reserved space, which the caller can fill in later - before + * the next update operation or the transaction ends. This saves + * an extra memcpy if the data is being generated later. This *flag - * must not be specified if the database was opened with + * must not be specified if the database was opened with *MDB_DUPSORT. - * - MDB_APPEND - append the given key/data pair to the end of the - * database. No key comparisons are performed. This option allows - * fast bulk loading when keys are already known to be in the - * correct order. Loading unsorted keys with this flag will cause - * a MDB_KEYEXIST error. - * - MDB_APPENDDUP - as above, but for sorted dup data. - * - MDB_MULTIPLE - store multiple contiguous data elements in a - * single request. This flag may only be specified if the + * - MDB_APPEND - append the given key/data pair to the end of the + * database. No key comparisons are performed. This option allows + * fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * a MDB_KEYEXIST error. + * - MDB_APPENDDUP - as above, but for sorted dup data. + * - MDB_MULTIPLE - store multiple contiguous data elements in a + * single request. This flag may only be specified if the *database - * was opened with MDB_DUPFIXED. The \b data argument must be an - * array of two MDB_vals. The mv_size of the first MDB_val must + * was opened with MDB_DUPFIXED. The data argument must be an + * array of two MDB_vals. The mv_size of the first MDB_val must *be - * the size of a single data element. The mv_data of the first + * the size of a single data element. The mv_data of the first *MDB_val - * must point to the beginning of the array of contiguous data + * must point to the beginning of the array of contiguous data *elements. - * The mv_size of the second MDB_val must be the count of the + * The mv_size of the second MDB_val must be the count of the *number - * of data elements to store. On return this field will be set to - * the count of the number of elements actually written. The + * of data elements to store. On return this field will be set to + * the count of the number of elements actually written. The *mv_data - * of the second MDB_val is unused. - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). - * - MDB_TXN_FULL - the transaction has too many dirty pages. - * - EACCES - an attempt was made to write in a read-only transaction. - * - EINVAL - an invalid parameter was specified. - */ + * of the second MDB_val is unused. + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). + * - MDB_TXN_FULL - the transaction has too many dirty pages. + * - EACCES - an attempt was made to write in a read-only transaction. + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, unsigned flags); @@ -1421,14 +1389,14 @@ LIBMDBX_API int mdbx_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * [in] cursor A cursor handle returned by mdbx_cursor_open() * [in] flags Options for this operation. This parameter * must be set to 0 or one of the values described here. - * - MDB_NODUPDATA - delete all of the data items for the current key. - * This flag may only be specified if the database was opened with + * - MDB_NODUPDATA - delete all of the data items for the current key. + * This flag may only be specified if the database was opened with *MDB_DUPSORT. - * Returns A non-zero error value on failure and 0 on success. Some possible - * errors are: - * - EACCES - an attempt was made to write in a read-only transaction. - * - EINVAL - an invalid parameter was specified. - */ + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - EACCES - an attempt was made to write in a read-only transaction. + * - EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); /* Return count of duplicates for current key. @@ -1438,8 +1406,8 @@ LIBMDBX_API int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); * [in] cursor A cursor handle returned by mdbx_cursor_open() * [out] countp Address where the count will be stored * - * Returns A non-zero error value on failure and 0 on success. - * Some possible errors are: + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: * - EINVAL - cursor is not initialized, * or an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, size_t *countp); @@ -1562,6 +1530,7 @@ LIBMDBX_API void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); * a laggard readers to allowing reclaiming of freeDB. * * [in] env An environment handle returned by mdbx_env_create(). + * * Returns A #MDBX_oom_func function or NULL if disabled. */ LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDB_env *env); @@ -1619,10 +1588,6 @@ LIBMDBX_API int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, LIBMDBX_API int mdbx_is_dirty(const MDB_txn *txn, const void *ptr); -LIBMDBX_API int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, - MDB_dbi *dbi, MDB_cmp_func *keycmp, - MDB_cmp_func *datacmp); - LIBMDBX_API int mdbx_dbi_sequence(MDB_txn *txn, MDB_dbi dbi, uint64_t *result, uint64_t increment); diff --git a/src/bits.h b/src/bits.h index 7d6fca09..8e8542ae 100644 --- a/src/bits.h +++ b/src/bits.h @@ -431,25 +431,20 @@ struct MDB_txn { /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; -/** @defgroup mt_dbflag Transaction DB Flags -* @ingroup internal -* @{ -*/ -#define DB_DIRTY 0x01 /**< DB was written in this txn */ -#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ -#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ -#define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ -#define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ -#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ - /** @} */ - /** In write txns, array of cursors for each DB */ +/* Transaction DB Flags */ +#define DB_DIRTY 0x01 /* DB was written in this txn */ +#define DB_STALE 0x02 /* Named-DB record is older than txnID */ +#define DB_NEW 0x04 /* Named-DB handle opened in this txn */ +#define DB_VALID 0x08 /* DB handle is valid, see also #MDB_VALID */ +#define DB_USRVALID 0x10 /* As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /* DB is #MDB_DUPSORT data */ + /* In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; - /** Array of flags for each DB */ - unsigned char *mt_dbflags; - /** Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. - */ + /* Array of flags for each DB */ + uint8_t *mt_dbflags; + /* Number of DB records in use, or 0 when the txn is finished. + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. */ MDB_dbi mt_numdbs; /** @defgroup mdbx_txn Transaction Flags diff --git a/src/mdbx.c b/src/mdbx.c index 89829b4b..0cc74d73 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2437,7 +2437,7 @@ size_t mdbx_txn_id(MDB_txn *txn) { return txn->mt_txnid; } -/** Export or close DBI handles opened in this txn. */ +/* Export or close DBI handles opened in this txn. */ static void mdbx_dbis_update(MDB_txn *txn, int keep) { MDB_dbi n = txn->mt_numdbs; MDB_env *env = txn->mt_env; @@ -2463,11 +2463,10 @@ static void mdbx_dbis_update(MDB_txn *txn, int keep) { env->me_numdbs = n; } -/** End a transaction, except successful commit of a nested transaction. +/* End a transaction, except successful commit of a nested transaction. * May be called twice for readonly txns: First reset it, then abort. * @param[in] txn the transaction handle to end - * @param[in] mode why and how to end the transaction - */ + * @param[in] mode why and how to end the transaction */ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { MDB_env *env = txn->mt_env; static const char *const names[] = MDB_END_NAMES; @@ -4717,8 +4716,7 @@ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { memcpy(&md_flags, ((char *)data.mv_data + offsetof(MDB_db, md_flags)), sizeof(uint16_t)); /* The txn may not know this DBI, or another process may - * have dropped and recreated the DB with other flags. - */ + * have dropped and recreated the DB with other flags. */ if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != md_flags)) return MDB_INCOMPATIBLE; memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); @@ -10067,24 +10065,24 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, /* если данные совпадают, то ничего делать не надо */ goto bailout; #if 0 /* LY: исправлено в mdbx_cursor_put(), здесь в качестве памятки */ - MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA) - && mc.mc_xcursor->mx_db.md_entries > 1) { - /* Если у ключа больше одного значения, то - * сначала удаляем найденое "старое" значение. - * - * Этого можно не делать, так как MDBX уже - * обучен корректно обрабатывать такие ситуации. - * - * Однако, следует помнить, что в LMDB при - * совпадении размера данных, значение будет - * просто перезаписано с нарушением - * упорядоченности, что сломает поиск. */ - rc = mdbx_cursor_del(&mc, 0); - if (rc != MDB_SUCCESS) - goto bailout; - flags -= MDB_CURRENT; - } + MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + if (F_ISSET(leaf->mn_flags, F_DUPDATA) && + mc.mc_xcursor->mx_db.md_entries > 1) { + /* Если у ключа больше одного значения, то + * сначала удаляем найденое "старое" значение. + * + * Этого можно не делать, так как MDBX уже + * обучен корректно обрабатывать такие ситуации. + * + * Однако, следует помнить, что в LMDB при + * совпадении размера данных, значение будет + * просто перезаписано с нарушением + * упорядоченности, что сломает поиск. */ + rc = mdbx_cursor_del(&mc, 0); + if (rc != MDB_SUCCESS) + goto bailout; + flags -= MDB_CURRENT; + } #endif } } else { From 7a4541f5982c33c6c6ea8e69a19bb70585be2947 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 17 May 2017 20:09:40 +0300 Subject: [PATCH 120/303] mdbx: minor cleanup. --- src/mdbx.c | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 0cc74d73..d9a7da2c 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4636,7 +4636,8 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { } if (unlikely(!IS_LEAF(mp))) { - mdbx_debug("internal error, index points to a %02X page!?", mp->mp_flags); + mdbx_debug("internal error, index points to a page with 0x%02x flags!?", + mp->mp_flags); mc->mc_txn->mt_flags |= MDB_TXN_ERROR; return MDB_CORRUPTED; } @@ -9187,9 +9188,9 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { break; } } - if (first) { + if (first) rc = func("(no active readers)\n", ctx); - } + return rc; } @@ -9453,7 +9454,7 @@ static void __hot mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge) { #define SMALL 8 #define MIDL_SWAP(a, b) \ { \ - itmp = (a); \ + MDB_ID itmp = (a); \ (a) = (b); \ (b) = itmp; \ } @@ -9462,7 +9463,7 @@ static void __hot mdbx_midl_sort(MDB_IDL ids) { /* Max possible depth of int-indexed tree * 2 items/level */ int istack[sizeof(int) * CHAR_BIT * 2]; int i, j, k, l, ir, jstack; - MDB_ID a, itmp; + MDB_ID a; ir = (int)ids[0]; l = 1; @@ -9542,19 +9543,17 @@ static unsigned __hot mdbx_mid2l_search(MDB_ID2L ids, MDB_ID id) { if (val < 0) { n = pivot; - } else if (val > 0) { base = cursor; n -= pivot + 1; - } else { return cursor; } } - if (val > 0) { + if (val > 0) ++cursor; - } + return cursor; } @@ -9562,29 +9561,20 @@ static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id) { unsigned x, i; x = mdbx_mid2l_search(ids, id->mid); + if (x < 1) + return /* internal error */ -2; - if (x < 1) { - /* internal error */ - return -2; - } + if (x <= ids[0].mid && ids[x].mid == id->mid) + return /* duplicate */ -1; - if (x <= ids[0].mid && ids[x].mid == id->mid) { - /* duplicate */ - return -1; - } - - if (ids[0].mid >= MDB_IDL_UM_MAX) { - /* too big */ - return -2; - - } else { - /* insert id */ - ids[0].mid++; - for (i = (unsigned)ids[0].mid; i > x; i--) - ids[i] = ids[i - 1]; - ids[x] = *id; - } + if (ids[0].mid >= MDB_IDL_UM_MAX) + return /* too big */ -2; + /* insert id */ + ids[0].mid++; + for (i = (unsigned)ids[0].mid; i > x; i--) + ids[i] = ids[i - 1]; + ids[x] = *id; return 0; } From c479c5ff154bfaf21c48150069c7662df2e6574e Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 18 May 2017 02:21:39 +0300 Subject: [PATCH 121/303] mdbx: rework mdbx_env_sync(). Change-Id: I6192e8a333b607d1e46eadddb73863943635f1ec --- src/mdbx.c | 107 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 62 insertions(+), 45 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index d9a7da2c..0d33fab6 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -656,7 +656,8 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned nflags); static int mdbx_read_header(MDB_env *env, MDB_meta *meta); -static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); +static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, + MDB_meta *pending); static void mdbx_env_close0(MDB_env *env); static MDB_node *mdbx_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); @@ -1724,7 +1725,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { me_flags |= MDBX_UTTERLY_NOSYNC; mdbx_assert(env, env->me_sync_pending > 0); - if (mdbx_env_sync0(env, me_flags, &meta) == MDB_SUCCESS) { + if (mdbx_env_sync_locked(env, me_flags, &meta) == MDB_SUCCESS) { txnid_t snap = mdbx_find_oldest(env, NULL); if (snap > oldest) { continue; @@ -1969,10 +1970,6 @@ fail: } int mdbx_env_sync(MDB_env *env, int force) { - int rc; - MDB_meta *head; - unsigned flags; - if (unlikely(!env)) return MDBX_EINVAL; @@ -1982,49 +1979,60 @@ int mdbx_env_sync(MDB_env *env, int force) { if (unlikely(!env->me_lck)) return MDB_PANIC; - flags = env->me_flags & ~MDB_NOMETASYNC; + unsigned flags = env->me_flags & ~MDB_NOMETASYNC; if (unlikely(flags & (MDB_RDONLY | MDB_FATAL_ERROR))) return MDBX_EACCESS; - head = mdbx_meta_head(env); - if (!META_IS_WEAK(head) && env->me_sync_pending == 0 && - env->me_mapsize == head->mm_mapsize) - /* LY: nothing to do */ - return MDB_SUCCESS; - - if (force || head->mm_mapsize != env->me_mapsize || - (env->me_sync_threshold && - env->me_sync_pending >= env->me_sync_threshold)) - flags &= MDB_WRITEMAP; - - /* LY: early sync before acquiring the mutex to reduce writer's latency */ - if (env->me_sync_pending > env->me_psize * 16 && (flags & MDB_NOSYNC) == 0) { - assert(((flags ^ env->me_flags) & MDB_WRITEMAP) == 0); - if (flags & MDB_WRITEMAP) { - size_t used_size = env->me_psize * (head->mm_last_pg + 1); - rc = mdbx_msync(env->me_map, used_size, flags & MDB_MAPASYNC); - } else { - rc = mdbx_filesync(env->me_fd, false); - } - if (unlikely(rc != MDB_SUCCESS)) - return rc; - } - - rc = mdbx_txn_lock(env); + int rc = mdbx_txn_lock(env); if (unlikely(rc != MDB_SUCCESS)) return rc; - /* LY: head may be changed while the mutex has been acquired. */ - head = mdbx_meta_head(env); - rc = MDB_SUCCESS; - if (META_IS_WEAK(head) || env->me_sync_pending != 0 || + MDB_meta *head = mdbx_meta_head(env); + if (!META_IS_STEADY(head) || env->me_sync_pending || env->me_mapsize != head->mm_mapsize) { - MDB_meta meta = *head; - rc = mdbx_env_sync0(env, flags, &meta); + + if (force || head->mm_mapsize != env->me_mapsize || + (env->me_sync_threshold && + env->me_sync_pending >= env->me_sync_threshold)) + flags &= MDB_WRITEMAP /* clear flags for full steady sync */; + + if (env->me_sync_pending > env->me_psize * 16 && + (flags & MDB_NOSYNC) == 0) { + assert(((flags ^ env->me_flags) & MDB_WRITEMAP) == 0); + size_t used_size = env->me_psize * (head->mm_last_pg + 1); + mdbx_txn_unlock(env); + + /* LY: pre-sync without holding lock to reduce latency for writer(s) */ + if (flags & MDB_WRITEMAP) { + rc = mdbx_msync(env->me_map, used_size, flags & MDB_MAPASYNC); + } else { + rc = mdbx_filesync(env->me_fd, false); + } + if (unlikely(rc != MDB_SUCCESS)) + return rc; + + rc = mdbx_txn_lock(env); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + + /* LY: head may be changed. */ + head = mdbx_meta_head(env); + } + + if (!META_IS_STEADY(head) || env->me_sync_pending || + env->me_mapsize != head->mm_mapsize) { + MDB_meta meta = *head; + rc = mdbx_env_sync_locked(env, flags, &meta); + if (unlikely(rc != MDB_SUCCESS)) { + mdbx_txn_unlock(env); + return rc; + } + } } mdbx_txn_unlock(env); - return rc; + assert(rc == MDB_SUCCESS); + return MDB_SUCCESS; } /** Back up parent txn's cursors, then grab the originals for tracking */ @@ -3266,7 +3274,7 @@ int mdbx_txn_commit(MDB_txn *txn) { meta.mm_txnid = txn->mt_txnid; meta.mm_canary = txn->mt_canary; - rc = mdbx_env_sync0(env, env->me_flags | txn->mt_flags, &meta); + rc = mdbx_env_sync_locked(env, env->me_flags | txn->mt_flags, &meta); } if (unlikely(rc != MDB_SUCCESS)) goto fail; @@ -3396,7 +3404,8 @@ static int __cold mdbx_env_init_meta(MDB_env *env, MDB_meta *meta) { return rc; } -static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { +static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, + MDB_meta *pending) { int rc; MDB_meta *head = mdbx_meta_head(env); size_t prev_mapsize = head->mm_mapsize; @@ -3484,10 +3493,15 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { : "Legacy"); if (env->me_flags & MDB_WRITEMAP) { - /* LY: 'invalidate' the meta, - * but mdbx_meta_head_r() will be confused/retired in collision case. */ - target->mm_datasync_sign = MDB_DATASIGN_WEAK; - target->mm_txnid = 0; + /* LY: 'invalidate' the meta. */ + mdbx_jitter4testing(true); + if (target->mm_datasync_sign != MDB_DATASIGN_WEAK || + target->mm_txnid != pending->mm_txnid) { + target->mm_datasync_sign = MDB_DATASIGN_WEAK; + mdbx_jitter4testing(true); + target->mm_txnid = 0; + mdbx_jitter4testing(true); + } /* LY: update info */ target->mm_mapsize = pending->mm_mapsize; target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; @@ -3495,8 +3509,11 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { target->mm_last_pg = pending->mm_last_pg; target->mm_canary = pending->mm_canary; /* LY: 'commit' the meta */ + mdbx_jitter4testing(true); target->mm_txnid = pending->mm_txnid; + mdbx_jitter4testing(true); target->mm_datasync_sign = pending->mm_datasync_sign; + mdbx_jitter4testing(true); } else { pending->mm_magic = MDB_MAGIC; pending->mm_version = MDB_DATA_VERSION; From 4481555c90ee61f8f61e59ed7d2a5eb9d364f3aa Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 17 May 2017 22:17:24 +0300 Subject: [PATCH 122/303] mdbx: refine mdbx_dkey() API. --- mdbx.h | 3 +- src/mdbx.c | 97 +++++++++++++++++++++++++++++------------------------- 2 files changed, 54 insertions(+), 46 deletions(-) diff --git a/mdbx.h b/mdbx.h index a6405a53..a561d751 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1463,7 +1463,8 @@ LIBMDBX_API int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); * Returns 0 on success, non-zero on failure. */ LIBMDBX_API int mdbx_reader_check(MDB_env *env, int *dead); -LIBMDBX_API char *mdbx_dkey(MDB_val *key, char *buf, const size_t bufsize); +LIBMDBX_API char *mdbx_dkey(const MDB_val *key, char *const buf, + const size_t bufsize); LIBMDBX_API int mdbx_env_close_ex(MDB_env *env, int dont_sync); diff --git a/src/mdbx.c b/src/mdbx.c index 0d33fab6..dd106605 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -324,16 +324,19 @@ txnid_t mdbx_debug_edge; /** The version number for a database's lockfile format. */ #define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) +/* Key size which fits in a #DKBUF. */ #define DKBUF_MAXKEYSIZE 511 /* FIXME */ - /** Key size which fits in a #DKBUF. - * @ingroup debug - */ -#define DKBUF char kbuf[DKBUF_MAXKEYSIZE] -/** Display a key in hex. - * @ingroup debug - * Invoke a function to display a key in hex. - */ -#define DKEY(x) mdbx_dkey(x, kbuf, sizeof(kbuf)) + +#if MDB_DEBUG +#define DKBUF char _kbuf[DKBUF_MAXKEYSIZE * 4 + 2] +#define DKEY(x) mdbx_dkey(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1) +#define DVAL(x) \ + mdbx_dkey(x, _kbuf + DKBUF_MAXKEYSIZE * 2 + 1, DKBUF_MAXKEYSIZE * 2 + 1) +#else +#define DKBUF ((void)(0)) +#define DKEY(x) ("-") +#define DVAL(x) ("-") +#endif /** An invalid page number. * Mainly used to denote an empty tree. @@ -837,43 +840,46 @@ static __inline pgno_t mdbx_dbg_pgno(MDB_page *mp) { return ret; } -/** Display a key in hexadecimal and return the address of the result. -* @param[in] key the key to display -* @param[in] buf the buffer to write into. Should always be #DKBUF. -* @return The key in hexadecimal form. -*/ -char *mdbx_dkey(MDB_val *key, char *buf, const size_t bufsize) { -#ifdef _MSC_VER - (void)key; - (void)buf; - return "FIXME: mdbx_dkey()"; -#else - char *ptr = buf; - unsigned i; - +/* Dump a key in ascii or hexadecimal. */ +char *mdbx_dkey(const MDB_val *key, char *const buf, const size_t bufsize) { if (!key) - return ""; + return ""; + if (!buf || bufsize < 4) + return nullptr; + if (!key->iov_len) + return ""; const uint8_t *const data = key->mv_data; bool is_ascii = true; + unsigned i; for (i = 0; is_ascii && i < key->mv_size; i++) if (data[i] < ' ' || data[i] > 127) is_ascii = false; - if (is_ascii) - snprintf(buf, bufsize, "%.*s", - (key->mv_size > INT_MAX) ? INT_MAX : (int)key->mv_size, data); - else { - buf[0] = '\0'; + if (is_ascii) { + int len = + snprintf(buf, bufsize, "%.*s", + (key->mv_size > INT_MAX) ? INT_MAX : (int)key->mv_size, data); + assert(len > 0 && (unsigned)len < bufsize); + (void)len; + } else { + char *const detent = buf + bufsize - 2; + char *ptr = buf; + *ptr++ = '<'; for (i = 0; i < key->mv_size; i++) { - int len = snprintf(ptr, bufsize - (ptr - buf), "%02x", data[i]); - if (len < 1) + const ptrdiff_t left = detent - ptr; + assert(left > 0); + int len = snprintf(ptr, left, "%02x", data[i]); + if (len < 0 || len >= left) break; ptr += len; } + if (ptr < detent) { + ptr[0] = '>'; + ptr[1] = '\0'; + } } return buf; -#endif /* _MSC_VER */ } #if 0 /* LY: debug stuff */ @@ -4660,7 +4666,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { } mdbx_debug("found leaf page %" PRIuPTR " for key [%s]", mp->mp_pgno, - key ? DKEY(key) : "null"); + DKEY(key)); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -5318,8 +5324,8 @@ set1: /* The key already matches in all other cases */ if (op == MDB_SET_RANGE || op == MDB_SET_KEY) MDB_GET_KEY(leaf, key); - mdbx_debug("==> cursor placed on key [%s]", DKEY(key)); + mdbx_debug("==> cursor placed on key [%s], data [%s]", DKEY(key), DVAL(data)); return rc; } @@ -5627,9 +5633,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, env = mc->mc_txn->mt_env; - /* Check this first so counter will always be zero on any - * early failures. - */ + /* Check this first so counter will always be zero on any early failures. */ if (flags & MDB_MULTIPLE) { dcount = data[1].mv_size; data[1].mv_size = 0; @@ -5640,6 +5644,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (flags & MDB_RESERVE) { if (unlikely(mc->mc_db->md_flags & (MDB_DUPSORT | MDB_REVERSEDUP))) return MDB_INCOMPATIBLE; + data->mv_data = nullptr; } nospill = flags & MDB_NOSPILL; @@ -5670,9 +5675,10 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDB_BAD_VALSIZE; } - mdbx_debug("==> put db %d key [%s], size %" PRIuPTR ", data size %" PRIuPTR - "", - DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size); + mdbx_debug("==> put db %d key [%s], size %" PRIuPTR + ", data [%s] size %" PRIuPTR, + DDBI(mc), DKEY(key), key ? key->mv_size : 0, + DVAL((flags & MDB_RESERVE) ? nullptr : data), data->mv_size); int dupdata_flag = 0; if (flags & MDB_CURRENT) { @@ -6151,8 +6157,8 @@ new_sub: } return rc; bad_sub: - if (unlikely(rc == - MDB_KEYEXIST)) /* should not happen, we deleted that item */ + if (unlikely(rc == MDB_KEYEXIST)) + /* should not happen, we deleted that item */ rc = MDB_PROBLEM; } mc->mc_txn->mt_flags |= MDB_TXN_ERROR; @@ -6391,7 +6397,7 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, " key size %" PRIuPTR " [%s]", IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdbx_dbg_pgno(mp), indx, data ? data->mv_size : 0, - key ? key->mv_size : 0, key ? DKEY(key) : "null"); + key ? key->mv_size : 0, DKEY(key)); if (IS_LEAF2(mp)) { mdbx_cassert(mc, key); @@ -6891,7 +6897,7 @@ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { mp = mc->mc_pg[mc->mc_top]; node = NODEPTR(mp, indx); ptr = mp->mp_ptrs[indx]; - { + if (MDB_DEBUG) { MDB_val k2; char kbuf2[DKBUF_MAXKEYSIZE * 2 + 1]; k2.mv_data = NODEKEY(node); @@ -7658,7 +7664,8 @@ static int mdbx_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, int rc, exact = 0; DKBUF; - mdbx_debug("====> delete db %u key [%s]", dbi, DKEY(key)); + mdbx_debug("====> delete db %u key [%s], data [%s]", dbi, DKEY(key), + DVAL(data)); mdbx_cursor_init(&mc, txn, dbi, &mx); From 1431e3e5bf67227450cbf6b1e0e9cf77e935ce34 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 18 May 2017 14:14:59 +0300 Subject: [PATCH 123/303] test: fix logging for stupid MSVC. --- test/log.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/test/log.cc b/test/log.cc index 4f1df4e4..2254e3f0 100644 --- a/test/log.cc +++ b/test/log.cc @@ -114,6 +114,7 @@ bool output(const logging::loglevel priority, const char *format, va_list ap) { prefix.c_str(), level2str(priority), suffix.c_str()); va_list ones; + memset(&ones, 0, sizeof(ones)) /* zap MSVC and other stupid compilers */; if (priority >= error) va_copy(ones, ap); vfprintf(last, format, ap); From 6ee3aebc3260a3693f4e85243feb90782044fbbb Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 18 May 2017 14:00:40 +0300 Subject: [PATCH 124/303] mdbx: use flock() for share/exclusive lck. --- src/lck-posix.c | 84 +++++++++++++++++++++++++++++-------------------- src/osal.h | 1 + 2 files changed, 51 insertions(+), 34 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index aae8020e..40972a03 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -74,6 +74,50 @@ static int mdbx_lck_op(mdbx_filehandle_t fd, int op, int lck, off_t offset, } } +static __inline int mdbx_lck_exclusive(int lfd) { + assert(lfd != INVALID_HANDLE_VALUE); + if (flock(lfd, LOCK_EX | LOCK_NB)) + return errno; + return mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1); +} + +static __inline int mdbx_lck_shared(int lfd) { + assert(lfd != INVALID_HANDLE_VALUE); + while (flock(lfd, LOCK_SH)) { + int rc = errno; + if (rc != EINTR) + return rc; + } + return mdbx_lck_op(lfd, F_SETLKW, F_RDLCK, 0, 1); +} + +int mdbx_lck_downgrade(MDB_env *env) { return mdbx_lck_shared(env->me_lfd); } + +int mdbx_rpid_set(MDB_env *env) { + return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid, 1); +} + +int mdbx_rpid_clear(MDB_env *env) { + return mdbx_lck_op(env->me_lfd, F_SETLKW, F_UNLCK, env->me_pid, 1); +} + +/* Checks reader by pid. + * + * Returns: + * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) + * MDBX_RESULT_FALSE, if pid is dead (lock acquired) + * or otherwise the errcode. */ +int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { + int rc = mdbx_lck_op(env->me_lfd, F_GETLK, F_WRLCK, pid, 1); + if (rc == 0) + return MDBX_RESULT_FALSE; + if (rc < 0 && -rc == pid) + return MDBX_RESULT_TRUE; + return rc; +} + +/*---------------------------------------------------------------------------*/ + static int mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc); int mdbx_lck_init(MDB_env *env) { @@ -117,8 +161,7 @@ bailout: void mdbx_lck_destroy(MDB_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* try get exclusive access */ - if (env->me_lck && - mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, 0, LCK_WHOLE) == 0) { + if (env->me_lck && mdbx_lck_exclusive(env->me_lfd) == 0) { /* got exclusive, drown mutexes */ int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); if (rc == 0) @@ -169,20 +212,20 @@ static int internal_seize_lck(int lfd) { assert(lfd != INVALID_HANDLE_VALUE); /* try exclusive access */ - int rc = mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1); + int rc = mdbx_lck_exclusive(lfd); if (rc == 0) /* got exclusive */ return MDBX_RESULT_TRUE; - if (rc == EAGAIN || rc == EACCES || rc == EBUSY) { + if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) { /* get shared access */ - rc = mdbx_lck_op(lfd, F_SETLKW, F_RDLCK, 0, 1); + rc = mdbx_lck_shared(lfd); if (rc == 0) { /* got shared, try exclusive again */ - rc = mdbx_lck_op(lfd, F_SETLK, F_WRLCK, 0, 1); + rc = mdbx_lck_exclusive(lfd); if (rc == 0) /* now got exclusive */ return MDBX_RESULT_TRUE; - if (rc == EAGAIN || rc == EACCES || rc == EBUSY) + if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) /* unable exclusive, but stay shared */ return MDBX_RESULT_FALSE; } @@ -217,33 +260,6 @@ int mdbx_lck_seize(MDB_env *env) { return internal_seize_lck(env->me_lfd); } -int mdbx_lck_downgrade(MDB_env *env) { - return mdbx_lck_op(env->me_lfd, F_SETLK, F_RDLCK, 0, 1); -} - -int mdbx_rpid_set(MDB_env *env) { - return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid, 1); -} - -int mdbx_rpid_clear(MDB_env *env) { - return mdbx_lck_op(env->me_lfd, F_SETLKW, F_UNLCK, env->me_pid, 1); -} - -/* Checks reader by pid. - * - * Returns: - * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) - * MDBX_RESULT_FALSE, if pid is dead (lock acquired) - * or otherwise the errcode. */ -int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { - int rc = mdbx_lck_op(env->me_lfd, F_GETLK, F_WRLCK, pid, 1); - if (rc == 0) - return MDBX_RESULT_FALSE; - if (rc < 0 && -rc == pid) - return MDBX_RESULT_TRUE; - return rc; -} - #if !__GLIBC_PREREQ(2, 12) && !defined(pthread_mutex_consistent) #define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) #endif diff --git a/src/osal.h b/src/osal.h index 39c4de88..95037d85 100644 --- a/src/osal.h +++ b/src/osal.h @@ -68,6 +68,7 @@ typedef SSIZE_T ssize_t; #define THREAD_RESULT DWORD #else #include +#include #include #include #include From 818e742c58afee4bbfbc4bfc3e5b7bb49cfece37 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 18 May 2017 04:58:02 +0300 Subject: [PATCH 125/303] mdbx: cleanup mdbx_env_create(). --- mdbx.h | 2 +- src/mdbx.c | 28 +++++++++++++--------------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/mdbx.h b/mdbx.h index a561d751..6a51768d 100644 --- a/mdbx.h +++ b/mdbx.h @@ -386,7 +386,7 @@ LIBMDBX_API const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); * [out] env The address where the new handle will be stored * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_create(MDB_env **env); +LIBMDBX_API int mdbx_env_create(MDB_env **penv); /* Open an environment handle. * diff --git a/src/mdbx.c b/src/mdbx.c index dd106605..ca241781 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3614,25 +3614,23 @@ static void __cold mdbx_env_setup_limits(MDB_env *env, size_t pagesize) { assert(env->me_maxkey_limit > 42 && env->me_maxkey_limit < pagesize); } -int __cold mdbx_env_create(MDB_env **env) { - MDB_env *e; - - e = calloc(1, sizeof(MDB_env)); - if (!e) +int __cold mdbx_env_create(MDB_env **penv) { + MDB_env *env = calloc(1, sizeof(MDB_env)); + if (!env) return MDBX_ENOMEM; - e->me_maxreaders = DEFAULT_READERS; - e->me_maxdbs = e->me_numdbs = CORE_DBS; - e->me_fd = INVALID_HANDLE_VALUE; - e->me_lfd = INVALID_HANDLE_VALUE; - e->me_pid = mdbx_getpid(); - mdbx_env_setup_limits(e, e->me_os_psize = mdbx_syspagesize()); - if (!is_power2(e->me_os_psize)) + env->me_maxreaders = DEFAULT_READERS; + env->me_maxdbs = env->me_numdbs = CORE_DBS; + env->me_fd = INVALID_HANDLE_VALUE; + env->me_lfd = INVALID_HANDLE_VALUE; + env->me_pid = mdbx_getpid(); + mdbx_env_setup_limits(env, env->me_os_psize = mdbx_syspagesize()); + if (!is_power2(env->me_os_psize)) return MDB_INCOMPATIBLE; - VALGRIND_CREATE_MEMPOOL(e, 0, 0); - e->me_signature = MDBX_ME_SIGNATURE; - *env = e; + VALGRIND_CREATE_MEMPOOL(env, 0, 0); + env->me_signature = MDBX_ME_SIGNATURE; + *penv = env; return MDB_SUCCESS; } From fd8be9928285e65d15107b801580285967bac304 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 18 May 2017 05:04:31 +0300 Subject: [PATCH 126/303] mdbx: cleanup mdbx_env_init_metas(). --- src/mdbx.c | 73 +++++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index ca241781..f53dd308 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3364,49 +3364,45 @@ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { } /* Fill in most of the zeroed MDB_meta for an empty database environment */ -static void __cold mdbx_env_init_meta0(MDB_env *env, MDB_meta *meta) { - meta->mm_magic = MDB_MAGIC; - meta->mm_version = MDB_DATA_VERSION; - meta->mm_mapsize = env->me_mapsize; - meta->mm_psize = env->me_psize; - meta->mm_last_pg = NUM_METAS - 1; - meta->mm_flags = env->me_flags & 0xffff; - meta->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ - meta->mm_dbs[FREE_DBI].md_root = P_INVALID; - meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; - meta->mm_datasync_sign = mdbx_meta_sign(meta); +static void __cold mdbx_meta_model(const MDB_env *env, MDB_meta *model) { + memset(model, 0, sizeof(*model)); + model->mm_magic = MDB_MAGIC; + model->mm_version = MDB_DATA_VERSION; + model->mm_mapsize = env->me_mapsize; + model->mm_psize = env->me_psize; + model->mm_last_pg = NUM_METAS - 1; + model->mm_flags = (uint16_t)env->me_flags; + model->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + model->mm_dbs[FREE_DBI].md_root = P_INVALID; + model->mm_dbs[MAIN_DBI].md_root = P_INVALID; + model->mm_datasync_sign = mdbx_meta_sign(model); } -/** Write the environment parameters of a freshly created DB environment. - * @param[in] env the environment handle - * @param[in] meta the #MDB_meta to write - * @return 0 on success, non-zero on failure. - */ -static int __cold mdbx_env_init_meta(MDB_env *env, MDB_meta *meta) { - MDB_page *p, *q; - int rc; - unsigned psize; - - mdbx_debug("writing new meta page"); +/* Write the environment parameters of a freshly created DB environment. */ +static int __cold mdbx_env_init_metas(const MDB_env *env, MDB_meta *model) { + mdbx_debug("writing new meta pages"); assert(offsetof(MDB_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); - psize = env->me_psize; - - p = calloc(NUM_METAS, psize); - if (!p) + unsigned page_size = env->me_psize; + MDB_page *first = calloc(NUM_METAS, page_size); + if (!first) return MDBX_ENOMEM; - p->mp_pgno = 0; - p->mp_flags = P_META; - *(MDB_meta *)PAGEDATA(p) = *meta; + first->mp_pgno = 0; + first->mp_flags = P_META; + MDB_meta *first_meta = (MDB_meta *)PAGEDATA(first); - q = (MDB_page *)((char *)p + psize); - q->mp_pgno = 1; - q->mp_flags = P_META; - *(MDB_meta *)PAGEDATA(q) = *meta; + MDB_page *second = (MDB_page *)((char *)first + page_size); + second->mp_pgno = 1; + second->mp_flags = P_META; + MDB_meta *second_meta = (MDB_meta *)PAGEDATA(second); - rc = mdbx_pwrite(env->me_fd, p, psize * NUM_METAS, 0); + *first_meta = *model; + model->mm_txnid += 1; + *second_meta = *model; - free(p); + int rc = mdbx_pwrite(env->me_fd, first, page_size * NUM_METAS, 0); + + free(first); return rc; } @@ -3801,8 +3797,7 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { env->me_psize = env->me_os_psize; if (env->me_psize > MAX_PAGESIZE) env->me_psize = MAX_PAGESIZE; - memset(meta, 0, sizeof(*meta)); - mdbx_env_init_meta0(env, meta); + mdbx_meta_model(env, meta); meta->mm_mapsize = DEFAULT_MAPSIZE; } else { env->me_psize = meta->mm_psize; @@ -3825,7 +3820,7 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { if (rc == MDBX_RESULT_TRUE) { /* mdbx_env_map() may grow the datafile. Write the metapages * first, so the file will be valid if initialization fails. */ - err = mdbx_env_init_meta(env, meta); + err = mdbx_env_init_metas(env, meta); if (unlikely(err != MDB_SUCCESS)) return err; @@ -8449,7 +8444,7 @@ static int __cold mdbx_env_copyfd1(MDB_env *env, mdbx_filehandle_t fd) { mp->mp_pgno = 0; mp->mp_flags = P_META; mm = (MDB_meta *)PAGEDATA(mp); - mdbx_env_init_meta0(env, mm); + mdbx_meta_model(env, mm); mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); mp->mp_pgno = 1; From d2af39fd7391a3a3ac36ee0600584f9a4f188ef5 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 18 May 2017 15:36:32 +0300 Subject: [PATCH 127/303] mdbx: more for without-lck mode (not completed). --- src/mdbx.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index f53dd308..077281e8 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2146,15 +2146,17 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { mdbx_assert(env, r->mr_tid == mdbx_thread_self()); } } else { - mdbx_assert(env, env->me_flags & MDB_NOTLS); + mdbx_assert(env, !env->me_lck || (env->me_flags & MDB_NOTLS)); } if (likely(r)) { if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != ~(txnid_t)0)) return MDB_BAD_RSLOT; - } else { + } else if (env->me_lck) { mdbx_pid_t pid = env->me_pid; mdbx_tid_t tid = mdbx_thread_self(); + mdbx_assert(env, env->me_lck->mti_magic == MDB_MAGIC); + mdbx_assert(env, env->me_lck->mti_format == MDB_LOCK_FORMAT); rc = mdbx_rdt_lock(env); if (unlikely(MDBX_IS_ERROR(rc))) @@ -2212,8 +2214,10 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { mdbx_jitter4testing(false); const txnid_t snap = meta->mm_txnid; mdbx_jitter4testing(false); - r->mr_txnid = snap; - mdbx_jitter4testing(false); + if (r) { + r->mr_txnid = snap; + mdbx_jitter4testing(false); + } mdbx_coherent_barrier(); mdbx_jitter4testing(true); @@ -4097,7 +4101,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, } } - if ((env->me_flags & MDB_NOTLS) == 0) { + if (env->me_lck && (env->me_flags & MDB_NOTLS) == 0) { rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_lck->mti_readers[0], &env->me_lck->mti_readers[env->me_maxreaders]); if (unlikely(rc != MDB_SUCCESS)) From b4fd0500d1b3fadd13940370795055bd5cfc3d4b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 18 May 2017 15:36:53 +0300 Subject: [PATCH 128/303] mdbx: more lck-debug. --- src/lck-posix.c | 15 ++++++++++++--- src/mdbx.c | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 40972a03..5aa818d8 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -162,7 +162,7 @@ void mdbx_lck_destroy(MDB_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* try get exclusive access */ if (env->me_lck && mdbx_lck_exclusive(env->me_lfd) == 0) { - /* got exclusive, drown mutexes */ + mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); if (rc == 0) rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); @@ -188,22 +188,31 @@ static int mdbx_robust_unlock(MDB_env *env, pthread_mutex_t *mutex) { } int mdbx_rdt_lock(MDB_env *env) { - return mdbx_robust_lock(env, &env->me_lck->mti_rmutex); + mdbx_trace(">>"); + int rc = mdbx_robust_lock(env, &env->me_lck->mti_rmutex); + mdbx_trace("<< rc %d", rc); + return rc; } void mdbx_rdt_unlock(MDB_env *env) { + mdbx_trace(">>"); int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex); + mdbx_trace("<< rc %d", rc); if (unlikely(MDBX_IS_ERROR(rc))) mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); } int mdbx_txn_lock(MDB_env *env) { + mdbx_trace(">>"); int rc = mdbx_robust_lock(env, &env->me_lck->mti_wmutex); + mdbx_trace("<< rc %d", rc); return MDBX_IS_ERROR(rc) ? rc : MDB_SUCCESS; } void mdbx_txn_unlock(MDB_env *env) { + mdbx_trace(">>"); int rc = mdbx_robust_unlock(env, &env->me_lck->mti_wmutex); + mdbx_trace("<< rc %d", rc); if (unlikely(MDBX_IS_ERROR(rc))) mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); } @@ -296,7 +305,7 @@ static int __cold mdbx_mutex_failed(MDB_env *env, mdbx_mutex_t *mutex, int rc) { } #endif /* MDB_USE_ROBUST */ - mdbx_error("lock mutex failed, %s", mdbx_strerror(rc)); + mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(rc)); if (rc != EDEADLK) { env->me_flags |= MDB_FATAL_ERROR; rc = MDB_PANIC; diff --git a/src/mdbx.c b/src/mdbx.c index 077281e8..8e666202 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3955,7 +3955,7 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { if (rc == MDBX_RESULT_TRUE) { /* LY: exlcusive mode, init lck */ - memset(env->me_lck, 0, sizeof(MDBX_lockinfo)); + memset(env->me_lck, 0, size); err = mdbx_lck_init(env); if (err) return err; From 0810f4e105ecb442d9f74d9fc59b0ce1c6d3d40a Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 18 May 2017 22:11:47 +0300 Subject: [PATCH 129/303] mdbx: fix MDBX_CORRUPED error (nasty copy&paste bug). --- src/mdbx.c | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 8e666202..699d0bff 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3419,7 +3419,7 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, mdbx_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env)); mdbx_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0); - mdbx_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0 || + mdbx_assert(env, !META_IS_STEADY(head) || env->me_sync_pending != 0 || env->me_mapsize != prev_mapsize); pending->mm_mapsize = env->me_mapsize; @@ -3812,11 +3812,10 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { env->me_mapsize = meta->mm_mapsize; else { /* Make sure mapsize >= committed data size. Even when using - * mm_mapsize, which could be broken in old files (ITS#7789). - */ - size_t minsize = (meta->mm_last_pg + 1) * meta->mm_psize; - if (env->me_mapsize < minsize) - env->me_mapsize = minsize; + * mm_mapsize, which could be broken in old files (ITS#7789). */ + size_t usedsize = (meta->mm_last_pg + 1) * meta->mm_psize; + if (env->me_mapsize < usedsize) + env->me_mapsize = usedsize; meta->mm_mapsize = env->me_mapsize; } @@ -3831,10 +3830,25 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { err = mdbx_ftruncate(env->me_fd, env->me_mapsize); if (unlikely(err != MDB_SUCCESS)) return err; + } else { + off_t size; + err = mdbx_filesize(env->me_fd, &size); + if (unlikely(err != MDB_SUCCESS)) + return err; + + if (size != (off_t)env->me_mapsize) { + mdbx_trace("filesize mismatch"); + if ((env->me_flags & MDB_RDONLY) || + lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) + return MDBX_WANNA_RECOVERY /* LY: could not mdbx_ftruncate */; + + err = mdbx_ftruncate(env->me_fd, env->me_mapsize); + if (unlikely(err != MDB_SUCCESS)) + return err; + } } - const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; - err = mdbx_env_map(env, NULL, usedsize); + err = mdbx_env_map(env, NULL, env->me_mapsize); if (err) return err; @@ -3878,7 +3892,6 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { /* Open and/or initialize the lock region for the environment. */ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { - off_t size; assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd == INVALID_HANDLE_VALUE); @@ -3899,6 +3912,7 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { mdbx_debug("lck-setup: %s ", (rc == MDBX_RESULT_TRUE) ? "exclusive" : "shared"); + off_t size; err = mdbx_filesize(env->me_lfd, &size); if (unlikely(err != MDB_SUCCESS)) return err; @@ -5751,6 +5765,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (unlikely(rc2 = mdbx_page_new(mc, P_LEAF, 1, &np))) { return rc2; } + assert(np->mp_flags & P_LEAF); mdbx_cursor_push(mc, np); mc->mc_db->md_root = np->mp_pgno; mc->mc_db->md_depth++; @@ -6292,7 +6307,7 @@ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, if (unlikely((rc = mdbx_page_alloc(mc, num, &np, MDBX_ALLOC_ALL)))) return rc; - mdbx_debug("allocated new mpage %" PRIuPTR ", page size %u", np->mp_pgno, + mdbx_debug("allocated new page #%" PRIuPTR ", size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize); np->mp_flags = flags | P_DIRTY; np->mp_lower = (PAGEHDRSZ - PAGEBASE); @@ -6308,7 +6323,7 @@ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, } *mp = np; - return 0; + return MDB_SUCCESS; } /** Calculate the size of a leaf node. @@ -8908,7 +8923,7 @@ int mdbx_dbi_open_ex(MDB_txn *txn, const char *table_name, unsigned user_flags, txn->mt_dbflags[slot] = dbflag; txn->mt_dbiseqs[slot] = (txn->mt_env->me_dbiseqs[slot] += 1); - memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); + txn->mt_dbs[slot] = *(MDB_db *)data.mv_data; rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp); if (unlikely(rc != MDB_SUCCESS)) { assert((dbflag & DB_DIRTY) == 0); From d9eeac45b2fde15e655dad7cbab8f016b0504a9c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 19 May 2017 00:31:54 +0300 Subject: [PATCH 130/303] mdbx: fix extra-logging without NDEBUG, but with MDBX_DEBUG=0. --- src/bits.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bits.h b/src/bits.h index 8e8542ae..f99572a2 100644 --- a/src/bits.h +++ b/src/bits.h @@ -679,14 +679,14 @@ void mdbx_panic(const char *fmt, ...) #define mdbx_debug_enabled(type) \ unlikely(mdbx_runtime_flags &(type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) -#else -#ifndef NDEBUG -#define mdbx_debug_enabled(type) (1) #else #define mdbx_debug_enabled(type) (0) -#endif /* NDEBUG */ #define mdbx_audit_enabled() (0) +#ifndef NDEBUG +#define mdbx_assert_enabled() (1) +#else #define mdbx_assert_enabled() (0) +#endif /* NDEBUG */ #endif /* MDB_DEBUG */ #define mdbx_print(fmt, ...) \ From 82d3595b760b9cc3c213411e3e941bc05cc3f364 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 22 May 2017 14:02:33 +0300 Subject: [PATCH 131/303] mdbx: add MDBX_EKEYMISMATCH. --- mdbx.h | 4 ++++ src/mdbx.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/mdbx.h b/mdbx.h index 6a51768d..aa9c2653 100644 --- a/mdbx.h +++ b/mdbx.h @@ -328,6 +328,10 @@ typedef enum MDB_cursor_op { * right now (e.g. in readonly mode and so forth). */ #define MDBX_WANNA_RECOVERY (-30419) +/* The given key value is mismatched to the current cursor position, + * when mdbx_cursor_put() called with MDB_CURRENT option. */ +#define MDBX_EKEYMISMATCH (-30418) + /* Statistics for a database in the environment */ typedef struct MDBX_stat { unsigned ms_psize; /* Size of a database page. diff --git a/src/mdbx.c b/src/mdbx.c index 699d0bff..997bf987 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -759,6 +759,9 @@ static const char *__mdbx_strerr(int errnum) { case MDBX_WANNA_RECOVERY: return "MDBX_WANNA_RECOVERY: Database should be recovered, but this could " "be done in a read-only mode"; + case MDBX_EKEYMISMATCH: + return "MDBX_EKEYMISMATCH: The given key value is mismatched to the " + "current cursor position"; default: return NULL; } @@ -5729,7 +5732,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, mc->mc_ki[mc->mc_top]++; } else { /* new key is <= last key */ - rc = MDB_KEYEXIST; + rc = MDBX_EKEYMISMATCH; } } } else { From d0793a1dafceada630701b68c02f82d41a9c0d4e Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 19 May 2017 16:20:02 +0300 Subject: [PATCH 132/303] mdbx: fix MDB_DUPSORT with MDB_CURRENT update bug. --- mdbx.h | 35 +++++++++++++++++------------- src/mdbx.c | 63 +++++++++++++++++++++++++++++++++++------------------- 2 files changed, 61 insertions(+), 37 deletions(-) diff --git a/mdbx.h b/mdbx.h index aa9c2653..3572f438 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1317,24 +1317,28 @@ LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, /* Store by cursor. * * This function stores key/data pairs into the database. + * * The cursor is positioned at the new item, or on failure usually near it. * Note: Earlier documentation incorrectly said errors would leave the * state of the cursor unchanged. - * [in] cursor A cursor handle returned by mdbx_cursor_open() - * [in] key The key operated on. - * [in] data The data operated on. - * [in] flags Options for this operation. This parameter - * must be set to 0 or one of the values described here. - * - MDB_CURRENT - replace the item at the current cursor position. - * The key parameter must still be provided, and must match - *it. - * If using sorted duplicates (MDB_DUPSORT) the data item must - *still - * sort into the same place. This is intended to be used when the - * new data is the same size as the old. Otherwise it will simply - * perform a delete of the old record followed by an insert. - * - MDB_NODUPDATA - enter the new key/data pair only if it does not - * already appear in the database. This flag may only be + * + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in] key The key operated on. + * [in] data The data operated on. + * [in] flags Options for this operation. This parameter + * must be set to 0 or one of the values described here. + * + * - MDB_CURRENT - replace the item at the current cursor position. The + * key parameter must still be provided, and must match it. + * + * If using sorted duplicates (MDB_DUPSORT) the data item + * must still sort into the same place. This is intended to + * be used when the new data is the same size as the old. + * Otherwise it will simply perform a delete of the old + * record followed by an insert. + * + * - MDB_NODUPDATA - enter the new key/data pair only if it does not already + * appear in the database. This flag may only be *specified * if the database was opened with MDB_DUPSORT. The function *will @@ -1380,6 +1384,7 @@ LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: + * - MDBX_EKEYMISMATCH * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). * - MDB_TXN_FULL - the transaction has too many dirty pages. * - EACCES - an attempt was made to write in a read-only transaction. diff --git a/src/mdbx.c b/src/mdbx.c index 997bf987..7e130618 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -5622,7 +5622,7 @@ static int mdbx_cursor_touch(MDB_cursor *mc) { return rc; } -/** Do not spill pages to disk if txn is getting full, may fail instead */ +/* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDB_NOSPILL 0x8000 int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, @@ -5696,15 +5696,29 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, int dupdata_flag = 0; if (flags & MDB_CURRENT) { - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return MDBX_EINVAL; + /* Опция MDB_CURRENT означает, что запрошено обновление текущей записи, + * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает + * со значением в текущей позиции курсора. + * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц + * с MDB_DUPSORT также требуется текущий размер данных. */ + MDB_val current_key, current_data; + rc = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDB_GET_CURRENT); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + if (mc->mc_dbx->md_cmp(key, ¤t_key) != 0) + return MDBX_EKEYMISMATCH; + if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdbx_cassert(mc, mc->mc_xcursor != NULL && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); - if (mc->mc_xcursor->mx_db.md_entries > 1) { + /* Если за ключом более одного значения, либо если размер данных + * отличается, то вместо inplace обновления требуется удаление и + * последующая вставка. */ + if (mc->mc_xcursor->mx_db.md_entries > 1 || + current_data.mv_size != data->mv_size) { rc = mdbx_cursor_del(mc, 0); if (rc != MDB_SUCCESS) return rc; @@ -5712,14 +5726,15 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } } - rc = MDB_SUCCESS; - } else if (mc->mc_db->md_root == P_INVALID) { + } + + if (mc->mc_db->md_root == P_INVALID) { /* new database, cursor has nothing to point to */ mc->mc_snum = 0; mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; rc = MDB_NO_ROOT; - } else { + } else if ((flags & MDB_CURRENT) == 0) { int exact = 0; MDB_val d2; if (flags & MDB_APPEND) { @@ -5790,8 +5805,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if ((mc->mc_db->md_flags & MDB_DUPSORT) && LEAFSIZE(key, data) > env->me_nodemax) { /* Too big for a node, insert in sub-DB. Set up an empty - * "old sub-page" for prep_subDB to expand to a full page. - */ + * "old sub-page" for prep_subDB to expand to a full page. */ fp_flags = P_LEAF | P_DIRTY; fp = env->me_pbuf; fp->mp_leaf2_ksize = (uint16_t)data->mv_size; /* used if MDB_DUPFIXED */ @@ -5810,8 +5824,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, memcpy(ptr, key->mv_data, ksize); fix_parent: /* if overwriting slot 0 of leaf, need to - * update branch key if there is a parent page - */ + * update branch key if there is a parent page */ if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { unsigned dtop = 1; mc->mc_top--; @@ -5841,20 +5854,13 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, /* Prepare (sub-)page/sub-DB to accept the new item, * if needed. fp: old sub-page or a header faking * it. mp: new (sub-)page. offset: growth in page - * size. xdata: node data with new page or DB. - */ + * size. xdata: node data with new page or DB. */ unsigned i, offset = 0; MDB_page *mp = fp = xdata.mv_data = env->me_pbuf; mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; /* Was a single item before, must convert now */ if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - /* Just overwrite the current item */ - if (flags & MDB_CURRENT) { - if ((flags & MDB_NODUPDATA) && !mc->mc_dbx->md_dcmp(data, &olddata)) - return MDB_KEYEXIST; - goto current; - } /* does data match? */ if (!mc->mc_dbx->md_dcmp(data, &olddata)) { @@ -5864,6 +5870,10 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, goto current; } + /* Just overwrite the current item */ + if (flags & MDB_CURRENT) + goto current; + /* Back up original data item */ dupdata_flag = 1; dkey.mv_size = olddata.mv_size; @@ -6029,16 +6039,25 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if ((rc2 = mdbx_ovpage_free(mc, omp)) != MDB_SUCCESS) return rc2; } else if (data->mv_size == olddata.mv_size) { + assert(EVEN(key->mv_size) == EVEN(leaf->mn_ksize)); /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, - * but instead we opt to shrink the node in that case. - */ + * but instead we opt to shrink the node in that case. */ if (F_ISSET(flags, MDB_RESERVE)) data->mv_data = olddata.mv_data; else if (!(mc->mc_flags & C_SUB)) memcpy(olddata.mv_data, data->mv_data, data->mv_size); else { - memcpy(NODEKEY(leaf), key->mv_data, key->mv_size); + assert(NUMKEYS(mc->mc_pg[mc->mc_top]) == 1); + assert(mc->mc_pg[mc->mc_top]->mp_upper == + mc->mc_pg[mc->mc_top]->mp_lower); + assert(IS_LEAF(mc->mc_pg[mc->mc_top]) && + !IS_LEAF2(mc->mc_pg[mc->mc_top])); + assert(NODEDSZ(leaf) == 0); + assert(leaf->mn_flags == 0); + memcpy(NODEKEY(leaf), key->mv_data, leaf->mn_ksize = key->mv_size); + assert((char *)NODEDATA(leaf) + NODEDSZ(leaf) < + (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); goto fix_parent; } return MDB_SUCCESS; From 0c9832fc2892fdbb7287f9828ba13637129e7830 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 14:40:37 +0300 Subject: [PATCH 133/303] ci: add TESTDB into Makefile. --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 6c0d99c8..82c3311d 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,7 @@ XCFLAGS ?= -DNDEBUG=1 -DMDB_DEBUG=0 -DLIBMDBX_EXPORTS=1 CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden CFLAGS += -D_GNU_SOURCE=1 -std=gnu99 -pthread $(XCFLAGS) CXXFLAGS = -std=c++11 $(filter-out -std=gnu99,$(CFLAGS)) +TESTDB ?= /tmp/mdbx-check.db # LY: '--no-as-needed,-lrt' for ability to built with modern glibc, but then run with the old LDFLAGS ?= -Wl,--gc-sections,-z,relro,-O,--no-as-needed,-lrt @@ -67,7 +68,7 @@ clean: rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err check: test/test - (set -o pipefail; test/test --pathname=tmp.db --dont-cleanup-after basic | tee test.log | tail -n 42) && ./mdbx_chk -vn tmp.db + rm -f $(TESTDB) && (set -o pipefail; test/test --pathname=$(TESTDB) --dont-cleanup-after basic | tee test.log | tail -n 42) && ./mdbx_chk -vn $(TESTDB) mdbx.o: $(MDBX_SRC) Makefile $(CC) $(CFLAGS) -c src/mdbx.c -o $@ From 994481e4f4f452a624462e88f52ea20c934d697a Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 22 May 2017 19:57:54 +0300 Subject: [PATCH 134/303] mdbx: replace size_t with uint64_t/uint32_t in API. --- mdbx.h | 37 ++++++++++++++++++------------------- src/mdbx.c | 4 ++-- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/mdbx.h b/mdbx.h index 3572f438..685686b9 100644 --- a/mdbx.h +++ b/mdbx.h @@ -115,7 +115,7 @@ typedef struct MDB_env MDB_env; typedef struct MDB_txn MDB_txn; /* A handle for an individual database in the DB environment. */ -typedef unsigned MDB_dbi; +typedef uint32_t MDB_dbi; /* Opaque structure for navigating through a database */ typedef struct MDB_cursor MDB_cursor; @@ -185,9 +185,8 @@ typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); #define MDB_REVERSEKEY 0x02u /* use sorted duplicates */ #define MDB_DUPSORT 0x04u -/* numeric keys in native byte order, either unsigned int or mdbx_size_t. - * (lmdb expects 32-bit int <= size_t <= 32/64-bit mdbx_size_t.) - * The keys must all be of the same size. */ +/* numeric keys in native byte order, either uint32_t or uint64_t. + * The keys must all be of the same size. */ #define MDB_INTEGERKEY 0x08u /* with MDB_DUPSORT, sorted dup items have fixed size */ #define MDB_DUPFIXED 0x10u @@ -334,23 +333,23 @@ typedef enum MDB_cursor_op { /* Statistics for a database in the environment */ typedef struct MDBX_stat { - unsigned ms_psize; /* Size of a database page. - * This is currently the same for all databases. */ - unsigned ms_depth; /* Depth (height) of the B-tree */ - size_t ms_branch_pages; /* Number of internal (non-leaf) pages */ - size_t ms_leaf_pages; /* Number of leaf pages */ - size_t ms_overflow_pages; /* Number of overflow pages */ - size_t ms_entries; /* Number of data items */ + uint32_t ms_psize; /* Size of a database page. + * This is currently the same for all databases. */ + uint32_t ms_depth; /* Depth (height) of the B-tree */ + uint64_t ms_branch_pages; /* Number of internal (non-leaf) pages */ + uint64_t ms_leaf_pages; /* Number of leaf pages */ + uint64_t ms_overflow_pages; /* Number of overflow pages */ + uint64_t ms_entries; /* Number of data items */ } MDBX_stat; /* Information about the environment */ typedef struct MDBX_envinfo { void *me_mapaddr; /* Address of map, if fixed */ - size_t me_mapsize; /* Size of the data memory map */ - size_t me_last_pgno; /* ID of the last used page */ + uint64_t me_mapsize; /* Size of the data memory map */ + uint64_t me_last_pgno; /* ID of the last used page */ uint64_t me_last_txnid; /* ID of the last committed transaction */ - unsigned me_maxreaders; /* max reader slots in the environment */ - unsigned me_numreaders; /* max reader slots used in the environment */ + uint32_t me_maxreaders; /* max reader slots in the environment */ + uint32_t me_numreaders; /* max reader slots used in the environment */ uint64_t me_tail_txnid; /* ID of the last reader transaction */ uint64_t me_meta1_txnid, me_meta1_sign; uint64_t me_meta2_txnid, me_meta2_sign; @@ -917,7 +916,7 @@ LIBMDBX_API MDB_env *mdbx_txn_env(MDB_txn *txn); * [in] txn A transaction handle returned by mdbx_txn_begin() * * Returns A transaction ID, valid if input is an active transaction. */ -LIBMDBX_API size_t mdbx_txn_id(MDB_txn *txn); +LIBMDBX_API uint64_t mdbx_txn_id(MDB_txn *txn); /* Commit all the operations of a transaction into the database. * @@ -1419,7 +1418,7 @@ LIBMDBX_API int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); * possible errors are: * - EINVAL - cursor is not initialized, * or an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, size_t *countp); +LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, uint64_t *countp); /* Compare two data items according to a particular database. * @@ -1523,7 +1522,7 @@ LIBMDBX_API int mdbx_txn_straggler(MDB_txn *txn, int *percent); * 1 on success (reader was killed), * >1 on success (reader was SURE killed). */ typedef int(MDBX_oom_func)(MDB_env *env, int pid, mdbx_tid_t thread_id, - size_t txn, unsigned gap, int retry); + uint64_t txn, unsigned gap, int retry); /* Set the OOM callback. * @@ -1560,7 +1559,7 @@ typedef void MDBX_debug_func(int type, const char *function, int line, LIBMDBX_API int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); -typedef int MDBX_pgvisitor_func(size_t pgno, unsigned pgnumber, void *ctx, +typedef int MDBX_pgvisitor_func(uint64_t pgno, unsigned pgnumber, void *ctx, const char *dbi, const char *type, int nentries, int payload_bytes, int header_bytes, int unused_bytes); diff --git a/src/mdbx.c b/src/mdbx.c index 7e130618..513d72cf 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2452,7 +2452,7 @@ MDB_env *mdbx_txn_env(MDB_txn *txn) { return txn->mt_env; } -size_t mdbx_txn_id(MDB_txn *txn) { +uint64_t mdbx_txn_id(MDB_txn *txn) { if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) return ~(txnid_t)0; return txn->mt_txnid; @@ -6839,7 +6839,7 @@ int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { } /* Return the count of duplicate data items for the current key */ -int mdbx_cursor_count(MDB_cursor *mc, size_t *countp) { +int mdbx_cursor_count(MDB_cursor *mc, uint64_t *countp) { if (unlikely(mc == NULL || countp == NULL)) return MDBX_EINVAL; From 0eee938c5fca906994e0f9b9edc7883ea1546eea Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 22 May 2017 19:59:16 +0300 Subject: [PATCH 135/303] mdbx: add MDBX_EIO. --- mdbx_osal.h | 3 ++- src/osal.c | 13 +++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/mdbx_osal.h b/mdbx_osal.h index da0e2ca4..438b1a95 100644 --- a/mdbx_osal.h +++ b/mdbx_osal.h @@ -101,6 +101,7 @@ typedef DWORD mdbx_tid_t; #define MDBX_ENOMEM ERROR_OUTOFMEMORY #define MDBX_EROFS ERROR_FILE_READ_ONLY #define MDBX_ENOSYS ERROR_NOT_SUPPORTED +#define MDBX_EIO ERROR_WRITE_FAULT #else @@ -118,7 +119,7 @@ typedef pthread_t mdbx_tid_t; #define MDBX_ENOMEM ENOMEM #define MDBX_EROFS EROFS #define MDBX_ENOSYS ENOSYS - +#define MDBX_EIO EIO #endif /*--------------------------------------------------------------------------*/ diff --git a/src/osal.c b/src/osal.c index 459b9f49..0e887dd4 100644 --- a/src/osal.c +++ b/src/osal.c @@ -139,7 +139,7 @@ int mdbx_asprintf(char **strp, const char *fmt, ...) { int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result) { #if _MSC_VER *result = _aligned_malloc(bytes, alignment); - return *result ? MDB_SUCCESS : ERROR_OUTOFMEMORY; + return *result ? MDB_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */; #elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L *result = memalign(alignment, bytes); return *result ? MDB_SUCCESS : errno; @@ -343,7 +343,7 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { ssize_t read = pread(fd, buf, bytes, offset); if (read < 0) { int rc = errno; - return (rc == MDB_SUCCESS) ? /* paranoia */ EIO : rc; + return (rc == MDB_SUCCESS) ? /* paranoia */ MDBX_EIO : rc; } #endif return (bytes == (size_t)read) ? MDB_SUCCESS : MDBX_ENODATA; @@ -362,7 +362,7 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, DWORD written; if (likely(WriteFile(fd, buf, (DWORD)bytes, &written, &ov))) - return (bytes == written) ? MDB_SUCCESS : ERROR_WRITE_FAULT; + return (bytes == written) ? MDB_SUCCESS : MDBX_EIO /* ERROR_WRITE_FAULT */; return mdbx_get_errno_checked(); #else int rc; @@ -373,7 +373,7 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, return MDB_SUCCESS; rc = errno; } while (rc == EINTR); - return (written < 0) ? rc : EIO /* Use which error code (ENOSPC)? */; + return (written < 0) ? rc : MDBX_EIO /* Use which error code (ENOSPC)? */; #endif } @@ -388,7 +388,8 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, written += iov[i].iov_len; offset += iov[i].iov_len; } - return (expected_written == written) ? MDB_SUCCESS : ERROR_WRITE_FAULT; + return (expected_written == written) ? MDB_SUCCESS + : MDBX_EIO /* ERROR_WRITE_FAULT */; #else int rc; ssize_t written; @@ -398,7 +399,7 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, return MDB_SUCCESS; rc = errno; } while (rc == EINTR); - return (written < 0) ? rc : EIO /* Use which error code? */; + return (written < 0) ? rc : MDBX_EIO /* Use which error code? */; #endif } From 398b839b98d7c449dfe79070b1bfda7f88e488a1 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 22 May 2017 20:02:23 +0300 Subject: [PATCH 136/303] mdbx: drops old/obsolete API (mdbx_env_copy and mdbx_env_copydf). --- mdbx.h | 39 ++++----------------------------------- src/mdbx.c | 22 +++++++--------------- src/tools/mdbx_copy.c | 4 ++-- 3 files changed, 13 insertions(+), 52 deletions(-) diff --git a/mdbx.h b/mdbx.h index 685686b9..23302783 100644 --- a/mdbx.h +++ b/mdbx.h @@ -563,37 +563,6 @@ LIBMDBX_API int mdbx_env_open(MDB_env *env, const char *path, unsigned flags, LIBMDBX_API int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mode_t mode, int *exclusive); -/* Copy an LMDB environment to the specified path. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * Note: This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under caveats_sec. - * [in] env An environment handle returned by mdbx_env_create(). It - * must have already been opened successfully. - * [in] path The directory in which the copy will reside. This - * directory must already exist and be writable but must otherwise be - * empty. - * - * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_copy(MDB_env *env, const char *path); - -/* Copy an LMDB environment to the specified file descriptor. - * - * This function may be used to make a backup of an existing environment. - * No lockfile is created, since it gets recreated at need. - * Note: This call can trigger significant file size growth if run in - * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under caveats_sec. - * [in] env An environment handle returned by mdbx_env_create(). It - * must have already been opened successfully. - * [in] fd The filedescriptor to write the copy to. It must - * have already been opened for Write access. - * - * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_copyfd(MDB_env *env, mdbx_filehandle_t fd); - /* Copy an LMDB environment to the specified path, with options. * * This function may be used to make a backup of an existing environment. @@ -617,14 +586,14 @@ LIBMDBX_API int mdbx_env_copyfd(MDB_env *env, mdbx_filehandle_t fd); *leak. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags); +LIBMDBX_API int mdbx_env_copy(MDB_env *env, const char *path, unsigned flags); /* Copy an LMDB environment to the specified file descriptor, * with options. * * This function may be used to make a backup of an existing environment. * No lockfile is created, since it gets recreated at need. See - * mdbx_env_copy2() for further details. + * mdbx_env_copy() for further details. * Note: This call can trigger significant file size growth if run in * parallel with write transactions, because it employs a read-only * transaction. See long-lived transactions under caveats_sec. @@ -633,10 +602,10 @@ LIBMDBX_API int mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags); * [in] fd The filedescriptor to write the copy to. It must * have already been opened for Write access. * [in] flags Special options for this operation. - * See mdbx_env_copy2() for options. + * See mdbx_env_copy() for options. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_copyfd2(MDB_env *env, mdbx_filehandle_t fd, +LIBMDBX_API int mdbx_env_copy2fd(MDB_env *env, mdbx_filehandle_t fd, unsigned flags); /* Return statistics about the LMDB environment. diff --git a/src/mdbx.c b/src/mdbx.c index 513d72cf..a511bf25 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -8448,7 +8448,7 @@ done: } /** Copy environment with compaction. */ -static int __cold mdbx_env_copyfd1(MDB_env *env, mdbx_filehandle_t fd) { +static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { MDB_meta *mm; MDB_page *mp; mdbx_copy my; @@ -8548,7 +8548,7 @@ done2: } /** Copy environment as-is. */ -static int __cold mdbx_env_copyfd0(MDB_env *env, mdbx_filehandle_t fd) { +static int __cold mdbx_env_copy_asis(MDB_env *env, mdbx_filehandle_t fd) { MDB_txn *txn = NULL; int rc; @@ -8585,19 +8585,15 @@ bailout: return rc; } -int __cold mdbx_env_copyfd2(MDB_env *env, mdbx_filehandle_t fd, +int __cold mdbx_env_copy2fd(MDB_env *env, mdbx_filehandle_t fd, unsigned flags) { if (flags & MDB_CP_COMPACT) - return mdbx_env_copyfd1(env, fd); + return mdbx_env_compact(env, fd); else - return mdbx_env_copyfd0(env, fd); + return mdbx_env_copy_asis(env, fd); } -int __cold mdbx_env_copyfd(MDB_env *env, mdbx_filehandle_t fd) { - return mdbx_env_copyfd2(env, fd, 0); -} - -int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { +int __cold mdbx_env_copy(MDB_env *env, const char *path, unsigned flags) { int rc, len; char *lck_pathname; mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE; @@ -8627,7 +8623,7 @@ int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { (void)fcntl(newfd, F_SETFL, rc | O_DIRECT); #endif } - rc = mdbx_env_copyfd2(env, newfd, flags); + rc = mdbx_env_copy2fd(env, newfd, flags); } if (!(env->me_flags & MDB_NOSUBDIR)) @@ -8642,10 +8638,6 @@ int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { return rc; } -int __cold mdbx_env_copy(MDB_env *env, const char *path) { - return mdbx_env_copy2(env, path, 0); -} - int __cold mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff) { if (unlikely(flags & ~CHANGEABLE)) return MDBX_EINVAL; diff --git a/src/tools/mdbx_copy.c b/src/tools/mdbx_copy.c index aea8be5f..9eb3c49c 100644 --- a/src/tools/mdbx_copy.c +++ b/src/tools/mdbx_copy.c @@ -63,9 +63,9 @@ int main(int argc, char *argv[]) { if (rc == MDB_SUCCESS) { act = "copying"; if (argc == 2) - rc = mdbx_env_copyfd2(env, STDOUT_FILENO, cpflags); + rc = mdbx_env_copy2fd(env, STDOUT_FILENO, cpflags); else - rc = mdbx_env_copy2(env, argv[2], cpflags); + rc = mdbx_env_copy(env, argv[2], cpflags); } if (rc) fprintf(stderr, "%s: %s failed, error %d (%s)\n", progname, act, rc, From 277bdfb4c49e5dfedf649fd7ba6c55205f7409ef Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 22 May 2017 20:02:36 +0300 Subject: [PATCH 137/303] mdbx: cleanup formatting for mdbx.h --- mdbx.h | 967 +++++++++++++++++++++++++++------------------------------ 1 file changed, 461 insertions(+), 506 deletions(-) diff --git a/mdbx.h b/mdbx.h index 23302783..67c0ee89 100644 --- a/mdbx.h +++ b/mdbx.h @@ -43,8 +43,7 @@ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #pragma once /* *INDENT-OFF* */ @@ -66,17 +65,17 @@ extern "C" { #endif -/** Library major version */ +/* Library major version */ #define MDBX_VERSION_MAJOR 0 -/** Library minor version */ +/* Library minor version */ #define MDBX_VERSION_MINOR 2 -/** Library patch version */ +/* Library patch version */ #define MDBX_VERSION_PATCH 0 -/** Combine args a,b,c into a single integer for easy version comparisons */ +/* Combine args a,b,c into a single integer for easy version comparisons */ #define MDB_VERINT(a, b, c) (((a) << 24) | ((b) << 16) | (c)) -/** The full library version as a single integer */ +/* The full library version as a single integer */ #define MDBX_VERSION_FULL \ MDB_VERINT(MDBX_VERSION_MAJOR, MDBX_VERSION_MINOR, MDBX_VERSION_PATCH) @@ -360,6 +359,7 @@ typedef struct MDBX_envinfo { * [out] major if non-NULL, the library major version number is copied here * [out] minor if non-NULL, the library minor version number is copied here * [out] patch if non-NULL, the library patch version number is copied here + * * Returns "version string" The library version as a string */ LIBMDBX_API const char *mdbx_version(int *major, int *minor, int *patch); @@ -395,6 +395,7 @@ LIBMDBX_API int mdbx_env_create(MDB_env **penv); * * If this function fails, mdbx_env_close() must be called to discard * the MDB_env handle. + * * [in] env An environment handle returned by mdbx_env_create() * [in] path The directory in which the database files reside. * This directory must already exist and be writable. @@ -424,140 +425,108 @@ LIBMDBX_API int mdbx_env_create(MDB_env **penv); * Incompatible with nested transactions. * Do not mix processes with and without MDB_WRITEMAP on the same * environment. This can defeat durability (mdbx_env_sync etc). + * * - MDB_NOMETASYNC * Flush system buffers to disk only once per transaction, omit the - * metadata flush. Defer that until the system flushes files to - *disk, - * or next non-MDB_RDONLY commit or mdbx_env_sync(). This - *optimization - * maintains database integrity, but a system crash may undo the - *last - * committed transaction. I.e. it preserves the ACI (atomicity, - * consistency, isolation) but not D (durability) database - *property. - * This flag may be changed at any time using - *mdbx_env_set_flags(). + * metadata flush. Defer that until the system flushes files to disk, + * or next non-MDB_RDONLY commit or mdbx_env_sync(). This optimization + * maintains database integrity, but a system crash may undo the last + * committed transaction. I.e. it preserves the ACI (atomicity, + * consistency, isolation) but not D (durability) database property. + * This flag may be changed at any time using mdbx_env_set_flags(). + * * - MDB_NOSYNC - * Don't flush system buffers to disk when committing a - *transaction. - * This optimization means a system crash can corrupt the database - *or - * lose the last transactions if buffers are not yet flushed to - *disk. - * The risk is governed by how often the system flushes dirty - *buffers - * to disk and how often mdbx_env_sync() is called. However, if - *the - * filesystem preserves write order and the MDB_WRITEMAP flag is - *not - * used, transactions exhibit ACI (atomicity, consistency, - *isolation) - * properties and only lose D (durability). I.e. database - *integrity - * is maintained, but a system crash may undo the final - *transactions. - * Note that (MDB_NOSYNC | MDB_WRITEMAP) leaves the system with - *no - * hint for when to write transactions to disk, unless - *mdbx_env_sync() - * is called. (MDB_MAPASYNC | MDB_WRITEMAP) may be preferable. - * This flag may be changed at any time using - *mdbx_env_set_flags(). + * Don't flush system buffers to disk when committing a transaction. + * This optimization means a system crash can corrupt the database or + * lose the last transactions if buffers are not yet flushed to disk. + * The risk is governed by how often the system flushes dirty buffers + * to disk and how often mdbx_env_sync() is called. However, if the + * filesystem preserves write order and the MDB_WRITEMAP and/or + * MDBX_LIFORECLAIM flags are not used, transactions exhibit ACI + * (atomicity, consistency, isolation) properties and only lose D + * (durability). I.e. database integrity is maintained, but a system + * crash may undo the final transactions. + * + * Note that (MDB_NOSYNC | MDB_WRITEMAP) leaves the system with no + * hint for when to write transactions to disk. + * Therefore the (MDB_MAPASYNC | MDB_WRITEMAP) may be preferable. + * This flag may be changed at any time using mdbx_env_set_flags(). + * + * - MDBX_UTTERLY_NOSYNC (internally MDB_NOSYNC | MDB_MAPASYNC) + * FIXME: TODO + * * - MDB_MAPASYNC - * When using MDB_WRITEMAP, use asynchronous flushes to disk. - * As with MDB_NOSYNC, a system crash can then corrupt the - * database or lose the last transactions. Calling - *mdbx_env_sync() - * ensures on-disk database integrity until next commit. - * This flag may be changed at any time using - *mdbx_env_set_flags(). + * When using MDB_WRITEMAP, use asynchronous flushes to disk. As with + * MDB_NOSYNC, a system crash can then corrupt the database or lose + * the last transactions. Calling mdbx_env_sync() ensures on-disk + * database integrity until next commit. This flag may be changed at + * any time using mdbx_env_set_flags(). + * * - MDB_NOTLS - * Don't use Thread-Local Storage. Tie reader locktable slots to - * MDB_txn objects instead of to threads. I.e. mdbx_txn_reset() - *keeps - * the slot reseved for the MDB_txn object. A thread may use - *parallel - * read-only transactions. A read-only transaction may span threads - *if - * the user synchronizes its use. Applications that multiplex - *many - * user threads over individual OS threads need this option. Such - *an - * application must also serialize the write transactions in an - *OS - * thread, since LMDB's write locking is unaware of the user - *threads. - * - MDB_NOLOCK - * Don't do any locking. If concurrent access is anticipated, the - * caller must manage all concurrency itself. For proper - *operation - * the caller must enforce single-writer semantics, and must - *ensure - * that no readers are using old transactions while a writer is - * active. The simplest approach is to use an exclusive lock so - *that - * no readers may be active at all when a writer begins. + * Don't use Thread-Local Storage. Tie reader locktable slots to + * MDB_txn objects instead of to threads. I.e. mdbx_txn_reset() keeps + * the slot reseved for the MDB_txn object. A thread may use parallel + * read-only transactions. A read-only transaction may span threads if + * the user synchronizes its use. Applications that multiplex many + * user threads over individual OS threads need this option. Such an + * application must also serialize the write transactions in an OS + * thread, since LMDB's write locking is unaware of the user threads. + * + * - MDB_NOLOCK (don't supported by MDBX) + * Don't do any locking. If concurrent access is anticipated, the + * caller must manage all concurrency itself. For proper operation + * the caller must enforce single-writer semantics, and must ensure + * that no readers are using old transactions while a writer is + * active. The simplest approach is to use an exclusive lock so that + * no readers may be active at all when a writer begins. + * * - MDB_NORDAHEAD - * Turn off readahead. Most operating systems perform readahead - *on - * read requests by default. This option turns it off if the OS - * supports it. Turning it off may help random read performance - * when the DB is larger than RAM and system RAM is full. + * Turn off readahead. Most operating systems perform readahead on + * read requests by default. This option turns it off if the OS + * supports it. Turning it off may help random read performance + * when the DB is larger than RAM and system RAM is full. + * * - MDB_NOMEMINIT - * Don't initialize malloc'd memory before writing to unused - *spaces - * in the data file. By default, memory for pages written to the - *data - * file is obtained using malloc. While these pages may be reused - *in - * subsequent transactions, freshly malloc'd pages will be - *initialized - * to zeroes before use. This avoids persisting leftover data from - *other - * code (that used the heap and subsequently freed the memory) into - *the - * data file. Note that many other system libraries may allocate - * and free memory from the heap for arbitrary uses. E.g., stdio - *may - * use the heap for file I/O buffers. This initialization step has - *a - * modest performance cost so some applications may want to - *disable - * it using this flag. This option can be a problem for - *applications - * which handle sensitive data like passwords, and it makes - *memory - * checkers like Valgrind noisy. This flag is not needed with - *MDB_WRITEMAP, - * which writes directly to the mmap instead of using malloc for - *pages. The - * initialization is also skipped if MDB_RESERVE is used; the - * caller is expected to overwrite all of the memory that was - * reserved in that case. - * This flag may be changed at any time using - *mdbx_env_set_flags(). - * - #MDBX_COALESCE - * Aim to coalesce records while reclaiming FreeDB. - * This flag may be changed at any time using - *mdbx_env_set_flags(). - * - #MDBX_LIFORECLAIM - * LIFO policy for reclaiming FreeDB records. This significantly - *reduce - * write IPOS in case MDB_NOSYNC with periodically checkpoints. - * [in] mode The UNIX permissions to set on created files and - *semaphores. + * Don't initialize malloc'd memory before writing to unused spaces + * in the data file. By default, memory for pages written to the data + * file is obtained using malloc. While these pages may be reused in + * subsequent transactions, freshly malloc'd pages will be initialized + * to zeroes before use. This avoids persisting leftover data from other + * code (that used the heap and subsequently freed the memory) into the + * data file. Note that many other system libraries may allocate and free + * memory from the heap for arbitrary uses. E.g., stdio may use the heap + * for file I/O buffers. This initialization step has a modest performance + * cost so some applications may want to disable it using this flag. This + * option can be a problem for applications which handle sensitive data + * like passwords, and it makes memory checkers like Valgrind noisy. This + * flag is not needed with MDB_WRITEMAP, which writes directly to the + * mmap instead of using malloc for pages. The initialization is also + * skipped if MDB_RESERVE is used; the caller is expected to overwrite + * all of the memory that was reserved in that case. This flag may be + * changed at any time using mdbx_env_set_flags(). + * + * - MDBX_COALESCE + * Aim to coalesce records while reclaiming FreeDB. This flag may be + * changed at any time using mdbx_env_set_flags(). + * FIXME: TODO + * + * - MDBX_LIFORECLAIM + * LIFO policy for reclaiming FreeDB records. This significantly reduce + * write IPOs in case MDB_NOSYNC with periodically checkpoints. + * FIXME: TODO + * + * [in] mode The UNIX permissions to set on created files. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_VERSION_MISMATCH - the version of the LMDB library doesn't - *match the - * version that created the database environment. - * - MDB_INVALID - the environment file headers are corrupted. - * - ENOENT - the directory specified by the path parameter doesn't - *exist. - * - EACCES - the user didn't have permission to access the environment - *files. - * - EAGAIN - the environment was locked by another process. */ + * - MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the + * version that created the database environment. + * - MDB_INVALID - the environment file headers are corrupted. + * - MDBX_ENOENT - the directory specified by the path parameter + * doesn't exist. + * - MDBX_EACCES - the user didn't have permission to access + * the environment files. + * - MDBX_EAGAIN - the environment was locked by another process. */ LIBMDBX_API int mdbx_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode); LIBMDBX_API int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, @@ -567,23 +536,25 @@ LIBMDBX_API int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, * * This function may be used to make a backup of an existing environment. * No lockfile is created, since it gets recreated at need. - * Note: This call can trigger significant file size growth if run in + * NOTE: This call can trigger significant file size growth if run in * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under caveats_sec. - * [in] env An environment handle returned by mdbx_env_create(). It - * must have already been opened successfully. - * [in] path The directory in which the copy will reside. This - * directory must already exist and be writable but must otherwise be - * empty. - * [in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - * - MDB_CP_COMPACT - Perform compaction while copying: omit free - * pages and sequentially renumber all pages in output. This - *option - * consumes more CPU and runs more slowly than the default. - * Currently it fails if the environment has suffered a page - *leak. + * transaction. See long-lived transactions under "Caveats" section. + * + * [in] env An environment handle returned by mdbx_env_create(). It must + * have already been opened successfully. + * [in] path The directory in which the copy will reside. This directory must + * already exist and be writable but must otherwise be empty. + * [in] flags Special options for this operation. This parameter must be set + * to 0 or by bitwise OR'ing together one or more of the values + * described here: + * + * - MDB_CP_COMPACT + * Perform compaction while copying: omit free pages and sequentially + * renumber all pages in output. This option consumes little bit more + * CPU for processing, but may running quickly than the default, on + * account skipping free pages. + * + * NOTE: Currently it fails if the environment has suffered a page leak. * * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copy(MDB_env *env, const char *path, unsigned flags); @@ -594,15 +565,17 @@ LIBMDBX_API int mdbx_env_copy(MDB_env *env, const char *path, unsigned flags); * This function may be used to make a backup of an existing environment. * No lockfile is created, since it gets recreated at need. See * mdbx_env_copy() for further details. - * Note: This call can trigger significant file size growth if run in + * + * NOTE: This call can trigger significant file size growth if run in * parallel with write transactions, because it employs a read-only - * transaction. See long-lived transactions under caveats_sec. - * [in] env An environment handle returned by mdbx_env_create(). It - * must have already been opened successfully. - * [in] fd The filedescriptor to write the copy to. It must - * have already been opened for Write access. - * [in] flags Special options for this operation. - * See mdbx_env_copy() for options. + * transaction. See long-lived transactions under "Caveats" section. + * + * [in] env An environment handle returned by mdbx_env_create(). It must + * have already been opened successfully. + * [in] fd The filedescriptor to write the copy to. It must have already + * been opened for Write access. + * [in] flags Special options for this operation. See mdbx_env_copy() for + * options. * * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_copy2fd(MDB_env *env, mdbx_filehandle_t fd, @@ -610,18 +583,16 @@ LIBMDBX_API int mdbx_env_copy2fd(MDB_env *env, mdbx_filehandle_t fd, /* Return statistics about the LMDB environment. * - * [in] env An environment handle returned by mdbx_env_create() - * [out] stat The address of an MDB_stat structure - * where the statistics will be copied - */ + * [in] env An environment handle returned by mdbx_env_create() + * [out] stat The address of an MDB_stat structure where the statistics + * will be copied */ LIBMDBX_API int mdbx_env_stat(MDB_env *env, MDBX_stat *stat, size_t bytes); /* Return information about the LMDB environment. * - * [in] env An environment handle returned by mdbx_env_create() - * [out] stat The address of an MDB_envinfo structure - * where the information will be copied - */ + * [in] env An environment handle returned by mdbx_env_create() + * [out] stat The address of an MDB_envinfo structure + * where the information will be copied */ LIBMDBX_API int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); /* Flush the data buffers to disk. @@ -631,33 +602,34 @@ LIBMDBX_API int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); * the OS buffers upon commit as well, unless the environment was * opened with MDB_NOSYNC or in part MDB_NOMETASYNC. This call is * not valid if the environment was opened with MDB_RDONLY. - * [in] env An environment handle returned by mdbx_env_create() - * [in] force If non-zero, force a synchronous flush. Otherwise - * if the environment has the MDB_NOSYNC flag set the flushes - * will be omitted, and with MDB_MAPASYNC they will be asynchronous. + * + * [in] env An environment handle returned by mdbx_env_create() + * [in] force If non-zero, force a synchronous flush. Otherwise if the + * environment has the MDB_NOSYNC flag set the flushes will be + * omitted, and with MDB_MAPASYNC they will be asynchronous. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EACCES - the environment is read-only. - * - EINVAL - an invalid parameter was specified. - * - EIO - an error occurred during synchronization. */ + * - MDBX_EACCES - the environment is read-only. + * - MDBX_EINVAL - an invalid parameter was specified. + * - MDBX_EIO - an error occurred during synchronization. */ LIBMDBX_API int mdbx_env_sync(MDB_env *env, int force); /* Close the environment and release the memory map. * * Only a single thread may call this function. All transactions, databases, * and cursors must already be closed before calling this function. Attempts - * to - * use any such handles after calling this function will cause a SIGSEGV. + * to use any such handles after calling this function will cause a SIGSEGV. * The environment handle will be freed and must not be used again after this * call. - * [in] env An environment handle returned by mdbx_env_create() - * [in] dont_sync A dont'sync flag, if non-zero the last checkpoint - * (meta-page update) will be kept "as is" and may be still "weak" - * in NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be ignored - * on opening next time, and transactions since the last non-weak - * checkpoint (meta-page update) will rolledback for consistency guarantee. - */ + * + * [in] env An environment handle returned by mdbx_env_create() + * [in] dont_sync A dont'sync flag, if non-zero the last checkpoint (meta-page + * update) will be kept "as is" and may be still "weak" in the + * NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be ignored + * on opening next time, and transactions since the last non-weak + * checkpoint (meta-page update) will rolledback for consistency + * guarantee. */ LIBMDBX_API void mdbx_env_close(MDB_env *env); /* Set environment flags. @@ -665,49 +637,49 @@ LIBMDBX_API void mdbx_env_close(MDB_env *env); * This may be used to set some flags in addition to those from * mdbx_env_open(), or to unset these flags. If several threads * change the flags at the same time, the result is undefined. - * [in] env An environment handle returned by mdbx_env_create() - * [in] flags The flags to change, bitwise OR'ed together - * [in] onoff A non-zero value sets the flags, zero clears them. + * + * [in] env An environment handle returned by mdbx_env_create() + * [in] flags The flags to change, bitwise OR'ed together + * [in] onoff A non-zero value sets the flags, zero clears them. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff); /* Get environment flags. * - * [in] env An environment handle returned by mdbx_env_create() - * [out] flags The address of an integer to store the flags + * [in] env An environment handle returned by mdbx_env_create() + * [out] flags The address of an integer to store the flags * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_flags(MDB_env *env, unsigned *flags); /* Return the path that was used in mdbx_env_open(). * - * [in] env An environment handle returned by mdbx_env_create() - * [out] path Address of a string pointer to contain the path. This - * is the actual string in the environment, not a copy. It should not be - * altered in any way. + * [in] env An environment handle returned by mdbx_env_create() + * [out] path Address of a string pointer to contain the path. + * This is the actual string in the environment, not a copy. + * It should not be altered in any way. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_path(MDB_env *env, const char **path); -/* Return the filedescriptor for the given environment. +/* Return the file descriptor for the given environment. * - * This function may be called after fork(), so the descriptor can be - * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. - * (Until LMDB 0.9.18, only the lockfile had that.) + * NOTE: All MDBX file descriptors have FD_CLOEXEC and + * could't be used after exec() and or fork(). * - * [in] env An environment handle returned by mdbx_env_create() - * [out] fd Address of a int to contain the descriptor. + * [in] env An environment handle returned by mdbx_env_create() + * [out] fd Address of a int to contain the descriptor. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *fd); /* Set the size of the memory map to use for this environment. @@ -717,10 +689,9 @@ LIBMDBX_API int mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *fd); * of the database. The value should be chosen as large as possible, * to accommodate future growth of the database. * This function should be called after mdbx_env_create() and before - *mdbx_env_open(). - * It may be called at later times if no transactions are active in - * this process. Note that the library does not check for this condition, - * the caller must ensure it explicitly. + * mdbx_env_open(). It may be called at later times if no transactions + * are active in this process. Note that the library does not check for + * this condition, the caller must ensure it explicitly. * * The new size takes effect immediately for the current process but * will not be persisted to any others until a write transaction has been @@ -732,36 +703,36 @@ LIBMDBX_API int mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *fd); * return MDB_MAP_RESIZED. This function may be called with a size * of zero to adopt the new size. * - * Any attempt to set a size smaller than the space already consumed - * by the environment will be silently changed to the current size of the used - *space. - * [in] env An environment handle returned by mdbx_env_create() - * [in] size The size in bytes + * Any attempt to set a size smaller than the space already consumed by the + * environment will be silently changed to the current size of the used space. + * + * [in] env An environment handle returned by mdbx_env_create() + * [in] size The size in bytes * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified, - * or the environment has an active write transaction. */ + * - MDBX_EINVAL - an invalid parameter was specified, + * or the environment has an active write transaction. */ LIBMDBX_API int mdbx_env_set_mapsize(MDB_env *env, size_t size); /* Set the maximum number of threads/reader slots for the environment. * * This defines the number of slots in the lock table that is used to track - *readers in the - * the environment. The default is 126. + * readers in the the environment. The default is 61. * Starting a read-only transaction normally ties a lock table slot to the * current thread until the environment closes or the thread exits. If * MDB_NOTLS is in use, mdbx_txn_begin() instead ties the slot to the * MDB_txn object until it or the MDB_env object is destroyed. * This function may only be called after mdbx_env_create() and before - *mdbx_env_open(). - * [in] env An environment handle returned by mdbx_env_create() - * [in] readers The maximum number of reader lock table slots + * mdbx_env_open(). + * + * [in] env An environment handle returned by mdbx_env_create() + * [in] readers The maximum number of reader lock table slots * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified, - * or the environment is already open. */ + * - MDBX_EINVAL - an invalid parameter was specified, + * or the environment is already open. */ LIBMDBX_API int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); /* Get the maximum number of threads/reader slots for the environment. @@ -771,7 +742,7 @@ LIBMDBX_API int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); /* Set the maximum number of named databases for the environment. @@ -780,25 +751,26 @@ LIBMDBX_API int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); * environment. Simpler applications that use the environment as a single * unnamed database can ignore this option. * This function may only be called after mdbx_env_create() and before - *mdbx_env_open(). + * mdbx_env_open(). * * Currently a moderate number of slots are cheap but a huge number gets * expensive: 7-120 words per transaction, and every mdbx_dbi_open() * does a linear search of the opened slots. - * [in] env An environment handle returned by mdbx_env_create() - * [in] dbs The maximum number of databases + * + * [in] env An environment handle returned by mdbx_env_create() + * [in] dbs The maximum number of databases * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified, - * or the environment is already open. */ + * - MDBX_EINVAL - an invalid parameter was specified, + * or the environment is already open. */ LIBMDBX_API int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); /* Get the maximum size of keys and MDB_DUPSORT data we can write. * * [in] env An environment handle returned by mdbx_env_create() - * Returns The maximum size of a key we can write - */ + * + * Returns The maximum size of a key we can write. */ LIBMDBX_API int mdbx_env_get_maxkeysize(MDB_env *env); LIBMDBX_API int mdbx_get_maxkeysize(size_t pagesize); @@ -813,22 +785,22 @@ LIBMDBX_API int mdbx_env_set_userctx(MDB_env *env, void *ctx); /* Get the application information associated with the MDB_env. * * [in] env An environment handle returned by mdbx_env_create() - * Returns The pointer set by mdbx_env_set_userctx(). - */ + * Returns The pointer set by mdbx_env_set_userctx(). */ LIBMDBX_API void *mdbx_env_get_userctx(MDB_env *env); /* A callback function for most LMDB assert() failures, * called before printing the message and aborting. * * [in] env An environment handle returned by mdbx_env_create(). - * [in] msg The assertion message, not including newline. - */ + * [in] msg The assertion message, not including newline. */ typedef void MDB_assert_func(MDB_env *env, const char *msg, const char *function, unsigned line); /* Set or reset the assert() callback of the environment. + * * Disabled if liblmdb is buillt with MDB_DEBUG=0. - * Note: This hack should become obsolete as lmdb's error handling matures. + * NOTE: This hack should become obsolete as lmdb's error handling matures. + * * [in] env An environment handle returned by mdbx_env_create(). * [in] func An MDB_assert_func function, or 0. * @@ -837,36 +809,39 @@ LIBMDBX_API int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); /* Create a transaction for use with the environment. * - * The transaction handle may be discarded using mdbx_txn_abort() or - *mdbx_txn_commit(). - * Note: A transaction and its cursors must only be used by a single + * The transaction handle may be discarded using mdbx_txn_abort() + * or mdbx_txn_commit(). + * NOTE: A transaction and its cursors must only be used by a single * thread, and a thread may only have a single transaction at a time. * If MDB_NOTLS is in use, this does not apply to read-only transactions. - * Note: Cursors may not span transactions. - * [in] env An environment handle returned by mdbx_env_create() - * [in] parent If this parameter is non-NULL, the new transaction - * will be a nested transaction, with the transaction indicated by parent - * as its parent. Transactions may be nested to any level. A parent - * transaction and its cursors may not issue any other operations than - * mdbx_txn_commit and mdbx_txn_abort while it has active child transactions. - * [in] flags Special options for this transaction. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. + * NOTE: Cursors may not span transactions. + * + * [in] env An environment handle returned by mdbx_env_create() + * [in] parent If this parameter is non-NULL, the new transaction will be + * a nested transaction, with the transaction indicated by parent + * as its parent. Transactions may be nested to any level. + * A parent transaction and its cursors may not issue any other + * operations than mdbx_txn_commit and mdbx_txn_abort while it + * has active child transactions. + * [in] flags Special options for this transaction. This parameter + * must be set to 0 or by bitwise OR'ing together one or more + * of the values described here. + * * - MDB_RDONLY - * This transaction will not perform any write operations. + * This transaction will not perform any write operations. + * * [out] txn Address where the new MDB_txn handle will be stored * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - * - MDB_MAP_RESIZED - another process wrote data beyond this - * MDB_env's - * mapsize and this environment's map must be resized as well. - * See mdbx_env_set_mapsize(). - * - MDB_READERS_FULL - a read-only transaction was requested and - * the reader lock table is full. See mdbx_env_set_maxreaders(). - * - ENOMEM - out of memory. */ + * - MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + * - MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's + * mapsize and this environment's map must be resized + * as well. See mdbx_env_set_mapsize(). + * - MDB_READERS_FULL - a read-only transaction was requested and the reader + * lock table is full. See mdbx_env_set_maxreaders(). + * - MDBX_ENOMEM - out of memory. */ LIBMDBX_API int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **txn); @@ -892,61 +867,50 @@ LIBMDBX_API uint64_t mdbx_txn_id(MDB_txn *txn); * The transaction handle is freed. It and its cursors must not be used * again after this call, except with mdbx_cursor_renew(). * - * Note: MDBX-mode: * A cursor must be closed explicitly always, before * or after its transaction ends. It can be reused with * mdbx_cursor_renew() before finally closing it. * - * Note: LMDB-compatible mode: - * Earlier documentation incorrectly said all cursors would be freed. - * Only write-transactions free cursors. - * * [in] txn A transaction handle returned by mdbx_txn_begin() * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified. - * - ENOSPC - no more disk space. - * - EIO - a low-level I/O error occurred while writing. - * - ENOMEM - out of memory. */ + * - MDBX_EINVAL - an invalid parameter was specified. + * - MDBX_ENOSPC - no more disk space. + * - MDBX_EIO - a low-level I/O error occurred while writing. + * - MDBX_ENOMEM - out of memory. */ LIBMDBX_API int mdbx_txn_commit(MDB_txn *txn); -/* Abandon all the operations of the transaction instead of saving - * them. +/* Abandon all the operations of the transaction instead of saving them. * * The transaction handle is freed. It and its cursors must not be used * again after this call, except with mdbx_cursor_renew(). * - * Note: MDBX-mode: - * A cursor must be closed explicitly always, before - * or after its transaction ends. It can be reused with - * mdbx_cursor_renew() before finally closing it. + * A cursor must be closed explicitly always, before or after its transaction + * ends. It can be reused with mdbx_cursor_renew() before finally closing it. * - * Note: LMDB-compatible mode: - * Earlier documentation incorrectly said all cursors would be freed. - * Only write-transactions free cursors. - * - * [in] txn A transaction handle returned by mdbx_txn_begin() - */ + * [in] txn A transaction handle returned by mdbx_txn_begin(). */ LIBMDBX_API int mdbx_txn_abort(MDB_txn *txn); /* Reset a read-only transaction. * * Abort the transaction like mdbx_txn_abort(), but keep the transaction - * handle. mdbx_txn_renew() may reuse the handle. This saves allocation - * overhead if the process will start a new read-only transaction soon, - * and also locking overhead if MDB_NOTLS is in use. The reader table + * handle. Therefore mdbx_txn_renew() may reuse the handle. This saves + * allocation overhead if the process will start a new read-only transaction + * soon, and also locking overhead if MDB_NOTLS is in use. The reader table * lock is released, but the table slot stays tied to its thread or * MDB_txn. Use mdbx_txn_abort() to discard a reset handle, and to free * its lock table slot if MDB_NOTLS is in use. + * * Cursors opened within the transaction must not be used * again after this call, except with mdbx_cursor_renew(). + * * Reader locks generally don't interfere with writers, but they keep old * versions of database pages allocated. Thus they prevent the old pages * from being reused when writers commit new data, and so under heavy load * the database size may grow much more rapidly than otherwise. - * [in] txn A transaction handle returned by mdbx_txn_begin() - */ + * + * [in] txn A transaction handle returned by mdbx_txn_begin() */ LIBMDBX_API int mdbx_txn_reset(MDB_txn *txn); /* Renew a read-only transaction. @@ -954,16 +918,18 @@ LIBMDBX_API int mdbx_txn_reset(MDB_txn *txn); * This acquires a new reader lock for a transaction handle that had been * released by mdbx_txn_reset(). It must be called before a reset transaction * may be used again. + * * [in] txn A transaction handle returned by mdbx_txn_begin() * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - * - EINVAL - an invalid parameter was specified. */ + * - MDB_PANIC - a fatal error occurred earlier and the environment + * must be shut down. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_txn_renew(MDB_txn *txn); /* Open a table in the environment. + * * A table handle denotes the name and parameters of a table, independently * of whether such a table exists. The table handle may be discarded by * calling mdbx_dbi_close(). The old table handle is returned if the table @@ -1024,10 +990,10 @@ LIBMDBX_API int mdbx_txn_renew(MDB_txn *txn); * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_NOTFOUND - the specified database doesn't exist in the - * environment and MDB_CREATE was not specified. - * - MDB_DBS_FULL - too many databases have been opened. - * See mdbx_env_set_maxdbs(). */ + * - MDB_NOTFOUND - the specified database doesn't exist in the + * environment and MDB_CREATE was not specified. + * - MDB_DBS_FULL - too many databases have been opened. + * See mdbx_env_set_maxdbs(). */ LIBMDBX_API int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp); @@ -1036,29 +1002,30 @@ LIBMDBX_API int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, /* Retrieve statistics for a database. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [out] stat The address of an MDB_stat structure - * where the statistics will be copied + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [out] stat The address of an MDB_stat structure where the statistics + * will be copied * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, size_t bytes); /* Retrieve the DB flags for a database handle. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [out] flags Address where the flags will be returned. + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [out] flags Address where the flags will be returned. * * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); -/* Close a database handle. Normally unnecessary. Use with care: +/* Close a database handle. Normally unnecessary. * - * This call is not mutex protected. Handles should only be closed by + * Use with care: + * FIXME: This call is not mutex protected. Handles should only be closed by * a single thread, and only if no other threads are going to reference * the database handle or one of its cursors any further. Do not close * a handle if an existing transaction has modified its database. @@ -1069,18 +1036,19 @@ LIBMDBX_API int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); * reuse the handle value. Usually it's better to set a bigger * mdbx_env_set_maxdbs(), unless that value would be large. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] env An environment handle returned by mdbx_env_create() + * [in] dbi A database handle returned by mdbx_dbi_open() */ LIBMDBX_API int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); /* Empty or delete+close a database. * * See mdbx_dbi_close() for restrictions about closing the DB handle. - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] del 0 to empty the DB, 1 to delete it from the - * environment and close the DB handle. + * + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] del 0 to empty the DB, 1 to delete it from the environment + * and close the DB handle. * * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); @@ -1094,22 +1062,23 @@ LIBMDBX_API int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); * first data item for the key will be returned. Retrieval of other * items requires the use of mdbx_cursor_get(). * - * Note: The memory pointed to by the returned values is owned by the + * NOTE: The memory pointed to by the returned values is owned by the * database. The caller need not dispose of the memory, and may not * modify it in any way. For values returned in a read-only transaction * any modification attempts will cause a SIGSEGV. - * Note: Values returned from the database are valid only until a + * + * NOTE: Values returned from the database are valid only until a * subsequent update operation, or the end of the transaction. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] key The key to search for in the database - * [out] data The data corresponding to the key + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] key The key to search for in the database + * [out] data The data corresponding to the key * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_NOTFOUND - the key was not in the database. - * - EINVAL - an invalid parameter was specified. */ + * - MDB_NOTFOUND - the key was not in the database. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); @@ -1120,44 +1089,56 @@ LIBMDBX_API int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, * if duplicates are disallowed, or adding a duplicate data item if * duplicates are allowed (MDB_DUPSORT). * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] key The key to store in the database - * [in,out] data The data to store - * [in] flags Special options for this operation. This parameter must be - * set to 0 or by bitwise OR'ing together one or more of the values - * described here. + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] key The key to store in the database + * [in,out] data The data to store + * [in] flags Special options for this operation. This parameter must be + * set to 0 or by bitwise OR'ing together one or more of the + * values described here. * - * - MDB_NODUPDATA - enter the new key/data pair only if it does not - * already appear in the database. This flag may only be specified - * if the database was opened with MDB_DUPSORT. The function will - * return MDB_KEYEXIST if the key/data pair already appears in the - * database. - * - MDB_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will return - * MDB_KEYEXIST if the key already appears in the database, even if - * the database supports duplicates (MDB_DUPSORT). The data - * parameter will be set to point to the existing item. - * - MDB_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. - * LMDB does nothing else with this memory, the caller is expected - * to modify all of the space requested. This flag must not be - * specified if the database was opened with MDB_DUPSORT. - * - MDB_APPEND - append the given key/data pair to the end of the - * database. This option allows fast bulk loading when keys are - * already known to be in the correct order. Loading unsorted keys - * with this flag will cause a MDB_KEYEXIST error. - * - MDB_APPENDDUP - as above, but for sorted dup data. + * - MDB_NODUPDATA + * Enter the new key/data pair only if it does not already appear + * in the database. This flag may only be specified if the database + * was opened with MDB_DUPSORT. The function will return MDB_KEYEXIST + * if the key/data pair already appears in the database. + * + * - MDB_NOOVERWRITE + * Enter the new key/data pair only if the key does not already appear + * in the database. The function will return MDB_KEYEXIST if the key + * already appears in the database, even if the database supports + * duplicates (MDB_DUPSORT). The data parameter will be set to point + * to the existing item. + * + * - MDB_CURRENT + * Update an single existing entry, but not add new ones. The function + * will return MDB_NOTFOUND if the given key not exist in the database. + * Or the MDBX_EMULTIVAL in case duplicates for the given key. + * + * - MDB_RESERVE + * Reserve space for data of the given size, but don't copy the given + * data. Instead, return a pointer to the reserved space, which the + * caller can fill in later - before the next update operation or the + * transaction ends. This saves an extra memcpy if the data is being + * generated later. MDBX does nothing else with this memory, the caller + * is expected to modify all of the space requested. This flag must not + * be specified if the database was opened with MDB_DUPSORT. + * + * - MDB_APPEND + * Append the given key/data pair to the end of the database. This option + * allows fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * a MDBX_EKEYMISMATCH error. + * + * - MDB_APPENDDUP + * As above, but for sorted dup data. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). - * - MDB_TXN_FULL - the transaction has too many dirty pages. - * - EACCES - an attempt was made to write in a read-only transaction. - * - EINVAL - an invalid parameter was specified. */ + * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). + * - MDB_TXN_FULL - the transaction has too many dirty pages. + * - MDBX_EACCES - an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); @@ -1165,30 +1146,22 @@ LIBMDBX_API int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, * * This function removes key/data pairs from the database. * - * MDBX-mode: * The data parameter is NOT ignored regardless the database does * support sorted duplicate data items or not. If the data parameter * is non-NULL only the matching data item will be deleted. * - * LMDB-compatible mode: - * If the database does not support sorted duplicate data items - * (MDB_DUPSORT) the data parameter is ignored. - * If the database supports sorted duplicates and the data parameter - * is NULL, all of the duplicate data items for the key will be - * deleted. Otherwise, if the data parameter is non-NULL - * only the matching data item will be deleted. - * * This function will return MDB_NOTFOUND if the specified key/data * pair is not in the database. - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] key The key to delete from the database - * [in] data The data to delete + * + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] key The key to delete from the database + * [in] data The data to delete * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EACCES - an attempt was made to write in a read-only transaction. - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EACCES - an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); @@ -1199,27 +1172,17 @@ LIBMDBX_API int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, * when its transaction has ended, except with mdbx_cursor_renew(). * It can be discarded with mdbx_cursor_close(). * - * MDBX-mode: * A cursor must be closed explicitly always, before * or after its transaction ends. It can be reused with * mdbx_cursor_renew() before finally closing it. * - * LMDB-compatible mode: - * A cursor in a write-transaction can be closed before its transaction - * ends, and will otherwise be closed when its transaction ends. - * A cursor in a read-only transaction must be closed explicitly, before - * or after its transaction ends. It can be reused with - * mdbx_cursor_renew() before finally closing it. - * Note: Earlier documentation said that cursors in every transaction - * were closed when the transaction committed or aborted. - * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [out] cursor Address where the new MDB_cursor handle will be stored + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [out] cursor Address where the new MDB_cursor handle will be stored * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); @@ -1227,68 +1190,61 @@ LIBMDBX_API int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, * * The cursor handle will be freed and must not be used again after this call. * Its transaction must still be live if it is a write-transaction. - * [in] cursor A cursor handle returned by mdbx_cursor_open() - */ + * + * [in] cursor A cursor handle returned by mdbx_cursor_open() */ LIBMDBX_API void mdbx_cursor_close(MDB_cursor *cursor); /* Renew a cursor handle. * * A cursor is associated with a specific transaction and database. - * Cursors that are only used in read-only - * transactions may be re-used, to avoid unnecessary malloc/free overhead. - * The cursor may be associated with a new read-only transaction, and - * referencing the same database handle as it was created with. + * Cursors that are only used in read-only transactions may be re-used, + * to avoid unnecessary malloc/free overhead. The cursor may be associated + * with a new read-only transaction, and referencing the same database handle + * as it was created with. + * * This may be done whether the previous transaction is live or dead. * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] cursor A cursor handle returned by mdbx_cursor_open() * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); /* Return the cursor's transaction handle. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() - */ + * [in] cursor A cursor handle returned by mdbx_cursor_open() */ LIBMDBX_API MDB_txn *mdbx_cursor_txn(MDB_cursor *cursor); /* Return the cursor's database handle. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() - */ + * [in] cursor A cursor handle returned by mdbx_cursor_open() */ LIBMDBX_API MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); /* Retrieve by cursor. * * This function retrieves key/data pairs from the database. The address and - *length - * of the key are returned in the object to which key refers (except for - *the - * case of the MDB_SET option, in which the key object is unchanged), and - * the address and length of the data are returned in the object to which \b - *data - * refers. - * See mdbx_get() for restrictions on using the output values. - * [in] cursor A cursor handle returned by mdbx_cursor_open() - * [in,out] key The key for a retrieved item - * [in,out] data The data of a retrieved item - * [in] op A cursor operation MDB_cursor_op + * length of the key are returned in the object to which key refers (except + * for the case of the MDB_SET option, in which the key object is unchanged), + * and the address and length of the data are returned in the object to which + * data refers. See mdbx_get() for restrictions on using the output values. + * + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in,out] key The key for a retrieved item + * [in,out] data The data of a retrieved item + * [in] op A cursor operation MDB_cursor_op * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDB_NOTFOUND - no matching key found. - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, MDB_cursor_op op); /* Store by cursor. * - * This function stores key/data pairs into the database. - * - * The cursor is positioned at the new item, or on failure usually near it. - * Note: Earlier documentation incorrectly said errors would leave the - * state of the cursor unchanged. + * This function stores key/data pairs into the database. The cursor is + * positioned at the new item, or on failure usually near it. * * [in] cursor A cursor handle returned by mdbx_cursor_open() * [in] key The key operated on. @@ -1296,107 +1252,105 @@ LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * [in] flags Options for this operation. This parameter * must be set to 0 or one of the values described here. * - * - MDB_CURRENT - replace the item at the current cursor position. The - * key parameter must still be provided, and must match it. + * - MDB_CURRENT + * Replace the item at the current cursor position. The key parameter + * must still be provided, and must match it, otherwise the function + * return MDBX_EKEYMISMATCH. * - * If using sorted duplicates (MDB_DUPSORT) the data item - * must still sort into the same place. This is intended to - * be used when the new data is the same size as the old. - * Otherwise it will simply perform a delete of the old - * record followed by an insert. + * NOTE: MDBX unlike LMDB allows you to change the size of the data and + * automatically handles reordering for sorted duplicates (MDB_DUPSORT). * - * - MDB_NODUPDATA - enter the new key/data pair only if it does not already - * appear in the database. This flag may only be - *specified - * if the database was opened with MDB_DUPSORT. The function - *will - * return MDB_KEYEXIST if the key/data pair already appears in - *the - * database. - * - MDB_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will - *return - * MDB_KEYEXIST if the key already appears in the database, even - *if - * the database supports duplicates (MDB_DUPSORT). - * - MDB_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. This - *flag - * must not be specified if the database was opened with - *MDB_DUPSORT. - * - MDB_APPEND - append the given key/data pair to the end of the - * database. No key comparisons are performed. This option allows - * fast bulk loading when keys are already known to be in the - * correct order. Loading unsorted keys with this flag will cause - * a MDB_KEYEXIST error. - * - MDB_APPENDDUP - as above, but for sorted dup data. - * - MDB_MULTIPLE - store multiple contiguous data elements in a - * single request. This flag may only be specified if the - *database - * was opened with MDB_DUPFIXED. The data argument must be an - * array of two MDB_vals. The mv_size of the first MDB_val must - *be - * the size of a single data element. The mv_data of the first - *MDB_val - * must point to the beginning of the array of contiguous data - *elements. - * The mv_size of the second MDB_val must be the count of the - *number - * of data elements to store. On return this field will be set to - * the count of the number of elements actually written. The - *mv_data - * of the second MDB_val is unused. + * - MDB_NODUPDATA + * Enter the new key/data pair only if it does not already appear in the + * database. This flag may only be specified if the database was opened + * with MDB_DUPSORT. The function will return MDB_KEYEXIST if the + * key/data pair already appears in the database. + * + * - MDB_NOOVERWRITE + * Enter the new key/data pair only if the key does not already appear + * in the database. The function will return MDB_KEYEXIST if the key + * already appears in the database, even if the database supports + * duplicates (MDB_DUPSORT). + * + * - MDB_RESERVE + * Reserve space for data of the given size, but don't copy the given + * data. Instead, return a pointer to the reserved space, which the + * caller can fill in later - before the next update operation or the + * transaction ends. This saves an extra memcpy if the data is being + * generated later. This flag must not be specified if the database + * was opened with MDB_DUPSORT. + * + * - MDB_APPEND + * Append the given key/data pair to the end of the database. No key + * comparisons are performed. This option allows fast bulk loading when + * keys are already known to be in the correct order. Loading unsorted + * keys with this flag will cause a MDB_KEYEXIST error. + * + * - MDB_APPENDDUP + * As above, but for sorted dup data. + * + * - MDB_MULTIPLE + * Store multiple contiguous data elements in a single request. This flag + * may only be specified if the database was opened with MDB_DUPFIXED. + * The data argument must be an array of two MDB_vals. The mv_size of the + * first MDB_val must be the size of a single data element. The mv_data + * of the first MDB_val must point to the beginning of the array of + * contiguous data elements. The mv_size of the second MDB_val must be + * the count of the number of data elements to store. On return this + * field will be set to the count of the number of elements actually + * written. The mv_data of the second MDB_val is unused. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EKEYMISMATCH - * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). - * - MDB_TXN_FULL - the transaction has too many dirty pages. - * - EACCES - an attempt was made to write in a read-only transaction. - * - EINVAL - an invalid parameter was specified. */ + * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). + * - MDB_TXN_FULL - the transaction has too many dirty pages. + * - MDBX_EACCES - an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, unsigned flags); /* Delete current key/data pair * * This function deletes the key/data pair to which the cursor refers. - * [in] cursor A cursor handle returned by mdbx_cursor_open() - * [in] flags Options for this operation. This parameter - * must be set to 0 or one of the values described here. - * - MDB_NODUPDATA - delete all of the data items for the current key. - * This flag may only be specified if the database was opened with - *MDB_DUPSORT. + * + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in] flags Options for this operation. This parameter must be set to 0 + * or one of the values described here. + * + * - MDB_NODUPDATA + * Delete all of the data items for the current key. This flag may only + * be specified if the database was opened with MDB_DUPSORT. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EACCES - an attempt was made to write in a read-only transaction. - * - EINVAL - an invalid parameter was specified. */ + * - MDBX_EACCES - an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); /* Return count of duplicates for current key. * - * This call is only valid on databases that support sorted duplicate - * data items MDB_DUPSORT. - * [in] cursor A cursor handle returned by mdbx_cursor_open() - * [out] countp Address where the count will be stored + * This call is only valid on databases that support sorted duplicate data + * items MDB_DUPSORT. + * + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [out] countp Address where the count will be stored * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - EINVAL - cursor is not initialized, - * or an invalid parameter was specified. */ + * - MDBX_EINVAL - cursor is not initialized, or an invalid parameter + * was specified. */ LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, uint64_t *countp); /* Compare two data items according to a particular database. * * This returns a comparison as if the two data items were keys in the * specified database. - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] a The first item to compare - * [in] b The second item to compare + * + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] a The first item to compare + * [in] b The second item to compare * * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ LIBMDBX_API int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, @@ -1406,10 +1360,11 @@ LIBMDBX_API int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, * * This returns a comparison as if the two items were data items of * the specified database. The database must have the MDB_DUPSORT flag. - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] a The first item to compare - * [in] b The second item to compare + * + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] a The first item to compare + * [in] b The second item to compare * * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ LIBMDBX_API int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, @@ -1417,25 +1372,25 @@ LIBMDBX_API int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, /* A callback function used to print a message from the library. * - * [in] msg The string to be printed. - * [in] ctx An arbitrary context pointer for the callback. + * [in] msg The string to be printed. + * [in] ctx An arbitrary context pointer for the callback. * * Returns < 0 on failure, >= 0 on success. */ typedef int(MDB_msg_func)(const char *msg, void *ctx); /* Dump the entries in the reader lock table. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] func A MDB_msg_func function - * [in] ctx Anything the message function needs + * [in] env An environment handle returned by mdbx_env_create() + * [in] func A MDB_msg_func function + * [in] ctx Anything the message function needs * * Returns < 0 on failure, >= 0 on success. */ LIBMDBX_API int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); /* Check for stale entries in the reader lock table. * - * [in] env An environment handle returned by mdbx_env_create() - * [out] dead Number of stale slots that were cleared + * [in] env An environment handle returned by mdbx_env_create() + * [out] dead Number of stale slots that were cleared * * Returns 0 on success, non-zero on failure. */ LIBMDBX_API int mdbx_reader_check(MDB_env *env, int *dead); @@ -1450,27 +1405,27 @@ LIBMDBX_API int mdbx_env_close_ex(MDB_env *env, int dont_sync); * in the environment. * * Data is always written to disk when mdbx_txn_commit() is called, - * but the operating system may keep it buffered. LMDB always flushes + * but the operating system may keep it buffered. MDBX always flushes * the OS buffers upon commit as well, unless the environment was * opened with MDB_NOSYNC or in part MDB_NOMETASYNC. * - * The default is 0, than mean no any threshold checked, - * and no additional flush will be made. + * The default is 0, than mean no any threshold checked, and no additional + * flush will be made. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] bytes The size in bytes of summary changes - * when a synchronous flush would be made. + * [in] env An environment handle returned by mdbx_env_create() + * [in] bytes The size in bytes of summary changes when a synchronous + * flush would be made. * * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); -/* Returns a lag of the reading. +/* Returns a lag of the reading for the given transaction. * * Returns an information for estimate how much given read-only * transaction is lagging relative the to actual head. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [out] percent Percentage of page allocation in the database. + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [out] percent Percentage of page allocation in the database. * * Returns Number of transactions committed after the given was started for * read, or -1 on failure. */ @@ -1479,27 +1434,27 @@ LIBMDBX_API int mdbx_txn_straggler(MDB_txn *txn, int *percent); /* A callback function for killing a laggard readers, * but also could waiting ones. Called in case of MDB_MAP_FULL error. * - * [in] env An environment handle returned by mdbx_env_create(). - * [in] pid pid of the reader process. - * [in] thread_id thread_id of the reader thread. - * [in] txn Transaction number on which stalled. - * [in] gap a lag from the last commited txn. - * [in] retry a retry number, less that zero for notify end of OOM-loop. + * [in] env An environment handle returned by mdbx_env_create(). + * [in] pid pid of the reader process. + * [in] tid thread_id of the reader thread. + * [in] txn Transaction number on which stalled. + * [in] gap A lag from the last commited txn. + * [in] retry A retry number, less that zero for notify end of OOM-loop. * * Returns -1 on failure (reader is not killed), * 0 on a race condition (no such reader), * 1 on success (reader was killed), * >1 on success (reader was SURE killed). */ -typedef int(MDBX_oom_func)(MDB_env *env, int pid, mdbx_tid_t thread_id, - uint64_t txn, unsigned gap, int retry); +typedef int(MDBX_oom_func)(MDB_env *env, int pid, mdbx_tid_t tid, uint64_t txn, + unsigned gap, int retry); /* Set the OOM callback. * * Callback will be called only on out-of-pages case for killing * a laggard readers to allowing reclaiming of freeDB. * - * [in] env An environment handle returned by mdbx_env_create(). - * [in] oomfunc A #MDBX_oom_func function or NULL to disable. */ + * [in] env An environment handle returned by mdbx_env_create(). + * [in] oomfunc A MDBX_oom_func function or NULL to disable. */ LIBMDBX_API void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); /* Get the current oom_func callback. @@ -1507,9 +1462,9 @@ LIBMDBX_API void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); * Callback will be called only on out-of-pages case for killing * a laggard readers to allowing reclaiming of freeDB. * - * [in] env An environment handle returned by mdbx_env_create(). + * [in] env An environment handle returned by mdbx_env_create(). * - * Returns A #MDBX_oom_func function or NULL if disabled. */ + * Returns A MDBX_oom_func function or NULL if disabled. */ LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDB_env *env); #define MDBX_DBG_ASSERT 1 From 8b9e391dd08bd8ed8755750d5ae91620aaf88e8f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 14:44:53 +0300 Subject: [PATCH 138/303] mdbx: cleanup internals (mostly formatting, but not only). --- src/bits.h | 468 +++++++------- src/mdbx.c | 1420 +++++++++++++++++++---------------------- src/tools/mdbx_chk.c | 3 +- src/tools/mdbx_copy.c | 3 +- src/tools/mdbx_dump.c | 3 +- src/tools/mdbx_load.c | 3 +- src/tools/mdbx_stat.c | 3 +- 7 files changed, 872 insertions(+), 1031 deletions(-) diff --git a/src/bits.h b/src/bits.h index f99572a2..3acc23ba 100644 --- a/src/bits.h +++ b/src/bits.h @@ -9,8 +9,7 @@ * * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at - * . - */ + * . */ #pragma once /* *INDENT-OFF* */ @@ -143,14 +142,14 @@ /*----------------------------------------------------------------------------*/ -/** handle for the DB used to track free pages. */ +/* handle for the DB used to track free pages. */ #define FREE_DBI 0 -/** handle for the default DB. */ +/* handle for the default DB. */ #define MAIN_DBI 1 -/** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ +/* Number of DBs in metapage (free and main) - also hardcoded elsewhere */ #define CORE_DBS 2 -/** Number of meta pages - also hardcoded elsewhere */ +/* Number of meta pages - also hardcoded elsewhere */ #define NUM_METAS 2 /* A generic unsigned ID number. These were entryIDs in back-bdb. @@ -181,100 +180,92 @@ typedef MDB_ID txnid_t; */ typedef MDB_ID *MDB_IDL; -/* An ID2 is an ID/pointer pair. -*/ +/* An ID2 is an ID/pointer pair. */ typedef struct MDB_ID2 { MDB_ID mid; /* The ID */ void *mptr; /* The pointer */ } MDB_ID2; /* An ID2L is an ID2 List, a sorted array of ID2s. -* The first element's \b mid member is a count of how many actual -* elements are in the array. The \b mptr member of the first element is -* unused. -* The array is sorted in ascending order by \b mid. -*/ + * The first element's mid member is a count of how many actual + * elements are in the array. The mptr member of the first element is + * unused. The array is sorted in ascending order by mid. */ typedef MDB_ID2 *MDB_ID2L; -/** Used for offsets within a single page. -* Since memory pages are typically 4 or 8KB in size, 12-13 bits, -* this is plenty. -*/ +/* Used for offsets within a single page. + * Since memory pages are typically 4 or 8KB in size, 12-13 bits, + * this is plenty. */ typedef uint16_t indx_t; #pragma pack(push, 1) -/** The information we store in a single slot of the reader table. -* In addition to a transaction ID, we also record the process and -* thread ID that owns a slot, so that we can detect stale information, -* e.g. threads or processes that went away without cleaning up. -* @note We currently don't check for stale records. We simply re-init -* the table when we know that we're the only process opening the -* lock file. -*/ +/* The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * NOTE: We currently don't check for stale records. We simply re-init + * the table when we know that we're the only process opening the + * lock file. */ typedef struct MDB_rxbody { - /** Current Transaction ID when this transaction began, or (txnid_t)-1. - * Multiple readers that start at the same time will probably have the - * same ID here. Again, it's not important to exclude them from - * anything; all we need to know is which version of the DB they - * started from so we can avoid overwriting any data used in that - * particular version. - */ + /* Current Transaction ID when this transaction began, or (txnid_t)-1. + * Multiple readers that start at the same time will probably have the + * same ID here. Again, it's not important to exclude them from + * anything; all we need to know is which version of the DB they + * started from so we can avoid overwriting any data used in that + * particular version. */ volatile txnid_t mrb_txnid; - /** The process ID of the process owning this reader txn. */ + /* The process ID of the process owning this reader txn. */ volatile mdbx_pid_t mrb_pid; - /** The thread ID of the thread owning this txn. */ + /* The thread ID of the thread owning this txn. */ volatile mdbx_tid_t mrb_tid; } MDB_rxbody; -/** The actual reader record, with cacheline padding. */ +/* The actual reader record, with cacheline padding. */ typedef struct MDB_reader { union { MDB_rxbody mrx; -/** shorthand for mrb_txnid */ +/* shorthand for mrb_txnid */ #define mr_txnid mru.mrx.mrb_txnid #define mr_pid mru.mrx.mrb_pid #define mr_tid mru.mrx.mrb_tid - /** cache line alignment */ + /* cache line alignment */ char pad[(sizeof(MDB_rxbody) + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1)]; } mru; } MDB_reader; -/** Information about a single database in the environment. */ +/* Information about a single database in the environment. */ typedef struct MDB_db { - uint32_t md_xsize; /**< also ksize for LEAF2 pages */ - uint16_t md_flags; /**< @ref mdbx_dbi_open */ - uint16_t md_depth; /**< depth of this tree */ + uint32_t md_xsize; /* also ksize for LEAF2 pages */ + uint16_t md_flags; /* see mdbx_dbi_open */ + uint16_t md_depth; /* depth of this tree */ uint64_t md_seq; /* table sequence counter */ - pgno_t md_branch_pages; /**< number of internal pages */ - pgno_t md_leaf_pages; /**< number of leaf pages */ - pgno_t md_overflow_pages; /**< number of overflow pages */ - size_t md_entries; /**< number of data items */ - pgno_t md_root; /**< the root page of this tree */ + pgno_t md_branch_pages; /* number of internal pages */ + pgno_t md_leaf_pages; /* number of leaf pages */ + pgno_t md_overflow_pages; /* number of overflow pages */ + pgno_t md_root; /* the root page of this tree */ + uint64_t md_entries; /* number of data items */ } MDB_db; -/** Meta page content. -* A meta page is the start point for accessing a database snapshot. -* Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). -*/ +/* Meta page content. + * A meta page is the start point for accessing a database snapshot. + * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */ typedef struct MDB_meta { - /** Stamp identifying this as an LMDB file. It must be set - * to #MDB_MAGIC. */ + /* Stamp identifying this as an LMDB file. It must be set + * to MDB_MAGIC. */ uint32_t mm_magic; - /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ + /* Version number of this file. Must be set to MDB_DATA_VERSION. */ uint32_t mm_version; - size_t mm_mapsize; /**< size of mmap region */ - MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ - /** The size of pages used in this DB */ + size_t mm_mapsize; /* size of mmap region */ + MDB_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ + /* The size of pages used in this DB */ #define mm_psize mm_dbs[FREE_DBI].md_xsize -/** Any persistent environment flags. @ref mdbx_env */ +/* Any persistent environment flags, see mdbx_env */ #define mm_flags mm_dbs[FREE_DBI].md_flags - /** Last used page in the datafile. - * Actually the file may be shorter if the freeDB lists the final pages. - */ + /* Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. */ pgno_t mm_last_pg; - volatile txnid_t mm_txnid; /**< txnid that committed this page */ + volatile txnid_t mm_txnid; /* txnid that committed this page */ #define MDB_DATASIGN_NONE 0u #define MDB_DATASIGN_WEAK 1u volatile uint64_t mm_datasync_sign; @@ -285,69 +276,61 @@ typedef struct MDB_meta { volatile mdbx_canary mm_canary; } MDB_meta; -/** Common header for all page types. The page type depends on #mp_flags. -* -* #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with -* sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages -* omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. -* -* #P_OVERFLOW records occupy one or more contiguous pages where only the -* first has a page header. They hold the real data of #F_BIGDATA nodes. -* -* #P_SUBP sub-pages are small leaf "pages" with duplicate data. -* A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. -* (Duplicate data can also go in sub-databases, which use normal pages.) -* -* #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. -* -* Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once -* in the snapshot: Either used by a database or listed in a freeDB record. -*/ +/* Common header for all page types. The page type depends on mp_flags. + * + * P_BRANCH and P_LEAF pages have unsorted 'MDB_node's at the end, with + * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages + * omit mp_ptrs and pack sorted MDB_DUPFIXED values after the page header. + * + * P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of F_BIGDATA nodes. + * + * P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * P_META pages contain MDB_meta, the start point of an LMDB snapshot. + * + * Each non-metapage up to MDB_meta.mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a freeDB record. */ typedef struct MDB_page { #define mp_pgno mp_p.p_pgno #define mp_next mp_p.p_next union { - pgno_t p_pgno; /**< page number */ - struct MDB_page *p_next; /**< for in-memory list of freed pages */ + pgno_t p_pgno; /* page number */ + struct MDB_page *p_next; /* for in-memory list of freed pages */ } mp_p; - uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */ - /** @defgroup mdbx_page Page Flags - * @ingroup internal - * Flags for the page headers. - * @{ - */ -#define P_BRANCH 0x01 /**< branch page */ -#define P_LEAF 0x02 /**< leaf page */ -#define P_OVERFLOW 0x04 /**< overflow page */ -#define P_META 0x08 /**< meta page */ -#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ -#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ -#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ -#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */ -#define P_KEEP 0x8000 /**< leave this page alone during spill */ - /** @} */ - uint16_t mp_flags; /**< @ref mdbx_page */ + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01 /* branch page */ +#define P_LEAF 0x02 /* leaf page */ +#define P_OVERFLOW 0x04 /* overflow page */ +#define P_META 0x08 /* meta page */ +#define P_DIRTY 0x10 /* dirty page, also set for P_SUBP pages */ +#define P_LEAF2 0x20 /* for MDB_DUPFIXED records */ +#define P_SUBP 0x40 /* for MDB_DUPSORT sub-pages */ +#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ +#define P_KEEP 0x8000 /* leave this page alone during spill */ + uint16_t mp_flags; #define mp_lower mp_pb.pb.pb_lower #define mp_upper mp_pb.pb.pb_upper #define mp_pages mp_pb.pb_pages union { struct { - indx_t pb_lower; /**< lower bound of free space */ - indx_t pb_upper; /**< upper bound of free space */ + indx_t pb_lower; /* lower bound of free space */ + indx_t pb_upper; /* upper bound of free space */ } pb; - uint32_t pb_pages; /**< number of overflow pages */ + uint32_t pb_pages; /* number of overflow pages */ } mp_pb; - indx_t mp_ptrs[1]; /**< dynamic size */ + indx_t mp_ptrs[1]; /* dynamic size */ } MDB_page; -/** Size of the page header, excluding dynamic data at the end */ +/* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDB_page, mp_ptrs)) -/** Buffer for a stack-allocated meta page. -* The members define size and alignment, and silence type -* aliasing warnings. They are not used directly; that could -* mean incorrectly using several union members in parallel. -*/ +/* Buffer for a stack-allocated meta page. + * The members define size and alignment, and silence type + * aliasing warnings. They are not used directly; that could + * mean incorrectly using several union members in parallel. */ typedef union MDB_metabuf { MDB_page mb_page; struct { @@ -386,9 +369,9 @@ typedef struct MDBX_lockinfo { * The information here is mostly static/read-only. There is * only a single copy of this record in the environment. */ typedef struct MDB_dbx { - MDB_val md_name; /**< name of the database */ - MDB_cmp_func *md_cmp; /**< function for comparing keys */ - MDB_cmp_func *md_dcmp; /**< function for comparing data items */ + MDB_val md_name; /* name of the database */ + MDB_cmp_func *md_cmp; /* function for comparing keys */ + MDB_cmp_func *md_dcmp; /* function for comparing data items */ } MDB_dbx; /* A database transaction. @@ -396,24 +379,24 @@ typedef struct MDB_dbx { struct MDB_txn { #define MDBX_MT_SIGNATURE (0x93D53A31) unsigned mt_signature; - MDB_txn *mt_parent; /**< parent of a nested txn */ - /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ + MDB_txn *mt_parent; /* parent of a nested txn */ + /* Nested txn under this txn, set together with flag MDB_TXN_HAS_CHILD */ MDB_txn *mt_child; - pgno_t mt_next_pgno; /**< next unallocated page */ + pgno_t mt_next_pgno; /* next unallocated page */ /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; - MDB_env *mt_env; /**< the DB environment */ - /** The list of reclaimed txns from freeDB */ + MDB_env *mt_env; /* the DB environment */ + /* The list of reclaimed txns from freeDB */ MDB_IDL mt_lifo_reclaimed; /* The list of pages that became unused during this transaction. */ MDB_IDL mt_free_pgs; /* The list of loose pages that became unused and may be reused - * in this transaction, linked through #NEXT_LOOSE_PAGE(page). */ + * in this transaction, linked through NEXT_LOOSE_PAGE(page). */ MDB_page *mt_loose_pgs; - /** Number of loose pages (#mt_loose_pgs) */ - int mt_loose_count; + /* Number of loose pages (mt_loose_pgs) */ + unsigned mt_loose_count; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ @@ -435,130 +418,114 @@ struct MDB_txn { #define DB_DIRTY 0x01 /* DB was written in this txn */ #define DB_STALE 0x02 /* Named-DB record is older than txnID */ #define DB_NEW 0x04 /* Named-DB handle opened in this txn */ -#define DB_VALID 0x08 /* DB handle is valid, see also #MDB_VALID */ -#define DB_USRVALID 0x10 /* As #DB_VALID, but not set for #FREE_DBI */ -#define DB_DUPDATA 0x20 /* DB is #MDB_DUPSORT data */ +#define DB_VALID 0x08 /* DB handle is valid, see also MDB_VALID */ +#define DB_USRVALID 0x10 /* As DB_VALID, but not set for FREE_DBI */ +#define DB_DUPDATA 0x20 /* DB is MDB_DUPSORT data */ /* In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; /* Array of flags for each DB */ uint8_t *mt_dbflags; /* Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. */ + * This number only ever increments until the txn finishes; we + * don't decrement it when individual DB handles are closed. */ MDB_dbi mt_numdbs; -/** @defgroup mdbx_txn Transaction Flags -* @ingroup internal -* @{ -*/ -/** #mdbx_txn_begin() flags */ +/* Transaction Flags */ +/* mdbx_txn_begin() flags */ #define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC | MDB_NOSYNC | MDB_RDONLY) #define MDB_TXN_NOMETASYNC \ - MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ -#define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ -#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ + MDB_NOMETASYNC /* don't sync meta for this txn on commit */ +#define MDB_TXN_NOSYNC MDB_NOSYNC /* don't sync this txn on commit */ +#define MDB_TXN_RDONLY MDB_RDONLY /* read-only transaction */ /* internal txn flags */ -#define MDB_TXN_WRITEMAP \ - MDB_WRITEMAP /**< copy of #MDB_env flag in writers \ - */ -#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ -#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ -#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ -#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ -#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */ -/** most operations on the txn are currently illegal */ +#define MDB_TXN_WRITEMAP MDB_WRITEMAP /* copy of MDB_env flag in writers */ +#define MDB_TXN_FINISHED 0x01 /* txn is finished or never began */ +#define MDB_TXN_ERROR 0x02 /* txn is unusable after an error */ +#define MDB_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */ +#define MDB_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */ +#define MDB_TXN_HAS_CHILD 0x10 /* txn has an MDB_txn.mt_child */ +/* most operations on the txn are currently illegal */ #define MDB_TXN_BLOCKED (MDB_TXN_FINISHED | MDB_TXN_ERROR | MDB_TXN_HAS_CHILD) - /** @} */ - unsigned mt_flags; /**< @ref mdbx_txn */ - /** #dirty_list room: Array size - \#dirty pages visible to this txn. - * Includes ancestor txns' dirty pages not hidden by other txns' - * dirty/spilled pages. Thus commit(nested txn) has room to merge - * dirty_list into mt_parent after freeing hidden mt_parent pages. - */ + unsigned mt_flags; + /* dirty_list room: Array size - dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirty_list into mt_parent after freeing hidden mt_parent pages. */ unsigned mt_dirty_room; mdbx_canary mt_canary; }; -/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. -* At 4 keys per node, enough for 2^64 nodes, so there's probably no need to -* raise this on a 64 bit machine. -*/ +/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. + * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to + * raise this on a 64 bit machine. */ #define CURSOR_STACK 32 struct MDB_xcursor; -/** Cursors are used for all DB operations. -* A cursor holds a path of (page pointer, key index) from the DB -* root to a position in the DB, plus other state. #MDB_DUPSORT -* cursors include an xcursor to the current data item. Write txns -* track their cursors and keep them up to date when data moves. -* Exception: An xcursor's pointer to a #P_SUBP page can be stale. -* (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). -*/ +/* Cursors are used for all DB operations. + * A cursor holds a path of (page pointer, key index) from the DB + * root to a position in the DB, plus other state. MDB_DUPSORT + * cursors include an xcursor to the current data item. Write txns + * track their cursors and keep them up to date when data moves. + * Exception: An xcursor's pointer to a P_SUBP page can be stale. + * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */ struct MDB_cursor { #define MDBX_MC_SIGNATURE (0xFE05D5B1) #define MDBX_MC_READY4CLOSE (0x2817A047) #define MDBX_MC_WAIT4EOT (0x90E297A7) unsigned mc_signature; - /** Next cursor on this DB in this txn */ + /* Next cursor on this DB in this txn */ MDB_cursor *mc_next; - /** Backup of the original cursor if this cursor is a shadow */ + /* Backup of the original cursor if this cursor is a shadow */ MDB_cursor *mc_backup; - /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ + /* Context used for databases with MDB_DUPSORT, otherwise NULL */ struct MDB_xcursor *mc_xcursor; - /** The transaction that owns this cursor */ + /* The transaction that owns this cursor */ MDB_txn *mc_txn; - /** The database handle this cursor operates on */ + /* The database handle this cursor operates on */ MDB_dbi mc_dbi; - /** The database record for this cursor */ + /* The database record for this cursor */ MDB_db *mc_db; - /** The database auxiliary record for this cursor */ + /* The database auxiliary record for this cursor */ MDB_dbx *mc_dbx; - /** The @ref mt_dbflag for this database */ + /* The mt_dbflag for this database */ uint8_t *mc_dbflag; - uint16_t mc_snum; /**< number of pushed pages */ - uint16_t mc_top; /**< index of top page, normally mc_snum-1 */ - /** @defgroup mdbx_cursor Cursor Flags - * @ingroup internal - * Cursor state flags. - * @{ - */ -#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ -#define C_EOF 0x02 /**< No more data */ -#define C_SUB 0x04 /**< Cursor is a sub-cursor */ -#define C_DEL 0x08 /**< last op was a cursor_del */ -#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ -#define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */ - /** @} */ - unsigned mc_flags; /**< @ref mdbx_cursor */ - MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ - indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ + uint16_t mc_snum; /* number of pushed pages */ + uint16_t mc_top; /* index of top page, normally mc_snum-1 */ + /* Cursor state flags. */ +#define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ +#define C_EOF 0x02 /* No more data */ +#define C_SUB 0x04 /* Cursor is a sub-cursor */ +#define C_DEL 0x08 /* last op was a cursor_del */ +#define C_UNTRACK 0x40 /* Un-track cursor when closing */ +#define C_RECLAIMING 0x80 /* FreeDB lookup is prohibited */ + unsigned mc_flags; /* see mdbx_cursor */ + MDB_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; -/** Context for sorted-dup records. -* We could have gone to a fully recursive design, with arbitrarily -* deep nesting of sub-databases. But for now we only handle these -* levels - main DB, optional sub-DB, sorted-duplicate DB. -*/ +/* Context for sorted-dup records. + * We could have gone to a fully recursive design, with arbitrarily + * deep nesting of sub-databases. But for now we only handle these + * levels - main DB, optional sub-DB, sorted-duplicate DB. */ typedef struct MDB_xcursor { - /** A sub-cursor for traversing the Dup DB */ + /* A sub-cursor for traversing the Dup DB */ MDB_cursor mx_cursor; - /** The database record for this Dup DB */ + /* The database record for this Dup DB */ MDB_db mx_db; - /** The auxiliary DB record for this Dup DB */ + /* The auxiliary DB record for this Dup DB */ MDB_dbx mx_dbx; - /** The @ref mt_dbflag for this Dup DB */ - unsigned char mx_dbflag; + /* The mt_dbflag for this Dup DB */ + uint8_t mx_dbflag; } MDB_xcursor; -/** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ +/* Check if there is an inited xcursor, so XCURSOR_REFRESH() is proper */ #define XCURSOR_INITED(mc) \ ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) -/** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed -* when the node which contains the sub-page may have moved. Called -* with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. -*/ +/* Update sub-page pointer, if any, in mc->mc_xcursor. + * Needed when the node which contains the sub-page may have moved. + * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */ #define XCURSOR_REFRESH(mc, mp, ki) \ do { \ MDB_page *xr_pg = (mp); \ @@ -567,88 +534,87 @@ typedef struct MDB_xcursor { (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ } while (0) -/** State of FreeDB old pages, stored in the MDB_env */ +/* State of FreeDB old pages, stored in the MDB_env */ typedef struct MDB_pgstate { - pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ - txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ + pgno_t *mf_pghead; /* Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_pglast; /* ID of last used record, or 0 if !mf_pghead */ } MDB_pgstate; #define MDBX_LOCKINFO_WHOLE_SIZE \ ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ ~((size_t)MDBX_CACHELINE_SIZE - 1)) -/** Lockfile format signature: version, features and field layout */ +/* Lockfile format signature: version, features and field layout */ #define MDB_LOCK_FORMAT \ (((uint64_t)(MDBX_OSAL_LOCK_SIGN) << 32) + \ ((MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1) << 16) + \ (MDB_LOCK_VERSION) /* Flags which describe functionality */) -/** The database environment. */ +/* The database environment. */ struct MDB_env { #define MDBX_ME_SIGNATURE (0x9A899641) unsigned me_signature; - mdbx_filehandle_t me_fd; /**< The main data file */ - mdbx_filehandle_t me_lfd; /**< The lock file */ -/** Failed to update the meta page. Probably an I/O error. */ + mdbx_filehandle_t me_fd; /* The main data file */ + mdbx_filehandle_t me_lfd; /* The lock file */ +/* Failed to update the meta page. Probably an I/O error. */ #define MDB_FATAL_ERROR 0x80000000U -/** Some fields are initialized. */ +/* Some fields are initialized. */ #define MDB_ENV_ACTIVE 0x20000000U -/** me_txkey is set */ +/* me_txkey is set */ #define MDB_ENV_TXKEY 0x10000000U - uint32_t me_flags; /**< @ref mdbx_env */ - unsigned me_psize; /**< DB page size, inited from me_os_psize */ - unsigned me_os_psize; /**< OS page size, from mdbx_syspagesize() */ - unsigned me_maxreaders; /**< size of the reader table */ - /** Max #MDBX_lockinfo.mti_numreaders of interest to #mdbx_env_close() */ + uint32_t me_flags; /* see mdbx_env */ + unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_maxreaders; /* size of the reader table */ + /* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */ unsigned me_close_readers; - MDB_dbi me_numdbs; /**< number of DBs opened */ - MDB_dbi me_maxdbs; /**< size of the DB table */ - mdbx_pid_t me_pid; /**< process ID of this env */ - char *me_path; /**< path to the DB files */ - char *me_map; /**< the memory map of the data file */ - MDBX_lockinfo *me_lck; /**< the memory map of the lock file, never NULL */ - void *me_pbuf; /**< scratch area for DUPSORT put() */ - MDB_txn *me_txn; /**< current write transaction */ - MDB_txn *me_txn0; /**< prealloc'd write transaction */ - size_t me_mapsize; /**< size of the data memory map */ - pgno_t me_maxpg; /**< me_mapsize / me_psize */ - MDB_dbx *me_dbxs; /**< array of static DB info */ - uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ - unsigned *me_dbiseqs; /**< array of dbi sequence numbers */ - mdbx_thread_key_t me_txkey; /**< thread-key for readers */ - txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ - MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ + MDB_dbi me_numdbs; /* number of DBs opened */ + MDB_dbi me_maxdbs; /* size of the DB table */ + mdbx_pid_t me_pid; /* process ID of this env */ + char *me_path; /* path to the DB files */ + char *me_map; /* the memory map of the data file */ + MDBX_lockinfo *me_lck; /* the memory map of the lock file, never NULL */ + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDB_txn *me_txn; /* current write transaction */ + MDB_txn *me_txn0; /* prealloc'd write transaction */ + size_t me_mapsize; /* size of the data memory map */ + pgno_t me_maxpg; /* me_mapsize / me_psize */ + MDB_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDB_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + mdbx_thread_key_t me_txkey; /* thread-key for readers */ + txnid_t me_pgoldest; /* ID of oldest reader last time we looked */ + MDB_pgstate me_pgstate; /* state of old pages from freeDB */ #define me_pglast me_pgstate.mf_pglast #define me_pghead me_pgstate.mf_pghead - MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ - /** IDL of pages that became unused in a write txn */ + MDB_page *me_dpages; /* list of malloc'd blocks for re-use */ + /* IDL of pages that became unused in a write txn */ MDB_IDL me_free_pgs; - /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ + /* ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ MDB_ID2L me_dirty_list; - /** Max number of freelist items that can fit in a single overflow page */ + /* Max number of freelist items that can fit in a single overflow page */ unsigned me_maxfree_1pg; - /** Max size of a node on a page */ + /* Max size of a node on a page */ unsigned me_nodemax; - unsigned me_maxkey_limit; /**< max size of a key */ - int me_live_reader; /**< have liveness lock in reader table */ - void *me_userctx; /**< User-settable context */ + unsigned me_maxkey_limit; /* max size of a key */ + int me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ #if MDB_DEBUG - MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ + MDB_assert_func *me_assert_func; /* Callback for assertion failures */ #endif - uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last - mdbx_env_sync() */ - uint64_t - me_sync_threshold; /**< Treshold of above to force synchronous flush */ - MDBX_oom_func *me_oom_func; /**< Callback for kicking laggard readers */ + uint64_t me_sync_pending; /* Total dirty/non-sync'ed bytes + * since the last mdbx_env_sync() */ + uint64_t me_sync_threshold; /* Treshold of above to force synchronous flush */ + MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */ #ifdef USE_VALGRIND int me_valgrind_handle; #endif }; -/** Nested transaction */ +/* Nested transaction */ typedef struct MDB_ntxn { - MDB_txn mnt_txn; /**< the transaction */ - MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ + MDB_txn mnt_txn; /* the transaction */ + MDB_pgstate mnt_pgstate; /* parent transaction's saved freestate */ } MDB_ntxn; /*----------------------------------------------------------------------------*/ @@ -692,7 +658,7 @@ void mdbx_panic(const char *fmt, ...) #define mdbx_print(fmt, ...) \ mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) -/*****************************************/ +/*----------------------------------------------------------------------------*/ #define mdbx_trace(fmt, ...) \ do { \ @@ -743,7 +709,7 @@ void mdbx_panic(const char *fmt, ...) fmt "\n", ##__VA_ARGS__); \ } while (0) -/*****************************************/ +/*----------------------------------------------------------------------------*/ #define mdbx_debug(fmt, ...) \ do { \ diff --git a/src/mdbx.c b/src/mdbx.c index a511bf25..51953ea5 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -33,8 +33,7 @@ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "./bits.h" #include "./midl.h" @@ -158,90 +157,77 @@ __cold void mdbx_rthc_remove(mdbx_thread_key_t key) { /*----------------------------------------------------------------------------*/ -/** Search for an ID in an IDL. - * @param[in] ids The IDL to search. - * @param[in] id The ID to search for. - * @return The index of the first ID greater than or equal to \b id. - */ -static unsigned mdbx_midl_search(MDB_IDL ids, MDB_ID id); +/* Search for an ID in an IDL. + * [in] ids The IDL to search. + * [in] id The ID to search for. + * Returns The index of the first ID greater than or equal to id. */ +static unsigned mdbx_midl_search(MDB_IDL ids, pgno_t id); -/** Allocate an IDL. +/* Allocate an IDL. * Allocates memory for an IDL of the given size. - * @return IDL on success, NULL on failure. - */ + * Returns IDL on success, NULL on failure. */ static MDB_IDL mdbx_midl_alloc(int num); -/** Free an IDL. - * @param[in] ids The IDL to free. - */ +/* Free an IDL. + * [in] ids The IDL to free. */ static void mdbx_midl_free(MDB_IDL ids); -/** Shrink an IDL. +/* Shrink an IDL. * Return the IDL to the default size if it has grown larger. - * @param[in,out] idp Address of the IDL to shrink. - */ + * [in,out] idp Address of the IDL to shrink. */ static void mdbx_midl_shrink(MDB_IDL *idp); -/** Make room for num additional elements in an IDL. - * @param[in,out] idp Address of the IDL. - * @param[in] num Number of elements to make room for. - * @return 0 on success, MDBX_ENOMEM on failure. - */ +/* Make room for num additional elements in an IDL. + * [in,out] idp Address of the IDL. + * [in] num Number of elements to make room for. + * Returns 0 on success, MDBX_ENOMEM on failure. */ static int mdbx_midl_need(MDB_IDL *idp, unsigned num); -/** Append an ID onto an IDL. - * @param[in,out] idp Address of the IDL to append to. - * @param[in] id The ID to append. - * @return 0 on success, MDBX_ENOMEM if the IDL is too large. - */ -static int mdbx_midl_append(MDB_IDL *idp, MDB_ID id); +/* Append an ID onto an IDL. + * [in,out] idp Address of the IDL to append to. + * [in] id The ID to append. + * Returns 0 on success, MDBX_ENOMEM if the IDL is too large. */ +static int mdbx_midl_append(MDB_IDL *idp, pgno_t id); -/** Append an IDL onto an IDL. - * @param[in,out] idp Address of the IDL to append to. - * @param[in] app The IDL to append. - * @return 0 on success, MDBX_ENOMEM if the IDL is too large. - */ +/* Append an IDL onto an IDL. + * [in,out] idp Address of the IDL to append to. + * [in] app The IDL to append. + * Returns 0 on success, MDBX_ENOMEM if the IDL is too large. */ static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app); -/** Append an ID range onto an IDL. - * @param[in,out] idp Address of the IDL to append to. - * @param[in] id The lowest ID to append. - * @param[in] n Number of IDs to append. - * @return 0 on success, MDBX_ENOMEM if the IDL is too large. - */ -static int mdbx_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n); +/* Append an ID range onto an IDL. + * [in,out] idp Address of the IDL to append to. + * [in] id The lowest ID to append. + * [in] n Number of IDs to append. + * Returns 0 on success, MDBX_ENOMEM if the IDL is too large. */ +static int mdbx_midl_append_range(MDB_IDL *idp, pgno_t id, unsigned n); -/** Merge an IDL onto an IDL. The destination IDL must be big enough. - * @param[in] idl The IDL to merge into. - * @param[in] merge The IDL to merge. - */ +/* Merge an IDL onto an IDL. The destination IDL must be big enough. + * [in] idl The IDL to merge into. + * [in] merge The IDL to merge. */ static void mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge); -/** Sort an IDL. - * @param[in,out] ids The IDL to sort. - */ +/* Sort an IDL. + * [in,out] ids The IDL to sort. */ static void mdbx_midl_sort(MDB_IDL ids); -/** Search for an ID in an ID2L. - * @param[in] ids The ID2L to search. - * @param[in] id The ID to search for. - * @return The index of the first ID2 whose \b mid member is greater than - * or equal to \b id. - */ -static unsigned mdbx_mid2l_search(MDB_ID2L ids, MDB_ID id); +/* Search for an ID in an ID2L. + * [in] ids The ID2L to search. + * [in] id The ID to search for. + * Returns The index of the first ID2 whose mid member is greater than + * or equal to id. */ +static unsigned mdbx_mid2l_search(MDB_ID2L ids, pgno_t id); -/** Insert an ID2 into a ID2L. - * @param[in,out] ids The ID2L to insert into. - * @param[in] id The ID2 to insert. - * @return 0 on success, -1 if the ID was already present in the ID2L. - */ +/* Insert an ID2 into a ID2L. + * [in,out] ids The ID2L to insert into. + * [in] id The ID2 to insert. + * Returns 0 on success, -1 if the ID was already present in the ID2L. */ static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id); -/** Append an ID2 into a ID2L. - * @param[in,out] ids The ID2L to append into. - * @param[in] id The ID2 to append. - * @return 0 on success, -2 if the ID2L is too big. - */ +/* Append an ID2 into a ID2L. + * [in,out] ids The ID2L to append into. + * [in] id The ID2 to append. + * Returns 0 on success, -2 if the ID2L is too big. */ static int mdbx_mid2l_append(MDB_ID2L ids, MDB_ID2 *id); /*----------------------------------------------------------------------------*/ @@ -269,7 +255,7 @@ int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); txnid_t mdbx_debug_edge; #endif -/** Features under development */ +/* Features under development */ #ifndef MDB_DEVEL #define MDB_DEVEL 0 #endif @@ -281,50 +267,47 @@ txnid_t mdbx_debug_edge; #define DDBI(mc) \ (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) -/** @brief The maximum size of a database page. +/* The maximum size of a database page. * - * It is 32k or 64k, since value-PAGEBASE must fit in - * #MDB_page.%mp_upper. + * It is 32k or 64k, since value-PAGEBASE must fit in + * MDB_page.mp_upper. * - * LMDB will use database pages < OS pages if needed. - * That causes more I/O in write transactions: The OS must - * know (read) the whole page before writing a partial page. + * LMDB will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. * - * Note that we don't currently support Huge pages. On Linux, - * regular data files cannot use Huge pages, and in general - * Huge pages aren't actually pageable. We rely on the OS - * demand-pager to read our data and page it out when memory - * pressure from other processes is high. So until OSs have - * actual paging support for Huge pages, they're not viable. - */ + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. */ #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) -/** The minimum number of keys required in a database page. - * Setting this to a larger value will place a smaller bound on the - * maximum size of a data item. Data items larger than this size will - * be pushed into overflow pages instead of being stored directly in - * the B-tree node. This value used to default to 4. With a page size - * of 4096 bytes that meant that any item larger than 1024 bytes would - * go into an overflow page. That also meant that on average 2-3KB of - * each overflow page was wasted space. The value cannot be lower than - * 2 because then there would no longer be a tree structure. With this - * value, items larger than 2KB will go into overflow pages, and on - * average only 1KB will be wasted. - */ +/* The minimum number of keys required in a database page. + * Setting this to a larger value will place a smaller bound on the + * maximum size of a data item. Data items larger than this size will + * be pushed into overflow pages instead of being stored directly in + * the B-tree node. This value used to default to 4. With a page size + * of 4096 bytes that meant that any item larger than 1024 bytes would + * go into an overflow page. That also meant that on average 2-3KB of + * each overflow page was wasted space. The value cannot be lower than + * 2 because then there would no longer be a tree structure. With this + * value, items larger than 2KB will go into overflow pages, and on + * average only 1KB will be wasted. */ #define MDB_MINKEYS 2 -/** A stamp that identifies a file as an LMDB file. - * There's nothing special about this value other than that it is easily - * recognizable, and it will reflect any byte order mismatches. - */ +/* A stamp that identifies a file as an LMDB file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. */ #define MDB_MAGIC 0xBEEFC0DE -/** The version number for a database's datafile format. */ +/* The version number for a database's datafile format. */ #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) -/** The version number for a database's lockfile format. */ +/* The version number for a database's lockfile format. */ #define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) -/* Key size which fits in a #DKBUF. */ +/* Key size which fits in a DKBUF. */ #define DKBUF_MAXKEYSIZE 511 /* FIXME */ #if MDB_DEBUG @@ -338,140 +321,116 @@ txnid_t mdbx_debug_edge; #define DVAL(x) ("-") #endif -/** An invalid page number. - * Mainly used to denote an empty tree. - */ +/* An invalid page number. + * Mainly used to denote an empty tree. */ #define P_INVALID (~(pgno_t)0) -/** Test if the flags \b f are set in a flag word \b w. */ +/* Test if the flags f are set in a flag word w. */ #define F_ISSET(w, f) (((w) & (f)) == (f)) -/** Round \b n up to an even number. */ +/* Round n up to an even number. */ #define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ -/** Default size of memory map. - * This is certainly too small for any actual applications. Apps should - *always set - * the size explicitly using #mdbx_env_set_mapsize(). - */ +/* Default size of memory map. + * This is certainly too small for any actual applications. Apps should + * always set the size explicitly using mdbx_env_set_mapsize(). */ #define DEFAULT_MAPSIZE 1048576 -/** @defgroup readers Reader Lock Table - * Readers don't acquire any locks for their data access. Instead, they - * simply record their transaction ID in the reader table. The reader - * mutex is needed just to find an empty slot in the reader table. The - * slot's address is saved in thread-specific data so that subsequent - *read - * transactions started by the same thread need no further locking to - *proceed. +/* Reader Lock Table * - * If #MDB_NOTLS is set, the slot address is not saved in thread-specific - *data. + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent + * read transactions started by the same thread need no further locking to + * proceed. * - * No reader table is used if the database is on a read-only filesystem, - *or - * if #MDB_NOLOCK is set. + * If MDB_NOTLS is set, the slot address is not saved in thread-specific data. + * No reader table is used if the database is on a read-only filesystem. * - * Since the database uses multi-version concurrency control, readers - *don't - * actually need any locking. This table is used to keep track of which - * readers are using data from which old transactions, so that we'll know - * when a particular old transaction is no longer in use. Old - *transactions - * that have discarded any data pages can then have those pages reclaimed - * for use by a later write transaction. + * Since the database uses multi-version concurrency control, readers don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. * - * The lock table is constructed such that reader slots are aligned with - *the - * processor's cache line size. Any slot is only ever used by one thread. - * This alignment guarantees that there will be no contention or cache - * thrashing as threads update their own slot info, and also eliminates - * any need for locking when accessing a slot. + * The lock table is constructed such that reader slots are aligned with the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. * - * A writer thread will scan every slot in the table to determine the - *oldest - * outstanding reader transaction. Any freed pages older than this will - *be - * reclaimed by the writer. The writer doesn't use any locks when - *scanning - * this table. This means that there's no guarantee that the writer will - * see the most up-to-date reader info, but that's not required for - *correct - * operation - all we need is to know the upper bound on the oldest - *reader, - * we don't care at all about the newest reader. So the only consequence - *of - * reading stale information here is that old pages might hang around a - * while longer before being reclaimed. That's actually good anyway, - *because - * the longer we delay reclaiming old pages, the more likely it is that a - * string of contiguous pages can be found after coalescing old pages - *from - * many old transactions together. - * @{ - */ -/** Number of slots in the reader table. - * This value was chosen somewhat arbitrarily. 126 readers plus a - * couple mutexes fit exactly into 8KB on my development machine. - * Applications should set the table size using - *#mdbx_env_set_maxreaders(). - */ -#define DEFAULT_READERS 126 + * A writer thread will scan every slot in the table to determine the oldest + * outstanding reader transaction. Any freed pages older than this will be + * reclaimed by the writer. The writer doesn't use any locks when scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for correct + * operation - all we need is to know the upper bound on the oldest reader, + * we don't care at all about the newest reader. So the only consequence of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages from + * many old transactions together. */ -/** Address of first usable data byte in a page, after the header */ +/* Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. The 61 is a prime number, + * and such readers plus a couple mutexes fit into single 4KB page. + * Applications should set the table size using mdbx_env_set_maxreaders(). */ +#define DEFAULT_READERS 61 + +/* Address of first usable data byte in a page, after the header */ #define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) -/** ITS#7713, change PAGEBASE to handle 65536 byte pages */ +/* ITS#7713, change PAGEBASE to handle 65536 byte pages */ #define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) -/** Number of nodes on a page */ +/* Number of nodes on a page */ #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) -/** The amount of space remaining in the page */ +/* The amount of space remaining in the page */ #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) -/** The percentage of space used in the page, in tenths of a percent. */ +/* The percentage of space used in the page, in tenths of a percent. */ #define PAGEFILL(env, p) \ (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ ((env)->me_psize - PAGEHDRSZ)) -/** The minimum page fill factor, in tenths of a percent. - * Pages emptier than this are candidates for merging. - */ +/* The minimum page fill factor, in tenths of a percent. + * Pages emptier than this are candidates for merging. */ #define FILL_THRESHOLD 250 -/** Test if a page is a leaf page */ +/* Test if a page is a leaf page */ #define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) -/** Test if a page is a LEAF2 page */ +/* Test if a page is a LEAF2 page */ #define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) -/** Test if a page is a branch page */ +/* Test if a page is a branch page */ #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) -/** Test if a page is an overflow page */ +/* Test if a page is an overflow page */ #define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) -/** Test if a page is a sub page */ +/* Test if a page is a sub page */ #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) -/** The number of overflow pages needed to store the given size. */ +/* The number of overflow pages needed to store the given size. */ #define OVPAGES(size, psize) ((PAGEHDRSZ - 1 + (size)) / (psize) + 1) -/** Link in #MDB_txn.%mt_loose_pgs list. - * Kept outside the page header, which is needed when reusing the page. - */ +/* Link in MDB_txn.mt_loose_pgs list. + * Kept outside the page header, which is needed when reusing the page. */ #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) -/** Header for a single key/data pair within a page. - * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. +/* Header for a single key/data pair within a page. + * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDB_node's. * - * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child - * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used + * mn_lo and mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, mn_flags is also used * for pgno. (Branch nodes have no flags). Lo and hi are in host byte * order in case some accesses can be optimized to 32-bit word access. * - * Leaf node flags describe node contents. #F_BIGDATA says the node's + * Leaf node flags describe node contents. F_BIGDATA says the node's * data part is the page number of an overflow page with actual data. - * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in - * a sub-page/sub-database, and named databases (just #F_SUBDATA). - */ + * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just F_SUBDATA). */ typedef struct MDB_node { /* part of data size or pgno */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ @@ -479,36 +438,31 @@ typedef struct MDB_node { #else uint16_t mn_hi, mn_lo; #endif -/** @defgroup mdbx_node Node Flags - * @ingroup internal - * Flags for node headers. - */ -#define F_BIGDATA 0x01 /**< data put on overflow page */ -#define F_SUBDATA 0x02 /**< data is a sub-database */ -#define F_DUPDATA 0x04 /**< data has duplicates */ +/* mdbx_node Flags */ +#define F_BIGDATA 0x01 /* data put on overflow page */ +#define F_SUBDATA 0x02 /* data is a sub-database */ +#define F_DUPDATA 0x04 /* data has duplicates */ -/** valid flags for #mdbx_node_add() */ +/* valid flags for mdbx_node_add() */ #define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDB_RESERVE | MDB_APPEND) - uint16_t mn_flags; /**< @ref mdbx_node */ - uint16_t mn_ksize; /**< key size */ - uint8_t mn_data[1]; /**< key and data are appended here */ + uint16_t mn_flags; /* see mdbx_node */ + uint16_t mn_ksize; /* key size */ + uint8_t mn_data[1]; /* key and data are appended here */ } MDB_node; -/** Size of the node header, excluding dynamic data at the end */ +/* Size of the node header, excluding dynamic data at the end */ #define NODESIZE offsetof(MDB_node, mn_data) -/** Bit position of top word in page number, for shifting mn_flags */ +/* Bit position of top word in page number, for shifting mn_flags */ #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) -/** Size of a node in a branch page with a given key. - * This is just the node header plus the key, there is no data. - */ +/* Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. */ #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) -/** Size of a node in a leaf page with a given key and data. - * This is node header plus key plus data size. - */ +/* Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. */ #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) /* Address of node i in page p */ @@ -517,17 +471,18 @@ static __inline MDB_node *NODEPTR(MDB_page *p, unsigned i) { return (MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); } -/** Address of the key for the node */ +/* Address of the key for the node */ #define NODEKEY(node) (void *)((node)->mn_data) -/** Address of the data for a node */ +/* Address of the data for a node */ #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) -/** Get the page number pointed to by a branch node */ +/* Get the page number pointed to by a branch node */ #define NODEPGNO(node) \ ((node)->mn_lo | ((pgno_t)(node)->mn_hi << 16) | \ (PGNO_TOPWORD ? ((pgno_t)(node)->mn_flags << PGNO_TOPWORD) : 0)) -/** Set the page number in a branch node */ + +/* Set the page number in a branch node */ #define SETPGNO(node, pgno) \ do { \ (node)->mn_lo = (uint16_t)(pgno); \ @@ -536,18 +491,20 @@ static __inline MDB_node *NODEPTR(MDB_page *p, unsigned i) { (node)->mn_flags = (uint16_t)((pgno) >> PGNO_TOPWORD); \ } while (0) -/** Get the size of the data in a leaf node */ +/* Get the size of the data in a leaf node */ #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) -/** Set the size of the data for a leaf node */ + +/* Set the size of the data for a leaf node */ #define SETDSZ(node, size) \ do { \ (node)->mn_lo = (uint16_t)(size); \ (node)->mn_hi = (uint16_t)((size) >> 16); \ } while (0) -/** The size of a key in a node */ + +/* The size of a key in a node */ #define NODEKSZ(node) ((node)->mn_ksize) -/** Copy a page number from src to dst */ +/* Copy a page number from src to dst */ #if UNALIGNED_OK #define COPY_PGNO(dst, src) dst = src #elif SIZE_MAX > 4294967295UL @@ -572,14 +529,12 @@ static __inline MDB_node *NODEPTR(MDB_page *p, unsigned i) { } while (0) #endif /* UNALIGNED_OK */ -/** The address of a key in a LEAF2 page. - * LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate - *sub-DBs. - * There are no node headers, keys are stored contiguously. - */ +/* The address of a key in a LEAF2 page. + * LEAF2 pages are used for MDB_DUPFIXED sorted-duplicate sub-DBs. + * There are no node headers, keys are stored contiguously. */ #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks))) -/** Set the \b node's key into \b keyptr, if requested. */ +/* Set the node's key into keyptr, if requested. */ #define MDB_GET_KEY(node, keyptr) \ { \ if ((keyptr) != NULL) { \ @@ -588,32 +543,32 @@ static __inline MDB_node *NODEPTR(MDB_page *p, unsigned i) { } \ } -/** Set the \b node's key into \b key. */ +/* Set the node's key into key. */ #define MDB_GET_KEY2(node, key) \ { \ key.mv_size = NODEKSZ(node); \ key.mv_data = NODEKEY(node); \ } -#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ +#define MDB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) -/** #mdbx_dbi_open() flags */ +/* mdbx_dbi_open() flags */ #define VALID_FLAGS \ (MDB_REVERSEKEY | MDB_DUPSORT | MDB_INTEGERKEY | MDB_DUPFIXED | \ MDB_INTEGERDUP | MDB_REVERSEDUP | MDB_CREATE) -/** max number of pages to commit in one writev() call */ +/* max number of pages to commit in one writev() call */ #define MDB_COMMIT_PAGES 64 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ #undef MDB_COMMIT_PAGES #define MDB_COMMIT_PAGES IOV_MAX #endif -/** Check \b txn and \b dbi arguments to a function */ +/* Check txn and dbi arguments to a function */ #define TXN_DBI_EXIST(txn, dbi, validity) \ ((dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) -/** Check for misused \b dbi handles */ +/* Check for misused dbi handles */ #define TXN_DBI_CHANGED(txn, dbi) \ ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) @@ -638,11 +593,11 @@ enum { MDB_END_FAIL_BEGIN, MDB_END_FAIL_BEGINCHILD }; -#define MDB_END_OPMASK 0x0F /**< mask for #mdbx_txn_end() operation number */ -#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ -#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ -#define MDB_END_EOTDONE 0x40 /**< txn's cursors already closed */ -#define MDB_END_SLOT 0x80 /**< release any reader slot if #MDB_NOTLS */ +#define MDB_END_OPMASK 0x0F /* mask for mdbx_txn_end() operation number */ +#define MDB_END_UPDATE 0x10 /* update env state (DBIs) */ +#define MDB_END_FREE 0x20 /* free txn unless it is MDB_env.me_txn0 */ +#define MDB_END_EOTDONE 0x40 /* txn's cursors already closed */ +#define MDB_END_SLOT 0x80 /* release any reader slot if MDB_NOTLS */ static int mdbx_txn_end(MDB_txn *txn, unsigned mode); static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); @@ -654,7 +609,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags); static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); -#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */ +#define MDB_SPLIT_REPLACE MDB_APPENDDUP /* newkey is not new */ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned nflags); @@ -700,12 +655,10 @@ static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); static int mdbx_drop0(MDB_cursor *mc, int subs); -/** @cond */ static MDB_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_ai, mdbx_cmp_int_a2, mdbx_cmp_int_ua; -/** @endcond */ -/** Return the library version info. */ +/* Return the library version info. */ const char *mdbx_version(int *major, int *minor, int *patch) { if (major) *major = MDBX_VERSION_MAJOR; @@ -836,7 +789,7 @@ void __cold mdbx_debug_log(int type, const char *function, int line, va_end(args); } -/** Return the page number of \b mp which may be sub-page, for debug output */ +/* Return the page number of mp which may be sub-page, for debug output */ static __inline pgno_t mdbx_dbg_pgno(MDB_page *mp) { pgno_t ret; COPY_PGNO(ret, mp->mp_pgno); @@ -886,109 +839,116 @@ char *mdbx_dkey(const MDB_val *key, char *const buf, const size_t bufsize) { } #if 0 /* LY: debug stuff */ -static const char * -mdbx_leafnode_type(MDB_node *n) -{ - static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; - return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : - tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)]; +static const char *mdbx_leafnode_type(MDB_node *n) { + static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; + return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" + : tp[F_ISSET(n->mn_flags, F_DUPDATA)] + [F_ISSET(n->mn_flags, F_SUBDATA)]; } -/** Display all the keys in the page. */ -static void -mdbx_page_list(MDB_page *mp) -{ - pgno_t pgno = mdbx_dbg_pgno(mp); - const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; - MDB_node *node; - unsigned i, nkeys, nsize, total = 0; - MDB_val key; - DKBUF; +/* Display all the keys in the page. */ +static void mdbx_page_list(MDB_page *mp) { + pgno_t pgno = mdbx_dbg_pgno(mp); + const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; + MDB_node *node; + unsigned i, nkeys, nsize, total = 0; + MDB_val key; + DKBUF; - switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) { - case P_BRANCH: type = "Branch page"; break; - case P_LEAF: type = "Leaf page"; break; - case P_LEAF|P_SUBP: type = "Sub-page"; break; - case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; - case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; - case P_OVERFLOW: - mdbx_print("Overflow page %" PRIuPTR " pages %u%s\n", - pgno, mp->mp_pages, state); - return; - case P_META: - mdbx_print("Meta-page %" PRIuPTR " txnid %" PRIuPTR "\n", - pgno, ((MDB_meta *)PAGEDATA(mp))->mm_txnid); - return; - default: - mdbx_print("Bad page %" PRIuPTR " flags 0x%X\n", pgno, mp->mp_flags); - return; - } + switch (mp->mp_flags & + (P_BRANCH | P_LEAF | P_LEAF2 | P_META | P_OVERFLOW | P_SUBP)) { + case P_BRANCH: + type = "Branch page"; + break; + case P_LEAF: + type = "Leaf page"; + break; + case P_LEAF | P_SUBP: + type = "Sub-page"; + break; + case P_LEAF | P_LEAF2: + type = "LEAF2 page"; + break; + case P_LEAF | P_LEAF2 | P_SUBP: + type = "LEAF2 sub-page"; + break; + case P_OVERFLOW: + mdbx_print("Overflow page %" PRIu64 " pages %u%s\n", pgno, mp->mp_pages, + state); + return; + case P_META: + mdbx_print("Meta-page %" PRIu64 " txnid %" PRIu64 "\n", pgno, + ((MDB_meta *)PAGEDATA(mp))->mm_txnid); + return; + default: + mdbx_print("Bad page %" PRIu64 " flags 0x%X\n", pgno, mp->mp_flags); + return; + } - nkeys = NUMKEYS(mp); - mdbx_print("%s %" PRIuPTR " numkeys %u%s\n", type, pgno, nkeys, state); + nkeys = NUMKEYS(mp); + mdbx_print("%s %" PRIu64 " numkeys %u%s\n", type, pgno, nkeys, state); - for (i=0; imp_leaf2_ksize; - key.mv_data = LEAF2KEY(mp, i, nsize); - total += nsize; - mdbx_print("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); - continue; - } - node = NODEPTR(mp, i); - key.mv_size = node->mn_ksize; - key.mv_data = node->mn_data; - nsize = NODESIZE + key.mv_size; - if (IS_BRANCH(mp)) { - mdbx_print("key %u: page %" PRIuPTR ", %s\n", i, NODEPGNO(node), DKEY(&key)); - total += nsize; - } else { - if (F_ISSET(node->mn_flags, F_BIGDATA)) - nsize += sizeof(pgno_t); - else - nsize += NODEDSZ(node); - total += nsize; - nsize += sizeof(indx_t); - mdbx_print("key %u: nsize %u, %s%s\n", - i, nsize, DKEY(&key), mdbx_leafnode_type(node)); - } - total = EVEN(total); - } - mdbx_print("Total: header %u + contents %u + unused %u\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp)); + for (i = 0; i < nkeys; i++) { + if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ + key.mv_size = nsize = mp->mp_leaf2_ksize; + key.mv_data = LEAF2KEY(mp, i, nsize); + total += nsize; + mdbx_print("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); + continue; + } + node = NODEPTR(mp, i); + key.mv_size = node->mn_ksize; + key.mv_data = node->mn_data; + nsize = NODESIZE + key.mv_size; + if (IS_BRANCH(mp)) { + mdbx_print("key %u: page %" PRIu64 ", %s\n", i, NODEPGNO(node), + DKEY(&key)); + total += nsize; + } else { + if (F_ISSET(node->mn_flags, F_BIGDATA)) + nsize += sizeof(pgno_t); + else + nsize += NODEDSZ(node); + total += nsize; + nsize += sizeof(indx_t); + mdbx_print("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), + mdbx_leafnode_type(node)); + } + total = EVEN(total); + } + mdbx_print("Total: header %u + contents %u + unused %u\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, + SIZELEFT(mp)); } -static void -mdbx_cursor_chk(MDB_cursor *mc) -{ - unsigned i; - MDB_node *node; - MDB_page *mp; +static void mdbx_cursor_chk(MDB_cursor *mc) { + unsigned i; + MDB_node *node; + MDB_page *mp; - if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; - for (i=0; imc_top; i++) { - mp = mc->mc_pg[i]; - node = NODEPTR(mp, mc->mc_ki[i]); - if (unlikely(NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)) - mdbx_print("oops!\n"); - } - if (unlikely(mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))) - mdbx_print("ack!\n"); - if (XCURSOR_INITED(mc)) { - node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && - mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { - mdbx_print("blah!\n"); - } - } + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) + return; + for (i = 0; i < mc->mc_top; i++) { + mp = mc->mc_pg[i]; + node = NODEPTR(mp, mc->mc_ki[i]); + if (unlikely(NODEPGNO(node) != mc->mc_pg[i + 1]->mp_pgno)) + mdbx_print("oops!\n"); + } + if (unlikely(mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))) + mdbx_print("ack!\n"); + if (XCURSOR_INITED(mc)) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (((node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) && + mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { + mdbx_print("blah!\n"); + } + } } #endif /* 0 */ -/** Count all the pages in each DB and in the freelist - * and make sure it matches the actual number of pages - * being used. - * All named DBs must be open for a correct count. - */ +/* Count all the pages in each DB and in the freelist and make sure + * it matches the actual number of pages being used. + * All named DBs must be open for a correct count. */ static void mdbx_audit(MDB_txn *txn) { MDB_cursor mc; MDB_val key, data; @@ -1049,10 +1009,9 @@ int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { return txn->mt_dbxs[dbi].md_dcmp(a, b); } -/** Allocate memory for a page. +/* Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. - * Set #MDB_TXN_ERROR on failure. - */ + * Set MDB_TXN_ERROR on failure. */ static MDB_page *mdbx_page_malloc(MDB_txn *txn, unsigned num) { MDB_env *env = txn->mt_env; size_t size = env->me_psize; @@ -1087,17 +1046,16 @@ static MDB_page *mdbx_page_malloc(MDB_txn *txn, unsigned num) { return np; } -/** Free a single page. +/* Free a single page. * Saves single pages to a list, for future reuse. - * (This is not used for multi-page overflow pages.) - */ + * (This is not used for multi-page overflow pages.) */ static __inline void mdbx_page_free(MDB_env *env, MDB_page *mp) { mp->mp_next = env->me_dpages; VALGRIND_MEMPOOL_FREE(env, mp); env->me_dpages = mp; } -/** Free a dirty page */ +/* Free a dirty page */ static void mdbx_dpage_free(MDB_env *env, MDB_page *dp) { if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { mdbx_page_free(env, dp); @@ -1108,7 +1066,7 @@ static void mdbx_dpage_free(MDB_env *env, MDB_page *dp) { } } -/** Return all dirty pages to dpage list */ +/* Return all dirty pages to dpage list */ static void mdbx_dlist_free(MDB_txn *txn) { MDB_env *env = txn->mt_env; MDB_ID2L dl = txn->mt_u.dirty_list; @@ -1137,7 +1095,8 @@ static void __cold mdbx_kill_page(MDB_env *env, pgno_t pgno) { } } -/** Loosen or free a single page. +/* Loosen or free a single page. + * * Saves single pages to a list for future reuse * in this same txn. It has been pulled from the freeDB * and already resides on the dirty list, but has been @@ -1145,8 +1104,7 @@ static void __cold mdbx_kill_page(MDB_env *env, pgno_t pgno) { * from the freeDB. * * If the page wasn't dirtied in this txn, just add it - * to this txn's free list. - */ + * to this txn's free list. */ static int mdbx_page_loose(MDB_cursor *mc, MDB_page *mp) { int loose = 0; pgno_t pgno = mp->mp_pgno; @@ -1155,8 +1113,8 @@ static int mdbx_page_loose(MDB_cursor *mc, MDB_page *mp) { if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { if (txn->mt_parent) { MDB_ID2 *dl = txn->mt_u.dirty_list; - /* If txn has a parent, make sure the page is in our - * dirty list. */ + /* If txn has a parent, + * make sure the page is in our dirty list. */ if (dl[0].mid) { unsigned x = mdbx_mid2l_search(dl, pgno); if (x <= dl[0].mid && dl[x].mid == pgno) { @@ -1195,13 +1153,15 @@ static int mdbx_page_loose(MDB_cursor *mc, MDB_page *mp) { return MDB_SUCCESS; } -/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. - * @param[in] mc A cursor handle for the current operation. - * @param[in] pflags Flags of the pages to update: - * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. - * @param[in] all No shortcuts. Needed except after a full #mdbx_page_flush(). - * @return 0 on success, non-zero on failure. - */ +/* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. + * + * [in] mc A cursor handle for the current operation. + * [in] pflags Flags of the pages to update: + * - P_DIRTY to set P_KEEP, + * - P_DIRTY|P_KEEP to clear it. + * [in] all No shortcuts. Needed except after a full mdbx_page_flush(). + * + * Returns 0 on success, non-zero on failure. */ static int mdbx_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { const unsigned Mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP; MDB_txn *txn = mc->mc_txn; @@ -1261,24 +1221,26 @@ mark_done: static int mdbx_page_flush(MDB_txn *txn, int keep); -/** Spill pages from the dirty list back to disk. - * This is intended to prevent running into #MDB_TXN_FULL situations, +/* Spill pages from the dirty list back to disk. + * This is intended to prevent running into MDB_TXN_FULL situations, * but note that they may still occur in a few cases: - * 1) our estimate of the txn size could be too small. Currently this - * seems unlikely, except with a large number of #MDB_MULTIPLE items. - * 2) child txns may run out of space if their parents dirtied a - * lot of pages and never spilled them. TODO: we probably should do - * a preemptive spill during #mdbx_txn_begin() of a child txn, if - * the parent's dirty_room is below a given threshold. + * + * 1) our estimate of the txn size could be too small. Currently this + * seems unlikely, except with a large number of MDB_MULTIPLE items. + * + * 2) child txns may run out of space if their parents dirtied a + * lot of pages and never spilled them. TODO: we probably should do + * a preemptive spill during mdbx_txn_begin() of a child txn, if + * the parent's dirty_room is below a given threshold. * * Otherwise, if not using nested txns, it is expected that apps will - * not run into #MDB_TXN_FULL any more. The pages are flushed to disk + * not run into MDB_TXN_FULL any more. The pages are flushed to disk * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. * If the txn never references them again, they can be left alone. * If the txn only reads them, they can be used without any fuss. * If the txn writes them again, they can be dirtied immediately without - * going thru all of the work of #mdbx_page_touch(). Such references are - * handled by #mdbx_page_unspill(). + * going thru all of the work of mdbx_page_touch(). Such references are + * handled by mdbx_page_unspill(). * * Also note, we never spill DB root pages, nor pages of active cursors, * because we'll need these back again soon anyway. And in nested txns, @@ -1287,12 +1249,12 @@ static int mdbx_page_flush(MDB_txn *txn, int keep); * the child hasn't committed yet, and we'd have no way to undo it if * the child aborted. * - * @param[in] m0 cursor A cursor handle identifying the transaction and - * database for which we are checking space. - * @param[in] key For a put operation, the key being stored. - * @param[in] data For a put operation, the data being stored. - * @return 0 on success, non-zero on failure. - */ + * [in] m0 cursor A cursor handle identifying the transaction and + * database for which we are checking space. + * [in] key For a put operation, the key being stored. + * [in] data For a put operation, the data being stored. + * + * Returns 0 on success, non-zero on failure. */ static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { MDB_txn *txn = m0->mc_txn; MDB_page *dp; @@ -1354,8 +1316,8 @@ static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { dp = dl[i].mptr; if (dp->mp_flags & (P_LOOSE | P_KEEP)) continue; - /* Can't spill twice, make sure it's not already in a parent's - * spill list. */ + /* Can't spill twice, + * make sure it's not already in a parent's spill list. */ if (txn->mt_parent) { MDB_txn *tx2; for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { @@ -1393,11 +1355,9 @@ bailout: static __inline uint64_t mdbx_meta_sign(MDB_meta *meta) { uint64_t sign = MDB_DATASIGN_NONE; #if 0 /* TODO */ - sign = hippeus_hash64( - &meta->mm_mapsize, - sizeof(MDB_meta) - offsetof(MDB_meta, mm_mapsize), - meta->mm_version | (uint64_t) MDB_MAGIC << 32 - ); + sign = hippeus_hash64(&meta->mm_mapsize, + sizeof(MDB_meta) - offsetof(MDB_meta, mm_mapsize), + meta->mm_version | (uint64_t)MDB_MAGIC << 32); #else (void)meta; #endif @@ -1416,7 +1376,7 @@ static __inline int mdbx_meta_lt(const MDB_meta *a, const MDB_meta *b) { return META_IS_STEADY(b); } -/** Find oldest txnid still referenced. */ +/* Find oldest txnid still referenced. */ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { const MDB_meta *const a = METAPAGE_1(env); const MDB_meta *const b = METAPAGE_2(env); @@ -1440,7 +1400,7 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { return env->me_pgoldest = oldest; } -/** Add a page to the txn's dirty list */ +/* Add a page to the txn's dirty list */ static void mdbx_page_dirty(MDB_txn *txn, MDB_page *mp) { MDB_ID2 mid; int rc, (*insert)(MDB_ID2L, MDB_ID2 *); @@ -1457,22 +1417,22 @@ static void mdbx_page_dirty(MDB_txn *txn, MDB_page *mp) { txn->mt_dirty_room--; } -/** Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. +/* Allocate page numbers and memory for writing. Maintain me_pglast, + * me_pghead and mt_next_pgno. Set MDB_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. * Do not modify the freedB, just merge freeDB records into me_pghead[] * and move me_pglast to say which records were consumed. Only this * function can create me_pghead and move me_pglast/mt_next_pgno. - * @param[in] mc cursor A cursor handle identifying the transaction and - * database for which we are allocating. - * @param[in] num the number of pages to allocate. - * @param[out] mp Address of the allocated page(s). Requests for multiple - *pages - * will always be satisfied by a single contiguous chunk of memory. - * @return 0 on success, non-zero on failure. - */ + * + * [in] mc cursor A cursor handle identifying the transaction and + * database for which we are allocating. + * [in] num the number of pages to allocate. + * [out] mp Address of the allocated page(s). Requests for multiple pages + * will always be satisfied by a single contiguous chunk of memory. + * + * Returns 0 on success, non-zero on failure.*/ #define MDBX_ALLOC_CACHE 1 #define MDBX_ALLOC_GC 2 @@ -1796,11 +1756,10 @@ done: return MDB_SUCCESS; } -/** Copy the used portions of a non-overflow page. - * @param[in] dst page to copy into - * @param[in] src page to copy from - * @param[in] psize size of a page - */ +/* Copy the used portions of a non-overflow page. + * [in] dst page to copy into + * [in] src page to copy from + * [in] psize size of a page */ static void mdbx_page_copy(MDB_page *dst, MDB_page *src, unsigned psize) { enum { Align = sizeof(pgno_t) }; indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; @@ -1817,14 +1776,15 @@ static void mdbx_page_copy(MDB_page *dst, MDB_page *src, unsigned psize) { } } -/** Pull a page off the txn's spill list, if present. +/* Pull a page off the txn's spill list, if present. + * * If a page being referenced was spilled to disk in this txn, bring * it back and make it dirty/writable again. - * @param[in] txn the transaction handle. - * @param[in] mp the page being referenced. It must not be dirty. - * @param[out] ret the writable page, if any. ret is unchanged if - * mp wasn't spilled. - */ + * + * [in] txn the transaction handle. + * [in] mp the page being referenced. It must not be dirty. + * [out] ret the writable page, if any. + * ret is unchanged if mp wasn't spilled. */ static int mdbx_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) { MDB_env *env = txn->mt_env; const MDB_txn *tx2; @@ -1876,11 +1836,12 @@ static int mdbx_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) { return MDB_SUCCESS; } -/** Touch a page: make it dirty and re-insert into tree with updated pgno. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc cursor pointing to the page to be touched - * @return 0 on success, non-zero on failure. - */ +/* Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set MDB_TXN_ERROR on failure. + * + * [in] mc cursor pointing to the page to be touched + * + * Returns 0 on success, non-zero on failure. */ static int mdbx_page_touch(MDB_cursor *mc) { MDB_page *mp = mc->mc_pg[mc->mc_top], *np; MDB_txn *txn = mc->mc_txn; @@ -2044,7 +2005,7 @@ int mdbx_env_sync(MDB_env *env, int force) { return MDB_SUCCESS; } -/** Back up parent txn's cursors, then grab the originals for tracking */ +/* Back up parent txn's cursors, then grab the originals for tracking */ static int mdbx_cursor_shadow(MDB_txn *src, MDB_txn *dst) { MDB_cursor *mc, *bk; MDB_xcursor *mx; @@ -2080,11 +2041,12 @@ static int mdbx_cursor_shadow(MDB_txn *src, MDB_txn *dst) { return MDB_SUCCESS; } -/** Close this write txn's cursors, give parent txn's cursors back to parent. - * @param[in] txn the transaction handle. - * @param[in] merge true to keep changes to parent cursors, false to revert. - * @return 0 on success, non-zero on failure. - */ +/* Close this write txn's cursors, give parent txn's cursors back to parent. + * + * [in] txn the transaction handle. + * [in] merge true to keep changes to parent cursors, false to revert. + * + * Returns 0 on success, non-zero on failure. */ static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; MDB_xcursor *mx; @@ -2127,7 +2089,7 @@ static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { } } -/* Common code for #mdbx_txn_begin() and #mdbx_txn_renew(). */ +/* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { MDB_env *env = txn->mt_env; unsigned i, nr; @@ -2382,7 +2344,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, } txn->mt_dbxs = env->me_dbxs; /* static */ txn->mt_dbs = (MDB_db *)((char *)txn + tsize); - txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; + txn->mt_dbflags = (uint8_t *)txn + size - env->me_maxdbs; txn->mt_flags = flags; txn->mt_env = env; @@ -2462,7 +2424,7 @@ uint64_t mdbx_txn_id(MDB_txn *txn) { static void mdbx_dbis_update(MDB_txn *txn, int keep) { MDB_dbi n = txn->mt_numdbs; MDB_env *env = txn->mt_env; - unsigned char *tdbflags = txn->mt_dbflags; + uint8_t *tdbflags = txn->mt_dbflags; for (unsigned i = n; --i >= CORE_DBS;) { if (tdbflags[i] & DB_NEW) { @@ -2486,8 +2448,8 @@ static void mdbx_dbis_update(MDB_txn *txn, int keep) { /* End a transaction, except successful commit of a nested transaction. * May be called twice for readonly txns: First reset it, then abort. - * @param[in] txn the transaction handle to end - * @param[in] mode why and how to end the transaction */ + * [in] txn the transaction handle to end + * [in] mode why and how to end the transaction */ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { MDB_env *env = txn->mt_env; static const char *const names[] = MDB_END_NAMES; @@ -2632,7 +2594,7 @@ static int mdbx_prep_backlog(MDB_txn *txn, MDB_cursor *mc) { return MDB_SUCCESS; } -/** Save the freelist as of this transaction to the freeDB. +/* Save the freelist as of this transaction to the freeDB. * This changes the freelist. Keep trying until it stabilizes. */ static int mdbx_freelist_save(MDB_txn *txn) { @@ -2784,8 +2746,7 @@ again: /* LY: other troubles... */ goto bailout; - /* LY: freedb is empty, will look any free txn-id in high2low order. - */ + /* LY: freedb is empty, will look any free txn-id in high2low order. */ if (unlikely(env->me_pglast < 1)) { /* LY: not any txn in the past of freedb. */ rc = MDB_MAP_FULL; @@ -2948,11 +2909,10 @@ bailout: return rc; } -/** Flush (some) dirty pages to the map, after clearing their dirty flag. - * @param[in] txn the transaction that's being committed - * @param[in] keep number of initial pages in dirty_list to keep dirty. - * @return 0 on success, non-zero on failure. - */ +/* Flush (some) dirty pages to the map, after clearing their dirty flag. + * [in] txn the transaction that's being committed + * [in] keep number of initial pages in dirty_list to keep dirty. + * Returns 0 on success, non-zero on failure. */ static int mdbx_page_flush(MDB_txn *txn, int keep) { MDB_env *env = txn->mt_env; MDB_ID2L dl = txn->mt_u.dirty_list; @@ -3542,8 +3502,7 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, * is wrapped by wmutex, all of these changes will become visible * after the wmutex is unlocked. Since the DB is multi-version, * readers will get consistent data regardless of how fresh or - * how stale their view of these values is. - */ + * how stale their view of these values is. */ /* LY: step#3 - sync meta-pages. */ if ((flags & (MDB_NOSYNC | MDB_NOMETASYNC)) == 0) { @@ -3711,8 +3670,7 @@ int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { return MDBX_EINVAL; /* If env is already open, caller is responsible for making - * sure there are no active txns. - */ + * sure there are no active txns. */ if (env->me_map) { int rc; MDB_meta *meta; @@ -3995,10 +3953,9 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { return rc; } -/** Only a subset of the @ref mdbx_env flags can be changed - * at runtime. Changing other flags requires closing the - * environment and re-opening it with the new flags. - */ +/* Only a subset of the mdbx_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. */ #define CHANGEABLE \ (MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC | MDB_NOMEMINIT | \ MDBX_COALESCE | MDBX_PAGEPERTURB) @@ -4048,8 +4005,8 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, rc = MDB_SUCCESS; flags |= env->me_flags; if (flags & MDB_RDONLY) { - /* LY: silently ignore irrelevant flags when we're only getting read - * access */ + /* LY: silently ignore irrelevant flags when + * we're only getting read access */ flags &= ~(MDB_WRITEMAP | MDB_MAPASYNC | MDB_NOSYNC | MDB_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM | MDB_NOMEMINIT); } else { @@ -4136,7 +4093,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, txn->mt_dbs = (MDB_db *)((char *)txn + tsize); txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); - txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_dbflags = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDB_TXN_FINISHED; @@ -4176,7 +4133,7 @@ int __cold mdbx_env_open(MDB_env *env, const char *path, unsigned flags, return mdbx_env_open_ex(env, path, flags, mode, NULL); } -/** Destroy resources from mdbx_env_open(), clear our readers & DBIs */ +/* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ static void __cold mdbx_env_close0(MDB_env *env) { if (!(env->me_flags & MDB_ENV_ACTIVE)) return; @@ -4272,7 +4229,7 @@ void __cold mdbx_env_close(MDB_env *env) { mdbx_env_close_ex(env, 0); } #define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) #endif -/** Compare two items pointing at aligned unsigned int's. */ +/* Compare two items pointing at aligned unsigned int's. */ static int __hot mdbx_cmp_int_ai(const MDB_val *a, const MDB_val *b) { mdbx_assert(NULL, a->mv_size == b->mv_size); mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(int) && @@ -4289,7 +4246,7 @@ static int __hot mdbx_cmp_int_ai(const MDB_val *a, const MDB_val *b) { } } -/** Compare two items pointing at 2-byte aligned unsigned int's. */ +/* Compare two items pointing at 2-byte aligned unsigned int's. */ static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { mdbx_assert(NULL, a->mv_size == b->mv_size); mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(uint16_t) && @@ -4332,10 +4289,9 @@ static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { #endif /* UNALIGNED_OK */ } -/** Compare two items pointing at unsigneds of unknown alignment. +/* Compare two items pointing at unsigneds of unknown alignment. * - * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp. - */ + * This is also set as MDB_INTEGERDUP|MDB_DUPFIXED's MDB_dbx.md_dcmp. */ static int __hot mdbx_cmp_int_ua(const MDB_val *a, const MDB_val *b) { mdbx_assert(NULL, a->mv_size == b->mv_size); #if UNALIGNED_OK @@ -4372,18 +4328,18 @@ static int __hot mdbx_cmp_int_ua(const MDB_val *a, const MDB_val *b) { #endif /* UNALIGNED_OK */ } -/** Compare two items lexically */ +/* Compare two items lexically */ static int __hot mdbx_cmp_memn(const MDB_val *a, const MDB_val *b) { /* LY: assumes that length of keys are NOT equal for most cases, * if no then branch-prediction should mitigate the problem */ #if 0 - /* LY: without branch instructions on x86, - * but isn't best for equal length of keys */ - int diff_len = mdbx_cmp2int(a->mv_size, b->mv_size); + /* LY: without branch instructions on x86, + * but isn't best for equal length of keys */ + int diff_len = mdbx_cmp2int(a->mv_size, b->mv_size); #else /* LY: best when length of keys are equal, * but got a branch-penalty otherwise */ - if (unlikely(a->mv_size == b->mv_size)) + if (likely(a->mv_size == b->mv_size)) return memcmp(a->mv_data, b->mv_data, a->mv_size); int diff_len = (a->mv_size < b->mv_size) ? -1 : 1; #endif @@ -4392,7 +4348,7 @@ static int __hot mdbx_cmp_memn(const MDB_val *a, const MDB_val *b) { return likely(diff_data) ? diff_data : diff_len; } -/** Compare two items in reverse byte order */ +/* Compare two items in reverse byte order */ static int __hot mdbx_cmp_memnr(const MDB_val *a, const MDB_val *b) { const uint8_t *pa, *pb, *end; @@ -4409,13 +4365,12 @@ static int __hot mdbx_cmp_memnr(const MDB_val *a, const MDB_val *b) { return mdbx_cmp2int(a->mv_size, b->mv_size); } -/** Search for key within a page, using binary search. +/* Search for key within a page, using binary search. * Returns the smallest entry larger or equal to the key. * If exactp is non-null, stores whether the found entry was an exact match * in *exactp (1 or 0). * Updates the cursor index with the index of the found entry. - * If no entry larger or equal to the key is found, returns NULL. - */ + * If no entry larger or equal to the key is found, returns NULL. */ static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) { unsigned i = 0, nkeys; @@ -4496,21 +4451,19 @@ static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, return IS_LEAF2(mp) ? node : NODEPTR(mp, i); } -#if 0 -static void -mdbx_cursor_adjust(MDB_cursor *mc, func) -{ - MDB_cursor *m2; +#if 0 /* unused for now */ +static void mdbx_cursor_adjust(MDB_cursor *mc, func) { + MDB_cursor *m2; - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { - func(mc, m2); - } - } + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { + func(mc, m2); + } + } } #endif -/** Pop a page off the top of the cursor's stack. */ +/* Pop a page off the top of the cursor's stack. */ static void mdbx_cursor_pop(MDB_cursor *mc) { if (mc->mc_snum) { mdbx_debug("popped page %" PRIuPTR " off db %d cursor %p", @@ -4525,9 +4478,8 @@ static void mdbx_cursor_pop(MDB_cursor *mc) { } } -/** Push a page onto the top of the cursor's stack. - * Set #MDB_TXN_ERROR on failure. - */ +/* Push a page onto the top of the cursor's stack. + * Set MDB_TXN_ERROR on failure. */ static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp) { mdbx_debug("pushing page %" PRIuPTR " on db %d cursor %p", mp->mp_pgno, DDBI(mc), (void *)mc); @@ -4544,16 +4496,17 @@ static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp) { return MDB_SUCCESS; } -/** Find the address of the page corresponding to a given page number. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc the cursor accessing the page. - * @param[in] pgno the page number for the page to retrieve. - * @param[out] ret address of a pointer where the page's address will be - * stored. - * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, - * 0=mapped page. - * @return 0 on success, non-zero on failure. - */ +/* Find the address of the page corresponding to a given page number. + * Set MDB_TXN_ERROR on failure. + * + * [in] mc the cursor accessing the page. + * [in] pgno the page number for the page to retrieve. + * [out] ret address of a pointer where the page's address will be + * stored. + * [out] lvl dirty_list inheritance level of found page. 1=current txn, + * 0=mapped page. + * + * Returns 0 on success, non-zero on failure. */ static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) { MDB_txn *txn = mc->mc_txn; @@ -4605,9 +4558,8 @@ done: return MDB_SUCCESS; } -/** Finish #mdbx_page_search() / #mdbx_page_search_lowest(). - * The cursor is at the root page, set up the rest of it. - */ +/* Finish mdbx_page_search() / mdbx_page_search_lowest(). + * The cursor is at the root page, set up the rest of it. */ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { MDB_page *mp = mc->mc_pg[mc->mc_top]; int rc; @@ -4621,8 +4573,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { NUMKEYS(mp)); /* Don't assert on branch pages in the FreeDB. We can get here * while in the process of rebalancing a FreeDB branch page; we must - * let that proceed. ITS#8336 - */ + * let that proceed. ITS#8336 */ mdbx_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); mdbx_debug("found index 0 to page %" PRIuPTR "", NODEPGNO(NODEPTR(mp, 0))); @@ -4687,12 +4638,11 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { return MDB_SUCCESS; } -/** Search for the lowest key under the current branch page. +/* Search for the lowest key under the current branch page. * This just bypasses a NUMKEYS check in the current page * before calling mdbx_page_search_root(), because the callers * are all in situations where the current page is known to - * be underfilled. - */ + * be underfilled. */ static int mdbx_page_search_lowest(MDB_cursor *mc) { MDB_page *mp = mc->mc_pg[mc->mc_top]; MDB_node *node = NODEPTR(mp, 0); @@ -4707,24 +4657,25 @@ static int mdbx_page_search_lowest(MDB_cursor *mc) { return mdbx_page_search_root(mc, NULL, MDB_PS_FIRST); } -/** Search for the page a given key should be in. +/* Search for the page a given key should be in. * Push it and its parent pages on the cursor stack. - * @param[in,out] mc the cursor for this operation. - * @param[in] key the key to search for, or NULL for first/last page. - * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB - * are touched (updated with new page numbers). - * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. - * This is used by #mdbx_cursor_first() and #mdbx_cursor_last(). - * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. - * @return 0 on success, non-zero on failure. - */ + * + * [in,out] mc the cursor for this operation. + * [in] key the key to search for, or NULL for first/last page. + * [in] flags If MDB_PS_MODIFY is set, visited pages in the DB + * are touched (updated with new page numbers). + * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. + * This is used by mdbx_cursor_first() and mdbx_cursor_last(). + * If MDB_PS_ROOTONLY set, just fetch root node, no further + * lookups. + * + * Returns 0 on success, non-zero on failure. */ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { int rc; pgno_t root; /* Make sure the txn is still viable, then find the root from - * the txn's db table and set it as the root of the cursor's stack. - */ + * the txn's db table and set it as the root of the cursor's stack. */ if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) { mdbx_debug("transaction has failed, must abort"); return MDB_BAD_TXN; @@ -4807,8 +4758,7 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDB_page *mp) { * * Won't create me_pghead: me_pglast must be inited along with it. * Unsupported in nested txns: They would need to hide the page - * range in ancestor txns' dirty and spilled lists. - */ + * range in ancestor txns' dirty and spilled lists. */ if (env->me_pghead && !txn->mt_parent && ((mp->mp_flags & P_DIRTY) || (sl && (x = mdbx_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) { @@ -4863,12 +4813,13 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDB_page *mp) { return 0; } -/** Return the data associated with a given node. - * @param[in] mc The cursor for this operation. - * @param[in] leaf The node being read. - * @param[out] data Updated to point to the node's data. - * @return 0 on success, non-zero on failure. - */ +/* Return the data associated with a given node. + * + * [in] mc The cursor for this operation. + * [in] leaf The node being read. + * [out] data Updated to point to the node's data. + * + * Returns 0 on success, non-zero on failure. */ static __inline int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) { MDB_page *omp; /* overflow page */ @@ -4881,8 +4832,7 @@ static __inline int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, return MDB_SUCCESS; } - /* Read overflow data. - */ + /* Read overflow data. */ data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); if (unlikely((rc = mdbx_page_get(mc, pgno, &omp, NULL)) != 0)) { @@ -4918,14 +4868,15 @@ int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { return mdbx_cursor_set(&mc, key, data, MDB_SET, &exact); } -/** Find a sibling for a page. - * Replaces the page at the top of the cursor's stack with the - * specified sibling, if one exists. - * @param[in] mc The cursor for this operation. - * @param[in] move_right Non-zero if the right sibling is requested, - * otherwise the left sibling. - * @return 0 on success, non-zero on failure. - */ +/* Find a sibling for a page. + * Replaces the page at the top of the cursor's stack with the specified + * sibling, if one exists. + * + * [in] mc The cursor for this operation. + * [in] move_right Non-zero if the right sibling is requested, + * otherwise the left sibling. + * + * Returns 0 on success, non-zero on failure. */ static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { int rc; MDB_node *indx; @@ -4974,7 +4925,7 @@ static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { return MDB_SUCCESS; } -/** Move the cursor to the next data item. */ +/* Move the cursor to the next data item. */ static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) { MDB_page *mp; @@ -5063,7 +5014,7 @@ skip: return MDB_SUCCESS; } -/** Move the cursor to the previous data item. */ +/* Move the cursor to the previous data item. */ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) { MDB_page *mp; @@ -5150,7 +5101,7 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDB_SUCCESS; } -/** Set the cursor on a specific data item. */ +/* Set the cursor on a specific data item. */ static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, int *exactp) { int rc; @@ -5343,7 +5294,7 @@ set1: return rc; } -/** Move the cursor to the first item in the database. */ +/* Move the cursor to the first item in the database. */ static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) { int rc; MDB_node *leaf; @@ -5385,7 +5336,7 @@ static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) { return MDB_SUCCESS; } -/** Move the cursor to the last item in the database. */ +/* Move the cursor to the last item in the database. */ static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) { int rc; MDB_node *leaf; @@ -5592,11 +5543,9 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, return rc; } -/** Touch all the pages in the cursor stack. Set mc_top. - * Makes sure all the pages are writable, before attempting a write - *operation. - * @param[in] mc The cursor to operate on. - */ +/* Touch all the pages in the cursor stack. Set mc_top. + * Makes sure all the pages are writable, before attempting a write operation. + * [in] mc The cursor to operate on. */ static int mdbx_cursor_touch(MDB_cursor *mc) { int rc = MDB_SUCCESS; @@ -5851,10 +5800,10 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, /* DB has dups? */ if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { - /* Prepare (sub-)page/sub-DB to accept the new item, - * if needed. fp: old sub-page or a header faking - * it. mp: new (sub-)page. offset: growth in page - * size. xdata: node data with new page or DB. */ + /* Prepare (sub-)page/sub-DB to accept the new item, if needed. + * fp: old sub-page or a header faking it. + * mp: new (sub-)page. offset: growth in page size. + * xdata: node data with new page or DB. */ unsigned i, offset = 0; MDB_page *mp = fp = xdata.mv_data = env->me_pbuf; mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; @@ -6313,15 +6262,16 @@ fail: return rc; } -/** Allocate and initialize new pages for a database. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc a cursor on the database being added to. - * @param[in] flags flags defining what type of page is being allocated. - * @param[in] num the number of pages to allocate. This is usually 1, - * unless allocating overflow pages for a large record. - * @param[out] mp Address of a page, or NULL on failure. - * @return 0 on success, non-zero on failure. - */ +/* Allocate and initialize new pages for a database. + * Set MDB_TXN_ERROR on failure. + * + * [in] mc a cursor on the database being added to. + * [in] flags flags defining what type of page is being allocated. + * [in] num the number of pages to allocate. This is usually 1, + * unless allocating overflow pages for a large record. + * [out] mp Address of a page, or NULL on failure. + * + * Returns 0 on success, non-zero on failure. */ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) { MDB_page *np; @@ -6348,17 +6298,19 @@ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, return MDB_SUCCESS; } -/** Calculate the size of a leaf node. +/* Calculate the size of a leaf node. + * * The size depends on the environment's page size; if a data item * is too large it will be put onto an overflow page and the node * size will only include the key and not the data. Sizes are always * rounded up to an even number of bytes, to guarantee 2-byte alignment - * of the #MDB_node headers. - * @param[in] env The environment handle. - * @param[in] key The key for the node. - * @param[in] data The data for the node. - * @return The number of bytes needed to store the node. - */ + * of the MDB_node headers. + * + * [in] env The environment handle. + * [in] key The key for the node. + * [in] data The data for the node. + * + * Returns The number of bytes needed to store the node. */ static __inline size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data) { size_t sz; @@ -6372,16 +6324,18 @@ static __inline size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, return EVEN(sz + sizeof(indx_t)); } -/** Calculate the size of a branch node. +/* Calculate the size of a branch node. + * * The size should depend on the environment's page size but since * we currently don't support spilling large keys onto overflow - * pages, it's simply the size of the #MDB_node header plus the + * pages, it's simply the size of the MDB_node header plus the * size of the key. Sizes are always rounded up to an even number - * of bytes, to guarantee 2-byte alignment of the #MDB_node headers. - * @param[in] env The environment handle. - * @param[in] key The key for the node. - * @return The number of bytes needed to store the node. - */ + * of bytes, to guarantee 2-byte alignment of the MDB_node headers. + * + * [in] env The environment handle. + * [in] key The key for the node. + * + * Returns The number of bytes needed to store the node. */ static __inline size_t mdbx_branch_size(MDB_env *env, MDB_val *key) { size_t sz; @@ -6397,22 +6351,22 @@ static __inline size_t mdbx_branch_size(MDB_env *env, MDB_val *key) { return sz + sizeof(indx_t); } -/** Add a node to the page pointed to by the cursor. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc The cursor for this operation. - * @param[in] indx The index on the page where the new node should be added. - * @param[in] key The key for the new node. - * @param[in] data The data for the new node, if any. - * @param[in] pgno The page number, if adding a branch node. - * @param[in] flags Flags for the node. - * @return 0 on success, non-zero on failure. Possible errors are: - *
      - *
    • MDBX_ENOMEM - failed to allocate overflow pages for the node. - *
    • MDB_PAGE_FULL - there is insufficient room in the page. This error - * should never happen since all callers already calculate the - * page's free space before calling this function. - *
    - */ +/* Add a node to the page pointed to by the cursor. + * Set MDB_TXN_ERROR on failure. + * + * [in] mc The cursor for this operation. + * [in] indx The index on the page where the new node should be added. + * [in] key The key for the new node. + * [in] data The data for the new node, if any. + * [in] pgno The page number, if adding a branch node. + * [in] flags Flags for the node. + * + * Returns 0 on success, non-zero on failure. Possible errors are: + * + * MDBX_ENOMEM - failed to allocate overflow pages for the node. + * MDB_PAGE_FULL - there is insufficient room in the page. This error + * should never happen since all callers already calculate + * the page's free space before calling this function. */ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags) { unsigned i; @@ -6537,11 +6491,10 @@ full: return MDB_PAGE_FULL; } -/** Delete the specified node from a page. - * @param[in] mc Cursor pointing to the node to delete. - * @param[in] ksize The size of a node. Only used if the page is - * part of a #MDB_DUPFIXED database. - */ +/* Delete the specified node from a page. + * [in] mc Cursor pointing to the node to delete. + * [in] ksize The size of a node. Only used if the page is + * part of a MDB_DUPFIXED database. */ static void mdbx_node_del(MDB_cursor *mc, int ksize) { MDB_page *mp = mc->mc_pg[mc->mc_top]; indx_t indx = mc->mc_ki[mc->mc_top]; @@ -6592,10 +6545,9 @@ static void mdbx_node_del(MDB_cursor *mc, int ksize) { mp->mp_upper += sz; } -/** Compact the main page after deleting a node on a subpage. - * @param[in] mp The main page to operate on. - * @param[in] indx The index of the subpage on the main page. - */ +/* Compact the main page after deleting a node on a subpage. + * [in] mp The main page to operate on. + * [in] indx The index of the subpage on the main page. */ static void mdbx_node_shrink(MDB_page *mp, indx_t indx) { MDB_node *node; MDB_page *sp, *xp; @@ -6635,16 +6587,16 @@ static void mdbx_node_shrink(MDB_page *mp, indx_t indx) { mp->mp_upper += delta; } -/** Initial setup of a sorted-dups cursor. +/* Initial setup of a sorted-dups cursor. + * * Sorted duplicates are implemented as a sub-database for the given key. * The duplicate data items are actually keys of the sub-database. * Operations on the duplicate data items are performed using a sub-cursor * initialized when the sub-database is first accessed. This function does * the preliminary setup of the sub-cursor, filling in the fields that * depend only on the parent DB. - * @param[in] mc The main cursor whose sorted-dups cursor is to be - * initialized. - */ + * + * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */ static void mdbx_xcursor_init0(MDB_cursor *mc) { MDB_xcursor *mx = mc->mc_xcursor; @@ -6663,12 +6615,10 @@ static void mdbx_xcursor_init0(MDB_cursor *mc) { mx->mx_dbx.md_dcmp = NULL; } -/** Final setup of a sorted-dups cursor. - * Sets up the fields that depend on the data from the main cursor. - * @param[in] mc The main cursor whose sorted-dups cursor is to be - *initialized. - * @param[in] node The data containing the #MDB_db record for the - * sorted-dup database. +/* Final setup of a sorted-dups cursor. + * Sets up the fields that depend on the data from the main cursor. + * [in] mc The main cursor whose sorted-dups cursor is to be initialized. + * [in] node The data containing the MDB_db record for the sorted-dup database. */ static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node) { MDB_xcursor *mx = mc->mc_xcursor; @@ -6711,14 +6661,13 @@ static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node) { #endif */ } -/** Fixup a sorted-dups cursor due to underlying update. - * Sets up some fields that depend on the data from the main cursor. - * Almost the same as init1, but skips initialization steps if the - * xcursor had already been used. - * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up. - * @param[in] src_mx The xcursor of an up-to-date cursor. - * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item. - */ +/* Fixup a sorted-dups cursor due to underlying update. + * Sets up some fields that depend on the data from the main cursor. + * Almost the same as init1, but skips initialization steps if the + * xcursor had already been used. + * [in] mc The main cursor whose sorted-dups cursor is to be fixed up. + * [in] src_mx The xcursor of an up-to-date cursor. + * [in] new_dupdata True if converting from a non-F_DUPDATA item. */ static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) { MDB_xcursor *mx = mc->mc_xcursor; @@ -6739,7 +6688,7 @@ static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, mx->mx_db.md_root); } -/** Initialize a cursor for a given transaction and database. */ +/* Initialize a cursor for a given transaction and database. */ static void mdbx_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) { mc->mc_signature = MDBX_MC_SIGNATURE; @@ -6912,12 +6861,11 @@ MDB_dbi mdbx_cursor_dbi(MDB_cursor *mc) { return mc->mc_dbi; } -/** Replace the key for a branch node with a new key. - * Set #MDB_TXN_ERROR on failure. - * @param[in] mc Cursor pointing to the node to operate on. - * @param[in] key The new key to use. - * @return 0 on success, non-zero on failure. - */ +/* Replace the key for a branch node with a new key. + * Set MDB_TXN_ERROR on failure. + * [in] mc Cursor pointing to the node to operate on. + * [in] key The new key to use. + * Returns 0 on success, non-zero on failure. */ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { MDB_page *mp; MDB_node *node; @@ -6983,7 +6931,7 @@ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); -/** Perform \b act while tracking temporary cursor \b mn */ +/* Perform act while tracking temporary cursor mn */ #define WITH_CURSOR_TRACKING(mn, act) \ do { \ MDB_cursor mc_dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ @@ -7000,8 +6948,7 @@ static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); *tp = tracked->mc_next; \ } while (0) -/** Move a node from csrc to cdst. - */ +/* Move a node from csrc to cdst. */ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { MDB_node *srcnode; MDB_val key, data; @@ -7218,14 +7165,15 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { return MDB_SUCCESS; } -/** Merge one page into another. - * The nodes from the page pointed to by \b csrc will - * be copied to the page pointed to by \b cdst and then - * the \b csrc page will be freed. - * @param[in] csrc Cursor pointing to the source page. - * @param[in] cdst Cursor pointing to the destination page. - * @return 0 on success, non-zero on failure. - */ +/* Merge one page into another. + * + * The nodes from the page pointed to by csrc will be copied to the page + * pointed to by cdst and then the csrc page will be freed. + * + * [in] csrc Cursor pointing to the source page. + * [in] cdst Cursor pointing to the destination page. + * + * Returns 0 on success, non-zero on failure. */ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { MDB_page *psrc, *pdst; MDB_node *srcnode; @@ -7250,8 +7198,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { /* get dst page again now that we've touched it. */ pdst = cdst->mc_pg[cdst->mc_top]; - /* Move all nodes from src to dst. - */ + /* Move all nodes from src to dst. */ j = nkeys = NUMKEYS(pdst); if (IS_LEAF2(psrc)) { key.mv_size = csrc->mc_db->md_xsize; @@ -7300,8 +7247,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { pdst->mp_pgno, NUMKEYS(pdst), (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10); - /* Unlink the src page from parent and add to free list. - */ + /* Unlink the src page from parent and add to free list. */ csrc->mc_top--; mdbx_node_del(csrc, 0); if (csrc->mc_ki[csrc->mc_top] == 0) { @@ -7316,8 +7262,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { psrc = csrc->mc_pg[csrc->mc_top]; /* If not operating on FreeDB, allow this page to be reused - * in this txn. Otherwise just add to free list. - */ + * in this txn. Otherwise just add to free list. */ rc = mdbx_page_loose(csrc, psrc); if (unlikely(rc)) return rc; @@ -7366,10 +7311,9 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { return rc; } -/** Copy the contents of a cursor. - * @param[in] csrc The cursor to copy from. - * @param[out] cdst The cursor to copy to. - */ +/* Copy the contents of a cursor. + * [in] csrc The cursor to copy from. + * [out] cdst The cursor to copy to. */ static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) { unsigned i; @@ -7387,11 +7331,9 @@ static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) { } } -/** Rebalance the tree after a delete operation. - * @param[in] mc Cursor pointing to the page where rebalancing - * should begin. - * @return 0 on success, non-zero on failure. - */ +/* Rebalance the tree after a delete operation. + * [in] mc Cursor pointing to the page where rebalancing should begin. + * Returns 0 on success, non-zero on failure. */ static int mdbx_rebalance(MDB_cursor *mc) { MDB_node *node; int rc, fromleft; @@ -7503,25 +7445,21 @@ static int mdbx_rebalance(MDB_cursor *mc) { } /* The parent (branch page) must have at least 2 pointers, - * otherwise the tree is invalid. - */ + * otherwise the tree is invalid. */ ptop = mc->mc_top - 1; mdbx_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1); /* Leaf page fill factor is below the threshold. * Try to move keys from left or right neighbor, or - * merge with a neighbor page. - */ + * merge with a neighbor page. */ - /* Find neighbors. - */ + /* Find neighbors. */ mdbx_cursor_copy(mc, &mn); mn.mc_xcursor = NULL; oldki = mc->mc_ki[mc->mc_top]; if (mc->mc_ki[ptop] == 0) { - /* We're the leftmost leaf in our parent. - */ + /* We're the leftmost leaf in our parent. */ mdbx_debug("reading right neighbor"); mn.mc_ki[ptop]++; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); @@ -7532,8 +7470,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); fromleft = 0; } else { - /* There is at least one neighbor to the left. - */ + /* There is at least one neighbor to the left. */ mdbx_debug("reading left neighbor"); mn.mc_ki[ptop]--; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); @@ -7551,8 +7488,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { /* If the neighbor page is above threshold and has enough keys, * move one key from it. Otherwise we should try to merge them. - * (A branch page must never have less than 2 keys.) - */ + * (A branch page must never have less than 2 keys.) */ if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { rc = mdbx_node_move(&mn, mc, fromleft); @@ -7576,7 +7512,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { return rc; } -/** Complete a delete operation started by #mdbx_cursor_del(). */ +/* Complete a delete operation started by mdbx_cursor_del(). */ static int mdbx_cursor_del0(MDB_cursor *mc) { int rc; MDB_page *mp; @@ -7618,8 +7554,7 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { if (likely(rc == MDB_SUCCESS)) { /* DB is totally empty now, just bail out. * Other cursors adjustments were already done - * by mdbx_rebalance and aren't needed here. - */ + * by mdbx_rebalance and aren't needed here. */ if (!mc->mc_snum) { mc->mc_flags |= C_DEL | C_EOF; return rc; @@ -7652,8 +7587,7 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { /* If this node has dupdata, it may need to be reinited * because its data has moved. * If the xcursor was not initd it must be reinited. - * Else if node points to a subDB, nothing is needed. - */ + * Else if node points to a subDB, nothing is needed. */ if (node->mn_flags & F_DUPDATA) { if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { if (!(node->mn_flags & F_SUBDATA)) @@ -7719,8 +7653,7 @@ static int mdbx_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, * update the parent's separator key(s). If the new sepkey * is larger than the current one, the parent page may * run out of space, triggering a split. We need this - * cursor to be consistent until the end of the rebalance. - */ + * cursor to be consistent until the end of the rebalance. */ mc.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &mc; rc = mdbx_cursor_del(&mc, flags); @@ -7729,17 +7662,16 @@ static int mdbx_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, return rc; } -/** Split a page and insert a new node. - * Set #MDB_TXN_ERROR on failure. - * @param[in,out] mc Cursor pointing to the page and desired insertion index. +/* Split a page and insert a new node. + * Set MDB_TXN_ERROR on failure. + * [in,out] mc Cursor pointing to the page and desired insertion index. * The cursor will be updated to point to the actual page and index where * the node got inserted after the split. - * @param[in] newkey The key for the newly inserted node. - * @param[in] newdata The data for the newly inserted node. - * @param[in] newpgno The page number, if the new node is a branch node. - * @param[in] nflags The #NODE_ADD_FLAGS for the new node. - * @return 0 on success, non-zero on failure. - */ + * [in] newkey The key for the newly inserted node. + * [in] newdata The data for the newly inserted node. + * [in] newpgno The page number, if the new node is a branch node. + * [in] nflags The NODE_ADD_FLAGS for the new node. + * Returns 0 on success, non-zero on failure. */ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned nflags) { unsigned flags; @@ -7774,8 +7706,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, /* Usually when splitting the root page, the cursor * height is 1. But when called from mdbx_update_key, * the cursor height may be greater because it walks - * up the stack while finding the branch slot to update. - */ + * up the stack while finding the branch slot to update. */ if (mc->mc_top < 1) { if ((rc = mdbx_page_new(mc, P_BRANCH, 1, &pp))) goto done; @@ -7966,8 +7897,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, ptop++; } /* Right page might now have changed parent. - * Check if left page also changed parent. - */ + * Check if left page also changed parent. */ if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { for (i = 0; i < ptop; i++) { @@ -8062,8 +7992,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, } else { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. - */ + /* Make sure mc_ki is still valid. */ if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { for (i = 0; i <= ptop; i++) { @@ -8081,8 +8010,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, if (newindx >= split_indx) { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. - */ + /* Make sure mc_ki is still valid. */ if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { for (i = 0; i <= ptop; i++) { @@ -8209,29 +8137,28 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, #ifndef MDB_WBUF #define MDB_WBUF (1024 * 1024) #endif -#define MDB_EOF 0x10 /**< #mdbx_env_copyfd1() is done reading */ +#define MDB_EOF 0x10 /* mdbx_env_copyfd1() is done reading */ -/** State needed for a double-buffering compacting copy. */ +/* State needed for a double-buffering compacting copy. */ typedef struct mdbx_copy { MDB_env *mc_env; MDB_txn *mc_txn; mdbx_mutex_t mc_mutex; - mdbx_cond_t mc_cond; /**< Condition variable for #mc_new */ + mdbx_cond_t mc_cond; /* Condition variable for mc_new */ char *mc_wbuf[2]; char *mc_over[2]; int mc_wlen[2]; int mc_olen[2]; pgno_t mc_next_pgno; mdbx_filehandle_t mc_fd; - int mc_toggle; /**< Buffer number in provider */ - int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ - /** Error code. Never cleared if set. Both threads can set nonzero - * to fail the copy. Not mutex-protected, LMDB expects atomic int. - */ + int mc_toggle; /* Buffer number in provider */ + int mc_new; /* (0-2 buffers to write) | (MDB_EOF at end) */ + /* Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, LMDB expects atomic int. */ volatile int mc_error; } mdbx_copy; -/** Dedicated writer thread for compacting copy. */ +/* Dedicated writer thread for compacting copy. */ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { mdbx_copy *my = arg; char *ptr; @@ -8269,11 +8196,10 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { return (THREAD_RESULT)0; } -/** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. +/* Give buffer and/or MDB_EOF to writer thread, await unused buffer. * - * @param[in] my control structure. - * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). - */ + * [in] my control structure. + * [in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). */ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { mdbx_mutex_lock(&my->mc_mutex); my->mc_new += adjust; @@ -8288,11 +8214,10 @@ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { return my->mc_error; } -/** Depth-first tree traversal for compacting copy. - * @param[in] my control structure. - * @param[in,out] pg database root. - * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. - */ +/* Depth-first tree traversal for compacting copy. + * [in] my control structure. + * [in,out] pg database root. + * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { MDB_cursor mc; MDB_node *ni; @@ -8412,8 +8337,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { mc.mc_ki[mc.mc_top] = 0; if (IS_BRANCH(mp)) { /* Whenever we advance to a sibling branch page, - * we must proceed all the way down to its first leaf. - */ + * we must proceed all the way down to its first leaf. */ mdbx_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize); goto again; } else @@ -8447,7 +8371,7 @@ done: return rc; } -/** Copy environment with compaction. */ +/* Copy environment with compaction. */ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { MDB_meta *mm; MDB_page *mp; @@ -8517,8 +8441,7 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { mm->mm_dbs[MAIN_DBI].md_root = new_root; } else { /* When the DB is empty, handle it specially to - * fix any breakage like page leaks from ITS#8174. - */ + * fix any breakage like page leaks from ITS#8174. */ mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; } if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { @@ -8547,7 +8470,7 @@ done2: return rc ? rc : my.mc_error; } -/** Copy environment as-is. */ +/* Copy environment as-is. */ static int __cold mdbx_env_copy_asis(MDB_env *env, mdbx_filehandle_t fd) { MDB_txn *txn = NULL; int rc; @@ -8702,12 +8625,11 @@ int __cold mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *arg) { return MDB_SUCCESS; } -/** Common code for #mdbx_dbi_stat() and #mdbx_env_stat(). - * @param[in] env the environment to operate in. - * @param[in] db the #MDB_db record containing the stats to return. - * @param[out] arg the address of an #MDB_stat structure to receive the stats. - * @return 0, this function always succeeds. - */ +/* Common code for mdbx_dbi_stat() and mdbx_env_stat(). + * [in] env the environment to operate in. + * [in] db the MDB_db record containing the stats to return. + * [out] arg the address of an MDB_stat structure to receive the stats. + * Returns 0, this function always succeeds. */ static int __cold mdbx_stat0(MDB_env *env, MDB_db *db, MDBX_stat *arg) { arg->ms_psize = env->me_psize; arg->ms_depth = db->md_depth; @@ -9021,11 +8943,10 @@ int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags) { return MDB_SUCCESS; } -/** Add all the DB's pages to the free list. - * @param[in] mc Cursor on the DB to free. - * @param[in] subs non-Zero to check for sub-DBs in this DB. - * @return 0 on success, non-zero on failure. - */ +/* Add all the DB's pages to the free list. + * [in] mc Cursor on the DB to free. + * [in] subs non-Zero to check for sub-DBs in this DB. + * Returns 0 on success, non-zero on failure. */ static int mdbx_drop0(MDB_cursor *mc, int subs) { int rc; @@ -9039,8 +8960,7 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. * This also avoids any P_LEAF2 pages, which have no nodes. * Also if the DB doesn't have sub-DBs and has no overflow - * pages, omit scanning leaves. - */ + * pages, omit scanning leaves. */ if ((mc->mc_flags & C_SUB) || (!subs && !mc->mc_db->md_overflow_pages)) mdbx_cursor_pop(mc); @@ -9093,8 +9013,7 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { if (unlikely(rc != MDB_NOTFOUND)) goto done; /* no more siblings, go back to beginning - * of previous level. - */ + * of previous level. */ pop: mdbx_cursor_pop(mc); mc->mc_ki[0] = 0; @@ -9204,8 +9123,6 @@ int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { } int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { - unsigned i, snap_nreaders; - MDB_reader *mr; char buf[64]; int rc = 0, first = 1; @@ -9215,9 +9132,9 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - snap_nreaders = env->me_lck->mti_numreaders; - mr = env->me_lck->mti_readers; - for (i = 0; i < snap_nreaders; i++) { + unsigned snap_nreaders = env->me_lck->mti_numreaders; + MDB_reader *mr = env->me_lck->mti_readers; + for (unsigned i = 0; i < snap_nreaders; i++) { if (mr[i].mr_pid) { txnid_t txnid = mr[i].mr_txnid; if (txnid == ~(txnid_t)0) @@ -9244,9 +9161,8 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { return rc; } -/** Insert pid into list if not already present. - * return -1 if already present. - */ +/* Insert pid into list if not already present. + * return -1 if already present. */ static int __cold mdbx_pid_insert(mdbx_pid_t *ids, mdbx_pid_t pid) { /* binary search of pid in list */ unsigned base = 0; @@ -9254,7 +9170,7 @@ static int __cold mdbx_pid_insert(mdbx_pid_t *ids, mdbx_pid_t pid) { int val = 0; unsigned n = ids[0]; - while (0 < n) { + while (n > 0) { unsigned pivot = n >> 1; cursor = base + pivot + 1; val = pid - ids[cursor]; @@ -9270,9 +9186,9 @@ static int __cold mdbx_pid_insert(mdbx_pid_t *ids, mdbx_pid_t pid) { } } - if (val > 0) { + if (val > 0) ++cursor; - } + ids[0]++; for (n = ids[0]; n > cursor; n--) ids[n] = ids[n - 1]; @@ -9368,33 +9284,30 @@ static unsigned __hot mdbx_midl_search(MDB_IDL ids, MDB_ID id) { /* * binary search of id in ids * if found, returns position of id - * if not found, returns first position greater than id - */ + * if not found, returns first position greater than id */ unsigned base = 0; unsigned cursor = 1; int val = 0; unsigned n = ids[0]; - while (0 < n) { + while (n > 0) { unsigned pivot = n >> 1; cursor = base + pivot + 1; val = mdbx_cmp2int(ids[cursor], id); if (val < 0) { n = pivot; - } else if (val > 0) { base = cursor; n -= pivot + 1; - } else { return cursor; } } - if (val > 0) { + if (val > 0) ++cursor; - } + return cursor; } @@ -9579,14 +9492,13 @@ static unsigned __hot mdbx_mid2l_search(MDB_ID2L ids, MDB_ID id) { /* * binary search of id in ids * if found, returns position of id - * if not found, returns first position greater than id - */ + * if not found, returns first position greater than id */ unsigned base = 0; unsigned cursor = 1; int val = 0; unsigned n = (unsigned)ids[0].mid; - while (0 < n) { + while (n > 0) { unsigned pivot = n >> 1; cursor = base + pivot + 1; val = mdbx_cmp2int(id, ids[cursor].mid); @@ -9608,21 +9520,19 @@ static unsigned __hot mdbx_mid2l_search(MDB_ID2L ids, MDB_ID id) { } static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id) { - unsigned x, i; - - x = mdbx_mid2l_search(ids, id->mid); - if (x < 1) + unsigned x = mdbx_mid2l_search(ids, id->mid); + if (unlikely(x < 1)) return /* internal error */ -2; if (x <= ids[0].mid && ids[x].mid == id->mid) return /* duplicate */ -1; - if (ids[0].mid >= MDB_IDL_UM_MAX) + if (unlikely(ids[0].mid >= MDB_IDL_UM_MAX)) return /* too big */ -2; /* insert id */ ids[0].mid++; - for (i = (unsigned)ids[0].mid; i > x; i--) + for (unsigned i = (unsigned)ids[0].mid; i > x; i--) ids[i] = ids[i - 1]; ids[x] = *id; return 0; @@ -9630,9 +9540,9 @@ static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id) { static int mdbx_mid2l_append(MDB_ID2L ids, MDB_ID2 *id) { /* Too big? */ - if (ids[0].mid >= MDB_IDL_UM_MAX) { + if (unlikely(ids[0].mid >= MDB_IDL_UM_MAX)) return -2; - } + ids[0].mid++; ids[ids[0].mid] = *id; return 0; @@ -9653,17 +9563,16 @@ int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn) { } static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { - int retry; - txnid_t snap; mdbx_debug("DB size maxed out"); - for (retry = 0;; ++retry) { + int retry; + for (retry = 0; retry < INT_MAX; ++retry) { int reader; if (mdbx_reader_check(env, NULL)) break; - snap = mdbx_find_oldest(env, &reader); + txnid_t snap = mdbx_find_oldest(env, &reader); if (oldest < snap || reader < 0) { if (retry && env->me_oom_func) { /* LY: notify end of oom-loop */ @@ -9736,10 +9645,6 @@ __attribute__((no_sanitize_thread, noinline)) #endif int mdbx_txn_straggler(MDB_txn *txn, int *percent) { - MDB_env *env; - MDB_meta *meta; - txnid_t lag; - if (unlikely(!txn)) return -MDBX_EINVAL; @@ -9749,8 +9654,8 @@ int mdbx_txn_straggler(MDB_txn *txn, int *percent) if (unlikely(!txn->mt_u.reader)) return -1; - env = txn->mt_env; - meta = mdbx_meta_head(env); + MDB_env *env = txn->mt_env; + MDB_meta *meta = mdbx_meta_head(env); if (percent) { size_t maxpg = env->me_maxpg; size_t last = meta->mm_last_pg + 1; @@ -9758,7 +9663,7 @@ int mdbx_txn_straggler(MDB_txn *txn, int *percent) last = env->me_txn0->mt_next_pgno; *percent = (last * 100ull + maxpg / 2) / maxpg; } - lag = meta->mm_txnid - txn->mt_u.reader->mr_txnid; + txnid_t lag = meta->mm_txnid - txn->mt_u.reader->mr_txnid; return (lag > INT_MAX) ? INT_MAX : (int)lag; } @@ -9768,7 +9673,7 @@ typedef struct mdbx_walk_ctx { MDBX_pgvisitor_func *mw_visitor; } mdbx_walk_ctx_t; -/** Depth-first tree traversal. */ +/* Depth-first tree traversal. */ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, pgno_t pg, int deep) { MDB_page *mp; @@ -9898,21 +9803,19 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, int __cold mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, void *user) { - mdbx_walk_ctx_t ctx; - int rc; - if (unlikely(!txn)) return MDB_BAD_TXN; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + mdbx_walk_ctx_t ctx; ctx.mw_txn = txn; ctx.mw_user = user; ctx.mw_visitor = visitor; - rc = visitor(0, 2, user, "lmdb", "meta", 2, sizeof(MDB_meta) * 2, - PAGEHDRSZ * 2, - (txn->mt_env->me_psize - sizeof(MDB_meta) - PAGEHDRSZ) * 2); + int rc = visitor(0, 2, user, "lmdb", "meta", 2, sizeof(MDB_meta) * 2, + PAGEHDRSZ * 2, + (txn->mt_env->me_psize - sizeof(MDB_meta) - PAGEHDRSZ) * 2); if (!rc) rc = mdbx_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0); if (!rc) @@ -9974,8 +9877,7 @@ int mdbx_cursor_on_first(MDB_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED)) return MDBX_RESULT_FALSE; - unsigned i; - for (i = 0; i < mc->mc_snum; ++i) { + for (unsigned i = 0; i < mc->mc_snum; ++i) { if (mc->mc_ki[i]) return MDBX_RESULT_FALSE; } @@ -9993,8 +9895,7 @@ int mdbx_cursor_on_last(MDB_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED)) return MDBX_RESULT_FALSE; - unsigned i; - for (i = 0; i < mc->mc_snum; ++i) { + for (unsigned i = 0; i < mc->mc_snum; ++i) { unsigned nkeys = NUMKEYS(mc->mc_pg[i]); if (mc->mc_ki[i] < nkeys - 1) return MDBX_RESULT_FALSE; @@ -10053,9 +9954,6 @@ static int mdbx_is_samedata(const MDB_val *a, const MDB_val *b) { */ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, MDB_val *old_data, unsigned flags) { - MDB_cursor mc; - MDB_xcursor mx; - if (unlikely(!key || !old_data || !txn || old_data == new_data)) return MDBX_EINVAL; @@ -10079,6 +9977,8 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) return (txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; + MDB_cursor mc; + MDB_xcursor mx; mdbx_cursor_init(&mc, txn, dbi, &mx); mc.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &mc; @@ -10104,26 +10004,6 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, if (mdbx_is_samedata(old_data, new_data)) /* если данные совпадают, то ничего делать не надо */ goto bailout; -#if 0 /* LY: исправлено в mdbx_cursor_put(), здесь в качестве памятки */ - MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); - if (F_ISSET(leaf->mn_flags, F_DUPDATA) && - mc.mc_xcursor->mx_db.md_entries > 1) { - /* Если у ключа больше одного значения, то - * сначала удаляем найденое "старое" значение. - * - * Этого можно не делать, так как MDBX уже - * обучен корректно обрабатывать такие ситуации. - * - * Однако, следует помнить, что в LMDB при - * совпадении размера данных, значение будет - * просто перезаписано с нарушением - * упорядоченности, что сломает поиск. */ - rc = mdbx_cursor_del(&mc, 0); - if (rc != MDB_SUCCESS) - goto bailout; - flags -= MDB_CURRENT; - } -#endif } } else { /* в old_data буфер для сохранения предыдущего значения */ diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index b99f3b2b..e5eb4416 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -11,8 +11,7 @@ * * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at - * . - */ + * . */ #include #include diff --git a/src/tools/mdbx_copy.c b/src/tools/mdbx_copy.c index 9eb3c49c..3b413e17 100644 --- a/src/tools/mdbx_copy.c +++ b/src/tools/mdbx_copy.c @@ -11,8 +11,7 @@ * * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at - * . - */ + * . */ #include "../../mdbx.h" #include diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index 871dd55e..92cf8905 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -11,8 +11,7 @@ * * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at - * . - */ + * . */ #include "../../mdbx.h" #include diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index 5af8a913..dc707504 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -11,8 +11,7 @@ * * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at - * . - */ + * . */ #include "../../mdbx.h" #include diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index 46eeb94d..47102e1c 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -11,8 +11,7 @@ * * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at - * . - */ + * . */ #include "../../mdbx.h" #include From 5c2042c46678b9b6d141825811dc2b8bf49e5fd7 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 16:46:55 +0300 Subject: [PATCH 139/303] tools: internal minor renames (preparation for changes). --- src/tools/mdbx_chk.c | 128 ++++++++++++++++++++++-------------------- src/tools/mdbx_load.c | 16 +++--- 2 files changed, 76 insertions(+), 68 deletions(-) diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index e5eb4416..c0b57551 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -76,8 +76,8 @@ int exclusive = 2; MDB_env *env; MDB_txn *txn, *locktxn; -MDBX_envinfo info; -MDBX_stat stat; +MDBX_envinfo envinfo; +MDBX_stat envstat; size_t maxkeysize, reclaimable_pages, freedb_pages, lastpgno; size_t userdb_count, skipped_subdb; unsigned verbose, quiet; @@ -229,7 +229,7 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, if (type) { size_t page_bytes = payload_bytes + header_bytes + unused_bytes; - size_t page_size = pgnumber * stat.ms_psize; + size_t page_size = pgnumber * envstat.ms_psize; int index = pagemap_lookup_dbi(dbi); if (index < 0) return ENOMEM; @@ -248,13 +248,13 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, if (unused_bytes < 0 || (size_t)unused_bytes > page_size) problem_add("page", pgno, "illegal unused-bytes", "%" PRIuPTR " < %i < %" PRIuPTR "", 0, unused_bytes, - stat.ms_psize); + envstat.ms_psize); if (header_bytes < (int)sizeof(long) || - (size_t)header_bytes >= stat.ms_psize - sizeof(long)) + (size_t)header_bytes >= envstat.ms_psize - sizeof(long)) problem_add("page", pgno, "illegal header-length", "%" PRIuPTR " < %i < %" PRIuPTR "", sizeof(long), - header_bytes, stat.ms_psize - sizeof(long)); + header_bytes, envstat.ms_psize - sizeof(long)); if (payload_bytes < 1) { if (nentries > 1) { problem_add("page", pgno, "zero size-of-entry", @@ -301,17 +301,20 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, return gotsignal ? EINTR : MDB_SUCCESS; } -typedef int(visitor)(size_t record_number, MDB_val *key, MDB_val *data); +typedef int(visitor)(const size_t record_number, const MDB_val *key, + const MDB_val *data); static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent); -static int handle_userdb(size_t record_number, MDB_val *key, MDB_val *data) { +static int handle_userdb(const size_t record_number, const MDB_val *key, + const MDB_val *data) { (void)record_number; (void)key; (void)data; return MDB_SUCCESS; } -static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { +static int handle_freedb(const size_t record_number, const MDB_val *key, + const MDB_val *data) { char *bad = ""; size_t pg, prev; ssize_t i, number, span = 0; @@ -320,7 +323,7 @@ static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { if (key->mv_size != sizeof(txnid)) problem_add("entry", record_number, "wrong txn-id size", "key-size %" PRIiPTR "", key->mv_size); - else if (txnid < 1 || txnid > info.me_last_txnid) + else if (txnid < 1 || txnid > envinfo.me_last_txnid) problem_add("entry", record_number, "wrong txn-id", "%" PRIuPTR "", txnid); if (data->mv_size < sizeof(size_t) || data->mv_size % sizeof(size_t)) @@ -337,13 +340,14 @@ static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { data->mv_size); else { freedb_pages += number; - if (info.me_tail_txnid > txnid) + if (envinfo.me_tail_txnid > txnid) reclaimable_pages += number; for (i = number, prev = 1; --i >= 0;) { pg = iptr[i]; - if (pg < 2 /* META_PAGE */ || pg > info.me_last_pgno) + if (pg < 2 /* META_PAGE */ || pg > envinfo.me_last_pgno) problem_add("entry", record_number, "wrong idl entry", - "2 < %" PRIiPTR " < %" PRIiPTR "", pg, info.me_last_pgno); + "2 < %" PRIiPTR " < %" PRIiPTR "", pg, + envinfo.me_last_pgno); else if (pg <= prev) { bad = " [bad sequence]"; problem_add("entry", record_number, "bad sequence", @@ -357,7 +361,7 @@ static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { if (verbose > 2 && !only_subdb) { print(" transaction %" PRIuPTR ", %" PRIiPTR " pages, maxspan %" PRIiPTR "%s\n", - *(size_t *)key->mv_data, number, span, bad); + txnid, number, span, bad); if (verbose > 3) { int j = number - 1; while (j >= 0) { @@ -377,7 +381,8 @@ static int handle_freedb(size_t record_number, MDB_val *key, MDB_val *data) { return MDB_SUCCESS; } -static int handle_maindb(size_t record_number, MDB_val *key, MDB_val *data) { +static int handle_maindb(const size_t record_number, const MDB_val *key, + const MDB_val *data) { char *name; int rc; size_t i; @@ -721,58 +726,60 @@ int main(int argc, char *argv[]) { goto bailout; } - rc = mdbx_env_info(env, &info, sizeof(info)); + rc = mdbx_env_info(env, &envinfo, sizeof(envinfo)); if (rc) { error("mdbx_env_info failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } - rc = mdbx_env_stat(env, &stat, sizeof(stat)); + rc = mdbx_env_stat(env, &envstat, sizeof(envstat)); if (rc) { error("mdbx_env_stat failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } - lastpgno = info.me_last_pgno + 1; + lastpgno = envinfo.me_last_pgno + 1; errno = 0; if (verbose) { double k = 1024.0; const char sf[] = "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ - for (i = 0; sf[i + 1] && info.me_mapsize / k > 1000.0; ++i) + for (i = 0; sf[i + 1] && envinfo.me_mapsize / k > 1000.0; ++i) k *= 1024; - print(" - map size %" PRIuPTR " (%.2f %cb)\n", info.me_mapsize, - info.me_mapsize / k, sf[i]); - if (info.me_mapaddr) - print(" - mapaddr %p\n", info.me_mapaddr); + print(" - map size %" PRIuPTR " (%.2f %cb)\n", envinfo.me_mapsize, + envinfo.me_mapsize / k, sf[i]); + if (envinfo.me_mapaddr) + print(" - mapaddr %p\n", envinfo.me_mapaddr); print(" - pagesize %u, max keysize %" PRIuPTR ", max readers %u\n", - stat.ms_psize, maxkeysize, info.me_maxreaders); + envstat.ms_psize, maxkeysize, envinfo.me_maxreaders); print(" - transactions: last %" PRIuPTR ", bottom %" PRIuPTR ", lag reading %" PRIiPTR "\n", - info.me_last_txnid, info.me_tail_txnid, - info.me_last_txnid - info.me_tail_txnid); + envinfo.me_last_txnid, envinfo.me_tail_txnid, + envinfo.me_last_txnid - envinfo.me_tail_txnid); - print(" - meta-1: %s %" PRIuPTR ", %s", meta_synctype(info.me_meta1_sign), - info.me_meta1_txnid, meta_lt(info.me_meta1_txnid, info.me_meta1_sign, - info.me_meta2_txnid, info.me_meta2_sign) - ? "tail" - : "head"); - if (info.me_meta1_txnid > info.me_last_txnid) + print(" - meta-1: %s %" PRIuPTR ", %s", + meta_synctype(envinfo.me_meta1_sign), envinfo.me_meta1_txnid, + meta_lt(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, + envinfo.me_meta2_txnid, envinfo.me_meta2_sign) + ? "tail" + : "head"); + if (envinfo.me_meta1_txnid > envinfo.me_last_txnid) print(", rolled-back %" PRIuPTR " (%" PRIuPTR " >>> %" PRIuPTR ")", - info.me_meta1_txnid - info.me_last_txnid, info.me_meta1_txnid, - info.me_last_txnid); + envinfo.me_meta1_txnid - envinfo.me_last_txnid, + envinfo.me_meta1_txnid, envinfo.me_last_txnid); print("\n"); - print(" - meta-2: %s %" PRIuPTR ", %s", meta_synctype(info.me_meta2_sign), - info.me_meta2_txnid, meta_lt(info.me_meta2_txnid, info.me_meta2_sign, - info.me_meta1_txnid, info.me_meta1_sign) - ? "tail" - : "head"); - if (info.me_meta2_txnid > info.me_last_txnid) + print(" - meta-2: %s %" PRIuPTR ", %s", + meta_synctype(envinfo.me_meta2_sign), envinfo.me_meta2_txnid, + meta_lt(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign) + ? "tail" + : "head"); + if (envinfo.me_meta2_txnid > envinfo.me_last_txnid) print(", rolled-back %" PRIuPTR " (%" PRIuPTR " >>> %" PRIuPTR ")", - info.me_meta2_txnid - info.me_last_txnid, info.me_meta2_txnid, - info.me_last_txnid); + envinfo.me_meta2_txnid - envinfo.me_last_txnid, + envinfo.me_meta2_txnid, envinfo.me_last_txnid); print("\n"); } @@ -780,34 +787,34 @@ int main(int argc, char *argv[]) { if (verbose) print(" - perform full check last-txn-id with meta-pages\n"); - if (!meta_lt(info.me_meta1_txnid, info.me_meta1_sign, info.me_meta2_txnid, - info.me_meta2_sign) && - info.me_meta1_txnid != info.me_last_txnid) { + if (!meta_lt(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, + envinfo.me_meta2_txnid, envinfo.me_meta2_sign) && + envinfo.me_meta1_txnid != envinfo.me_last_txnid) { print(" - meta-1 txn-id mismatch last-txn-id (%" PRIiPTR " != %" PRIiPTR ")\n", - info.me_meta1_txnid, info.me_last_txnid); + envinfo.me_meta1_txnid, envinfo.me_last_txnid); ++problems_meta; } - if (!meta_lt(info.me_meta2_txnid, info.me_meta2_sign, info.me_meta1_txnid, - info.me_meta1_sign) && - info.me_meta2_txnid != info.me_last_txnid) { + if (!meta_lt(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign) && + envinfo.me_meta2_txnid != envinfo.me_last_txnid) { print(" - meta-2 txn-id mismatch last-txn-id (%" PRIiPTR " != %" PRIiPTR ")\n", - info.me_meta2_txnid, info.me_last_txnid); + envinfo.me_meta2_txnid, envinfo.me_last_txnid); ++problems_meta; } } else if (locktxn) { if (verbose) print(" - perform lite check last-txn-id with meta-pages (not a " "monopolistic mode)\n"); - size_t last = (info.me_meta2_txnid > info.me_meta1_txnid) - ? info.me_meta2_txnid - : info.me_meta1_txnid; - if (last != info.me_last_txnid) { + size_t last = (envinfo.me_meta2_txnid > envinfo.me_meta1_txnid) + ? envinfo.me_meta2_txnid + : envinfo.me_meta1_txnid; + if (last != envinfo.me_last_txnid) { print(" - last-meta mismatch last-txn-id (%" PRIiPTR " != %" PRIiPTR ")\n", - last, info.me_last_txnid); + last, envinfo.me_last_txnid); ++problems_meta; } } else if (verbose) { @@ -854,7 +861,7 @@ int main(int argc, char *argv[]) { } if (verbose) { - size_t total_page_bytes = walk.pgcount * stat.ms_psize; + size_t total_page_bytes = walk.pgcount * envstat.ms_psize; print(" - dbi pages: %" PRIuPTR " total", walk.pgcount); if (verbose > 1) for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) @@ -870,7 +877,7 @@ int main(int argc, char *argv[]) { (total_page_bytes - walk.total_payload_bytes) * 100.0 / total_page_bytes); for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { - size_t dbi_bytes = walk.dbi_pages[i] * stat.ms_psize; + size_t dbi_bytes = walk.dbi_pages[i] * envstat.ms_psize; print(" %s: subtotal %" PRIuPTR " bytes (%.1f%%), payload %" PRIuPTR " (%.1f%%), " "unused %" PRIuPTR " (%.1f%%)", @@ -905,13 +912,13 @@ int main(int argc, char *argv[]) { problems_freedb = process_db(0 /* FREE_DBI */, "free", handle_freedb, 0); if (verbose) { - size_t value = info.me_mapsize / stat.ms_psize; + size_t value = envinfo.me_mapsize / envstat.ms_psize; double percent = value / 100.0; print(" - pages info: %" PRIuPTR " total", value); print(", allocated %" PRIuPTR " (%.1f%%)", lastpgno, lastpgno / percent); if (verbose > 1) { - value = info.me_mapsize / stat.ms_psize - lastpgno; + value = envinfo.me_mapsize / envstat.ms_psize - lastpgno; print(", remained %" PRIuPTR " (%.1f%%)", value, value / percent); value = lastpgno - freedb_pages; @@ -926,7 +933,8 @@ int main(int argc, char *argv[]) { reclaimable_pages / percent); } - value = info.me_mapsize / stat.ms_psize - lastpgno + reclaimable_pages; + value = + envinfo.me_mapsize / envstat.ms_psize - lastpgno + reclaimable_pages; print(", available %" PRIuPTR " (%.1f%%)\n", value, value / percent); } diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index dc707504..f61db314 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -37,7 +37,7 @@ static char *prog; static int Eof; -static MDBX_envinfo info; +static MDBX_envinfo envinfo; static MDB_val kbuf, dbuf; @@ -108,7 +108,7 @@ static void readhdr(void) { if (ptr) *ptr = '\0'; i = sscanf((char *)dbuf.mv_data + STRLENOF("mapaddr="), "%p", - &info.me_mapaddr); + &envinfo.me_mapaddr); if (i != 1) { fprintf(stderr, "%s: line %" PRIiPTR ": invalid mapaddr %s\n", prog, lineno, (char *)dbuf.mv_data + STRLENOF("mapaddr=")); @@ -120,7 +120,7 @@ static void readhdr(void) { if (ptr) *ptr = '\0'; i = sscanf((char *)dbuf.mv_data + STRLENOF("mapsize="), "%" PRIuPTR "", - &info.me_mapsize); + &envinfo.me_mapsize); if (i != 1) { fprintf(stderr, "%s: line %" PRIiPTR ": invalid mapsize %s\n", prog, lineno, (char *)dbuf.mv_data + STRLENOF("mapsize=")); @@ -132,7 +132,7 @@ static void readhdr(void) { if (ptr) *ptr = '\0'; i = sscanf((char *)dbuf.mv_data + STRLENOF("maxreaders="), "%u", - &info.me_maxreaders); + &envinfo.me_maxreaders); if (i != 1) { fprintf(stderr, "%s: line %" PRIiPTR ": invalid maxreaders %s\n", prog, lineno, (char *)dbuf.mv_data + STRLENOF("maxreaders=")); @@ -361,11 +361,11 @@ int main(int argc, char *argv[]) { mdbx_env_set_maxdbs(env, 2); - if (info.me_maxreaders) - mdbx_env_set_maxreaders(env, info.me_maxreaders); + if (envinfo.me_maxreaders) + mdbx_env_set_maxreaders(env, envinfo.me_maxreaders); - if (info.me_mapsize) - mdbx_env_set_mapsize(env, info.me_mapsize); + if (envinfo.me_mapsize) + mdbx_env_set_mapsize(env, envinfo.me_mapsize); #ifdef MDB_FIXEDMAP if (info.me_mapaddr) From 0cd30792bb818f2202bb6db7496d73110a7c54a0 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 14:49:12 +0300 Subject: [PATCH 140/303] mdbx: split MDB_ID into pgno_t and txnid_t. --- src/bits.h | 40 +++---- src/mdbx.c | 258 +++++++++++++++++++++---------------------- src/midl.h | 6 +- src/tools/mdbx_chk.c | 26 +++-- 4 files changed, 158 insertions(+), 172 deletions(-) diff --git a/src/bits.h b/src/bits.h index 3acc23ba..6b176d83 100644 --- a/src/bits.h +++ b/src/bits.h @@ -152,37 +152,27 @@ /* Number of meta pages - also hardcoded elsewhere */ #define NUM_METAS 2 -/* A generic unsigned ID number. These were entryIDs in back-bdb. -* Preferably it should have the same size as a pointer. -*/ -typedef size_t MDB_ID; +/* A page number in the database. + * + * MDBX uses 32 bit for page numbers. This limits database + * size up to 2^44 bytes, in case of 4K pages. */ +typedef uint32_t pgno_t; +#define PRIaPGNO PRIu32 -/** A page number in the database. -* Note that 64 bit page numbers are overkill, since pages themselves -* already represent 12-13 bits of addressable memory, and the OS will -* always limit applications to a maximum of 63 bits of address space. -* -* @note In the #MDB_node structure, we only store 48 bits of this value, -* which thus limits us to only 60 bits of addressable data. -*/ -typedef MDB_ID pgno_t; - -/** A transaction ID. -* See struct MDB_txn.mt_txnid for details. -*/ -typedef MDB_ID txnid_t; +/* A transaction ID. */ +typedef uint64_t txnid_t; +#define PRIaTXN PRIi64 /* An IDL is an ID List, a sorted array of IDs. The first -* element of the array is a counter for how many actual -* IDs are in the list. In the original back-bdb code, IDLs are -* sorted in ascending order. For libmdb IDLs are sorted in -* descending order. -*/ -typedef MDB_ID *MDB_IDL; + * element of the array is a counter for how many actual + * IDs are in the list. In the original back-bdb code, IDLs are + * sorted in ascending order. For libmdb IDLs are sorted in + * descending order. */ +typedef pgno_t *MDB_IDL; /* An ID2 is an ID/pointer pair. */ typedef struct MDB_ID2 { - MDB_ID mid; /* The ID */ + pgno_t mid; /* The ID */ void *mptr; /* The pointer */ } MDB_ID2; diff --git a/src/mdbx.c b/src/mdbx.c index 51953ea5..bf67a0e8 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -506,25 +506,21 @@ static __inline MDB_node *NODEPTR(MDB_page *p, unsigned i) { /* Copy a page number from src to dst */ #if UNALIGNED_OK -#define COPY_PGNO(dst, src) dst = src -#elif SIZE_MAX > 4294967295UL -#define COPY_PGNO(dst, src) \ - do { \ - uint16_t *s, *d; \ - s = (uint16_t *)&(src); \ - d = (uint16_t *)&(dst); \ - *d++ = *s++; \ - *d++ = *s++; \ - *d++ = *s++; \ - *d = *s; \ - } while (0) +#define COPY_PGNO(dst, src) (dst) = (src) +#elif defined(__GNUC__) || __has_builtin(__built_memcmp) +#define COPY_PGNO(dst, src) __built_memcmp(&(dst), &(src), sizeof(pgno_t)); #else #define COPY_PGNO(dst, src) \ do { \ uint16_t *s, *d; \ s = (uint16_t *)&(src); \ d = (uint16_t *)&(dst); \ - *d++ = *s++; \ + if (sizeof(pgno_t) > 6) \ + *d++ = *s++; \ + if (sizeof(pgno_t) > 4) \ + *d++ = *s++; \ + if (sizeof(pgno_t) > 2) \ + *d++ = *s++; \ *d = *s; \ } while (0) #endif /* UNALIGNED_OK */ @@ -952,14 +948,14 @@ static void mdbx_cursor_chk(MDB_cursor *mc) { static void mdbx_audit(MDB_txn *txn) { MDB_cursor mc; MDB_val key, data; - MDB_ID freecount, count; + pgno_t freecount, count; MDB_dbi i; int rc; freecount = 0; mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdbx_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) - freecount += *(MDB_ID *)data.mv_data; + freecount += *(pgno_t *)data.mv_data; mdbx_tassert(txn, rc == MDB_NOTFOUND); count = 0; @@ -992,10 +988,10 @@ static void mdbx_audit(MDB_txn *txn) { } } if (freecount + count + NUM_METAS != txn->mt_next_pgno) { - mdbx_print( - "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n", - txn->mt_txnid, freecount, count + NUM_METAS, - freecount + count + NUM_METAS, txn->mt_next_pgno); + mdbx_print("audit: %" PRIaTXN " freecount: %" PRIaPGNO " count: %" PRIaPGNO + " total: %" PRIaPGNO " next_pgno: %" PRIaPGNO "\n", + txn->mt_txnid, freecount, count + NUM_METAS, + freecount + count + NUM_METAS, txn->mt_next_pgno); } } @@ -1133,7 +1129,7 @@ static int mdbx_page_loose(MDB_cursor *mc, MDB_page *mp) { } } if (loose) { - mdbx_debug("loosen db %d page %" PRIuPTR "", DDBI(mc), mp->mp_pgno); + mdbx_debug("loosen db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); MDB_page **link = &NEXT_LOOSE_PAGE(mp); if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) { mdbx_kill_page(txn->mt_env, pgno); @@ -1312,7 +1308,7 @@ static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { /* Save the page IDs of all the pages we're flushing */ /* flush from the tail forward, this saves a lot of shifting later on. */ for (i = dl[0].mid; i && need; i--) { - MDB_ID pn = dl[i].mid << 1; + pgno_t pn = dl[i].mid << 1; dp = dl[i].mptr; if (dp->mp_flags & (P_LOOSE | P_KEEP)) continue; @@ -1470,7 +1466,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { np = txn->mt_loose_pgs; txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); txn->mt_loose_count--; - mdbx_debug("db %d use loose page %" PRIuPTR "", DDBI(mc), np->mp_pgno); + mdbx_debug("db %d use loose page %" PRIaPGNO, DDBI(mc), np->mp_pgno); ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize); *mp = np; return MDB_SUCCESS; @@ -1597,9 +1593,9 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { } } - idl = (MDB_ID *)data.mv_data; + idl = (pgno_t *)data.mv_data; mdbx_tassert(txn, idl[0] == 0 || - data.mv_size == (idl[0] + 1) * sizeof(MDB_ID)); + data.mv_size == (idl[0] + 1) * sizeof(pgno_t)); i = idl[0]; if (!mop) { if (unlikely(!(env->me_pghead = mop = mdbx_midl_alloc(i)))) { @@ -1618,11 +1614,11 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { env->me_pglast = last; if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { - mdbx_debug_extra("IDL read txn %" PRIuPTR " root %" PRIuPTR + mdbx_debug_extra("IDL read txn %" PRIaTXN " root %" PRIaPGNO " num %u, IDL", last, txn->mt_dbs[FREE_DBI].md_root, i); for (j = i; j; j--) - mdbx_debug_extra_print(" %" PRIuPTR "", idl[j]); + mdbx_debug_extra_print(" %" PRIaPGNO "", idl[j]); mdbx_debug_extra_print("\n"); } @@ -1684,8 +1680,8 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { * don't make a steady-sync, but only a legacy-mode checkpoint, * just for resume reclaiming only, not for data consistency. */ - mdbx_debug("kick-gc: head %" PRIuPTR "/%c, tail %" PRIuPTR - "/%c, oldest %" PRIuPTR "", + mdbx_debug("kick-gc: head %" PRIaTXN "/%c, tail %" PRIaTXN + "/%c, oldest %" PRIaTXN "", head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest); @@ -1862,7 +1858,7 @@ static int mdbx_page_touch(MDB_cursor *mc) { (rc = mdbx_page_alloc(mc, 1, &np, MDBX_ALLOC_ALL)))) goto fail; pgno = np->mp_pgno; - mdbx_debug("touched db %d page %" PRIuPTR " -> %" PRIuPTR "", DDBI(mc), + mdbx_debug("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), mp->mp_pgno, pgno); mdbx_cassert(mc, mp->mp_pgno != pgno); mdbx_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); @@ -2217,7 +2213,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { mdbx_runtime_flags |= MDBX_DBG_TRACE | MDBX_DBG_EXTRA | MDBX_DBG_AUDIT | MDBX_DBG_ASSERT; mdbx_debug_log(MDBX_DBG_EDGE, __FUNCTION__, __LINE__, - "on/off edge (txn %" PRIuPTR ")", txn->mt_txnid); + "on/off edge (txn %" PRIaTXN ")", txn->mt_txnid); } #endif if (unlikely(txn->mt_txnid < meta->mm_txnid)) { @@ -2285,8 +2281,7 @@ int mdbx_txn_renew(MDB_txn *txn) { rc = mdbx_txn_renew0(txn, MDB_TXN_RDONLY); if (rc == MDB_SUCCESS) { - mdbx_debug("renew txn %" PRIuPTR "%c %p on mdbenv %p, root page %" PRIuPTR - "", + mdbx_debug("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "", txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); } @@ -2399,8 +2394,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, } else { txn->mt_signature = MDBX_MT_SIGNATURE; *ret = txn; - mdbx_debug("begin txn %" PRIuPTR "%c %p on mdbenv %p, root page %" PRIuPTR - "", + mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "", txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); } @@ -2462,7 +2456,7 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { /* Export or close DBI handles opened in this txn */ mdbx_dbis_update(txn, mode & MDB_END_UPDATE); - mdbx_debug("%s txn %" PRIuPTR "%c %p on mdbenv %p, root page %" PRIuPTR "", + mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO "", names[mode & MDB_END_OPMASK], txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); @@ -2705,11 +2699,11 @@ again: if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { unsigned i = free_pgs[0]; - mdbx_debug_extra("IDL write txn %" PRIuPTR " root %" PRIuPTR + mdbx_debug_extra("IDL write txn %" PRIaTXN " root %" PRIaPGNO " num %u, IDL", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) - mdbx_debug_extra_print(" %" PRIuPTR "", free_pgs[i]); + mdbx_debug_extra_print(" %" PRIaPGNO "", free_pgs[i]); mdbx_debug_extra_print("\n"); } continue; @@ -2840,7 +2834,7 @@ again: for (;;) { txnid_t id; ssize_t len; - MDB_ID save; + pgno_t save; if (!lifo) { id = *(txnid_t *)key.mv_data; @@ -2859,11 +2853,11 @@ again: txn, cleanup_idx == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + len = (ssize_t)(data.mv_size / sizeof(pgno_t)) - 1; mdbx_tassert(txn, len >= 0); if (len > mop_len) len = mop_len; - data.mv_size = (len + 1) * sizeof(MDB_ID); + data.mv_size = (len + 1) * sizeof(pgno_t); key.mv_data = &id; key.mv_size = sizeof(id); data.mv_data = mop -= len; @@ -2979,7 +2973,7 @@ static int mdbx_page_flush(MDB_txn *txn, int keep) { wpos = pos; wsize = 0; } - mdbx_debug("committing page %" PRIuPTR "", pgno); + mdbx_debug("committing page %" PRIaPGNO "", pgno); next_pos = pos + size; iov[n].iov_len = size; iov[n].iov_base = (char *)dp; @@ -3096,7 +3090,7 @@ int mdbx_txn_commit(MDB_txn *txn) { pspill[0] = (pgno_t)-1; /* Mark our dirty pages as deleted in parent spill list */ for (i = 0, len = src[0].mid; ++i <= len;) { - MDB_ID pn = src[i].mid << 1; + pgno_t pn = src[i].mid << 1; while (pn > pspill[x]) x--; if (pn == pspill[x]) { @@ -3114,7 +3108,7 @@ int mdbx_txn_commit(MDB_txn *txn) { /* Remove anything in our spill list from parent's dirty list */ if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { for (i = 1; i <= txn->mt_spill_pgs[0]; i++) { - MDB_ID pn = txn->mt_spill_pgs[i]; + pgno_t pn = txn->mt_spill_pgs[i]; if (pn & 1) continue; /* deleted spillpg */ pn >>= 1; @@ -3201,7 +3195,7 @@ int mdbx_txn_commit(MDB_txn *txn) { goto done; mdbx_debug( - "committing txn %" PRIuPTR " %p on mdbenv %p, root page %" PRIuPTR "", + "committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO "", txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); /* Update DB root pointers */ @@ -3283,7 +3277,7 @@ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { MDB_page *p = (MDB_page *)&buf; if (!F_ISSET(p->mp_flags, P_META)) { - mdbx_debug("page %" PRIuPTR " not a meta-page", p->mp_pgno); + mdbx_debug("page %" PRIaPGNO " not a meta-page", p->mp_pgno); return MDB_INVALID; } @@ -3448,9 +3442,9 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, MDB_meta *stay = mdbx_env_meta_flipflop(env, (MDB_meta *)target); mdbx_debug( - "writing meta %d (%s, was %" PRIuPTR "/%s, stay %s %" PRIuPTR - "/%s), root %" PRIuPTR ", " - "txn_id %" PRIuPTR ", %s", + "writing meta %d (%s, was %" PRIaTXN "/%s, stay %s %" PRIaTXN + "/%s), root %" PRIaPGNO ", " + "txn_id %" PRIaTXN ", %s", offset >= (off_t)env->me_psize, target == head ? "head" : "tail", target->mm_txnid, META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" @@ -3815,7 +3809,7 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { MDB_meta *const head = mdbx_meta_head(env); if (head->mm_txnid != meta->mm_txnid) { - mdbx_trace("head->mm_txnid (%" PRIuPTR ") != (%" PRIuPTR ") meta->mm_txnid", + mdbx_trace("head->mm_txnid (%" PRIaTXN ") != (%" PRIaTXN ") meta->mm_txnid", head->mm_txnid, meta->mm_txnid); if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { assert(META_IS_STEADY(meta) && !META_IS_STEADY(head)); @@ -4111,13 +4105,13 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, env->me_psize); - mdbx_debug("using meta page %d, txn %" PRIuPTR "", toggle, meta->mm_txnid); + mdbx_debug("using meta page %d, txn %" PRIaTXN "", toggle, meta->mm_txnid); mdbx_debug("depth: %u", db->md_depth); - mdbx_debug("entries: %" PRIuPTR "", db->md_entries); - mdbx_debug("branch pages: %" PRIuPTR "", db->md_branch_pages); - mdbx_debug("leaf pages: %" PRIuPTR "", db->md_leaf_pages); - mdbx_debug("overflow pages: %" PRIuPTR "", db->md_overflow_pages); - mdbx_debug("root: %" PRIuPTR "", db->md_root); + mdbx_debug("entries: %" PRIu64 "", db->md_entries); + mdbx_debug("branch pages: %" PRIaPGNO "", db->md_branch_pages); + mdbx_debug("leaf pages: %" PRIaPGNO "", db->md_leaf_pages); + mdbx_debug("overflow pages: %" PRIaPGNO "", db->md_overflow_pages); + mdbx_debug("root: %" PRIaPGNO "", db->md_root); } #endif @@ -4384,7 +4378,7 @@ static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, nkeys = NUMKEYS(mp); - mdbx_debug("searching %u keys in %s %spage %" PRIuPTR "", nkeys, + mdbx_debug("searching %u keys in %s %spage %" PRIaPGNO "", nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdbx_dbg_pgno(mp)); @@ -4425,7 +4419,7 @@ static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, if (IS_LEAF(mp)) mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc); else - mdbx_debug("found branch index %u [%s -> %" PRIuPTR "], rc = %i", i, + mdbx_debug("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, DKEY(&nodekey), NODEPGNO(node), rc); if (rc == 0) break; @@ -4466,7 +4460,7 @@ static void mdbx_cursor_adjust(MDB_cursor *mc, func) { /* Pop a page off the top of the cursor's stack. */ static void mdbx_cursor_pop(MDB_cursor *mc) { if (mc->mc_snum) { - mdbx_debug("popped page %" PRIuPTR " off db %d cursor %p", + mdbx_debug("popped page %" PRIaPGNO " off db %d cursor %p", mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); mc->mc_snum--; @@ -4481,7 +4475,7 @@ static void mdbx_cursor_pop(MDB_cursor *mc) { /* Push a page onto the top of the cursor's stack. * Set MDB_TXN_ERROR on failure. */ static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp) { - mdbx_debug("pushing page %" PRIuPTR " on db %d cursor %p", mp->mp_pgno, + mdbx_debug("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc), (void *)mc); if (unlikely(mc->mc_snum >= CURSOR_STACK)) { @@ -4525,7 +4519,7 @@ static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ if (tx2->mt_spill_pgs) { - MDB_ID pn = pgno << 1; + pgno_t pn = pgno << 1; x = mdbx_midl_search(tx2->mt_spill_pgs, pn); if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) goto mapped; @@ -4542,7 +4536,7 @@ static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, } if (unlikely(pgno >= txn->mt_next_pgno)) { - mdbx_debug("page %" PRIuPTR " not found", pgno); + mdbx_debug("page %" PRIaPGNO " not found", pgno); txn->mt_flags |= MDB_TXN_ERROR; return MDB_PAGE_NOTFOUND; } @@ -4569,13 +4563,13 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { MDB_node *node; indx_t i; - mdbx_debug("branch page %" PRIuPTR " has %u keys", mp->mp_pgno, + mdbx_debug("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, NUMKEYS(mp)); /* Don't assert on branch pages in the FreeDB. We can get here * while in the process of rebalancing a FreeDB branch page; we must * let that proceed. ITS#8336 */ mdbx_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); - mdbx_debug("found index 0 to page %" PRIuPTR "", NODEPGNO(NODEPTR(mp, 0))); + mdbx_debug("found index 0 to page %" PRIaPGNO "", NODEPGNO(NODEPTR(mp, 0))); if (flags & (MDB_PS_FIRST | MDB_PS_LAST)) { i = 0; @@ -4630,7 +4624,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { return MDB_CORRUPTED; } - mdbx_debug("found leaf page %" PRIuPTR " for key [%s]", mp->mp_pgno, + mdbx_debug("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, DKEY(key)); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -4728,7 +4722,7 @@ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { mc->mc_snum = 1; mc->mc_top = 0; - mdbx_debug("db %d root page %" PRIuPTR " has flags 0x%X", DDBI(mc), root, + mdbx_debug("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, mc->mc_pg[0]->mp_flags); if (flags & MDB_PS_MODIFY) { @@ -4748,10 +4742,10 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDB_page *mp) { unsigned x = 0, ovpages = mp->mp_pages; MDB_env *env = txn->mt_env; MDB_IDL sl = txn->mt_spill_pgs; - MDB_ID pn = pg << 1; + pgno_t pn = pg << 1; int rc; - mdbx_debug("free ov page %" PRIuPTR " (%u)", pg, ovpages); + mdbx_debug("free ov page %" PRIaPGNO " (%u)", pg, ovpages); /* If the page is dirty or on the spill list we just acquired it, * so we should give it back to our current free list, if any. * Otherwise put it onto the list of pages we freed in this txn. @@ -4836,7 +4830,7 @@ static __inline int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); if (unlikely((rc = mdbx_page_get(mc, pgno, &omp, NULL)) != 0)) { - mdbx_debug("read overflow page %" PRIuPTR " failed", pgno); + mdbx_debug("read overflow page %" PRIaPGNO " failed", pgno); return rc; } data->mv_data = PAGEDATA(omp); @@ -4887,7 +4881,7 @@ static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { } mdbx_cursor_pop(mc); - mdbx_debug("parent page is page %" PRIuPTR ", index %u", + mdbx_debug("parent page is page %" PRIaPGNO ", index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); if (move_right @@ -4963,7 +4957,7 @@ static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } - mdbx_debug("cursor_next: top page is %" PRIuPTR " in cursor %p", + mdbx_debug("cursor_next: top page is %" PRIaPGNO " in cursor %p", mdbx_dbg_pgno(mp), (void *)mc); if (mc->mc_flags & C_DEL) { mc->mc_flags ^= C_DEL; @@ -4977,13 +4971,13 @@ static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, return rc; } mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("next page is %" PRIuPTR ", key index %u", mp->mp_pgno, + mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); } else mc->mc_ki[mc->mc_top]++; skip: - mdbx_debug("==> cursor points to page %" PRIuPTR + mdbx_debug("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", mdbx_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); @@ -5053,7 +5047,7 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } - mdbx_debug("cursor_prev: top page is %" PRIuPTR " in cursor %p", + mdbx_debug("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mdbx_dbg_pgno(mp), (void *)mc); mc->mc_flags &= ~(C_EOF | C_DEL); @@ -5065,12 +5059,12 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, } mp = mc->mc_pg[mc->mc_top]; mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - mdbx_debug("prev page is %" PRIuPTR ", key index %u", mp->mp_pgno, + mdbx_debug("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); } else mc->mc_ki[mc->mc_top]--; - mdbx_debug("==> cursor points to page %" PRIuPTR + mdbx_debug("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", mdbx_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); @@ -6279,7 +6273,7 @@ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, if (unlikely((rc = mdbx_page_alloc(mc, num, &np, MDBX_ALLOC_ALL)))) return rc; - mdbx_debug("allocated new page #%" PRIuPTR ", size %u", np->mp_pgno, + mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize); np->mp_flags = flags | P_DIRTY; np->mp_lower = (PAGEHDRSZ - PAGEBASE); @@ -6381,7 +6375,7 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, mdbx_cassert(mc, mp->mp_upper >= mp->mp_lower); - mdbx_debug("add to %s %spage %" PRIuPTR " index %i, data size %" PRIuPTR + mdbx_debug("add to %s %spage %" PRIaPGNO " index %i, data size %" PRIuPTR " key size %" PRIuPTR " [%s]", IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdbx_dbg_pgno(mp), indx, data ? data->mv_size : 0, @@ -6425,7 +6419,7 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, goto full; if ((rc = mdbx_page_new(mc, P_OVERFLOW, ovpages, &ofp))) return rc; - mdbx_debug("allocated overflow page %" PRIuPTR "", ofp->mp_pgno); + mdbx_debug("allocated overflow page %" PRIaPGNO "", ofp->mp_pgno); flags |= F_BIGDATA; goto update; } else { @@ -6482,7 +6476,7 @@ update: return MDB_SUCCESS; full: - mdbx_debug("not enough room in page %" PRIuPTR ", got %u ptrs", + mdbx_debug("not enough room in page %" PRIaPGNO ", got %u ptrs", mdbx_dbg_pgno(mp), NUMKEYS(mp)); mdbx_debug("upper-lower = %u - %u = %" PRIiPTR "", mp->mp_upper, mp->mp_lower, room); @@ -6503,7 +6497,7 @@ static void mdbx_node_del(MDB_cursor *mc, int ksize) { MDB_node *node; char *base; - mdbx_debug("delete node %u on %s page %" PRIuPTR "", indx, + mdbx_debug("delete node %u on %s page %" PRIaPGNO "", indx, IS_LEAF(mp) ? "leaf" : "branch", mdbx_dbg_pgno(mp)); numkeys = NUMKEYS(mp); mdbx_cassert(mc, indx < numkeys); @@ -6651,7 +6645,7 @@ static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node) { mx->mx_db.md_flags |= MDB_INTEGERKEY; } } - mdbx_debug("Sub-db -%u root page %" PRIuPTR "", mx->mx_cursor.mc_dbi, + mdbx_debug("Sub-db -%u root page %" PRIaPGNO "", mx->mx_cursor.mc_dbi, mx->mx_db.md_root); mx->mx_dbflag = DB_VALID | DB_USRVALID | DB_DUPDATA; /* #if UINT_MAX < SIZE_MAX @@ -6684,7 +6678,7 @@ static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, } mx->mx_db = src_mx->mx_db; mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; - mdbx_debug("Sub-db -%u root page %" PRIuPTR "", mx->mx_cursor.mc_dbi, + mdbx_debug("Sub-db -%u root page %" PRIaPGNO "", mx->mx_cursor.mc_dbi, mx->mx_db.md_root); } @@ -6884,8 +6878,8 @@ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { char kbuf2[DKBUF_MAXKEYSIZE * 2 + 1]; k2.mv_data = NODEKEY(node); k2.mv_size = node->mn_ksize; - mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %" PRIuPTR "", indx, - ptr, mdbx_dkey(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), + mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %" PRIaPGNO "", + indx, ptr, mdbx_dkey(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), mp->mp_pgno); } @@ -7027,8 +7021,8 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { return rc; } - mdbx_debug("moving %s node %u [%s] on page %" PRIuPTR - " to node %u on page %" PRIuPTR "", + mdbx_debug("moving %s node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO "", IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", csrc->mc_ki[csrc->mc_top], DKEY(&key), csrc->mc_pg[csrc->mc_top]->mp_pgno, cdst->mc_ki[cdst->mc_top], @@ -7111,7 +7105,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - mdbx_debug("update separator for source page %" PRIuPTR " to [%s]", + mdbx_debug("update separator for source page %" PRIaPGNO " to [%s]", csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)); mdbx_cursor_copy(csrc, &mn); mn.mc_snum--; @@ -7141,7 +7135,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - mdbx_debug("update separator for destination page %" PRIuPTR " to [%s]", + mdbx_debug("update separator for destination page %" PRIaPGNO " to [%s]", cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)); mdbx_cursor_copy(cdst, &mn); mn.mc_snum--; @@ -7185,7 +7179,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_debug("merging page %" PRIuPTR " into %" PRIuPTR "", psrc->mp_pgno, + mdbx_debug("merging page %" PRIaPGNO " into %" PRIaPGNO "", psrc->mp_pgno, pdst->mp_pgno); mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ @@ -7243,7 +7237,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { } } - mdbx_debug("dst page %" PRIuPTR " now has %u keys (%.1f%% filled)", + mdbx_debug("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", pdst->mp_pgno, NUMKEYS(pdst), (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10); @@ -7348,7 +7342,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { minkeys = 1; thresh = FILL_THRESHOLD; } - mdbx_debug("rebalancing %s page %" PRIuPTR " (has %u keys, %.1f%% full)", + mdbx_debug("rebalancing %s page %" PRIaPGNO " (has %u keys, %.1f%% full)", IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", mdbx_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), @@ -7356,7 +7350,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { - mdbx_debug("no need to rebalance page %" PRIuPTR ", above fill threshold", + mdbx_debug("no need to rebalance page %" PRIaPGNO ", above fill threshold", mdbx_dbg_pgno(mc->mc_pg[mc->mc_top])); return MDB_SUCCESS; } @@ -7482,7 +7476,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { fromleft = 1; } - mdbx_debug("found neighbor page %" PRIuPTR " (%u keys, %.1f%% full)", + mdbx_debug("found neighbor page %" PRIaPGNO " (%u keys, %.1f%% full)", mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10); @@ -7692,7 +7686,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, newindx = mc->mc_ki[mc->mc_top]; nkeys = NUMKEYS(mp); - mdbx_debug("-----> splitting %s page %" PRIuPTR + mdbx_debug("-----> splitting %s page %" PRIaPGNO " and adding [%s] at index %i/%i", IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys); @@ -7701,7 +7695,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, if ((rc = mdbx_page_new(mc, mp->mp_flags, 1, &rp))) return rc; rp->mp_leaf2_ksize = mp->mp_leaf2_ksize; - mdbx_debug("new right sibling: page %" PRIuPTR "", rp->mp_pgno); + mdbx_debug("new right sibling: page %" PRIaPGNO "", rp->mp_pgno); /* Usually when splitting the root page, the cursor * height is 1. But when called from mdbx_update_key, @@ -7718,7 +7712,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; - mdbx_debug("root split! new root = %" PRIuPTR "", pp->mp_pgno); + mdbx_debug("root split! new root = %" PRIaPGNO "", pp->mp_pgno); new_root = mc->mc_db->md_depth++; /* Add left (implicit) pointer. */ @@ -7736,7 +7730,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, ptop = 0; } else { ptop = mc->mc_top - 1; - mdbx_debug("parent branch page is %" PRIuPTR "", mc->mc_pg[ptop]->mp_pgno); + mdbx_debug("parent branch page is %" PRIaPGNO "", mc->mc_pg[ptop]->mp_pgno); } mdbx_cursor_copy(mc, &mn); @@ -8421,14 +8415,13 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { root = new_root = txn->mt_dbs[MAIN_DBI].md_root; if (root != P_INVALID) { /* Count free pages + freeDB pages. Subtract from last_pg - * to find the new last_pg, which also becomes the new root. - */ - MDB_ID freecount = 0; + * to find the new last_pg, which also becomes the new root. */ + pgno_t freecount = 0; MDB_cursor mc; MDB_val key, data; mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdbx_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) - freecount += *(MDB_ID *)data.mv_data; + freecount += *(pgno_t *)data.mv_data; if (rc != MDB_NOTFOUND) goto finish; freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + @@ -9138,11 +9131,11 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { if (mr[i].mr_pid) { txnid_t txnid = mr[i].mr_txnid; if (txnid == ~(txnid_t)0) - snprintf(buf, sizeof(buf), "%10d %" PRIxPTR " -\n", (int)mr[i].mr_pid, - (size_t)mr[i].mr_tid); + snprintf(buf, sizeof(buf), "%10" PRIuPTR " %" PRIxPTR " -\n", + (size_t)mr[i].mr_pid, (size_t)mr[i].mr_tid); else - snprintf(buf, sizeof(buf), "%10d %" PRIxPTR " %" PRIuPTR "\n", - (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); + snprintf(buf, sizeof(buf), "%10" PRIuPTR " %" PRIxPTR " %" PRIaTXN "\n", + (size_t)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); if (first) { first = 0; @@ -9264,8 +9257,8 @@ int __cold mdbx_reader_check0(MDB_env *env, int rdt_locked, int *dead) { /* clean it */ for (unsigned j = i; j < snap_nreaders; j++) { if (mr[j].mr_pid == pid) { - mdbx_debug("clear stale reader pid %u txn %" PRIiPTR "", (unsigned)pid, - mr[j].mr_txnid); + mdbx_debug("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN "", + (size_t)pid, mr[j].mr_txnid); mr[j].mr_pid = 0; count++; } @@ -9280,9 +9273,8 @@ int __cold mdbx_reader_check0(MDB_env *env, int rdt_locked, int *dead) { return rc; } -static unsigned __hot mdbx_midl_search(MDB_IDL ids, MDB_ID id) { - /* - * binary search of id in ids +static unsigned __hot mdbx_midl_search(MDB_IDL ids, pgno_t id) { + /* binary search of id in ids * if found, returns position of id * if not found, returns first position greater than id */ unsigned base = 0; @@ -9312,8 +9304,8 @@ static unsigned __hot mdbx_midl_search(MDB_IDL ids, MDB_ID id) { } static MDB_IDL mdbx_midl_alloc(int num) { - MDB_IDL ids = malloc((num + 2) * sizeof(MDB_ID)); - if (ids) { + MDB_IDL ids = malloc((num + 2) * sizeof(pgno_t)); + if (likely(ids)) { *ids++ = num; *ids = 0; } @@ -9326,19 +9318,22 @@ static void mdbx_midl_free(MDB_IDL ids) { } static void mdbx_midl_shrink(MDB_IDL *idp) { - MDB_IDL ids = *idp; - if (*(--ids) > MDB_IDL_UM_MAX && - (ids = realloc(ids, (MDB_IDL_UM_MAX + 2) * sizeof(MDB_ID)))) { - *ids++ = MDB_IDL_UM_MAX; - *idp = ids; + MDB_IDL ids = *idp - 1; + if (unlikely(*ids > MDB_IDL_UM_MAX)) { + /* shrink to MDB_IDL_UM_MAX */ + ids = realloc(ids, (MDB_IDL_UM_MAX + 2) * sizeof(pgno_t)); + if (likely(ids)) { + *ids++ = MDB_IDL_UM_MAX; + *idp = ids; + } } } static int mdbx_midl_grow(MDB_IDL *idp, int num) { MDB_IDL idn = *idp - 1; /* grow it */ - idn = realloc(idn, (*idn + num + 2) * sizeof(MDB_ID)); - if (!idn) + idn = realloc(idn, (*idn + num + 2) * sizeof(pgno_t)); + if (unlikely(!idn)) return MDBX_ENOMEM; *idn++ += num; *idp = idn; @@ -9350,7 +9345,8 @@ static int mdbx_midl_need(MDB_IDL *idp, unsigned num) { num += ids[0]; if (num > ids[-1]) { num = (num + num / 4 + (256 + 2)) & -256; - if (!(ids = realloc(ids - 1, num * sizeof(MDB_ID)))) + ids = realloc(ids - 1, num * sizeof(pgno_t)); + if (unlikely(!ids)) return MDBX_ENOMEM; *ids++ = num - 2; *idp = ids; @@ -9358,7 +9354,7 @@ static int mdbx_midl_need(MDB_IDL *idp, unsigned num) { return 0; } -static int mdbx_midl_append(MDB_IDL *idp, MDB_ID id) { +static int mdbx_midl_append(MDB_IDL *idp, pgno_t id) { MDB_IDL ids = *idp; /* Too big? */ if (ids[0] >= ids[-1]) { @@ -9379,13 +9375,13 @@ static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app) { return MDBX_ENOMEM; ids = *idp; } - memcpy(&ids[ids[0] + 1], &app[1], app[0] * sizeof(MDB_ID)); + memcpy(&ids[ids[0] + 1], &app[1], app[0] * sizeof(pgno_t)); ids[0] += app[0]; return 0; } -static int mdbx_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n) { - MDB_ID *ids = *idp, len = ids[0]; +static int mdbx_midl_append_range(MDB_IDL *idp, pgno_t id, unsigned n) { + pgno_t *ids = *idp, len = ids[0]; /* Too big? */ if (len + n > ids[-1]) { if (mdbx_midl_grow(idp, n | MDB_IDL_UM_MAX)) @@ -9400,8 +9396,8 @@ static int mdbx_midl_append_range(MDB_IDL *idp, MDB_ID id, unsigned n) { } static void __hot mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge) { - MDB_ID old_id, merge_id, i = merge[0], j = idl[0], k = i + j, total = k; - idl[0] = (MDB_ID)-1; /* delimiter for idl scan below */ + pgno_t old_id, merge_id, i = merge[0], j = idl[0], k = i + j, total = k; + idl[0] = ~(pgno_t)0; /* delimiter for idl scan below */ old_id = idl[j]; while (i) { merge_id = merge[i--]; @@ -9413,11 +9409,10 @@ static void __hot mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge) { } /* Quicksort + Insertion sort for small arrays */ - #define SMALL 8 #define MIDL_SWAP(a, b) \ { \ - MDB_ID itmp = (a); \ + pgno_t itmp = (a); \ (a) = (b); \ (b) = itmp; \ } @@ -9426,7 +9421,7 @@ static void __hot mdbx_midl_sort(MDB_IDL ids) { /* Max possible depth of int-indexed tree * 2 items/level */ int istack[sizeof(int) * CHAR_BIT * 2]; int i, j, k, l, ir, jstack; - MDB_ID a; + pgno_t a; ir = (int)ids[0]; l = 1; @@ -9488,9 +9483,8 @@ static void __hot mdbx_midl_sort(MDB_IDL ids) { } } -static unsigned __hot mdbx_mid2l_search(MDB_ID2L ids, MDB_ID id) { - /* - * binary search of id in ids +static unsigned __hot mdbx_mid2l_search(MDB_ID2L ids, pgno_t id) { + /* binary search of id in ids * if found, returns position of id * if not found, returns first position greater than id */ unsigned base = 0; diff --git a/src/midl.h b/src/midl.h index b59e024e..35e37ad5 100644 --- a/src/midl.h +++ b/src/midl.h @@ -13,7 +13,7 @@ */ /* IDL sizes - likely should be even bigger - * limiting factors: sizeof(ID), thread stack size */ + * limiting factors: sizeof(pgno_t), thread stack size */ #define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ #define MDB_IDL_DB_SIZE (1 << MDB_IDL_LOGN) #define MDB_IDL_UM_SIZE (1 << (MDB_IDL_LOGN + 1)) @@ -21,7 +21,7 @@ #define MDB_IDL_DB_MAX (MDB_IDL_DB_SIZE - 1) #define MDB_IDL_UM_MAX (MDB_IDL_UM_SIZE - 1) -#define MDB_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(MDB_ID)) +#define MDB_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(pgno_t)) #define MDB_IDL_IS_ZERO(ids) ((ids)[0] == 0) #define MDB_IDL_CPY(dst, src) (memcpy(dst, src, MDB_IDL_SIZEOF(src))) #define MDB_IDL_FIRST(ids) ((ids)[1]) @@ -33,6 +33,6 @@ /* Append ID to IDL. The IDL must be big enough. */ #define mdbx_midl_xappend(idl, id) \ do { \ - MDB_ID *xidl = (idl), xlen = ++(xidl[0]); \ + pgno_t *xidl = (idl), xlen = ++(xidl[0]); \ xidl[xlen] = (id); \ } while (0) diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index c0b57551..3662f4eb 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -26,6 +26,7 @@ #include #include "../../mdbx.h" +#include "../bits.h" #include "../midl.h" typedef struct flagbit { @@ -316,17 +317,18 @@ static int handle_userdb(const size_t record_number, const MDB_val *key, static int handle_freedb(const size_t record_number, const MDB_val *key, const MDB_val *data) { char *bad = ""; - size_t pg, prev; + pgno_t pg, prev; ssize_t i, number, span = 0; - size_t *iptr = data->mv_data, txnid = *(size_t *)key->mv_data; + pgno_t *iptr = data->mv_data; + txnid_t txnid = *(txnid_t *)key->mv_data; - if (key->mv_size != sizeof(txnid)) + if (key->mv_size != sizeof(txnid_t)) problem_add("entry", record_number, "wrong txn-id size", "key-size %" PRIiPTR "", key->mv_size); else if (txnid < 1 || txnid > envinfo.me_last_txnid) - problem_add("entry", record_number, "wrong txn-id", "%" PRIuPTR "", txnid); + problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN "", txnid); - if (data->mv_size < sizeof(size_t) || data->mv_size % sizeof(size_t)) + if (data->mv_size < sizeof(pgno_t) || data->mv_size % sizeof(pgno_t)) problem_add("entry", record_number, "wrong idl size", "%" PRIuPTR "", data->mv_size); else { @@ -334,9 +336,9 @@ static int handle_freedb(const size_t record_number, const MDB_val *key, if (number >= MDB_IDL_UM_MAX) problem_add("entry", record_number, "wrong idl length", "%" PRIiPTR "", number); - else if ((number + 1) * sizeof(size_t) != data->mv_size) + else if ((number + 1) * sizeof(pgno_t) != data->mv_size) problem_add("entry", record_number, "mismatch idl length", - "%" PRIiPTR " != %" PRIuPTR "", number * sizeof(size_t), + "%" PRIiPTR " != %" PRIuPTR "", (number + 1) * sizeof(pgno_t), data->mv_size); else { freedb_pages += number; @@ -344,9 +346,9 @@ static int handle_freedb(const size_t record_number, const MDB_val *key, reclaimable_pages += number; for (i = number, prev = 1; --i >= 0;) { pg = iptr[i]; - if (pg < 2 /* META_PAGE */ || pg > envinfo.me_last_pgno) + if (pg < NUM_METAS || pg > envinfo.me_last_pgno) problem_add("entry", record_number, "wrong idl entry", - "2 < %" PRIiPTR " < %" PRIiPTR "", pg, + "%u < %" PRIiPTR " < %" PRIiPTR "", NUM_METAS, pg, envinfo.me_last_pgno); else if (pg <= prev) { bad = " [bad sequence]"; @@ -359,7 +361,7 @@ static int handle_freedb(const size_t record_number, const MDB_val *key, ; } if (verbose > 2 && !only_subdb) { - print(" transaction %" PRIuPTR ", %" PRIiPTR + print(" transaction %" PRIaTXN ", %" PRIiPTR " pages, maxspan %" PRIiPTR "%s\n", txnid, number, span, bad); if (verbose > 3) { @@ -369,9 +371,9 @@ static int handle_freedb(const size_t record_number, const MDB_val *key, for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) ; if (span > 1) - print(" %9zu[%" PRIiPTR "]\n", pg, span); + print(" %9" PRIaPGNO "[%" PRIiPTR "]\n", pg, span); else - print(" %9zu\n", pg); + print(" %9" PRIaPGNO "\n", pg); } } } From eff0f92fa7b24751edf3458b05a5758f26b2aafa Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 14:47:49 +0300 Subject: [PATCH 141/303] mdbx-tools: replace size_t/PRIuPTR with uint64_t/PRIu64. --- src/tools/mdbx_chk.c | 192 +++++++++++++++++++++--------------------- src/tools/mdbx_dump.c | 2 +- src/tools/mdbx_load.c | 2 +- src/tools/mdbx_stat.c | 18 ++-- 4 files changed, 106 insertions(+), 108 deletions(-) diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 3662f4eb..f6df930a 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -59,39 +59,39 @@ static void signal_handler(int sig) { struct { const char *dbi_names[MAX_DBI]; - size_t dbi_pages[MAX_DBI]; - size_t dbi_empty_pages[MAX_DBI]; - size_t dbi_payload_bytes[MAX_DBI]; - size_t dbi_lost_bytes[MAX_DBI]; + uint64_t dbi_pages[MAX_DBI]; + uint64_t dbi_empty_pages[MAX_DBI]; + uint64_t dbi_payload_bytes[MAX_DBI]; + uint64_t dbi_lost_bytes[MAX_DBI]; short *pagemap; - size_t total_payload_bytes; - size_t pgcount; + uint64_t total_payload_bytes; + uint64_t pgcount; } walk; static __attribute__((constructor)) void init_walk(void) { walk.dbi_names[0] = "@gc"; } -size_t total_unused_bytes; +uint64_t total_unused_bytes; int exclusive = 2; MDB_env *env; MDB_txn *txn, *locktxn; MDBX_envinfo envinfo; MDBX_stat envstat; -size_t maxkeysize, reclaimable_pages, freedb_pages, lastpgno; -size_t userdb_count, skipped_subdb; +size_t maxkeysize, userdb_count, skipped_subdb; +uint64_t reclaimable_pages, freedb_pages, lastpgno; unsigned verbose, quiet; const char *only_subdb; struct problem { struct problem *pr_next; - size_t count; + uint64_t count; const char *caption; }; struct problem *problems_list; -size_t total_problems; +uint64_t total_problems; static void __attribute__((format(printf, 1, 2))) print(const char *msg, ...) { if (!quiet) { @@ -156,7 +156,7 @@ static int pagemap_lookup_dbi(const char *dbi) { return last = i; } -static void problem_add(const char *object, size_t entry_number, +static void problem_add(const char *object, uint64_t entry_number, const char *msg, const char *extra, ...) { total_problems++; @@ -178,7 +178,7 @@ static void problem_add(const char *object, size_t entry_number, p->count++; if (verbose > 1) { - print(" %s #%" PRIuPTR ": %s", object, entry_number, msg); + print(" %s #%" PRIu64 ": %s", object, entry_number, msg); if (extra) { va_list args; printf(" ("); @@ -200,8 +200,8 @@ static struct problem *problems_push() { return p; } -static size_t problems_pop(struct problem *list) { - size_t count = 0; +static uint64_t problems_pop(struct problem *list) { + uint64_t count = 0; if (problems_list) { int i; @@ -210,7 +210,7 @@ static size_t problems_pop(struct problem *list) { for (i = 0; problems_list; ++i) { struct problem *p = problems_list->pr_next; count += problems_list->count; - print("%s%s (%" PRIuPTR ")", i ? ", " : "", problems_list->caption, + print("%s%s (%" PRIu64 ")", i ? ", " : "", problems_list->caption, problems_list->count); free(problems_list); problems_list = p; @@ -223,23 +223,23 @@ static size_t problems_pop(struct problem *list) { return count; } -static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, - const char *type, int nentries, int payload_bytes, - int header_bytes, int unused_bytes) { +static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, + const char *dbi, const char *type, int nentries, + int payload_bytes, int header_bytes, int unused_bytes) { (void)ctx; if (type) { - size_t page_bytes = payload_bytes + header_bytes + unused_bytes; - size_t page_size = pgnumber * envstat.ms_psize; + uint64_t page_bytes = payload_bytes + header_bytes + unused_bytes; + uint64_t page_size = pgnumber * envstat.ms_psize; int index = pagemap_lookup_dbi(dbi); if (index < 0) return ENOMEM; if (verbose > 2 && (!only_subdb || strcmp(only_subdb, dbi) == 0)) { if (pgnumber == 1) - print(" %s-page %" PRIuPTR "", type, pgno); + print(" %s-page %" PRIu64, type, pgno); else - print(" %s-span %" PRIuPTR "[%u]", type, pgno, pgnumber); + print(" %s-span %" PRIu64 "[%u]", type, pgno, pgnumber); print(" of %s: header %i, payload %i, unused %i\n", dbi, header_bytes, payload_bytes, unused_bytes); } @@ -247,9 +247,8 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, walk.pgcount += pgnumber; if (unused_bytes < 0 || (size_t)unused_bytes > page_size) - problem_add("page", pgno, "illegal unused-bytes", - "%" PRIuPTR " < %i < %" PRIuPTR "", 0, unused_bytes, - envstat.ms_psize); + problem_add("page", pgno, "illegal unused-bytes", "%u < %i < %u", 0, + unused_bytes, envstat.ms_psize); if (header_bytes < (int)sizeof(long) || (size_t)header_bytes >= envstat.ms_psize - sizeof(long)) @@ -273,7 +272,7 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, if (page_bytes != page_size) { problem_add("page", pgno, "misused", - "%" PRIuPTR " != %" PRIuPTR " (%ih + %ip + %iu)", page_size, + "%" PRIu64 " != %" PRIu64 " (%ih + %ip + %iu)", page_size, page_bytes, header_bytes, payload_bytes, unused_bytes); if (page_size > page_bytes) walk.dbi_lost_bytes[index] += page_size - page_bytes; @@ -286,7 +285,7 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, do { if (pgno >= lastpgno) problem_add("page", pgno, "wrong page-no", - "%" PRIuPTR " > %" PRIiPTR "", pgno, lastpgno); + "%" PRIu64 " > %" PRIu64 "", pgno, lastpgno); else if (walk.pagemap[pgno]) problem_add("page", pgno, "already used", "in %s", walk.dbi_names[walk.pagemap[pgno]]); @@ -302,11 +301,11 @@ static int pgvisitor(size_t pgno, unsigned pgnumber, void *ctx, const char *dbi, return gotsignal ? EINTR : MDB_SUCCESS; } -typedef int(visitor)(const size_t record_number, const MDB_val *key, +typedef int(visitor)(const uint64_t record_number, const MDB_val *key, const MDB_val *data); static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent); -static int handle_userdb(const size_t record_number, const MDB_val *key, +static int handle_userdb(const uint64_t record_number, const MDB_val *key, const MDB_val *data) { (void)record_number; (void)key; @@ -314,11 +313,11 @@ static int handle_userdb(const size_t record_number, const MDB_val *key, return MDB_SUCCESS; } -static int handle_freedb(const size_t record_number, const MDB_val *key, +static int handle_freedb(const uint64_t record_number, const MDB_val *key, const MDB_val *data) { char *bad = ""; pgno_t pg, prev; - ssize_t i, number, span = 0; + int i, number, span = 0; pgno_t *iptr = data->mv_data; txnid_t txnid = *(txnid_t *)key->mv_data; @@ -361,9 +360,8 @@ static int handle_freedb(const size_t record_number, const MDB_val *key, ; } if (verbose > 2 && !only_subdb) { - print(" transaction %" PRIaTXN ", %" PRIiPTR - " pages, maxspan %" PRIiPTR "%s\n", - txnid, number, span, bad); + print(" transaction %" PRIaTXN ", %u pages, maxspan %i%s\n", txnid, + number, span, bad); if (verbose > 3) { int j = number - 1; while (j >= 0) { @@ -371,7 +369,7 @@ static int handle_freedb(const size_t record_number, const MDB_val *key, for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) ; if (span > 1) - print(" %9" PRIaPGNO "[%" PRIiPTR "]\n", pg, span); + print(" %9" PRIaPGNO "[%i]\n", pg, span); else print(" %9" PRIaPGNO "\n", pg); } @@ -383,7 +381,7 @@ static int handle_freedb(const size_t record_number, const MDB_val *key, return MDB_SUCCESS; } -static int handle_maindb(const size_t record_number, const MDB_val *key, +static int handle_maindb(const uint64_t record_number, const MDB_val *key, const MDB_val *data) { char *name; int rc; @@ -416,10 +414,10 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { unsigned flags; int rc, i; struct problem *saved_list; - size_t problems_count; + uint64_t problems_count; - unsigned record_count = 0, dups = 0; - size_t key_bytes = 0, data_bytes = 0; + uint64_t record_count = 0, dups = 0; + uint64_t key_bytes = 0, data_bytes = 0; if (0 > (int)dbi) { rc = mdbx_dbi_open(txn, name, 0, &dbi); @@ -472,10 +470,10 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { } print(" (0x%02X)\n", flags); if (verbose > 1) { - print(" - page size %u, entries %" PRIuPTR "\n", ms.ms_psize, + print(" - page size %u, entries %" PRIu64 "\n", ms.ms_psize, ms.ms_entries); - print(" - b-tree depth %u, pages: branch %" PRIuPTR ", leaf %" PRIuPTR - ", overflow %" PRIuPTR "\n", + print(" - b-tree depth %u, pages: branch %" PRIu64 ", leaf %" PRIu64 + ", overflow %" PRIu64 "\n", ms.ms_depth, ms.ms_branch_pages, ms.ms_leaf_pages, ms.ms_overflow_pages); } @@ -501,17 +499,17 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { if (key.mv_size > maxkeysize) { problem_add("entry", record_count, "key length exceeds max-key-size", - "%" PRIuPTR " > %" PRIuPTR "", key.mv_size, maxkeysize); - } else if ((flags & MDB_INTEGERKEY) && key.mv_size != sizeof(size_t) && - key.mv_size != sizeof(int)) { + "%" PRIuPTR " > %u", key.mv_size, maxkeysize); + } else if ((flags & MDB_INTEGERKEY) && key.mv_size != sizeof(uint64_t) && + key.mv_size != sizeof(uint32_t)) { problem_add("entry", record_count, "wrong key length", - "%" PRIuPTR " != %" PRIuPTR "", key.mv_size, sizeof(size_t)); + "%" PRIuPTR " != 4or8", key.mv_size); } - if ((flags & MDB_INTEGERDUP) && data.mv_size != sizeof(size_t) && - data.mv_size != sizeof(int)) { + if ((flags & MDB_INTEGERDUP) && data.mv_size != sizeof(uint64_t) && + data.mv_size != sizeof(uint32_t)) { problem_add("entry", record_count, "wrong data length", - "%" PRIuPTR " != %" PRIuPTR "", data.mv_size, sizeof(size_t)); + "%" PRIuPTR " != 4or8", data.mv_size); } if (prev_key.mv_data) { @@ -567,9 +565,9 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { bailout: problems_count = problems_pop(saved_list); if (!silent && verbose) { - print(" - summary: %u records, %u dups, %" PRIuPTR " key's bytes, %" PRIuPTR - " data's " - "bytes, %" PRIuPTR " problems\n", + print(" - summary: %" PRIu64 " records, %" PRIu64 " dups, %" PRIu64 + " key's bytes, %" PRIu64 " data's " + "bytes, %" PRIu64 " problems\n", record_count, dups, key_bytes, data_bytes, problems_count); fflush(NULL); } @@ -593,7 +591,7 @@ static void usage(char *prog) { exit(EXIT_INTERRUPTED); } -const char *meta_synctype(size_t sign) { +const char *meta_synctype(uint64_t sign) { switch (sign) { case 0: return "no-sync/legacy"; @@ -604,8 +602,10 @@ const char *meta_synctype(size_t sign) { } } -int meta_lt(size_t txn1, size_t sign1, size_t txn2, size_t sign2) { - return ((sign1 > 1) == (sign2 > 1)) ? txn1 < txn2 : txn2 && sign2 > 1; +int meta_lt(txnid_t txn1, uint64_t sign1, txnid_t txn2, uint64_t sign2) { + return (SIGN_IS_STEADY(sign1) == SIGN_IS_STEADY(sign2)) + ? txn1 < txn2 + : txn2 && SIGN_IS_STEADY(sign2); } int main(int argc, char *argv[]) { @@ -615,7 +615,6 @@ int main(int argc, char *argv[]) { int envflags = MDB_RDONLY; int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; int dont_traversal = 0; - size_t n; struct timespec timestamp_start, timestamp_finish; double elapsed; @@ -749,37 +748,37 @@ int main(int argc, char *argv[]) { "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ for (i = 0; sf[i + 1] && envinfo.me_mapsize / k > 1000.0; ++i) k *= 1024; - print(" - map size %" PRIuPTR " (%.2f %cb)\n", envinfo.me_mapsize, + print(" - map size %" PRIu64 " (%.2f %cb)\n", envinfo.me_mapsize, envinfo.me_mapsize / k, sf[i]); if (envinfo.me_mapaddr) print(" - mapaddr %p\n", envinfo.me_mapaddr); print(" - pagesize %u, max keysize %" PRIuPTR ", max readers %u\n", envstat.ms_psize, maxkeysize, envinfo.me_maxreaders); - print(" - transactions: last %" PRIuPTR ", bottom %" PRIuPTR - ", lag reading %" PRIiPTR "\n", + print(" - transactions: last %" PRIu64 ", bottom %" PRIu64 + ", lag reading %" PRIi64 "\n", envinfo.me_last_txnid, envinfo.me_tail_txnid, envinfo.me_last_txnid - envinfo.me_tail_txnid); - print(" - meta-1: %s %" PRIuPTR ", %s", - meta_synctype(envinfo.me_meta1_sign), envinfo.me_meta1_txnid, + print(" - meta-1: %s %" PRIu64 ", %s", meta_synctype(envinfo.me_meta1_sign), + envinfo.me_meta1_txnid, meta_lt(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, envinfo.me_meta2_txnid, envinfo.me_meta2_sign) ? "tail" : "head"); if (envinfo.me_meta1_txnid > envinfo.me_last_txnid) - print(", rolled-back %" PRIuPTR " (%" PRIuPTR " >>> %" PRIuPTR ")", + print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", envinfo.me_meta1_txnid - envinfo.me_last_txnid, envinfo.me_meta1_txnid, envinfo.me_last_txnid); print("\n"); - print(" - meta-2: %s %" PRIuPTR ", %s", - meta_synctype(envinfo.me_meta2_sign), envinfo.me_meta2_txnid, + print(" - meta-2: %s %" PRIu64 ", %s", meta_synctype(envinfo.me_meta2_sign), + envinfo.me_meta2_txnid, meta_lt(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, envinfo.me_meta1_txnid, envinfo.me_meta1_sign) ? "tail" : "head"); if (envinfo.me_meta2_txnid > envinfo.me_last_txnid) - print(", rolled-back %" PRIuPTR " (%" PRIuPTR " >>> %" PRIuPTR ")", + print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", envinfo.me_meta2_txnid - envinfo.me_last_txnid, envinfo.me_meta2_txnid, envinfo.me_last_txnid); print("\n"); @@ -792,7 +791,7 @@ int main(int argc, char *argv[]) { if (!meta_lt(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, envinfo.me_meta2_txnid, envinfo.me_meta2_sign) && envinfo.me_meta1_txnid != envinfo.me_last_txnid) { - print(" - meta-1 txn-id mismatch last-txn-id (%" PRIiPTR " != %" PRIiPTR + print(" - meta-1 txn-id mismatch last-txn-id (%" PRIi64 " != %" PRIi64 ")\n", envinfo.me_meta1_txnid, envinfo.me_last_txnid); ++problems_meta; @@ -801,7 +800,7 @@ int main(int argc, char *argv[]) { if (!meta_lt(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, envinfo.me_meta1_txnid, envinfo.me_meta1_sign) && envinfo.me_meta2_txnid != envinfo.me_last_txnid) { - print(" - meta-2 txn-id mismatch last-txn-id (%" PRIiPTR " != %" PRIiPTR + print(" - meta-2 txn-id mismatch last-txn-id (%" PRIi64 " != %" PRIi64 ")\n", envinfo.me_meta2_txnid, envinfo.me_last_txnid); ++problems_meta; @@ -810,12 +809,11 @@ int main(int argc, char *argv[]) { if (verbose) print(" - perform lite check last-txn-id with meta-pages (not a " "monopolistic mode)\n"); - size_t last = (envinfo.me_meta2_txnid > envinfo.me_meta1_txnid) - ? envinfo.me_meta2_txnid - : envinfo.me_meta1_txnid; + uint64_t last = (envinfo.me_meta2_txnid > envinfo.me_meta1_txnid) + ? envinfo.me_meta2_txnid + : envinfo.me_meta1_txnid; if (last != envinfo.me_last_txnid) { - print(" - last-meta mismatch last-txn-id (%" PRIiPTR " != %" PRIiPTR - ")\n", + print(" - last-meta mismatch last-txn-id (%" PRIi64 " != %" PRIi64 ")\n", last, envinfo.me_last_txnid); ++problems_meta; } @@ -852,6 +850,7 @@ int main(int argc, char *argv[]) { goto bailout; } + uint64_t n; for (n = 0; n < lastpgno; ++n) if (!walk.pagemap[n]) walk.dbi_pages[0] += 1; @@ -863,35 +862,34 @@ int main(int argc, char *argv[]) { } if (verbose) { - size_t total_page_bytes = walk.pgcount * envstat.ms_psize; - print(" - dbi pages: %" PRIuPTR " total", walk.pgcount); + uint64_t total_page_bytes = walk.pgcount * envstat.ms_psize; + print(" - dbi pages: %" PRIu64 " total", walk.pgcount); if (verbose > 1) for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) - print(", %s %" PRIuPTR "", walk.dbi_names[i], walk.dbi_pages[i]); - print(", %s %" PRIuPTR "\n", walk.dbi_names[0], walk.dbi_pages[0]); + print(", %s %" PRIu64 "", walk.dbi_names[i], walk.dbi_pages[i]); + print(", %s %" PRIu64 "\n", walk.dbi_names[0], walk.dbi_pages[0]); if (verbose > 1) { - print(" - space info: total %" PRIuPTR " bytes, payload %" PRIuPTR + print(" - space info: total %" PRIu64 " bytes, payload %" PRIu64 " (%.1f%%), unused " - "%" PRIuPTR " (%.1f%%)\n", + "%" PRIu64 " (%.1f%%)\n", total_page_bytes, walk.total_payload_bytes, walk.total_payload_bytes * 100.0 / total_page_bytes, total_page_bytes - walk.total_payload_bytes, (total_page_bytes - walk.total_payload_bytes) * 100.0 / total_page_bytes); for (i = 1; i < MAX_DBI && walk.dbi_names[i]; ++i) { - size_t dbi_bytes = walk.dbi_pages[i] * envstat.ms_psize; - print(" %s: subtotal %" PRIuPTR - " bytes (%.1f%%), payload %" PRIuPTR " (%.1f%%), " - "unused %" PRIuPTR " (%.1f%%)", + uint64_t dbi_bytes = walk.dbi_pages[i] * envstat.ms_psize; + print(" %s: subtotal %" PRIu64 " bytes (%.1f%%)," + " payload %" PRIu64 " (%.1f%%), unused %" PRIu64 " (%.1f%%)", walk.dbi_names[i], dbi_bytes, dbi_bytes * 100.0 / total_page_bytes, walk.dbi_payload_bytes[i], walk.dbi_payload_bytes[i] * 100.0 / dbi_bytes, dbi_bytes - walk.dbi_payload_bytes[i], (dbi_bytes - walk.dbi_payload_bytes[i]) * 100.0 / dbi_bytes); if (walk.dbi_empty_pages[i]) - print(", %" PRIuPTR " empty pages", walk.dbi_empty_pages[i]); + print(", %" PRIu64 " empty pages", walk.dbi_empty_pages[i]); if (walk.dbi_lost_bytes[i]) - print(", %" PRIuPTR " bytes lost", walk.dbi_lost_bytes[i]); + print(", %" PRIu64 " bytes lost", walk.dbi_lost_bytes[i]); print("\n"); } } @@ -914,40 +912,40 @@ int main(int argc, char *argv[]) { problems_freedb = process_db(0 /* FREE_DBI */, "free", handle_freedb, 0); if (verbose) { - size_t value = envinfo.me_mapsize / envstat.ms_psize; + uint64_t value = envinfo.me_mapsize / envstat.ms_psize; double percent = value / 100.0; - print(" - pages info: %" PRIuPTR " total", value); - print(", allocated %" PRIuPTR " (%.1f%%)", lastpgno, lastpgno / percent); + print(" - pages info: %" PRIu64 " total", value); + print(", allocated %" PRIu64 " (%.1f%%)", lastpgno, lastpgno / percent); if (verbose > 1) { value = envinfo.me_mapsize / envstat.ms_psize - lastpgno; - print(", remained %" PRIuPTR " (%.1f%%)", value, value / percent); + print(", remained %" PRIu64 " (%.1f%%)", value, value / percent); value = lastpgno - freedb_pages; - print(", used %" PRIuPTR " (%.1f%%)", value, value / percent); + print(", used %" PRIu64 " (%.1f%%)", value, value / percent); - print(", gc %" PRIuPTR " (%.1f%%)", freedb_pages, freedb_pages / percent); + print(", gc %" PRIu64 " (%.1f%%)", freedb_pages, freedb_pages / percent); value = freedb_pages - reclaimable_pages; - print(", detained %" PRIuPTR " (%.1f%%)", value, value / percent); + print(", detained %" PRIu64 " (%.1f%%)", value, value / percent); - print(", reclaimable %" PRIuPTR " (%.1f%%)", reclaimable_pages, + print(", reclaimable %" PRIu64 " (%.1f%%)", reclaimable_pages, reclaimable_pages / percent); } value = envinfo.me_mapsize / envstat.ms_psize - lastpgno + reclaimable_pages; - print(", available %" PRIuPTR " (%.1f%%)\n", value, value / percent); + print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent); } if (problems_maindb == 0 && problems_freedb == 0) { if (!dont_traversal && (exclusive || locktxn)) { if (walk.pgcount != lastpgno - freedb_pages) { - error("used pages mismatch (%" PRIuPTR " != %" PRIuPTR ")\n", + error("used pages mismatch (%" PRIu64 " != %" PRIu64 ")\n", walk.pgcount, lastpgno - freedb_pages); } if (walk.dbi_pages[0] != freedb_pages) { - error("gc pages mismatch (%" PRIuPTR " != %" PRIuPTR ")\n", + error("gc pages mismatch (%" PRIu64 " != %" PRIu64 ")\n", walk.dbi_pages[0], freedb_pages); } } else if (verbose) { @@ -986,7 +984,7 @@ bailout: total_problems += problems_meta; if (total_problems || problems_maindb || problems_freedb) { - print("Total %" PRIuPTR " error(s) is detected, elapsed %.3f seconds.\n", + print("Total %" PRIu64 " error(s) is detected, elapsed %.3f seconds.\n", total_problems, elapsed); if (problems_meta || problems_maindb || problems_freedb) return EXIT_FAILURE_CHECK_MAJOR; diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index 92cf8905..ca4572a2 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -109,7 +109,7 @@ static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) { if (name) printf("database=%s\n", name); printf("type=btree\n"); - printf("mapsize=%" PRIuPTR "\n", info.me_mapsize); + printf("mapsize=%" PRIu64 "\n", info.me_mapsize); if (info.me_mapaddr) printf("mapaddr=%p\n", info.me_mapaddr); printf("maxreaders=%u\n", info.me_maxreaders); diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index f61db314..f942613b 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -119,7 +119,7 @@ static void readhdr(void) { ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data + STRLENOF("mapsize="), "%" PRIuPTR "", + i = sscanf((char *)dbuf.mv_data + STRLENOF("mapsize="), "%" PRIu64 "", &envinfo.me_mapsize); if (i != 1) { fprintf(stderr, "%s: line %" PRIiPTR ": invalid mapsize %s\n", prog, diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index 47102e1c..12375bf5 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -23,10 +23,10 @@ static void prstat(MDBX_stat *ms) { printf(" Page size: %u\n", ms->ms_psize); printf(" Tree depth: %u\n", ms->ms_depth); - printf(" Branch pages: %" PRIuPTR "\n", ms->ms_branch_pages); - printf(" Leaf pages: %" PRIuPTR "\n", ms->ms_leaf_pages); - printf(" Overflow pages: %" PRIuPTR "\n", ms->ms_overflow_pages); - printf(" Entries: %" PRIuPTR "\n", ms->ms_entries); + printf(" Branch pages: %" PRIu64 "\n", ms->ms_branch_pages); + printf(" Leaf pages: %" PRIu64 "\n", ms->ms_leaf_pages); + printf(" Overflow pages: %" PRIu64 "\n", ms->ms_overflow_pages); + printf(" Entries: %" PRIu64 "\n", ms->ms_entries); } static void usage(char *prog) { @@ -121,12 +121,12 @@ int main(int argc, char *argv[]) { (void)mdbx_env_info(env, &mei, sizeof(mei)); printf("Environment Info\n"); printf(" Map address: %p\n", mei.me_mapaddr); - printf(" Map size: %" PRIuPTR "\n", mei.me_mapsize); + printf(" Map size: %" PRIu64 "\n", mei.me_mapsize); printf(" Page size: %u\n", mst.ms_psize); - printf(" Max pages: %" PRIuPTR "\n", mei.me_mapsize / mst.ms_psize); - printf(" Number of pages used: %" PRIuPTR "\n", mei.me_last_pgno + 1); - printf(" Last transaction ID: %" PRIuPTR "\n", mei.me_last_txnid); - printf(" Tail transaction ID: %" PRIuPTR " (%" PRIiPTR ")\n", + printf(" Max pages: %" PRIu64 "\n", mei.me_mapsize / mst.ms_psize); + printf(" Number of pages used: %" PRIu64 "\n", mei.me_last_pgno + 1); + printf(" Last transaction ID: %" PRIu64 "\n", mei.me_last_txnid); + printf(" Tail transaction ID: %" PRIu64 " (%" PRIi64 ")\n", mei.me_tail_txnid, mei.me_tail_txnid - mei.me_last_txnid); printf(" Max readers: %u\n", mei.me_maxreaders); printf(" Number of readers used: %u\n", mei.me_numreaders); From 2f97939efdd1633d0507cf321ca93ada1310e42c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 15:42:14 +0300 Subject: [PATCH 142/303] mdbx: more cleanup mdbx_midl_sort(). --- src/mdbx.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index bf67a0e8..6c6941fa 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9411,11 +9411,11 @@ static void __hot mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge) { /* Quicksort + Insertion sort for small arrays */ #define SMALL 8 #define MIDL_SWAP(a, b) \ - { \ - pgno_t itmp = (a); \ + do { \ + pgno_t tmp_pgno = (a); \ (a) = (b); \ - (b) = itmp; \ - } + (b) = tmp_pgno; \ + } while (0) static void __hot mdbx_midl_sort(MDB_IDL ids) { /* Max possible depth of int-indexed tree * 2 items/level */ @@ -9444,15 +9444,15 @@ static void __hot mdbx_midl_sort(MDB_IDL ids) { } else { k = (l + ir) >> 1; /* Choose median of left, center, right */ MIDL_SWAP(ids[k], ids[l + 1]); - if (ids[l] < ids[ir]) { + if (ids[l] < ids[ir]) MIDL_SWAP(ids[l], ids[ir]); - } - if (ids[l + 1] < ids[ir]) { + + if (ids[l + 1] < ids[ir]) MIDL_SWAP(ids[l + 1], ids[ir]); - } - if (ids[l] < ids[l + 1]) { + + if (ids[l] < ids[l + 1]) MIDL_SWAP(ids[l], ids[l + 1]); - } + i = l + 1; j = ir; a = ids[l + 1]; From 8828e90ff91ae7da38aae2d2d76c09f189ea5364 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 18:40:21 +0300 Subject: [PATCH 143/303] mdbx: mdbx_condmutex_t instead of mutex/condvar pair. --- src/lck-posix.c | 5 ++- src/mdbx.c | 27 +++++------- src/osal.c | 115 ++++++++++++++++++++++++++++++------------------ src/osal.h | 27 ++++++------ 4 files changed, 99 insertions(+), 75 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 5aa818d8..f253c4f8 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -32,7 +32,7 @@ /*----------------------------------------------------------------------------*/ /* rthc */ -static mdbx_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t mdbx_rthc_mutex = PTHREAD_MUTEX_INITIALIZER; void mdbx_rthc_lock(void) { mdbx_ensure(NULL, pthread_mutex_lock(&mdbx_rthc_mutex) == 0); @@ -273,7 +273,8 @@ int mdbx_lck_seize(MDB_env *env) { #define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) #endif -static int __cold mdbx_mutex_failed(MDB_env *env, mdbx_mutex_t *mutex, int rc) { +static int __cold mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, + int rc) { #if MDB_USE_ROBUST if (rc == EOWNERDEAD) { /* We own the mutex. Clean up after dead previous owner. */ diff --git a/src/mdbx.c b/src/mdbx.c index 6c6941fa..c06485c1 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -8137,8 +8137,7 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, typedef struct mdbx_copy { MDB_env *mc_env; MDB_txn *mc_txn; - mdbx_mutex_t mc_mutex; - mdbx_cond_t mc_cond; /* Condition variable for mc_new */ + mdbx_condmutex_t mc_condmutex; char *mc_wbuf[2]; char *mc_over[2]; int mc_wlen[2]; @@ -8158,10 +8157,10 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { char *ptr; int toggle = 0, wsize; - mdbx_mutex_lock(&my->mc_mutex); + mdbx_condmutex_lock(&my->mc_condmutex); while (!my->mc_error) { while (!my->mc_new) - mdbx_cond_wait(&my->mc_cond, &my->mc_mutex); + mdbx_condmutex_wait(&my->mc_condmutex); if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ break; wsize = my->mc_wlen[toggle]; @@ -8184,9 +8183,9 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { toggle ^= 1; /* Return the empty buffer to provider */ my->mc_new--; - mdbx_cond_signal(&my->mc_cond); + mdbx_condmutex_signal(&my->mc_condmutex); } - mdbx_mutex_unlock(&my->mc_mutex); + mdbx_condmutex_unlock(&my->mc_condmutex); return (THREAD_RESULT)0; } @@ -8195,12 +8194,12 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { * [in] my control structure. * [in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). */ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { - mdbx_mutex_lock(&my->mc_mutex); + mdbx_condmutex_lock(&my->mc_condmutex); my->mc_new += adjust; - mdbx_cond_signal(&my->mc_cond); + mdbx_condmutex_signal(&my->mc_condmutex); while (my->mc_new & 2) /* both buffers in use */ - mdbx_cond_wait(&my->mc_cond, &my->mc_mutex); - mdbx_mutex_unlock(&my->mc_mutex); + mdbx_condmutex_wait(&my->mc_condmutex); + mdbx_condmutex_unlock(&my->mc_condmutex); my->mc_toggle ^= (adjust & 1); /* Both threads reset mc_wlen, to be safe from threading errors */ @@ -8376,10 +8375,8 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { int rc; memset(&my, 0, sizeof(my)); - if ((rc = mdbx_mutex_init(&my.mc_mutex)) != 0) + if ((rc = mdbx_condmutex_init(&my.mc_condmutex)) != 0) return rc; - if ((rc = mdbx_cond_init(&my.mc_cond)) != 0) - goto done2; rc = mdbx_memalign_alloc(env->me_os_psize, MDB_WBUF * 2, (void **)&my.mc_wbuf[0]); if (rc != MDB_SUCCESS) @@ -8457,9 +8454,7 @@ finish: done: mdbx_memalign_free(my.mc_wbuf[0]); - mdbx_cond_destroy(&my.mc_cond); -done2: - mdbx_mutex_destroy(&my.mc_mutex); + mdbx_condmutex_destroy(&my.mc_condmutex); return rc ? rc : my.mc_error; } diff --git a/src/osal.c b/src/osal.c index 0e887dd4..5a0f4009 100644 --- a/src/osal.c +++ b/src/osal.c @@ -164,77 +164,104 @@ void mdbx_memalign_free(void *ptr) { /*----------------------------------------------------------------------------*/ -int mdbx_mutex_init(mdbx_mutex_t *mutex) { +int mdbx_condmutex_init(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) - *mutex = CreateMutex(NULL, FALSE, NULL); - return *mutex ? MDB_SUCCESS : mdbx_get_errno_checked(); + int rc = MDB_SUCCESS; + condmutex->event = NULL; + condmutex->mutex = CreateMutex(NULL, FALSE, NULL); + if (!condmutex->mutex) + return mdbx_get_errno_checked(); + + condmutex->event = CreateEvent(NULL, FALSE, FALSE, NULL); + if (!condmutex->event) { + rc = mdbx_get_errno_checked(); + (void)CloseHandle(condmutex->mutex); + condmutex->mutex = NULL; + } + return rc; #else - return pthread_mutex_init(mutex, NULL); + memset(condmutex, 0, sizeof(mdbx_condmutex_t)); + int rc = pthread_mutex_init(&condmutex->mutex, NULL); + if (rc == 0) { + rc = pthread_cond_init(&condmutex->cond, NULL); + if (rc != 0) + (void)pthread_mutex_destroy(&condmutex->mutex); + } + return rc; #endif } -int mdbx_mutex_destroy(mdbx_mutex_t *mutex) { -#if defined(_WIN32) || defined(_WIN64) - return CloseHandle(*mutex) ? MDB_SUCCESS : mdbx_get_errno_checked(); -#else - return pthread_mutex_destroy(mutex); -#endif +static bool is_allzeros(const void *ptr, size_t bytes) { + const uint8_t *u8 = ptr; + for (size_t i = 0; i < bytes; ++i) + if (u8[i] != 0) + return false; + return true; } -int mdbx_mutex_lock(mdbx_mutex_t *mutex) { +int mdbx_condmutex_destroy(mdbx_condmutex_t *condmutex) { + int rc = MDBX_EINVAL; #if defined(_WIN32) || defined(_WIN64) - DWORD code = WaitForSingleObject(*mutex, INFINITE); + if (condmutex->event) { + rc = CloseHandle(condmutex->event) ? MDB_SUCCESS : mdbx_get_errno_checked(); + if (rc == MDB_SUCCESS) + condmutex->event = NULL; + } + if (condmutex->mutex) { + rc = CloseHandle(condmutex->mutex) ? MDB_SUCCESS : mdbx_get_errno_checked(); + if (rc == MDB_SUCCESS) + condmutex->mutex = NULL; + } +#else + if (!is_allzeros(&condmutex->cond, sizeof(condmutex->cond))) { + rc = pthread_cond_destroy(&condmutex->cond); + if (rc == 0) + memset(&condmutex->cond, 0, sizeof(condmutex->cond)); + } + if (!is_allzeros(&condmutex->mutex, sizeof(condmutex->mutex))) { + rc = pthread_mutex_destroy(&condmutex->mutex); + if (rc == 0) + memset(&condmutex->mutex, 0, sizeof(condmutex->mutex)); + } +#endif + return rc; +} + +int mdbx_condmutex_lock(mdbx_condmutex_t *condmutex) { +#if defined(_WIN32) || defined(_WIN64) + DWORD code = WaitForSingleObject(condmutex->mutex, INFINITE); return waitstatus2errcode(code); #else - return pthread_mutex_lock(mutex); + return pthread_mutex_lock(&condmutex->mutex); #endif } -int mdbx_mutex_unlock(mdbx_mutex_t *mutex) { +int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) - return ReleaseMutex(*mutex) ? MDB_SUCCESS : mdbx_get_errno_checked(); + return ReleaseMutex(condmutex->mutex) ? MDB_SUCCESS + : mdbx_get_errno_checked(); #else - return pthread_mutex_unlock(mutex); + return pthread_mutex_unlock(&condmutex->mutex); #endif } -/*----------------------------------------------------------------------------*/ - -int mdbx_cond_init(mdbx_cond_t *cond) { +int mdbx_condmutex_signal(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) - *cond = CreateEvent(NULL, FALSE, FALSE, NULL); - return *cond ? MDB_SUCCESS : mdbx_get_errno_checked(); + return SetEvent(condmutex->event) ? MDB_SUCCESS : mdbx_get_errno_checked(); #else - return pthread_cond_init(cond, NULL); + return pthread_cond_signal(&condmutex->cond); #endif } -#ifndef mdbx_cond_destroy -int mdbx_cond_destroy(mdbx_cond_t *cond) { +int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) - return CloseHandle(*cond) ? MDB_SUCCESS : mdbx_get_errno_checked(); -#else - return pthread_cond_destroy(cond); -#endif -} -#endif /* mdbx_cond_destroy */ - -int mdbx_cond_signal(mdbx_cond_t *cond) { -#if defined(_WIN32) || defined(_WIN64) - return SetEvent(*cond) ? MDB_SUCCESS : mdbx_get_errno_checked(); -#else - return pthread_cond_signal(cond); -#endif -} - -int mdbx_cond_wait(mdbx_cond_t *cond, mdbx_mutex_t *mutex) { -#if defined(_WIN32) || defined(_WIN64) - DWORD code = SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); + DWORD code = + SignalObjectAndWait(condmutex->mutex, condmutex->event, INFINITE, FALSE); if (code == WAIT_OBJECT_0) - code = WaitForSingleObject(*mutex, INFINITE); + code = WaitForSingleObject(condmutex->mutex, INFINITE); return waitstatus2errcode(code); #else - return pthread_cond_wait(cond, mutex); + return pthread_cond_wait(&condmutex->cond, &condmutex->mutex); #endif } diff --git a/src/osal.h b/src/osal.h index 95037d85..661337ca 100644 --- a/src/osal.h +++ b/src/osal.h @@ -57,8 +57,6 @@ #include #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_mutex_t; -typedef HANDLE mdbx_cond_t; typedef HANDLE mdbx_thread_t; typedef unsigned mdbx_thread_key_t; typedef SSIZE_T ssize_t; @@ -66,6 +64,10 @@ typedef SSIZE_T ssize_t; #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI #define THREAD_RESULT DWORD +typedef struct { + HANDLE mutex; + HANDLE event; +} mdbx_condmutex_t; #else #include #include @@ -74,13 +76,15 @@ typedef SSIZE_T ssize_t; #include #include #include -typedef pthread_mutex_t mdbx_mutex_t; -typedef pthread_cond_t mdbx_cond_t; typedef pthread_t mdbx_thread_t; typedef pthread_key_t mdbx_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * +typedef struct { + pthread_mutex_t mutex; + pthread_cond_t cond; +} mdbx_condmutex_t; #endif /* Platform */ #ifndef SSIZE_MAX @@ -384,15 +388,12 @@ static __inline int __mdbx_get_errno_checked(const char *file, unsigned line) { int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result); void mdbx_memalign_free(void *ptr); -int mdbx_mutex_init(mdbx_mutex_t *mutex); -int mdbx_mutex_destroy(mdbx_mutex_t *mutex); -int mdbx_mutex_lock(mdbx_mutex_t *mutex); -int mdbx_mutex_unlock(mdbx_mutex_t *mutex); - -int mdbx_cond_init(mdbx_cond_t *cond); -int mdbx_cond_destroy(mdbx_cond_t *cond); -int mdbx_cond_signal(mdbx_cond_t *cond); -int mdbx_cond_wait(mdbx_cond_t *cond, mdbx_mutex_t *mutex); +int mdbx_condmutex_init(mdbx_condmutex_t *condmutex); +int mdbx_condmutex_lock(mdbx_condmutex_t *condmutex); +int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex); +int mdbx_condmutex_signal(mdbx_condmutex_t *condmutex); +int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex); +int mdbx_condmutex_destroy(mdbx_condmutex_t *condmutex); int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, off_t offset, size_t expected_written); From aa80ef7e71573ec199f4deb003e6328f5fa669f3 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 17 May 2017 20:10:56 +0300 Subject: [PATCH 144/303] test: add hill testcase. --- test/cases.cc | 60 +++++++++++----- test/config.cc | 16 ++++- test/config.h | 128 ++++++++++++++++++++++++++++++++- test/hill.cc | 191 ++++++++++++++++++++++++++++++++++++++++++++++++- test/keygen.cc | 181 +++++++++++++++++++++++++++++++++++++++++++--- test/keygen.h | 116 +++++++++++++++--------------- test/main.cc | 50 ++++++++++--- test/test.cc | 96 +++++++++++++++++++++---- test/test.h | 25 ++++++- test/utils.cc | 42 +++++++++-- test/utils.h | 21 ++++-- 11 files changed, 795 insertions(+), 131 deletions(-) diff --git a/test/cases.cc b/test/cases.cc index 09da2103..1311f12e 100644 --- a/test/cases.cc +++ b/test/cases.cc @@ -14,8 +14,8 @@ #include "test.h" -void configure_actor(unsigned &lastid, const actor_testcase testcase, - const char *id_cstr, const actor_params ¶ms) { +void configure_actor(unsigned &last_space_id, const actor_testcase testcase, + const char *space_id_cstr, const actor_params ¶ms) { unsigned wait4id = 0; if (params.waitfor_nops) { @@ -33,40 +33,64 @@ void configure_actor(unsigned &lastid, const actor_testcase testcase, failure("No previous waitable actor for %u-ops\n", params.waitfor_nops); } - unsigned id = 0; - if (!id_cstr || strcmp(id_cstr, "auto") == 0) - id = lastid + 1; + unsigned space_id = 0; + if (!space_id_cstr || strcmp(space_id_cstr, "auto") == 0) + space_id = last_space_id + 1; else { char *end = nullptr; errno = 0; - id = strtoul(id_cstr, &end, 0); + space_id = strtoul(space_id_cstr, &end, 0); if (errno) - failure_perror("Expects an integer value for actor-id\n", errno); + failure_perror("Expects an integer value for space-id\n", errno); if (end && *end) - failure("The '%s' is unexpected for actor-id\n", end); + failure("The '%s' is unexpected for space-id\n", end); } - if (id < 1 || id > ACTOR_ID_MAX) - failure("Invalid actor-id %u\n", id); - lastid = id; + if (space_id > ACTOR_ID_MAX) + failure("Invalid space-id %u\n", space_id); + last_space_id = space_id; - log_trace("configure_actor: %u for %s", id, testcase2str(testcase)); - global::actors.emplace_back(actor_config(testcase, params, id, wait4id)); + log_trace("configure_actor: space %u for %s", space_id, + testcase2str(testcase)); + global::actors.emplace_back( + actor_config(testcase, params, space_id, wait4id)); global::databases.insert(params.pathname_db); } void testcase_setup(const char *casename, actor_params ¶ms, - unsigned &lastid) { + unsigned &last_space_id) { if (strcmp(casename, "basic") == 0) { log_notice(">>> testcase_setup(%s)", casename); - configure_actor(lastid, ac_hill, nullptr, params); - configure_actor(lastid, ac_jitter, nullptr, params); - configure_actor(lastid, ac_jitter, nullptr, params); - configure_actor(lastid, ac_jitter, nullptr, params); + configure_actor(last_space_id, ac_jitter, nullptr, params); + configure_actor(last_space_id, ac_hill, nullptr, params); + configure_actor(last_space_id, ac_jitter, nullptr, params); + configure_actor(last_space_id, ac_hill, nullptr, params); + configure_actor(last_space_id, ac_jitter, nullptr, params); + configure_actor(last_space_id, ac_hill, nullptr, params); log_notice("<<< testcase_setup(%s): done", casename); } else { failure("unknown testcase `%s`", casename); } } +void keycase_setup(const char *casename, actor_params ¶ms) { + if (strcmp(casename, "random") == 0 || strcmp(casename, "prng") == 0) { + log_notice(">>> keycase_setup(%s)", casename); + params.keygen.keycase = kc_random; + // TODO + log_notice("<<< keycase_setup(%s): done", casename); + } else if (strcmp(casename, "dashes") == 0 || + strcmp(casename, "aside") == 0) { + log_notice(">>> keycase_setup(%s)", casename); + params.keygen.keycase = kc_dashes; + // TODO + log_notice("<<< keycase_setup(%s): done", casename); + } else if (strcmp(casename, "custom") == 0) { + log_notice("=== keycase_setup(%s): skip", casename); + params.keygen.keycase = kc_custom; + } else { + failure("unknown keycase `%s`", casename); + } +} + /* TODO */ diff --git a/test/config.cc b/test/config.cc index 743e022a..d2e6dd12 100644 --- a/test/config.cc +++ b/test/config.cc @@ -176,6 +176,16 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, return true; } +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + uint8_t &value, const uint8_t minval, const uint8_t maxval) { + + uint64_t huge; + if (!parse_option(argc, argv, narg, option, huge, no_scale, minval, maxval)) + return false; + value = (uint8_t)huge; + return true; +} + bool parse_option(int argc, char *const argv[], int &narg, const char *option, bool &value) { const char *value_cstr = NULL; @@ -268,6 +278,8 @@ void dump(const char *title) { logging::local_suffix indent(title); for (auto i = global::actors.begin(); i != global::actors.end(); ++i) { + const std::string tableid = + i->space_id ? "MAINDB" : ("SUB#" + std::to_string(i->space_id)); log_info("#%u, testcase %s, space_id/table %u\n", i->actor_id, testcase2str(i->testcase), i->space_id); indent.push(); @@ -284,8 +296,6 @@ void dump(const char *title) { dump_verbs("mode", i->params.mode_flags, mode_bits); dump_verbs("table", i->params.table_flags, table_bits); - log_info("seed %u\n", i->params.seed); - if (i->params.test_nops) log_info("iterations/records %u\n", i->params.test_nops); else @@ -298,6 +308,8 @@ void dump(const char *title) { log_info("threads %u\n", i->params.nthreads); + log_info("keygen.case: %s\n", keygencase2str(i->params.keygen.keycase)); + log_info("keygen.seed: %u\n", i->params.keygen.seed); log_info("key: minlen %u, maxlen %u\n", i->params.keylen_min, i->params.keylen_max); log_info("data: minlen %u, maxlen %u\n", i->params.datalen_min, diff --git a/test/config.h b/test/config.h index c0a04f93..91ea4a24 100644 --- a/test/config.h +++ b/test/config.h @@ -34,6 +34,15 @@ enum actor_status { const char *testcase2str(const actor_testcase); const char *status2str(actor_status status); +enum keygen_case { + kc_random, /* [ 6.. 2.. 7.. 4.. 0.. 1.. 5.. 3.. ] */ + kc_dashes, /* [ 0123.. 4567.. ] */ + kc_custom, + /* TODO: more cases */ +}; + +const char *keygencase2str(const keygen_case); + //----------------------------------------------------------------------------- namespace config { @@ -65,17 +74,129 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, unsigned &value, const scale_mode scale, const unsigned minval = 0, const unsigned maxval = INT32_MAX); +bool parse_option(int argc, char *const argv[], int &narg, const char *option, + uint8_t &value, const uint8_t minval = 0, + const uint8_t maxval = 255); + //----------------------------------------------------------------------------- #pragma pack(push, 1) +struct keygen_params_pod { + keygen_case keycase; + + /* Параметры генератора пар key-value. + * + * Ключи и значения генерируются по задаваемым параметрам на основе "плоской" + * исходной координаты. При этом, в общем случае, в процессе тестов исходная + * координата последовательно итерируется в заданном диапазоне, а необходимые + * паттерны/последовательности/узоры получаются за счет преобразования + * исходной координаты, согласно описанным ниже параметрам. + * + * Стоит отметить, что порядок описания параметров для удобства совпадает с + * порядком их использования, т.е. с порядком соответствующих преобразований. + * + * Второе важное замечание касается ограничений одновременной координированной + * генерации паттеров как для ключей, так и для значений. Суть в том, что + * такая возможность не нужна по следующим причинам: + * - libmdbx поддерживает два существенно различающихся вида таблиц, + * "уникальные" (без дубликатов и без multi-value), и так называемые + * "с дубликатами" (c multi-value). + * - Для таблиц "без дубликатов" только размер связанных к ключами значений + * (данных) оказывает влияния на работу движка, непосредственно содержимое + * данных не анализируется движком и не оказывает влияния на его работу. + * - Для таблиц "с дубликатами", при наличии более одного значения для + * некоторого ключа, формируется дочернее btree-поддерево. Это дерево + * формируется в отдельном "кусте" страниц и обслуживается независимо + * от окружения родительского ключа. + * - Таким образом, паттерн генерации значений имеет смысл только для + * таблиц "с дубликатами" и только в контексте одного значения ключа. + * Иначе говоря, нет смысла в со-координации генерации паттернов для + * ключей и значений. Более того, генерацию значений всегда необходимо + * рассматривать в контексте связки с одним значением ключа. + * + * width: + * Большинство тестов предполагают создание или итерирование некоторого + * количества записей. При этом требуется итерирование или генерация + * значений и ключей из некоторого ограниченного пространства вариантов. + * + * Параметр width задает такую ширину пространства вариантов в битах. + * Таким образом мощность пространства вариантов (пока) всегда равна + * степени двойки. Это ограничение можно снять, но ценой увеличения + * вычислительной сложности, включая потерю простоты и прозрачности. + * + * С другой стороны, не-битовый width может быть полезен: + * - Позволит генерировать ключи/значения в точно задаваемом диапазоне. + * Например, перебрать в псевдо-случайном порядке 10001 значение. + * - Позволит поровну разделять заданное пространство (диапазон) + * ключей/значений между количеством потоков некратным степени двойки. + * + * mesh и seed: + * Позволяют получить псевдо-случайные последовательности ключей/значений. + * Параметр mesh задает сколько младших бит исходной плоской координаты + * будет "перемешано" (инъективно отображено), а параметр seed позволяет + * выбрать конкретный вариант "перемешивания". + * + * Перемешивание выполняется при ненулевом значении mesh. Перемешивание + * реализуется посредством применения двух инъективных функций для + * заданного количества бит: + * - применяется первая инъективная функция; + * - к результату добавляется salt полученный из seed; + * - применяется вторая инъективная функция; + * + * Следует отметить, что mesh умышленно позволяет перемешать только младшую + * часть, что при ненулевом значении split (см далее) не позволяет получать + * псевдо-случайные значений ключей без псевдо-случайности в значениях. + * + * Такое ограничение соответствуют внутренней алгоритмике libmdbx. Проще + * говоря мы можем проверить движок псевдо-случайной последовательностью + * ключей на таблицах без дубликатов (без multi-value), а затем проверить + * корректность работу псевдо-случайной последовательностью значений на + * таблицах с дубликатами (с multi-value), опционально добавляя + * псевдо-случайности к последовательности ключей. Однако, нет смысла + * генерировать псевдо-случайные ключи, одновременно с формированием + * какого-либо паттерна в значениях, так как содержимое в данных либо + * не будет иметь значения (для таблиц без дубликатов), либо будет + * обрабатываться в отдельных btree-поддеревьях. + * + * rotate и offset: + * Для проверки слияния и разделения страниц внутри движка требуются + * генерация ключей/значений в виде не-смежных последовательностей, как-бы + * в виде "пунктира", который постепенно заполняет весь заданных диапазон. + * + * Параметры позволяют генерировать такой "пунктир". Соответственно rotate + * задает циклический сдвиг вправо, а offset задает смещение, точнее говоря + * сложение по модулю внутри диапазона заданного посредством width. + * + * Например, при rotate равном 1 (циклический сдвиг вправо на 1 бит), + * четные и нечетные исходные значения сложатся в две линейные + * последовательности, которые постепенно закроют старшую и младшую + * половины диапазона. + * + * split: + * Для таблиц без дубликатов (без multi-value ключей) фактически требуется + * генерация только ключей, а данные могут быть постоянным. Но для таблиц с + * дубликатами (с multi-value ключами) также требуется генерация значений. + * + * Ненулевое значение параметра split фактически включает генерацию значений, + * при этом значение split определяет сколько бит исходного абстрактного + * номера будет отрезано для генерации значения. + */ + + uint8_t width; + uint8_t mesh; + uint8_t rotate; + uint8_t split; + uint32_t seed; + uint64_t offset; +}; + struct actor_params_pod { unsigned loglevel; size_t mode_flags; size_t table_flags; uint64_t size; - unsigned seed; unsigned test_duration; unsigned test_nops; @@ -91,10 +212,11 @@ struct actor_params_pod { unsigned delaystart; unsigned waitfor_nops; - bool drop_table; - unsigned max_readers; unsigned max_tables; + keygen_params_pod keygen; + + bool drop_table; }; struct actor_config_pod { diff --git a/test/hill.cc b/test/hill.cc index 0a7d2fd7..daa6e04e 100644 --- a/test/hill.cc +++ b/test/hill.cc @@ -27,7 +27,196 @@ bool testcase_hill::setup() { bool testcase_hill::run() { db_open(); - /* TODO */ + + txn_begin(false); + MDB_dbi dbi = db_table_open(true); + txn_end(false); + + /* LY: тест "холмиком": + * - сначала наполняем таблицу циклическими CRUD-манипуляциями, + * которые в каждом цикле делают несколько операций, включая удаление, + * но в результате добавляют записи. + * - затем очищаем таблицу также CRUD-манипуляциями, но уже с другой + * пропорцией удалений. + * + * При этом очень многое зависит от порядка перебора ключей: + * - (псевдо)случайное распределение требуется лишь для полноты картины, + * но в целом не покрывает важных кейсов. + * - кроме (псевдо)случайного перебора требуется последовательное + * итерирование ключей интервалами различной ширины, с тем чтобы + * проверить различные варианты как разделения, так и слияния страниц + * внутри движка. + * - при не-уникальных ключах (MDB_DUPSORT с подвариантами), для каждого + * повтора внутри движка формируется вложенное btree-дерево, + * соответственно требуется соблюдение аналогичных принципов + * итерирования для значений. + */ + + /* TODO: работа в несколько потоков */ + keyvalue_maker.setup(config.params, 0 /* thread_number */); + + keygen::buffer a_key = keygen::alloc(config.params.keylen_max); + keygen::buffer a_data_0 = keygen::alloc(config.params.datalen_max); + keygen::buffer a_data_1 = keygen::alloc(config.params.datalen_max); + keygen::buffer b_key = keygen::alloc(config.params.keylen_max); + keygen::buffer b_data = keygen::alloc(config.params.datalen_max); + + const unsigned insert_flags = (config.params.table_flags & MDB_DUPSORT) + ? MDB_NODUPDATA + : MDB_NODUPDATA | MDB_NOOVERWRITE; + const unsigned update_flags = MDB_CURRENT | MDB_NODUPDATA | MDB_NOOVERWRITE; + + uint64_t serial_count = 0; + unsigned txn_nops = 0; + if (!txn_guard) + txn_begin(false); + + while (should_continue()) { + const keygen::serial_t a_serial = serial_count; + if (unlikely(!keyvalue_maker.increment(serial_count, 1))) + failure("uphill: unexpected key-space overflow"); + + const keygen::serial_t b_serial = serial_count; + assert(b_serial > a_serial); + + // создаем первую запись из пары + const keygen::serial_t age_shift = UINT64_C(1) << (a_serial % 31); + log_trace("uphill: insert-a (age %" PRIu64 ") %" PRIu64, age_shift, + a_serial); + generate_pair(a_serial, a_key, a_data_1, age_shift); + int rc = mdbx_put(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, + insert_flags); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_put(insert-a.1)", rc); + + if (++txn_nops >= config.params.batch_write) { + txn_restart(false, false); + txn_nops = 0; + } + + // создаем вторую запись из пары + log_trace("uphill: insert-b %" PRIu64, b_serial); + generate_pair(b_serial, b_key, b_data, 0); + rc = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value, + insert_flags); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_put(insert-b)", rc); + + if (++txn_nops >= config.params.batch_write) { + txn_restart(false, false); + txn_nops = 0; + } + + // обновляем данные в первой записи + log_trace("uphill: update-a (age %" PRIu64 "->0) %" PRIu64, age_shift, + a_serial); + generate_pair(a_serial, a_key, a_data_0, 0); + rc = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_0->value, + &a_data_1->value, update_flags); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_put(update-a: 1->0)", rc); + + if (++txn_nops >= config.params.batch_write) { + txn_restart(false, false); + txn_nops = 0; + } + + // удаляем вторую запись + log_trace("uphill: delete-b %" PRIu64, b_serial); + rc = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_del(b)", rc); + + if (++txn_nops >= config.params.batch_write) { + txn_restart(false, false); + txn_nops = 0; + } + + report(1); + if (!keyvalue_maker.increment(serial_count, 1)) { + // дошли до границы пространства ключей + serial_count = a_serial; + goto overflow; + } + } + + while (serial_count > 0) { + if (unlikely(!keyvalue_maker.increment(serial_count, -2))) + failure("downhill: unexpected key-space underflow"); + + overflow: + const keygen::serial_t a_serial = serial_count; + const keygen::serial_t b_serial = a_serial + 1; + assert(b_serial > a_serial); + + // обновляем первую запись из пары + const keygen::serial_t age_shift = UINT64_C(1) << (a_serial % 31); + log_trace("downhill: update-a (age 0->%" PRIu64 ") %" PRIu64, age_shift, + a_serial); + generate_pair(a_serial, a_key, a_data_0, 0); + generate_pair(a_serial, a_key, a_data_1, age_shift); + if (a_serial == 808) + log_trace("!!!"); + int rc = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, + &a_data_0->value, update_flags); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_put(update-a: 0->1)", rc); + + if (++txn_nops >= config.params.batch_write) { + txn_restart(false, false); + txn_nops = 0; + } + + // создаем вторую запись из пары + log_trace("downhill: insert-b %" PRIu64, b_serial); + generate_pair(b_serial, b_key, b_data, 0); + rc = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value, + insert_flags); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_put(insert-b)", rc); + + if (++txn_nops >= config.params.batch_write) { + txn_restart(false, false); + txn_nops = 0; + } + + // удаляем первую запись + log_trace("downhill: delete-a (age %" PRIu64 ") %" PRIu64, age_shift, + a_serial); + rc = mdbx_del(txn_guard.get(), dbi, &a_key->value, &a_data_1->value); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_del(a)", rc); + + if (++txn_nops >= config.params.batch_write) { + txn_restart(false, false); + txn_nops = 0; + } + + // удаляем вторую запись + log_trace("downhill: delete-b %" PRIu64, b_serial); + rc = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_del(b)", rc); + + if (++txn_nops >= config.params.batch_write) { + txn_restart(false, false); + txn_nops = 0; + } + + report(1); + } + + if (txn_guard) + txn_end(false); + + if (dbi) { + if (config.params.drop_table && !mode_readonly()) { + txn_begin(false); + db_table_drop(dbi); + txn_end(false); + } else + db_table_close(dbi); + } return true; } diff --git a/test/keygen.cc b/test/keygen.cc index 20c80a2a..0d7c0409 100644 --- a/test/keygen.cc +++ b/test/keygen.cc @@ -16,18 +16,179 @@ namespace keygen { -size_t ffs_fallback(serial_t serial) { - size_t bit = sizeof(serial_t) * 8 - 1; - auto mask = (serial_t)1u << bit; - do { - if (serial & mask) - return bit; - --bit; - } while (mask >>= 1); - return 0; +static inline __pure_function serial_t mask(unsigned bits) { + assert(bits > 0 && bits <= serial_maxwith); + return serial_allones >> (serial_maxwith - bits); } -void __hot make(const serial_t serial, const params_t ¶ms, result_t &out) { +/* LY: https://en.wikipedia.org/wiki/Injective_function */ +serial_t injective(const serial_t serial, + const unsigned bits /* at least serial_minwith (8) */, + const serial_t salt) { + assert(bits > serial_minwith && bits <= serial_maxwith); + + /* LY: All these "magic" prime numbers were found + * and verified with a bit of brute force. */ + + static const uint64_t m[64 - serial_minwith] = { + /* 8 - 24 */ + 113, 157, 397, 653, 1753, 5641, 9697, 23873, 25693, 80833, 105953, 316937, + 309277, 834497, 1499933, 4373441, 10184137, + /* 25 - 64 */ + 10184137, 17279209, 33990377, 67295161, 284404553, 1075238767, 6346721573, + 6924051577, 19204053433, 45840188887, 53625693977, 73447827913, + 141638870249, 745683604649, 1283334050489, 1100828289853, 2201656586197, + 5871903036137, 11238507001417, 45264020802263, 105008404482889, + 81921776907059, 199987980256399, 307207457507641, 946769023178273, + 2420886491930041, 3601632139991929, 11984491914483833, 21805846439714153, + 23171543400565993, 53353226456762893, 155627817337932409, + 227827205384840249, 816509268558278821, 576933057762605689, + 2623957345935638441, 5048241705479929949, 4634245581946485653}; + static const uint8_t s[64 - serial_minwith] = { + /* 8 - 24 */ + 2, 3, 4, 4, 2, 4, 3, 3, 7, 3, 3, 4, 8, 3, 10, 3, 11, + /* 25 - 64 */ + 11, 9, 9, 9, 11, 10, 5, 14, 11, 16, 14, 12, 13, 16, 19, 10, 10, 21, 7, 20, + 10, 14, 22, 19, 3, 21, 18, 19, 26, 24, 2, 21, 25, 29, 24, 10, 11, 14}; + + serial_t result = serial * m[bits - 8]; + if (salt) { + const unsigned left = bits / 2; + const unsigned right = bits - left; + result = (result << left) | ((result & mask(bits)) >> right); + result = (result ^ salt) * m[bits - 8]; + } + + result ^= result << s[bits - 8]; + result &= mask(bits); + log_trace("keygen-injective: serial %" PRIu64 " into %" PRIu64, serial, + result); + return result; +} + +void __hot maker::pair(serial_t serial, const buffer &key, buffer &value, + serial_t value_age) { + assert(mapping.width >= serial_minwith && mapping.width <= serial_maxwith); + assert(mapping.split <= mapping.width); + assert(mapping.mesh <= mapping.width); + assert(mapping.rotate <= mapping.width); + assert(mapping.offset <= mask(mapping.width)); + assert(!(key_essentials.flags & (MDB_INTEGERDUP | MDB_REVERSEDUP))); + assert(!(value_essentials.flags & (MDB_INTEGERKEY | MDB_REVERSEKEY))); + + log_trace("keygen-pair: serial %" PRIu64 ", data-age %" PRIu64, serial, + value_age); + + if (mapping.mesh >= serial_minwith) { + serial = + (serial & ~mask(mapping.mesh)) | injective(serial, mapping.mesh, salt); + log_trace("keygen-pair: mesh %" PRIu64, serial); + } + + if (mapping.rotate) { + const unsigned right = mapping.rotate; + const unsigned left = mapping.width - right; + serial = (serial << left) | ((serial & mask(mapping.width)) >> right); + log_trace("keygen-pair: rotate %" PRIu64 ", 0x%" PRIx64, serial, serial); + } + + serial = (serial + mapping.offset) & mask(mapping.width); + log_trace("keygen-pair: offset %" PRIu64, serial); + serial += base; + + serial_t key_serial = serial; + serial_t value_serial = value_age; + if (mapping.split) { + key_serial = serial >> mapping.split; + value_serial = + (serial & mask(mapping.split)) | (value_age << mapping.split); + } + + log_trace("keygen-pair: key %" PRIu64 ", value %" PRIu64, key_serial, + value_serial); + + mk(key_serial, key_essentials, *key); + mk(value_serial, value_essentials, *value); + + if (log_enabled(logging::trace)) { + char dump_key[128], dump_value[128]; + log_trace("keygen-pair: key %s, value %s", + mdbx_dkey(&key->value, dump_key, sizeof(dump_key)), + mdbx_dkey(&value->value, dump_value, sizeof(dump_value))); + } +} + +void maker::setup(const config::actor_params_pod &actor, + unsigned thread_number) { + key_essentials.flags = actor.table_flags & (MDB_INTEGERKEY | MDB_REVERSEKEY); + key_essentials.minlen = actor.keylen_min; + key_essentials.maxlen = actor.keylen_max; + + value_essentials.flags = + actor.table_flags & (MDB_INTEGERDUP | MDB_REVERSEDUP); + value_essentials.minlen = actor.datalen_min; + value_essentials.maxlen = actor.datalen_max; + + assert(thread_number < 2); + (void)thread_number; + mapping = actor.keygen; + salt = actor.keygen.seed * UINT64_C(14653293970879851569); + + // FIXME: TODO + base = 0; +} + +bool maker::increment(serial_t &serial, int delta) { + if (serial > mask(mapping.width)) { + log_extra("keygen-increment: %" PRIu64 " > %" PRIu64 ", overflow", serial, + mask(mapping.width)); + return false; + } + + serial_t target = serial + (int64_t)delta; + if (target > mask(mapping.width)) { + log_extra("keygen-increment: %" PRIu64 "%-d => %" PRIu64 ", overflow", + serial, delta, target); + return false; + } + + log_extra("keygen-increment: %" PRIu64 "%-d => %" PRIu64 ", continue", serial, + delta, target); + serial = target; + return true; +} + +//----------------------------------------------------------------------------- + +size_t length(serial_t serial) { + size_t n = 0; + if (serial > UINT32_MAX) { + n = 4; + serial >>= 32; + } + if (serial > UINT16_MAX) { + n += 2; + serial >>= 16; + } + if (serial > UINT8_MAX) { + n += 1; + serial >>= 8; + } + return (serial > 0) ? n + 1 : n; +} + +buffer alloc(size_t limit) { + result *ptr = (result *)malloc(sizeof(result) + limit); + if (unlikely(ptr == nullptr)) + failure_perror("malloc(keyvalue_buffer)", errno); + ptr->value.iov_base = ptr->bytes; + ptr->value.iov_len = 0; + ptr->limit = limit; + return buffer(ptr); +} + +void __hot maker::mk(const serial_t serial, const essentials ¶ms, + result &out) { assert(out.limit >= params.maxlen); assert(params.maxlen >= params.minlen); assert(params.maxlen >= length(serial)); diff --git a/test/keygen.h b/test/keygen.h index 58db2633..e6eeb194 100644 --- a/test/keygen.h +++ b/test/keygen.h @@ -15,6 +15,7 @@ #pragma once #include "base.h" +#include "config.h" #include "log.h" #include "utils.h" @@ -42,25 +43,41 @@ namespace keygen { * - частотное распределение по алфавиту; * - абсолютное значение ключей или разность между отдельными значениями; * - * Соответственно, схема генерации следующая: - * - для ключей вводится плоская одномерная "координата" uint64_t; - * - все преобразования (назначение диапазонов, переупорядочивание, - * коррекция распределения) выполняются только над "координатой"; + * Соответственно, в общих чертах, схема генерации следующая: + * - вводится плоская одномерная "координата" uint64_t; + * - генерация специфических паттернов (последовательностей) + * реализуется посредством соответствующих преобразований "координат", при + * этом все подобные преобразования выполняются только над "координатой"; * - итоговая "координата" преобразуется в 8-байтное суррогатное значение - * ключа, при этом опционально суррогат может усекаться до ненулевых байт; - * - для получения ключей длиной более 8 байт суррогат дополняется - * фиксированной последовательностью; + * ключа; + * - для получения ключей длиной МЕНЕЕ 8 байт суррогат может усекаться + * до ненулевых байт, в том числе до нулевой длины; + * - для получения ключей длиной БОЛЕЕ 8 байт суррогат дополняется + * нулями или псевдослучайной последовательностью; + * + * Механизм генерации паттернов: + * - реализованный механизм является компромиссом между скоростью/простотой + * и гибкостью, необходимой для получения последовательностей, которых + * будет достаточно для проверки сценариев разделения и слияния страниц + * с данными внутри mdbx; + * - псевдо-случайные паттерны реализуются посредством набора инъективных + * отображающих функций; + * - не-псевдо-случайные паттерны реализуются посредством параметризируемого + * трех-этапного преобразования: + * 1) смещение (сложение) по модулю; + * 2) циклический сдвиг; + * 3) добавление абсолютного смещения (базы); */ typedef uint64_t serial_t; -struct params_t { - uint8_t minlen; - uint8_t flags; - uint16_t maxlen; +enum { + serial_minwith = 8, + serial_maxwith = sizeof(serial_t) * 8, + serial_allones = ~(serial_t)0 }; -struct result_t { +struct result { MDB_val value; size_t limit; union { @@ -70,54 +87,39 @@ struct result_t { }; }; -void make(const serial_t serial, const params_t ¶ms, result_t &out); +//----------------------------------------------------------------------------- -static __inline void make(const serial_t serial, const params_t ¶ms, - result_t &out, size_t limit) { - out.limit = limit; - make(serial, params, out); -} +struct buffer_deleter : public std::unary_function { + void operator()(result *buffer) const { free(buffer); } +}; -size_t ffs_fallback(serial_t serial); +typedef std::unique_ptr buffer; -static __inline size_t ffs(serial_t serial) { - size_t rc; -#ifdef __GNUC__ - if (sizeof(serial) <= sizeof(int)) - rc = __builtin_ffs((int)serial); - else if (sizeof(serial) == sizeof(long)) - rc = __builtin_ffsl((long)serial); - else if (sizeof(serial) == sizeof(long long)) - rc = __builtin_ffsll((long long)serial); - else - return ffs_fallback(serial); -#elif defined(_MSC_VER) - unsigned long index; - if (sizeof(serial) <= sizeof(unsigned long)) - rc = _BitScanReverse(&index, (unsigned long)serial) ? index : 0; - else if (sizeof(serial) <= sizeof(unsigned __int64)) { -#if defined(_M_ARM64) || defined(_M_X64) - rc = _BitScanReverse64(&index, (unsigned __int64)serial) ? index : 0; -#else - size_t base = 0; - unsigned long value = (unsigned long)serial; - if ((unsigned __int64)serial > ULONG_MAX) { - base = 32; - value = (unsigned long)(serial >> 32); - } - rc = (_BitScanReverse(&index, value) ? index : 0) + base; -#endif /* _M_ARM64 || _M_X64 */ - } else - return ffs_fallback(serial); -#else - return ffs_fallback(serial); -#endif - assert(rc == ffs_fallback(serial)); - return rc; -} +buffer alloc(size_t limit); -static __inline size_t length(const serial_t serial) { - return (ffs(serial) + 7) >> 3; -} +class maker { + config::keygen_params_pod mapping; + serial_t base; + serial_t salt; + + struct essentials { + uint8_t minlen; + uint8_t flags; + uint16_t maxlen; + } key_essentials, value_essentials; + + static void mk(const serial_t serial, const essentials ¶ms, result &out); + +public: + maker() { memset(this, 0, sizeof(*this)); } + + void pair(serial_t serial, const buffer &key, buffer &value, + serial_t value_age); + void setup(const config::actor_params_pod &actor, unsigned thread_number); + + bool increment(serial_t &serial, int delta); +}; + +size_t length(serial_t serial); } /* namespace keygen */ diff --git a/test/main.cc b/test/main.cc index 14805366..8ef6f2f2 100644 --- a/test/main.cc +++ b/test/main.cc @@ -41,7 +41,14 @@ void actor_params::set_defaults(void) { MDB_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM; table_flags = MDB_DUPSORT; size = 1024 * 1024; - seed = 1; + + keygen.seed = 1; + keygen.keycase = kc_random; + keygen.width = 32; + keygen.mesh = 32; + keygen.split = keygen.width / 2; + keygen.rotate = 0; + keygen.offset = 0; test_duration = 0; test_nops = 1000; @@ -129,13 +136,13 @@ int main(int argc, char *const argv[]) { params.set_defaults(); global::config::dump_config = true; logging::setup((logging::loglevel)params.loglevel, "main"); - unsigned lastid = 0; + unsigned last_space_id = 0; for (int narg = 1; narg < argc; ++narg) { const char *value = nullptr; if (config::parse_option(argc, argv, narg, "case", &value)) { - testcase_setup(value, params, lastid); + testcase_setup(value, params, last_space_id); continue; } if (config::parse_option(argc, argv, narg, "pathname", params.pathname_db)) @@ -149,9 +156,30 @@ int main(int argc, char *const argv[]) { if (config::parse_option(argc, argv, narg, "size", params.size, config::binary, 4096 * 4)) continue; - if (config::parse_option(argc, argv, narg, "seed", params.seed, - config::no_scale)) + + if (config::parse_option(argc, argv, narg, "keygen.width", + params.keygen.width, 1, 64)) continue; + if (config::parse_option(argc, argv, narg, "keygen.mesh", + params.keygen.mesh, 1, 64)) + continue; + if (config::parse_option(argc, argv, narg, "keygen.seed", + params.keygen.seed, config::no_scale)) + continue; + if (config::parse_option(argc, argv, narg, "keygen.split", + params.keygen.split, 1, 64)) + continue; + if (config::parse_option(argc, argv, narg, "keygen.rotate", + params.keygen.rotate, 1, 64)) + continue; + if (config::parse_option(argc, argv, narg, "keygen.offset", + params.keygen.offset, config::binary)) + continue; + if (config::parse_option(argc, argv, narg, "keygen.case", &value)) { + keycase_setup(value, params); + continue; + } + if (config::parse_option(argc, argv, narg, "repeat", params.nrepeat, config::no_scale)) continue; @@ -225,20 +253,20 @@ int main(int argc, char *const argv[]) { params.test_duration = 0; continue; } - if (config::parse_option(argc, argv, narg, "hill", &value)) { - configure_actor(lastid, ac_hill, value, params); + if (config::parse_option(argc, argv, narg, "hill", &value, "auto")) { + configure_actor(last_space_id, ac_hill, value, params); continue; } if (config::parse_option(argc, argv, narg, "jitter", nullptr)) { - configure_actor(lastid, ac_jitter, value, params); + configure_actor(last_space_id, ac_jitter, value, params); continue; } if (config::parse_option(argc, argv, narg, "dead.reader", nullptr)) { - configure_actor(lastid, ac_deadread, value, params); + configure_actor(last_space_id, ac_deadread, value, params); continue; } if (config::parse_option(argc, argv, narg, "dead.writer", nullptr)) { - configure_actor(lastid, ac_deadwrite, value, params); + configure_actor(last_space_id, ac_deadwrite, value, params); continue; } if (config::parse_option(argc, argv, narg, "failfast", @@ -246,7 +274,7 @@ int main(int argc, char *const argv[]) { continue; if (*argv[narg] != '-') - testcase_setup(argv[narg], params, lastid); + testcase_setup(argv[narg], params, last_space_id); else failure("Unknown option '%s'\n", argv[narg]); } diff --git a/test/test.cc b/test/test.cc index 4e8052e7..ad82fd39 100644 --- a/test/test.cc +++ b/test/test.cc @@ -17,6 +17,7 @@ const char *testcase2str(const actor_testcase testcase) { switch (testcase) { default: + assert(false); return "?!"; case ac_none: return "none"; @@ -49,6 +50,20 @@ const char *status2str(actor_status status) { } } +const char *keygencase2str(const keygen_case keycase) { + switch (keycase) { + default: + assert(false); + return "?!"; + case kc_random: + return "random"; + case kc_dashes: + return "dashes"; + case kc_custom: + return "custom"; + } +} + //----------------------------------------------------------------------------- static void mdbx_debug_logger(int type, const char *function, int line, @@ -67,7 +82,9 @@ static void mdbx_debug_logger(int type, const char *function, int line, level = logging::failure; } - if (logging::output(level, "mdbx: %s: ", function)) + if (logging::output(level, strncmp(function, "mdbx_", 5) == 0 ? "%s: " + : "mdbx: %s: ", + function)) logging::feed(msg, args); if (type & MDBX_DBG_ASSERT) abort(); @@ -87,26 +104,26 @@ void testcase::db_prepare() { MDB_env *env = nullptr; rc = mdbx_env_create(&env); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_env_create()", rc); assert(env != nullptr); db_guard.reset(env); rc = mdbx_env_set_userctx(env, this); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_env_set_userctx()", rc); rc = mdbx_env_set_maxreaders(env, config.params.max_readers); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_env_set_maxreaders()", rc); rc = mdbx_env_set_maxdbs(env, config.params.max_tables); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_env_set_maxdbs()", rc); rc = mdbx_env_set_mapsize(env, (size_t)config.params.size); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_env_set_mapsize()", rc); log_trace("<< db_prepare"); @@ -119,7 +136,7 @@ void testcase::db_open() { db_prepare(); int rc = mdbx_env_open(db_guard.get(), config.params.pathname_db.c_str(), (unsigned)config.params.mode_flags, 0640); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_env_open()", rc); log_trace("<< db_open"); @@ -140,7 +157,7 @@ void testcase::txn_begin(bool readonly) { MDB_txn *txn = nullptr; int rc = mdbx_txn_begin(db_guard.get(), nullptr, readonly ? MDB_RDONLY : 0, &txn); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_txn_begin()", rc); txn_guard.reset(txn); @@ -154,17 +171,23 @@ void testcase::txn_end(bool abort) { MDB_txn *txn = txn_guard.release(); if (abort) { int rc = mdbx_txn_abort(txn); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_txn_abort()", rc); } else { int rc = mdbx_txn_commit(txn); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_txn_commit()", rc); } log_trace("<< txn_end(%s)", abort ? "abort" : "commit"); } +void testcase::txn_restart(bool abort, bool readonly) { + if (txn_guard) + txn_end(abort); + txn_begin(readonly); +} + bool testcase::wait4start() { if (config.wait4id) { log_trace(">> wait4start(%u)", config.wait4id); @@ -257,7 +280,7 @@ void testcase::fetch_canary() { log_trace(">> fetch_canary"); int rc = mdbx_canary_get(txn_guard.get(), &canary_now); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_canary_get()", rc); if (canary_now.v < last.canary.v) @@ -283,10 +306,57 @@ void testcase::update_canary(uint64_t increment) { canary_now.y += increment; int rc = mdbx_canary_put(txn_guard.get(), &canary_now); - if (rc != MDB_SUCCESS) + if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_canary_put()", rc); - log_trace(">> update_canary: sequence = %" PRIu64, canary_now.y); + log_trace("<< update_canary: sequence = %" PRIu64, canary_now.y); +} + +MDB_dbi testcase::db_table_open(bool create) { + log_trace(">> testcase::db_table_create"); + + char tablename_buf[16]; + const char *tablename = nullptr; + if (config.space_id) { + int rc = snprintf(tablename_buf, sizeof(tablename_buf), "TBL%04u", + config.space_id); + if (rc < 4 || rc >= (int)sizeof(tablename_buf) - 1) + failure("snprintf(tablename): %d", rc); + tablename = tablename_buf; + } + log_verbose("use %s table", tablename ? tablename : "MAINDB"); + + MDB_dbi handle = 0; + int rc = mdbx_dbi_open(txn_guard.get(), tablename, + (create ? MDB_CREATE : 0) | config.params.table_flags, + &handle); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_dbi_open()", rc); + + log_trace("<< testcase::db_table_create, handle %u", handle); + return handle; +} + +void testcase::db_table_drop(MDB_dbi handle) { + log_trace(">> testcase::db_table_drop, handle %u", handle); + + if (config.params.drop_table) { + int rc = mdbx_drop(txn_guard.get(), handle, true); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_drop()", rc); + log_trace("<< testcase::db_table_drop"); + } else { + log_trace("<< testcase::db_table_drop: not needed"); + } +} + +void testcase::db_table_close(MDB_dbi handle) { + log_trace(">> testcase::db_table_close, handle %u", handle); + assert(!txn_guard); + int rc = mdbx_dbi_close(db_guard.get(), handle); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_dbi_close()", rc); + log_trace("<< testcase::db_table_close"); } //----------------------------------------------------------------------------- diff --git a/test/test.h b/test/test.h index f1a039f7..07e4a094 100644 --- a/test/test.h +++ b/test/test.h @@ -25,9 +25,10 @@ bool test_execute(const actor_config &config); std::string thunk_param(const actor_config &config); void testcase_setup(const char *casename, actor_params ¶ms, - unsigned &lastid); -void configure_actor(unsigned &lastid, const actor_testcase testcase, - const char *id_cstr, const actor_params ¶ms); + unsigned &last_space_id); +void configure_actor(unsigned &last_space_id, const actor_testcase testcase, + const char *space_id_cstr, const actor_params ¶ms); +void keycase_setup(const char *casename, actor_params ¶ms); namespace global { @@ -87,6 +88,9 @@ protected: size_t nops_completed; chrono::time start_timestamp; + keygen::buffer key; + keygen::buffer data; + keygen::maker keyvalue_maker; struct { mdbx_canary canary; @@ -97,14 +101,29 @@ protected: void db_close(); void txn_begin(bool readonly); void txn_end(bool abort); + void txn_restart(bool abort, bool readonly); void fetch_canary(); void update_canary(uint64_t increment); + MDB_dbi db_table_open(bool create); + void db_table_drop(MDB_dbi handle); + void db_table_close(MDB_dbi handle); + bool wait4start(); void report(size_t nops_done); void signal(); bool should_continue() const; + void generate_pair(const keygen::serial_t serial, keygen::buffer &key, + keygen::buffer &value, keygen::serial_t data_age = 0) { + keyvalue_maker.pair(serial, key, value, data_age); + } + + void generate_pair(const keygen::serial_t serial, + keygen::serial_t data_age = 0) { + generate_pair(serial, key, data, data_age); + } + bool mode_readonly() const { return (config.params.mode_flags & MDB_RDONLY) ? true : false; } diff --git a/test/utils.cc b/test/utils.cc index 2c7e6d0b..ae58311f 100644 --- a/test/utils.cc +++ b/test/utils.cc @@ -14,7 +14,7 @@ #include "test.h" #include -#ifndef _MSC_VER +#ifdef HAVE_IEEE754_H #include #endif @@ -190,14 +190,11 @@ uint64_t entropy_ticks(void) { //----------------------------------------------------------------------------- static __inline uint64_t bleach64(uint64_t dirty) { - dirty = mul_64x64_high(bswap64(dirty), UINT64_C(17048867929148541611)); - return dirty; + return mul_64x64_high(bswap64(dirty), UINT64_C(17048867929148541611)); } static __inline uint32_t bleach32(uint32_t dirty) { - return (uint32_t)( - (bswap32(dirty) * UINT64_C(/*3080105489, 4267077937 */ 2175734609)) >> - 32); + return (uint32_t)((bswap32(dirty) * UINT64_C(2175734609)) >> 32); } uint64_t prng64_careless(uint64_t &state) { @@ -214,6 +211,39 @@ uint32_t prng32(uint64_t &state) { return (uint32_t)(prng64_careless(state) >> 32); } +void prng_fill(uint64_t &state, void *ptr, size_t bytes) { + while (bytes >= 4) { + *((uint32_t *)ptr) = prng32(state); + ptr = (uint32_t *)ptr + 1; + bytes -= 4; + } + + switch (bytes & 3) { + case 3: { + uint32_t u32 = prng32(state); + memcpy(ptr, &u32, 3); + } break; + case 2: + *((uint16_t *)ptr) = (uint16_t)prng32(state); + break; + case 1: + *((uint8_t *)ptr) = (uint8_t)prng32(state); + break; + case 0: + break; + } +} + +static __thread uint64_t prng_state; + +void prng_seed(uint64_t seed) { prng_state = bleach64(seed); } + +uint32_t prng32(void) { return prng32(prng_state); } + +uint64_t prng64(void) { return prng64_white(prng_state); } + +void prng_fill(void *ptr, size_t bytes) { prng_fill(prng_state, ptr, bytes); } + uint64_t entropy_white() { return bleach64(entropy_ticks()); } double double_from_lower(uint64_t salt) { diff --git a/test/utils.h b/test/utils.h index b4c88834..624a204c 100644 --- a/test/utils.h +++ b/test/utils.h @@ -101,12 +101,12 @@ #define bswap64(v) __bswap_64(v) #else static __inline uint64_t bswap64(uint64_t v) { - return v << 56 | v >> 56 | ((v << 40) & 0x00ff000000000000ull) | - ((v << 24) & 0x0000ff0000000000ull) | - ((v << 8) & 0x000000ff00000000ull) | - ((v >> 8) & 0x00000000ff000000ull) | - ((v >> 24) & 0x0000000000ff0000ull) | - ((v >> 40) & 0x000000000000ff00ull); + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff0000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); } #endif #endif /* bswap64 */ @@ -116,7 +116,8 @@ static __inline uint64_t bswap64(uint64_t v) { #define bswap32(v) __bswap_32(v) #else static __inline uint32_t bswap32(uint32_t v) { - return v << 24 | v >> 24 | ((v << 8) & 0x00ff0000) | ((v >> 8) & 0x0000ff00); + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); } #endif #endif /* bswap32 */ @@ -360,6 +361,12 @@ uint64_t entropy_white(void); uint64_t prng64_careless(uint64_t &state); uint64_t prng64_white(uint64_t &state); uint32_t prng32(uint64_t &state); +void prng_fill(uint64_t &state, void *ptr, size_t bytes); + +void prng_seed(uint64_t seed); +uint32_t prng32(void); +uint64_t prng64(void); +void prng_fill(void *ptr, size_t bytes); bool flipcoin(); bool jitter(unsigned probability_percent); From 8f2c21e2ba50e8337b7c2e6b8b66adbc743d4792 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 21:02:39 +0300 Subject: [PATCH 145/303] mdbx: rework MDBX_node. --- src/bits.h | 4 +- src/mdbx.c | 215 ++++++++++++++++++++++++++++++++--------------------- 2 files changed, 132 insertions(+), 87 deletions(-) diff --git a/src/bits.h b/src/bits.h index 6b176d83..4a18f5ce 100644 --- a/src/bits.h +++ b/src/bits.h @@ -268,7 +268,7 @@ typedef struct MDB_meta { /* Common header for all page types. The page type depends on mp_flags. * - * P_BRANCH and P_LEAF pages have unsorted 'MDB_node's at the end, with + * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages * omit mp_ptrs and pack sorted MDB_DUPFIXED values after the page header. * @@ -519,7 +519,7 @@ typedef struct MDB_xcursor { #define XCURSOR_REFRESH(mc, mp, ki) \ do { \ MDB_page *xr_pg = (mp); \ - MDB_node *xr_node = NODEPTR(xr_pg, ki); \ + MDBX_node *xr_node = NODEPTR(xr_pg, ki); \ if ((xr_node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ } while (0) diff --git a/src/mdbx.c b/src/mdbx.c index c06485c1..3af10d38 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -420,7 +420,7 @@ txnid_t mdbx_debug_edge; /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. - * We guarantee 2-byte alignment for 'MDB_node's. + * We guarantee 2-byte alignment for 'MDBX_node's. * * mn_lo and mn_hi are used for data size on leaf nodes, and for child * pgno on branch nodes. On 64 bit platforms, mn_flags is also used @@ -431,13 +431,32 @@ txnid_t mdbx_debug_edge; * data part is the page number of an overflow page with actual data. * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in * a sub-page/sub-database, and named databases (just F_SUBDATA). */ -typedef struct MDB_node { -/* part of data size or pgno */ +typedef struct MDBX_node { + union { + struct { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - uint16_t mn_lo, mn_hi; + union { + struct { + uint16_t mn_lo, mn_hi; /* part of data size or pgno */ + }; + uint32_t mn_dsize; + }; + uint16_t mn_flags; /* see mdbx_node */ + uint16_t mn_ksize; /* key size */ #else - uint16_t mn_hi, mn_lo; + uint16_t mn_ksize; /* key size */ + uint16_t mn_flags; /* see mdbx_node */ + union { + struct { + uint16_t mn_hi, mn_lo; /* part of data size or pgno */ + }; + uint32_t mn_dsize; + }; #endif + }; + pgno_t mn_ksize_and_pgno; + }; + /* mdbx_node Flags */ #define F_BIGDATA 0x01 /* data put on overflow page */ #define F_SUBDATA 0x02 /* data is a sub-database */ @@ -445,14 +464,11 @@ typedef struct MDB_node { /* valid flags for mdbx_node_add() */ #define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDB_RESERVE | MDB_APPEND) - - uint16_t mn_flags; /* see mdbx_node */ - uint16_t mn_ksize; /* key size */ uint8_t mn_data[1]; /* key and data are appended here */ -} MDB_node; +} MDBX_node; /* Size of the node header, excluding dynamic data at the end */ -#define NODESIZE offsetof(MDB_node, mn_data) +#define NODESIZE offsetof(MDBX_node, mn_data) /* Bit position of top word in page number, for shifting mn_flags */ #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) @@ -466,9 +482,9 @@ typedef struct MDB_node { #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) /* Address of node i in page p */ -static __inline MDB_node *NODEPTR(MDB_page *p, unsigned i) { +static __inline MDBX_node *NODEPTR(MDB_page *p, unsigned i) { assert(NUMKEYS(p) > (unsigned)(i)); - return (MDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); + return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); } /* Address of the key for the node */ @@ -478,28 +494,57 @@ static __inline MDB_node *NODEPTR(MDB_page *p, unsigned i) { #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) /* Get the page number pointed to by a branch node */ -#define NODEPGNO(node) \ - ((node)->mn_lo | ((pgno_t)(node)->mn_hi << 16) | \ - (PGNO_TOPWORD ? ((pgno_t)(node)->mn_flags << PGNO_TOPWORD) : 0)) +static __inline pgno_t NODEPGNO(const MDBX_node *node) { + pgno_t pgno; + if (UNALIGNED_OK) { + pgno = node->mn_ksize_and_pgno; + if (sizeof(pgno_t) > 4) + pgno &= UINT64_C(0xffffFFFFffff); + } else { + pgno = node->mn_lo | ((pgno_t)node->mn_lo << 16); + if (sizeof(pgno_t) > 4) + pgno |= ((uint64_t)node->mn_flags) << 32; + } + return pgno; +} /* Set the page number in a branch node */ -#define SETPGNO(node, pgno) \ - do { \ - (node)->mn_lo = (uint16_t)(pgno); \ - (node)->mn_hi = (uint16_t)((pgno) >> 16); \ - if (PGNO_TOPWORD) \ - (node)->mn_flags = (uint16_t)((pgno) >> PGNO_TOPWORD); \ - } while (0) +static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { + if (sizeof(pgno_t) > 4) + assert(pgno <= UINT64_C(0xffffFFFFffff)); + + if (UNALIGNED_OK) { + if (sizeof(pgno_t) > 4) + pgno |= ((uint64_t)node->mn_ksize) << 48; + node->mn_ksize_and_pgno = pgno; + } else { + node->mn_lo = (uint16_t)pgno; + node->mn_hi = (uint16_t)(pgno >> 16); + if (sizeof(pgno_t) > 4) + node->mn_flags = (uint16_t)((uint64_t)pgno >> 32); + } +} /* Get the size of the data in a leaf node */ -#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16)) +static __inline size_t NODEDSZ(const MDBX_node *node) { + size_t size; + if (UNALIGNED_OK) { + size = node->mn_dsize; + } else { + size = node->mn_lo | ((size_t)node->mn_hi << 16); + } + return size; +} /* Set the size of the data for a leaf node */ -#define SETDSZ(node, size) \ - do { \ - (node)->mn_lo = (uint16_t)(size); \ - (node)->mn_hi = (uint16_t)((size) >> 16); \ - } while (0) +static __inline void SETDSZ(MDBX_node *node, unsigned size) { + if (UNALIGNED_OK) { + node->mn_dsize = size; + } else { + node->mn_lo = (uint16_t)size; + node->mn_hi = (uint16_t)(size >> 16); + } +} /* The size of a key in a node */ #define NODEKSZ(node) ((node)->mn_ksize) @@ -614,13 +659,13 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, MDB_meta *pending); static void mdbx_env_close0(MDB_env *env); -static MDB_node *mdbx_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); +static MDBX_node *mdbx_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags); static void mdbx_node_del(MDB_cursor *mc, int ksize); static void mdbx_node_shrink(MDB_page *mp, indx_t indx); static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); -static int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); +static int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, MDB_val *data); static size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); static size_t mdbx_branch_size(MDB_env *env, MDB_val *key); @@ -646,7 +691,7 @@ static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); static void mdbx_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); static void mdbx_xcursor_init0(MDB_cursor *mc); -static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node); +static void mdbx_xcursor_init1(MDB_cursor *mc, MDBX_node *node); static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); static int mdbx_drop0(MDB_cursor *mc, int subs); @@ -835,7 +880,7 @@ char *mdbx_dkey(const MDB_val *key, char *const buf, const size_t bufsize) { } #if 0 /* LY: debug stuff */ -static const char *mdbx_leafnode_type(MDB_node *n) { +static const char *mdbx_leafnode_type(MDBX_node *n) { static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" : tp[F_ISSET(n->mn_flags, F_DUPDATA)] @@ -846,7 +891,7 @@ static const char *mdbx_leafnode_type(MDB_node *n) { static void mdbx_page_list(MDB_page *mp) { pgno_t pgno = mdbx_dbg_pgno(mp); const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; - MDB_node *node; + MDBX_node *node; unsigned i, nkeys, nsize, total = 0; MDB_val key; DKBUF; @@ -919,7 +964,7 @@ static void mdbx_page_list(MDB_page *mp) { static void mdbx_cursor_chk(MDB_cursor *mc) { unsigned i; - MDB_node *node; + MDBX_node *node; MDB_page *mp; if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) @@ -975,7 +1020,7 @@ static void mdbx_audit(MDB_txn *txn) { MDB_page *mp; mp = mc.mc_pg[mc.mc_top]; for (j = 0; j < NUMKEYS(mp); j++) { - MDB_node *leaf = NODEPTR(mp, j); + MDBX_node *leaf = NODEPTR(mp, j); if (leaf->mn_flags & F_SUBDATA) { MDB_db db; memcpy(&db, NODEDATA(leaf), sizeof(db)); @@ -1164,7 +1209,7 @@ static int mdbx_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { MDB_cursor *m3, *m0 = mc; MDB_xcursor *mx; MDB_page *dp, *mp; - MDB_node *leaf; + MDBX_node *leaf; unsigned i, j; int rc = MDB_SUCCESS, level; @@ -1483,7 +1528,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { for (op = MDB_FIRST;; op = (flags & MDBX_LIFORECLAIM) ? MDB_PREV : MDB_NEXT) { MDB_val key, data; - MDB_node *leaf; + MDBX_node *leaf; pgno_t *idl; /* Seek a big enough contiguous page range. Prefer @@ -1865,7 +1910,7 @@ static int mdbx_page_touch(MDB_cursor *mc) { /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { MDB_page *parent = mc->mc_pg[mc->mc_top - 1]; - MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top - 1]); + MDBX_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top - 1]); SETPGNO(node, pgno); } else { mc->mc_db->md_root = pgno; @@ -4365,13 +4410,13 @@ static int __hot mdbx_cmp_memnr(const MDB_val *a, const MDB_val *b) { * in *exactp (1 or 0). * Updates the cursor index with the index of the found entry. * If no entry larger or equal to the key is found, returns NULL. */ -static MDB_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, - int *exactp) { +static MDBX_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, + int *exactp) { unsigned i = 0, nkeys; int low, high; int rc = 0; MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NULL; + MDBX_node *node = NULL; MDB_val nodekey; MDB_cmp_func *cmp; DKBUF; @@ -4560,7 +4605,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { DKBUF; while (IS_BRANCH(mp)) { - MDB_node *node; + MDBX_node *node; indx_t i; mdbx_debug("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, @@ -4639,7 +4684,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { * be underfilled. */ static int mdbx_page_search_lowest(MDB_cursor *mc) { MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_node *node = NODEPTR(mp, 0); + MDBX_node *node = NODEPTR(mp, 0); int rc; if (unlikely((rc = mdbx_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) @@ -4686,7 +4731,7 @@ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { { MDB_val data; int exact = 0; - MDB_node *leaf = mdbx_node_search(&mc2, &mc->mc_dbx->md_name, &exact); + MDBX_node *leaf = mdbx_node_search(&mc2, &mc->mc_dbx->md_name, &exact); if (!exact) return MDB_NOTFOUND; if (unlikely((leaf->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) @@ -4814,7 +4859,7 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDB_page *mp) { * [out] data Updated to point to the node's data. * * Returns 0 on success, non-zero on failure. */ -static __inline int mdbx_node_read(MDB_cursor *mc, MDB_node *leaf, +static __inline int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, MDB_val *data) { MDB_page *omp; /* overflow page */ pgno_t pgno; @@ -4873,7 +4918,7 @@ int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { * Returns 0 on success, non-zero on failure. */ static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { int rc; - MDB_node *indx; + MDBX_node *indx; MDB_page *mp; if (unlikely(mc->mc_snum < 2)) { @@ -4923,7 +4968,7 @@ static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) { MDB_page *mp; - MDB_node *leaf; + MDBX_node *leaf; int rc; if ((mc->mc_flags & C_DEL) && op == MDB_NEXT_DUP) @@ -5012,7 +5057,7 @@ skip: static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) { MDB_page *mp; - MDB_node *leaf; + MDBX_node *leaf; int rc; if ((mc->mc_flags & C_DEL) && op == MDB_PREV_DUP) @@ -5100,7 +5145,7 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, int *exactp) { int rc; MDB_page *mp; - MDB_node *leaf = NULL; + MDBX_node *leaf = NULL; DKBUF; if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && @@ -5291,7 +5336,7 @@ set1: /* Move the cursor to the first item in the database. */ static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) { int rc; - MDB_node *leaf; + MDBX_node *leaf; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -5333,7 +5378,7 @@ static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) { /* Move the cursor to the last item in the database. */ static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) { int rc; - MDB_node *leaf; + MDBX_node *leaf; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -5406,7 +5451,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, key->mv_size = mc->mc_db->md_xsize; key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); } else { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDBX_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); MDB_GET_KEY(leaf, key); if (data) { if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5511,7 +5556,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (unlikely(mc->mc_xcursor == NULL)) return MDB_INCOMPATIBLE; { - MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { MDB_GET_KEY(leaf, key); rc = mdbx_node_read(mc, leaf, data); @@ -5652,7 +5697,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDBX_EKEYMISMATCH; if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { - MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdbx_cassert(mc, mc->mc_xcursor != NULL && @@ -5788,7 +5833,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, } more:; - MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); olddata.mv_size = NODEDSZ(leaf); olddata.mv_data = NODEDATA(leaf); @@ -6057,7 +6102,7 @@ new_sub: put_sub: xdata.mv_size = 0; xdata.mv_data = ""; - MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (flags & MDB_CURRENT) { xflags = (flags & MDB_NODUPDATA) ? MDB_CURRENT | MDB_NOOVERWRITE | MDB_NOSPILL @@ -6143,7 +6188,7 @@ new_sub: } int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { - MDB_node *leaf; + MDBX_node *leaf; MDB_page *mp; int rc; @@ -6206,7 +6251,7 @@ int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { if (!(m2->mc_flags & C_INITIALIZED)) continue; if (m2->mc_pg[mc->mc_top] == mp) { - MDB_node *n2 = leaf; + MDBX_node *n2 = leaf; if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); if (n2->mn_flags & F_SUBDATA) @@ -6298,7 +6343,7 @@ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, * is too large it will be put onto an overflow page and the node * size will only include the key and not the data. Sizes are always * rounded up to an even number of bytes, to guarantee 2-byte alignment - * of the MDB_node headers. + * of the MDBX_node headers. * * [in] env The environment handle. * [in] key The key for the node. @@ -6322,9 +6367,9 @@ static __inline size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, * * The size should depend on the environment's page size but since * we currently don't support spilling large keys onto overflow - * pages, it's simply the size of the MDB_node header plus the + * pages, it's simply the size of the MDBX_node header plus the * size of the key. Sizes are always rounded up to an even number - * of bytes, to guarantee 2-byte alignment of the MDB_node headers. + * of bytes, to guarantee 2-byte alignment of the MDBX_node headers. * * [in] env The environment handle. * [in] key The key for the node. @@ -6367,7 +6412,7 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, size_t node_size = NODESIZE; ssize_t room; unsigned ofs; - MDB_node *node; + MDBX_node *node; MDB_page *mp = mc->mc_pg[mc->mc_top]; MDB_page *ofp = NULL; /* overflow page */ void *ndata; @@ -6494,7 +6539,7 @@ static void mdbx_node_del(MDB_cursor *mc, int ksize) { indx_t indx = mc->mc_ki[mc->mc_top]; unsigned sz; indx_t i, j, numkeys, ptr; - MDB_node *node; + MDBX_node *node; char *base; mdbx_debug("delete node %u on %s page %" PRIaPGNO "", indx, @@ -6543,7 +6588,7 @@ static void mdbx_node_del(MDB_cursor *mc, int ksize) { * [in] mp The main page to operate on. * [in] indx The index of the subpage on the main page. */ static void mdbx_node_shrink(MDB_page *mp, indx_t indx) { - MDB_node *node; + MDBX_node *node; MDB_page *sp, *xp; char *base; unsigned nsize, delta, len, ptr; @@ -6614,7 +6659,7 @@ static void mdbx_xcursor_init0(MDB_cursor *mc) { * [in] mc The main cursor whose sorted-dups cursor is to be initialized. * [in] node The data containing the MDB_db record for the sorted-dup database. */ -static void mdbx_xcursor_init1(MDB_cursor *mc, MDB_node *node) { +static void mdbx_xcursor_init1(MDB_cursor *mc, MDBX_node *node) { MDB_xcursor *mx = mc->mc_xcursor; if (node->mn_flags & F_SUBDATA) { @@ -6808,7 +6853,7 @@ int mdbx_cursor_count(MDB_cursor *mc, uint64_t *countp) { *countp = 1; if (mc->mc_xcursor != NULL) { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + MDBX_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); @@ -6862,7 +6907,7 @@ MDB_dbi mdbx_cursor_dbi(MDB_cursor *mc) { * Returns 0 on success, non-zero on failure. */ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { MDB_page *mp; - MDB_node *node; + MDBX_node *node; char *base; size_t len; int delta, ksize, oksize; @@ -6944,7 +6989,7 @@ static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); /* Move a node from csrc to cdst. */ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { - MDB_node *srcnode; + MDBX_node *srcnode; MDB_val key, data; pgno_t srcpg; MDB_cursor mn; @@ -6973,7 +7018,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { unsigned snum = csrc->mc_snum; - MDB_node *s2; + MDBX_node *s2; /* must find the lowest key below src */ rc = mdbx_page_search_lowest(csrc); if (unlikely(rc)) @@ -6998,7 +7043,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { mn.mc_xcursor = NULL; if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { unsigned snum = cdst->mc_snum; - MDB_node *s2; + MDBX_node *s2; MDB_val bkey; /* must find the lowest key below dst */ mdbx_cursor_copy(cdst, &mn); @@ -7170,7 +7215,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { * Returns 0 on success, non-zero on failure. */ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { MDB_page *psrc, *pdst; - MDB_node *srcnode; + MDBX_node *srcnode; MDB_val key, data; unsigned nkeys; int rc; @@ -7208,7 +7253,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { srcnode = NODEPTR(psrc, i); if (i == 0 && IS_BRANCH(psrc)) { MDB_cursor mn; - MDB_node *s2; + MDBX_node *s2; mdbx_cursor_copy(csrc, &mn); mn.mc_xcursor = NULL; /* must find the lowest key below src */ @@ -7329,7 +7374,7 @@ static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) { * [in] mc Cursor pointing to the page where rebalancing should begin. * Returns 0 on success, non-zero on failure. */ static int mdbx_rebalance(MDB_cursor *mc) { - MDB_node *node; + MDBX_node *node; int rc, fromleft; unsigned ptop, minkeys, thresh; MDB_cursor mn; @@ -7576,7 +7621,7 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { } } if (mc->mc_db->md_flags & MDB_DUPSORT) { - MDB_node *node = + MDBX_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); /* If this node has dupdata, it may need to be reinited * because its data has moved. @@ -7674,7 +7719,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t pgno = 0; int i, j, split_indx, nkeys, pmax; MDB_env *env = mc->mc_txn->mt_env; - MDB_node *node; + MDBX_node *node; MDB_val sepkey, rkey, xdata, *rdata = &xdata; MDB_page *copy = NULL; MDB_page *mp, *rp, *pp; @@ -7845,7 +7890,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, psize += nsize; node = NULL; } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); if (IS_LEAF(mp)) { if (F_ISSET(node->mn_flags, F_BIGDATA)) @@ -7865,7 +7910,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, sepkey.mv_size = newkey->mv_size; sepkey.mv_data = newkey->mv_data; } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); + node = (MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); sepkey.mv_size = node->mn_ksize; sepkey.mv_data = NODEKEY(node); } @@ -7942,7 +7987,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, /* Update index for the new key. */ mc->mc_ki[mc->mc_top] = j; } else { - node = (MDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); rkey.mv_data = NODEKEY(node); rkey.mv_size = node->mn_ksize; if (IS_LEAF(mp)) { @@ -8112,7 +8157,7 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, if (likely(rc == MDB_SUCCESS) && (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)) { /* LY: allows update (explicit overwrite) only for unique keys */ - MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + MDBX_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdbx_tassert(txn, XCURSOR_INITED(&mc) && mc.mc_xcursor->mx_db.md_entries > 1); @@ -8213,7 +8258,7 @@ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { MDB_cursor mc; - MDB_node *ni; + MDBX_node *ni; MDB_page *mo, *mp, *leaf; char *buf, *ptr; int rc, toggle; @@ -8808,7 +8853,7 @@ int mdbx_dbi_open_ex(MDB_txn *txn, const char *table_name, unsigned user_flags, return rc; } else { /* make sure this is actually a table */ - MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + MDBX_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if (unlikely((node->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) return MDB_INCOMPATIBLE; } @@ -8941,7 +8986,7 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { rc = mdbx_page_search(mc, NULL, MDB_PS_FIRST); if (likely(rc == MDB_SUCCESS)) { MDB_txn *txn = mc->mc_txn; - MDB_node *ni; + MDBX_node *ni; MDB_cursor mx; unsigned i; @@ -9718,7 +9763,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, for (align_bytes = i = 0; i < nkeys; align_bytes += ((payload_size + align_bytes) & 1), i++) { - MDB_node *node; + MDBX_node *node; if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ @@ -10015,7 +10060,7 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, if (flags & MDB_CURRENT) { /* для не-уникальных ключей позволяем update/delete только если ключ * один */ - MDB_node *leaf = NODEPTR(page, mc.mc_ki[mc.mc_top]); + MDBX_node *leaf = NODEPTR(page, mc.mc_ki[mc.mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdbx_tassert(txn, XCURSOR_INITED(&mc) && mc.mc_xcursor->mx_db.md_entries > 1); @@ -10105,7 +10150,7 @@ int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, if (values_count) { *values_count = 1; if (mc.mc_xcursor != NULL) { - MDB_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); + MDBX_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdbx_tassert(txn, mc.mc_xcursor == &mx && (mx.mx_cursor.mc_flags & C_INITIALIZED)); From a3ed42b9997c0d7d5879b6128076654611258af8 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 21:04:23 +0300 Subject: [PATCH 146/303] mdbx: rework MDBX_page, drop COPY_PGNO. --- src/bits.h | 61 ++++++------ src/mdbx.c | 269 ++++++++++++++++++++++++----------------------------- 2 files changed, 148 insertions(+), 182 deletions(-) diff --git a/src/bits.h b/src/bits.h index 4a18f5ce..bd710361 100644 --- a/src/bits.h +++ b/src/bits.h @@ -283,13 +283,11 @@ typedef struct MDB_meta { * * Each non-metapage up to MDB_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a freeDB record. */ -typedef struct MDB_page { -#define mp_pgno mp_p.p_pgno -#define mp_next mp_p.p_next +typedef struct MDBX_page { union { - pgno_t p_pgno; /* page number */ - struct MDB_page *p_next; /* for in-memory list of freed pages */ - } mp_p; + pgno_t mp_pgno; /* page number */ + struct MDBX_page *mp_next; /* for in-memory list of freed pages */ + }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ #define P_LEAF 0x02 /* leaf page */ @@ -301,28 +299,25 @@ typedef struct MDB_page { #define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ #define P_KEEP 0x8000 /* leave this page alone during spill */ uint16_t mp_flags; -#define mp_lower mp_pb.pb.pb_lower -#define mp_upper mp_pb.pb.pb_upper -#define mp_pages mp_pb.pb_pages union { struct { - indx_t pb_lower; /* lower bound of free space */ - indx_t pb_upper; /* upper bound of free space */ - } pb; - uint32_t pb_pages; /* number of overflow pages */ - } mp_pb; + indx_t mp_lower; /* lower bound of free space */ + indx_t mp_upper; /* upper bound of free space */ + }; + uint32_t mp_pages; /* number of overflow pages */ + }; indx_t mp_ptrs[1]; /* dynamic size */ -} MDB_page; +} MDBX_page; /* Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned)offsetof(MDB_page, mp_ptrs)) +#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) /* Buffer for a stack-allocated meta page. * The members define size and alignment, and silence type * aliasing warnings. They are not used directly; that could * mean incorrectly using several union members in parallel. */ typedef union MDB_metabuf { - MDB_page mb_page; + MDBX_page mb_page; struct { char mm_pad[PAGEHDRSZ]; MDB_meta mm_meta; @@ -384,7 +379,7 @@ struct MDB_txn { MDB_IDL mt_free_pgs; /* The list of loose pages that became unused and may be reused * in this transaction, linked through NEXT_LOOSE_PAGE(page). */ - MDB_page *mt_loose_pgs; + MDBX_page *mt_loose_pgs; /* Number of loose pages (mt_loose_pgs) */ unsigned mt_loose_count; /* The sorted list of dirty pages we temporarily wrote to disk @@ -480,18 +475,18 @@ struct MDB_cursor { MDB_dbx *mc_dbx; /* The mt_dbflag for this database */ uint8_t *mc_dbflag; - uint16_t mc_snum; /* number of pushed pages */ - uint16_t mc_top; /* index of top page, normally mc_snum-1 */ - /* Cursor state flags. */ -#define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ -#define C_EOF 0x02 /* No more data */ -#define C_SUB 0x04 /* Cursor is a sub-cursor */ -#define C_DEL 0x08 /* last op was a cursor_del */ -#define C_UNTRACK 0x40 /* Un-track cursor when closing */ -#define C_RECLAIMING 0x80 /* FreeDB lookup is prohibited */ - unsigned mc_flags; /* see mdbx_cursor */ - MDB_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ - indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ + uint16_t mc_snum; /* number of pushed pages */ + uint16_t mc_top; /* index of top page, normally mc_snum-1 */ + /* Cursor state flags. */ +#define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ +#define C_EOF 0x02 /* No more data */ +#define C_SUB 0x04 /* Cursor is a sub-cursor */ +#define C_DEL 0x08 /* last op was a cursor_del */ +#define C_UNTRACK 0x40 /* Un-track cursor when closing */ +#define C_RECLAIMING 0x80 /* FreeDB lookup is prohibited */ + unsigned mc_flags; /* see mdbx_cursor */ + MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ + indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; /* Context for sorted-dup records. @@ -518,7 +513,7 @@ typedef struct MDB_xcursor { * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */ #define XCURSOR_REFRESH(mc, mp, ki) \ do { \ - MDB_page *xr_pg = (mp); \ + MDBX_page *xr_pg = (mp); \ MDBX_node *xr_node = NODEPTR(xr_pg, ki); \ if ((xr_node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ @@ -577,8 +572,8 @@ struct MDB_env { MDB_pgstate me_pgstate; /* state of old pages from freeDB */ #define me_pglast me_pgstate.mf_pglast #define me_pghead me_pgstate.mf_pghead - MDB_page *me_dpages; /* list of malloc'd blocks for re-use */ - /* IDL of pages that became unused in a write txn */ + MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ + /* IDL of pages that became unused in a write txn */ MDB_IDL me_free_pgs; /* ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ MDB_ID2L me_dirty_list; diff --git a/src/mdbx.c b/src/mdbx.c index 3af10d38..01bb8a70 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -270,7 +270,7 @@ txnid_t mdbx_debug_edge; /* The maximum size of a database page. * * It is 32k or 64k, since value-PAGEBASE must fit in - * MDB_page.mp_upper. + * MDBX_page.mp_upper. * * LMDB will use database pages < OS pages if needed. * That causes more I/O in write transactions: The OS must @@ -416,7 +416,7 @@ txnid_t mdbx_debug_edge; /* Link in MDB_txn.mt_loose_pgs list. * Kept outside the page header, which is needed when reusing the page. */ -#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) +#define NEXT_LOOSE_PAGE(p) (*(MDBX_page **)((p) + 2)) /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. @@ -482,7 +482,7 @@ typedef struct MDBX_node { #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) /* Address of node i in page p */ -static __inline MDBX_node *NODEPTR(MDB_page *p, unsigned i) { +static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { assert(NUMKEYS(p) > (unsigned)(i)); return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); } @@ -549,27 +549,6 @@ static __inline void SETDSZ(MDBX_node *node, unsigned size) { /* The size of a key in a node */ #define NODEKSZ(node) ((node)->mn_ksize) -/* Copy a page number from src to dst */ -#if UNALIGNED_OK -#define COPY_PGNO(dst, src) (dst) = (src) -#elif defined(__GNUC__) || __has_builtin(__built_memcmp) -#define COPY_PGNO(dst, src) __built_memcmp(&(dst), &(src), sizeof(pgno_t)); -#else -#define COPY_PGNO(dst, src) \ - do { \ - uint16_t *s, *d; \ - s = (uint16_t *)&(src); \ - d = (uint16_t *)&(dst); \ - if (sizeof(pgno_t) > 6) \ - *d++ = *s++; \ - if (sizeof(pgno_t) > 4) \ - *d++ = *s++; \ - if (sizeof(pgno_t) > 2) \ - *d++ = *s++; \ - *d = *s; \ - } while (0) -#endif /* UNALIGNED_OK */ - /* The address of a key in a LEAF2 page. * LEAF2 pages are used for MDB_DUPFIXED sorted-duplicate sub-DBs. * There are no node headers, keys are stored contiguously. */ @@ -613,9 +592,9 @@ static __inline void SETDSZ(MDBX_node *node, unsigned size) { #define TXN_DBI_CHANGED(txn, dbi) \ ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) -static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags); +static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags); static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, - MDB_page **mp); + MDBX_page **mp); static int mdbx_page_touch(MDB_cursor *mc); static int mdbx_cursor_touch(MDB_cursor *mc); @@ -641,7 +620,7 @@ enum { #define MDB_END_SLOT 0x80 /* release any reader slot if MDB_NOTLS */ static int mdbx_txn_end(MDB_txn *txn, unsigned mode); -static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); +static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDBX_page **mp, int *lvl); static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); #define MDB_PS_MODIFY 1 #define MDB_PS_ROOTONLY 2 @@ -663,7 +642,7 @@ static MDBX_node *mdbx_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags); static void mdbx_node_del(MDB_cursor *mc, int ksize); -static void mdbx_node_shrink(MDB_page *mp, indx_t indx); +static void mdbx_node_shrink(MDBX_page *mp, indx_t indx); static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); static int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, MDB_val *data); static size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); @@ -673,7 +652,7 @@ static int mdbx_rebalance(MDB_cursor *mc); static int mdbx_update_key(MDB_cursor *mc, MDB_val *key); static void mdbx_cursor_pop(MDB_cursor *mc); -static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp); +static int mdbx_cursor_push(MDB_cursor *mc, MDBX_page *mp); static int mdbx_cursor_del0(MDB_cursor *mc); static int mdbx_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, @@ -830,13 +809,6 @@ void __cold mdbx_debug_log(int type, const char *function, int line, va_end(args); } -/* Return the page number of mp which may be sub-page, for debug output */ -static __inline pgno_t mdbx_dbg_pgno(MDB_page *mp) { - pgno_t ret; - COPY_PGNO(ret, mp->mp_pgno); - return ret; -} - /* Dump a key in ascii or hexadecimal. */ char *mdbx_dkey(const MDB_val *key, char *const buf, const size_t bufsize) { if (!key) @@ -888,8 +860,8 @@ static const char *mdbx_leafnode_type(MDBX_node *n) { } /* Display all the keys in the page. */ -static void mdbx_page_list(MDB_page *mp) { - pgno_t pgno = mdbx_dbg_pgno(mp); +static void mdbx_page_list(MDBX_page *mp) { + pgno_t pgno = mp->mp_pgno; const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; MDBX_node *node; unsigned i, nkeys, nsize, total = 0; @@ -965,7 +937,7 @@ static void mdbx_page_list(MDB_page *mp) { static void mdbx_cursor_chk(MDB_cursor *mc) { unsigned i; MDBX_node *node; - MDB_page *mp; + MDBX_page *mp; if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; @@ -1017,7 +989,7 @@ static void mdbx_audit(MDB_txn *txn) { rc = mdbx_page_search(&mc, NULL, MDB_PS_FIRST); for (; rc == MDB_SUCCESS; rc = mdbx_cursor_sibling(&mc, 1)) { unsigned j; - MDB_page *mp; + MDBX_page *mp; mp = mc.mc_pg[mc.mc_top]; for (j = 0; j < NUMKEYS(mp); j++) { MDBX_node *leaf = NODEPTR(mp, j); @@ -1053,10 +1025,10 @@ int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { /* Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. * Set MDB_TXN_ERROR on failure. */ -static MDB_page *mdbx_page_malloc(MDB_txn *txn, unsigned num) { +static MDBX_page *mdbx_page_malloc(MDB_txn *txn, unsigned num) { MDB_env *env = txn->mt_env; size_t size = env->me_psize; - MDB_page *np = env->me_dpages; + MDBX_page *np = env->me_dpages; if (likely(num == 1 && np)) { ASAN_UNPOISON_MEMORY_REGION(np, size); VALGRIND_MEMPOOL_ALLOC(env, np, size); @@ -1090,14 +1062,14 @@ static MDB_page *mdbx_page_malloc(MDB_txn *txn, unsigned num) { /* Free a single page. * Saves single pages to a list, for future reuse. * (This is not used for multi-page overflow pages.) */ -static __inline void mdbx_page_free(MDB_env *env, MDB_page *mp) { +static __inline void mdbx_page_free(MDB_env *env, MDBX_page *mp) { mp->mp_next = env->me_dpages; VALGRIND_MEMPOOL_FREE(env, mp); env->me_dpages = mp; } /* Free a dirty page */ -static void mdbx_dpage_free(MDB_env *env, MDB_page *dp) { +static void mdbx_dpage_free(MDB_env *env, MDBX_page *dp) { if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { mdbx_page_free(env, dp); } else { @@ -1121,13 +1093,13 @@ static void mdbx_dlist_free(MDB_txn *txn) { static void __cold mdbx_kill_page(MDB_env *env, pgno_t pgno) { const size_t offs = env->me_psize * pgno; - const size_t shift = offsetof(MDB_page, mp_pb); + const size_t shift = offsetof(MDBX_page, mp_pages); if (env->me_flags & MDB_WRITEMAP) { - MDB_page *mp = (MDB_page *)(env->me_map + offs); - memset(&mp->mp_pb, 0x6F /* 'o', 111 */, env->me_psize - shift); - VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_pb, env->me_psize - shift); - ASAN_POISON_MEMORY_REGION(&mp->mp_pb, env->me_psize - shift); + MDBX_page *mp = (MDBX_page *)(env->me_map + offs); + memset(&mp->mp_pages, 0x6F /* 'o', 111 */, env->me_psize - shift); + VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_pages, env->me_psize - shift); + ASAN_POISON_MEMORY_REGION(&mp->mp_pages, env->me_psize - shift); } else { ssize_t len = env->me_psize - shift; void *buf = alloca(len); @@ -1146,7 +1118,7 @@ static void __cold mdbx_kill_page(MDB_env *env, pgno_t pgno) { * * If the page wasn't dirtied in this txn, just add it * to this txn's free list. */ -static int mdbx_page_loose(MDB_cursor *mc, MDB_page *mp) { +static int mdbx_page_loose(MDB_cursor *mc, MDBX_page *mp) { int loose = 0; pgno_t pgno = mp->mp_pgno; MDB_txn *txn = mc->mc_txn; @@ -1175,11 +1147,11 @@ static int mdbx_page_loose(MDB_cursor *mc, MDB_page *mp) { } if (loose) { mdbx_debug("loosen db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); - MDB_page **link = &NEXT_LOOSE_PAGE(mp); + MDBX_page **link = &NEXT_LOOSE_PAGE(mp); if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) { mdbx_kill_page(txn->mt_env, pgno); - VALGRIND_MAKE_MEM_UNDEFINED(link, sizeof(MDB_page *)); - ASAN_UNPOISON_MEMORY_REGION(link, sizeof(MDB_page *)); + VALGRIND_MAKE_MEM_UNDEFINED(link, sizeof(MDBX_page *)); + ASAN_UNPOISON_MEMORY_REGION(link, sizeof(MDBX_page *)); } *link = txn->mt_loose_pgs; txn->mt_loose_pgs = mp; @@ -1208,7 +1180,7 @@ static int mdbx_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { MDB_txn *txn = mc->mc_txn; MDB_cursor *m3, *m0 = mc; MDB_xcursor *mx; - MDB_page *dp, *mp; + MDBX_page *dp, *mp; MDBX_node *leaf; unsigned i, j; int rc = MDB_SUCCESS, level; @@ -1298,7 +1270,7 @@ static int mdbx_page_flush(MDB_txn *txn, int keep); * Returns 0 on success, non-zero on failure. */ static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { MDB_txn *txn = m0->mc_txn; - MDB_page *dp; + MDBX_page *dp; MDB_ID2L dl = txn->mt_u.dirty_list; unsigned i, j, need; int rc; @@ -1442,7 +1414,7 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { } /* Add a page to the txn's dirty list */ -static void mdbx_page_dirty(MDB_txn *txn, MDB_page *mp) { +static void mdbx_page_dirty(MDB_txn *txn, MDBX_page *mp) { MDB_ID2 mid; int rc, (*insert)(MDB_ID2L, MDB_ID2 *); @@ -1482,13 +1454,13 @@ static void mdbx_page_dirty(MDB_txn *txn, MDB_page *mp) { #define MDBX_ALLOC_ALL \ (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW | MDBX_ALLOC_KICK) -static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { +static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { int rc; MDB_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; pgno_t pgno, *mop = env->me_pghead; unsigned i = 0, j, mop_len = mop ? mop[0] : 0, n2 = num - 1; - MDB_page *np; + MDBX_page *np; txnid_t oldest = 0, last = 0; MDB_cursor_op op; MDB_cursor m2; @@ -1764,7 +1736,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) { done: assert(mp && num); if (env->me_flags & MDB_WRITEMAP) { - np = (MDB_page *)(env->me_map + env->me_psize * pgno); + np = (MDBX_page *)(env->me_map + env->me_psize * pgno); /* LY: reset no-access flag from mdbx_kill_page() */ VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize * num); @@ -1801,7 +1773,7 @@ done: * [in] dst page to copy into * [in] src page to copy from * [in] psize size of a page */ -static void mdbx_page_copy(MDB_page *dst, MDB_page *src, unsigned psize) { +static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, unsigned psize) { enum { Align = sizeof(pgno_t) }; indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; @@ -1826,7 +1798,7 @@ static void mdbx_page_copy(MDB_page *dst, MDB_page *src, unsigned psize) { * [in] mp the page being referenced. It must not be dirty. * [out] ret the writable page, if any. * ret is unchanged if mp wasn't spilled. */ -static int mdbx_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) { +static int mdbx_page_unspill(MDB_txn *txn, MDBX_page *mp, MDBX_page **ret) { MDB_env *env = txn->mt_env; const MDB_txn *tx2; unsigned x; @@ -1837,7 +1809,7 @@ static int mdbx_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) { continue; x = mdbx_midl_search(tx2->mt_spill_pgs, pn); if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { - MDB_page *np; + MDBX_page *np; int num; if (txn->mt_dirty_room == 0) return MDB_TXN_FULL; @@ -1884,7 +1856,7 @@ static int mdbx_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) { * * Returns 0 on success, non-zero on failure. */ static int mdbx_page_touch(MDB_cursor *mc) { - MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDBX_page *mp = mc->mc_pg[mc->mc_top], *np; MDB_txn *txn = mc->mc_txn; MDB_cursor *m2, *m3; pgno_t pgno; @@ -1909,7 +1881,7 @@ static int mdbx_page_touch(MDB_cursor *mc) { mdbx_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { - MDB_page *parent = mc->mc_pg[mc->mc_top - 1]; + MDBX_page *parent = mc->mc_pg[mc->mc_top - 1]; MDBX_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top - 1]); SETPGNO(node, pgno); } else { @@ -2707,7 +2679,7 @@ again: if (unlikely(!env->me_pghead) && txn->mt_loose_pgs) { /* Put loose page numbers in mt_free_pgs, since * we may be unable to return them to me_pghead. */ - MDB_page *mp = txn->mt_loose_pgs; + MDBX_page *mp = txn->mt_loose_pgs; if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)) return rc; @@ -2844,7 +2816,7 @@ again: /* Return loose page numbers to me_pghead, though usually none are * left at this point. The pages themselves remain in dirty_list. */ if (txn->mt_loose_pgs) { - MDB_page *mp = txn->mt_loose_pgs; + MDBX_page *mp = txn->mt_loose_pgs; unsigned count = txn->mt_loose_count; MDB_IDL loose; /* Room for loose pages + temp IDL with same */ @@ -2959,7 +2931,7 @@ static int mdbx_page_flush(MDB_txn *txn, int keep) { int i, pagecount = dl[0].mid, rc; size_t size = 0, pos = 0; pgno_t pgno = 0; - MDB_page *dp = NULL; + MDBX_page *dp = NULL; struct iovec iov[MDB_COMMIT_PAGES]; ssize_t wpos = 0, wsize = 0; size_t next_pos = 1; /* impossible pos, so pos != next_pos */ @@ -3084,7 +3056,7 @@ int mdbx_txn_commit(MDB_txn *txn) { if (txn->mt_parent) { MDB_txn *parent = txn->mt_parent; - MDB_page **lp; + MDBX_page **lp; MDB_ID2L dst, src; MDB_IDL pspill; unsigned i, x, y, len, ps_len; @@ -3320,7 +3292,7 @@ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { return rc; } - MDB_page *p = (MDB_page *)&buf; + MDBX_page *p = (MDBX_page *)&buf; if (!F_ISSET(p->mp_flags, P_META)) { mdbx_debug("page %" PRIaPGNO " not a meta-page", p->mp_pgno); return MDB_INVALID; @@ -3390,14 +3362,14 @@ static int __cold mdbx_env_init_metas(const MDB_env *env, MDB_meta *model) { assert(offsetof(MDB_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); unsigned page_size = env->me_psize; - MDB_page *first = calloc(NUM_METAS, page_size); + MDBX_page *first = calloc(NUM_METAS, page_size); if (!first) return MDBX_ENOMEM; first->mp_pgno = 0; first->mp_flags = P_META; MDB_meta *first_meta = (MDB_meta *)PAGEDATA(first); - MDB_page *second = (MDB_page *)((char *)first + page_size); + MDBX_page *second = (MDBX_page *)((char *)first + page_size); second->mp_pgno = 1; second->mp_flags = P_META; MDB_meta *second_meta = (MDB_meta *)PAGEDATA(second); @@ -4226,7 +4198,7 @@ static void __cold mdbx_env_close0(MDB_env *env) { } int __cold mdbx_env_close_ex(MDB_env *env, int dont_sync) { - MDB_page *dp; + MDBX_page *dp; int rc = MDB_SUCCESS; if (unlikely(!env)) @@ -4415,7 +4387,7 @@ static MDBX_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, unsigned i = 0, nkeys; int low, high; int rc = 0; - MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_page *mp = mc->mc_pg[mc->mc_top]; MDBX_node *node = NULL; MDB_val nodekey; MDB_cmp_func *cmp; @@ -4425,7 +4397,7 @@ static MDBX_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, mdbx_debug("searching %u keys in %s %spage %" PRIaPGNO "", nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mdbx_dbg_pgno(mp)); + mp->mp_pgno); low = IS_LEAF(mp) ? 0 : 1; high = nkeys - 1; @@ -4519,7 +4491,7 @@ static void mdbx_cursor_pop(MDB_cursor *mc) { /* Push a page onto the top of the cursor's stack. * Set MDB_TXN_ERROR on failure. */ -static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp) { +static int mdbx_cursor_push(MDB_cursor *mc, MDBX_page *mp) { mdbx_debug("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc), (void *)mc); @@ -4546,11 +4518,11 @@ static int mdbx_cursor_push(MDB_cursor *mc, MDB_page *mp) { * 0=mapped page. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, +static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDBX_page **ret, int *lvl) { MDB_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; - MDB_page *p = NULL; + MDBX_page *p = NULL; int level; if (!(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_WRITEMAP))) { @@ -4588,7 +4560,7 @@ static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, level = 0; mapped: - p = (MDB_page *)(env->me_map + env->me_psize * pgno); + p = (MDBX_page *)(env->me_map + env->me_psize * pgno); done: *ret = p; @@ -4600,7 +4572,7 @@ done: /* Finish mdbx_page_search() / mdbx_page_search_lowest(). * The cursor is at the root page, set up the rest of it. */ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { - MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_page *mp = mc->mc_pg[mc->mc_top]; int rc; DKBUF; @@ -4683,7 +4655,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { * are all in situations where the current page is known to * be underfilled. */ static int mdbx_page_search_lowest(MDB_cursor *mc) { - MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_page *mp = mc->mc_pg[mc->mc_top]; MDBX_node *node = NODEPTR(mp, 0); int rc; @@ -4781,7 +4753,7 @@ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { return mdbx_page_search_root(mc, key, flags); } -static int mdbx_ovpage_free(MDB_cursor *mc, MDB_page *mp) { +static int mdbx_ovpage_free(MDB_cursor *mc, MDBX_page *mp) { MDB_txn *txn = mc->mc_txn; pgno_t pg = mp->mp_pgno; unsigned x = 0, ovpages = mp->mp_pages; @@ -4861,7 +4833,7 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDB_page *mp) { * Returns 0 on success, non-zero on failure. */ static __inline int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, MDB_val *data) { - MDB_page *omp; /* overflow page */ + MDBX_page *omp; /* overflow page */ pgno_t pgno; int rc; @@ -4919,7 +4891,7 @@ int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { int rc; MDBX_node *indx; - MDB_page *mp; + MDBX_page *mp; if (unlikely(mc->mc_snum < 2)) { return MDB_NOTFOUND; /* root has no siblings */ @@ -4967,7 +4939,7 @@ static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { /* Move the cursor to the next data item. */ static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) { - MDB_page *mp; + MDBX_page *mp; MDBX_node *leaf; int rc; @@ -5002,8 +4974,8 @@ static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } - mdbx_debug("cursor_next: top page is %" PRIaPGNO " in cursor %p", - mdbx_dbg_pgno(mp), (void *)mc); + mdbx_debug("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, + (void *)mc); if (mc->mc_flags & C_DEL) { mc->mc_flags ^= C_DEL; goto skip; @@ -5024,7 +4996,7 @@ static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, skip: mdbx_debug("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", - mdbx_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); + mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mp)) { key->mv_size = mc->mc_db->md_xsize; @@ -5056,7 +5028,7 @@ skip: /* Move the cursor to the previous data item. */ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) { - MDB_page *mp; + MDBX_page *mp; MDBX_node *leaf; int rc; @@ -5092,8 +5064,8 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } - mdbx_debug("cursor_prev: top page is %" PRIaPGNO " in cursor %p", - mdbx_dbg_pgno(mp), (void *)mc); + mdbx_debug("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, + (void *)mc); mc->mc_flags &= ~(C_EOF | C_DEL); @@ -5111,7 +5083,7 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, mdbx_debug("==> cursor points to page %" PRIaPGNO " with %u keys, key index %u", - mdbx_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]); + mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mp)) { key->mv_size = mc->mc_db->md_xsize; @@ -5144,7 +5116,7 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, int *exactp) { int rc; - MDB_page *mp; + MDBX_page *mp; MDBX_node *leaf = NULL; DKBUF; @@ -5438,7 +5410,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, case MDB_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; - MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_page *mp = mc->mc_pg[mc->mc_top]; unsigned nkeys = NUMKEYS(mp); if (mc->mc_ki[mc->mc_top] >= nkeys) { mc->mc_ki[mc->mc_top] = nkeys; @@ -5616,7 +5588,7 @@ static int mdbx_cursor_touch(MDB_cursor *mc) { int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned flags) { MDB_env *env; - MDB_page *fp, *sub_root = NULL; + MDBX_page *fp, *sub_root = NULL; uint16_t fp_flags; MDB_val xdata, *rdata, dkey, olddata; MDB_db dummy; @@ -5765,7 +5737,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, } if (rc == MDB_NO_ROOT) { - MDB_page *np; + MDBX_page *np; /* new database, write a root leaf page */ mdbx_debug("allocating new root leaf page"); if (unlikely(rc2 = mdbx_page_new(mc, P_LEAF, 1, &np))) { @@ -5844,7 +5816,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, * mp: new (sub-)page. offset: growth in page size. * xdata: node data with new page or DB. */ unsigned i, offset = 0; - MDB_page *mp = fp = xdata.mv_data = env->me_pbuf; + MDBX_page *mp = fp = xdata.mv_data = env->me_pbuf; mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; /* Was a single item before, must convert now */ @@ -5903,7 +5875,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, case MDB_CURRENT | MDB_NODUPDATA: case MDB_CURRENT: fp->mp_flags |= P_DIRTY; - COPY_PGNO(fp->mp_pgno, mp->mp_pgno); + fp->mp_pgno = mp->mp_pgno; mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; flags |= F_DUPDATA; goto put_sub; @@ -5968,7 +5940,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDB_INCOMPATIBLE; /* overflow page overwrites need special handling */ if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; + MDBX_page *omp; pgno_t pg; int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); @@ -5993,7 +5965,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, * is smaller than the overflow threshold. */ if (unlikely(level > 1)) { /* It is writable only in a parent txn */ - MDB_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); + MDBX_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); MDB_ID2 id2; if (unlikely(!np)) return MDBX_ENOMEM; @@ -6073,7 +6045,7 @@ new_sub: MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; + MDBX_page *mp = mc->mc_pg[i]; for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { if (mc->mc_flags & C_SUB) @@ -6127,7 +6099,7 @@ new_sub: MDB_cursor *m2; MDB_xcursor *mx = mc->mc_xcursor; unsigned i = mc->mc_top; - MDB_page *mp = mc->mc_pg[i]; + MDBX_page *mp = mc->mc_pg[i]; int nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { @@ -6189,7 +6161,7 @@ new_sub: int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { MDBX_node *leaf; - MDB_page *mp; + MDBX_page *mp; int rc; if (unlikely(!mc)) @@ -6284,7 +6256,7 @@ int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { /* add overflow pages to free list */ if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { - MDB_page *omp; + MDBX_page *omp; pgno_t pg; memcpy(&pg, NODEDATA(leaf), sizeof(pg)); @@ -6312,8 +6284,8 @@ fail: * * Returns 0 on success, non-zero on failure. */ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, - MDB_page **mp) { - MDB_page *np; + MDBX_page **mp) { + MDBX_page *np; int rc; if (unlikely((rc = mdbx_page_alloc(mc, num, &np, MDBX_ALLOC_ALL)))) @@ -6413,8 +6385,8 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, ssize_t room; unsigned ofs; MDBX_node *node; - MDB_page *mp = mc->mc_pg[mc->mc_top]; - MDB_page *ofp = NULL; /* overflow page */ + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_page *ofp = NULL; /* overflow page */ void *ndata; DKBUF; @@ -6423,7 +6395,7 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, mdbx_debug("add to %s %spage %" PRIaPGNO " index %i, data size %" PRIuPTR " key size %" PRIuPTR " [%s]", IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mdbx_dbg_pgno(mp), indx, data ? data->mv_size : 0, + mp->mp_pgno, indx, data ? data->mv_size : 0, key ? key->mv_size : 0, DKEY(key)); if (IS_LEAF2(mp)) { @@ -6521,8 +6493,8 @@ update: return MDB_SUCCESS; full: - mdbx_debug("not enough room in page %" PRIaPGNO ", got %u ptrs", - mdbx_dbg_pgno(mp), NUMKEYS(mp)); + mdbx_debug("not enough room in page %" PRIaPGNO ", got %u ptrs", mp->mp_pgno, + NUMKEYS(mp)); mdbx_debug("upper-lower = %u - %u = %" PRIiPTR "", mp->mp_upper, mp->mp_lower, room); mdbx_debug("node size = %" PRIuPTR "", node_size); @@ -6535,7 +6507,7 @@ full: * [in] ksize The size of a node. Only used if the page is * part of a MDB_DUPFIXED database. */ static void mdbx_node_del(MDB_cursor *mc, int ksize) { - MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_page *mp = mc->mc_pg[mc->mc_top]; indx_t indx = mc->mc_ki[mc->mc_top]; unsigned sz; indx_t i, j, numkeys, ptr; @@ -6543,7 +6515,7 @@ static void mdbx_node_del(MDB_cursor *mc, int ksize) { char *base; mdbx_debug("delete node %u on %s page %" PRIaPGNO "", indx, - IS_LEAF(mp) ? "leaf" : "branch", mdbx_dbg_pgno(mp)); + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); numkeys = NUMKEYS(mp); mdbx_cassert(mc, indx < numkeys); @@ -6587,15 +6559,15 @@ static void mdbx_node_del(MDB_cursor *mc, int ksize) { /* Compact the main page after deleting a node on a subpage. * [in] mp The main page to operate on. * [in] indx The index of the subpage on the main page. */ -static void mdbx_node_shrink(MDB_page *mp, indx_t indx) { +static void mdbx_node_shrink(MDBX_page *mp, indx_t indx) { MDBX_node *node; - MDB_page *sp, *xp; + MDBX_page *sp, *xp; char *base; unsigned nsize, delta, len, ptr; int i; node = NODEPTR(mp, indx); - sp = (MDB_page *)NODEDATA(node); + sp = (MDBX_page *)NODEDATA(node); delta = SIZELEFT(sp); nsize = NODEDSZ(node) - delta; @@ -6605,13 +6577,13 @@ static void mdbx_node_shrink(MDB_page *mp, indx_t indx) { if (nsize & 1) return; /* do not make the node uneven-sized */ } else { - xp = (MDB_page *)((char *)sp + delta); /* destination subpage */ + xp = (MDBX_page *)((char *)sp + delta); /* destination subpage */ for (i = NUMKEYS(sp); --i >= 0;) xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; len = PAGEHDRSZ; } sp->mp_upper = sp->mp_lower; - COPY_PGNO(sp->mp_pgno, mp->mp_pgno); + sp->mp_pgno = mp->mp_pgno; SETDSZ(node, nsize); /* Shift upward */ @@ -6669,7 +6641,7 @@ static void mdbx_xcursor_init1(MDB_cursor *mc, MDBX_node *node) { mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags = C_SUB; } else { - MDB_page *fp = NODEDATA(node); + MDBX_page *fp = NODEDATA(node); mx->mx_db.md_xsize = 0; mx->mx_db.md_flags = 0; mx->mx_db.md_depth = 1; @@ -6677,7 +6649,7 @@ static void mdbx_xcursor_init1(MDB_cursor *mc, MDBX_node *node) { mx->mx_db.md_leaf_pages = 1; mx->mx_db.md_overflow_pages = 0; mx->mx_db.md_entries = NUMKEYS(fp); - COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); + mx->mx_db.md_root = fp->mp_pgno; mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags = C_INITIALIZED | C_SUB; @@ -6845,7 +6817,7 @@ int mdbx_cursor_count(MDB_cursor *mc, uint64_t *countp) { return MDB_NOTFOUND; } - MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_page *mp = mc->mc_pg[mc->mc_top]; if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) { *countp = 0; return MDB_NOTFOUND; @@ -6906,7 +6878,7 @@ MDB_dbi mdbx_cursor_dbi(MDB_cursor *mc) { * [in] key The new key to use. * Returns 0 on success, non-zero on failure. */ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { - MDB_page *mp; + MDBX_page *mp; MDBX_node *node; char *base; size_t len; @@ -7086,7 +7058,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = csrc->mc_dbi; - MDB_page *mpd, *mps; + MDBX_page *mpd, *mps; mps = csrc->mc_pg[csrc->mc_top]; /* If we're adding on the left, bump others up */ @@ -7214,7 +7186,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { * * Returns 0 on success, non-zero on failure. */ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { - MDB_page *psrc, *pdst; + MDBX_page *psrc, *pdst; MDBX_node *srcnode; MDB_val key, data; unsigned nkeys; @@ -7389,19 +7361,18 @@ static int mdbx_rebalance(MDB_cursor *mc) { } mdbx_debug("rebalancing %s page %" PRIaPGNO " (has %u keys, %.1f%% full)", IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", - mdbx_dbg_pgno(mc->mc_pg[mc->mc_top]), - NUMKEYS(mc->mc_pg[mc->mc_top]), + mc->mc_pg[mc->mc_top]->mp_pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10); if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { mdbx_debug("no need to rebalance page %" PRIaPGNO ", above fill threshold", - mdbx_dbg_pgno(mc->mc_pg[mc->mc_top])); + mc->mc_pg[mc->mc_top]->mp_pgno); return MDB_SUCCESS; } if (mc->mc_snum < 2) { - MDB_page *mp = mc->mc_pg[0]; + MDBX_page *mp = mc->mc_pg[0]; unsigned nkeys = NUMKEYS(mp); if (IS_SUBP(mp)) { mdbx_debug("Can't rebalance a subpage, ignoring"); @@ -7554,7 +7525,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { /* Complete a delete operation started by mdbx_cursor_del(). */ static int mdbx_cursor_del0(MDB_cursor *mc) { int rc; - MDB_page *mp; + MDBX_page *mp; indx_t ki; unsigned nkeys; MDB_cursor *m2, *m3; @@ -7721,8 +7692,8 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, MDB_env *env = mc->mc_txn->mt_env; MDBX_node *node; MDB_val sepkey, rkey, xdata, *rdata = &xdata; - MDB_page *copy = NULL; - MDB_page *mp, *rp, *pp; + MDBX_page *copy = NULL; + MDBX_page *mp, *rp, *pp; int ptop; MDB_cursor mn; DKBUF; @@ -8259,7 +8230,7 @@ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { MDB_cursor mc; MDBX_node *ni; - MDB_page *mo, *mp, *leaf; + MDBX_page *mo, *mp, *leaf; char *buf, *ptr; int rc, toggle; unsigned i; @@ -8285,13 +8256,13 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { return MDBX_ENOMEM; for (i = 0; i < mc.mc_top; i++) { - mdbx_page_copy((MDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); - mc.mc_pg[i] = (MDB_page *)ptr; + mdbx_page_copy((MDBX_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize); + mc.mc_pg[i] = (MDBX_page *)ptr; ptr += my->mc_env->me_psize; } /* This is writable space for a leaf page. Usually not needed. */ - leaf = (MDB_page *)ptr; + leaf = (MDBX_page *)ptr; toggle = my->mc_toggle; while (mc.mc_snum > 0) { @@ -8304,7 +8275,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { for (i = 0; i < n; i++) { ni = NODEPTR(mp, i); if (ni->mn_flags & F_BIGDATA) { - MDB_page *omp; + MDBX_page *omp; /* Need writable leaf */ if (mp != leaf) { @@ -8326,7 +8297,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { goto done; toggle = my->mc_toggle; } - mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); memcpy(mo, omp, my->mc_env->me_psize); mo->mp_pgno = my->mc_next_pgno; my->mc_next_pgno += omp->mp_pages; @@ -8389,7 +8360,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { goto done; toggle = my->mc_toggle; } - mo = (MDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); + mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); mdbx_page_copy(mo, mp, my->mc_env->me_psize); mo->mp_pgno = my->mc_next_pgno++; my->mc_wlen[toggle] += my->mc_env->me_psize; @@ -8412,7 +8383,7 @@ done: /* Copy environment with compaction. */ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { MDB_meta *mm; - MDB_page *mp; + MDBX_page *mp; mdbx_copy my; MDB_txn *txn = NULL; mdbx_thread_t thr; @@ -8440,14 +8411,14 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { if (rc) goto finish; - mp = (MDB_page *)my.mc_wbuf[0]; + mp = (MDBX_page *)my.mc_wbuf[0]; memset(mp, 0, NUM_METAS * env->me_psize); mp->mp_pgno = 0; mp->mp_flags = P_META; mm = (MDB_meta *)PAGEDATA(mp); mdbx_meta_model(env, mm); - mp = (MDB_page *)(my.mc_wbuf[0] + env->me_psize); + mp = (MDBX_page *)(my.mc_wbuf[0] + env->me_psize); mp->mp_pgno = 1; mp->mp_flags = P_META; *(MDB_meta *)PAGEDATA(mp) = *mm; @@ -8999,13 +8970,13 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { mdbx_cursor_copy(mc, &mx); while (mc->mc_snum > 0) { - MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_page *mp = mc->mc_pg[mc->mc_top]; unsigned n = NUMKEYS(mp); if (IS_LEAF(mp)) { for (i = 0; i < n; i++) { ni = NODEPTR(mp, i); if (ni->mn_flags & F_BIGDATA) { - MDB_page *omp; + MDBX_page *omp; pgno_t pg; memcpy(&pg, NODEDATA(ni), sizeof(pg)); rc = mdbx_page_get(mc, pg, &omp, NULL); @@ -9710,7 +9681,7 @@ typedef struct mdbx_walk_ctx { /* Depth-first tree traversal. */ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, pgno_t pg, int deep) { - MDB_page *mp; + MDBX_page *mp; int rc, i, nkeys; unsigned header_size, unused_size, payload_size, align_bytes; const char *type; @@ -9726,7 +9697,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, rc = mdbx_page_get(&mc, pg, &mp, NULL); if (rc) return rc; - if (pg != mp->mp_p.p_pgno) + if (pg != mp->mp_pgno) return MDB_CORRUPTED; nkeys = NUMKEYS(mp); @@ -9783,7 +9754,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, assert(IS_LEAF(mp)); if (node->mn_flags & F_BIGDATA) { - MDB_page *omp; + MDBX_page *omp; pgno_t *opg; size_t over_header, over_payload, over_unused; @@ -9792,7 +9763,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, rc = mdbx_page_get(&mc, *opg, &omp, NULL); if (rc) return rc; - if (*opg != omp->mp_p.p_pgno) + if (*opg != omp->mp_pgno) return MDB_CORRUPTED; /* LY: Don't use mask here, e.g bitwise * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). @@ -9831,7 +9802,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, } } - return ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, type, nkeys, + return ctx->mw_visitor(mp->mp_pgno, 1, ctx->mw_user, dbi, type, nkeys, payload_size, header_size, unused_size + align_bytes); } @@ -10055,7 +10026,7 @@ int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, *old_data = present_data; goto bailout; } else { - MDB_page *page = mc.mc_pg[mc.mc_top]; + MDBX_page *page = mc.mc_pg[mc.mc_top]; if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { if (flags & MDB_CURRENT) { /* для не-уникальных ключей позволяем update/delete только если ключ @@ -10195,7 +10166,7 @@ int mdbx_is_dirty(const MDB_txn *txn, const void *ptr) { const MDB_env *env = txn->mt_env; const uintptr_t mask = ~(uintptr_t)(env->me_psize - 1); - const MDB_page *page = (const MDB_page *)((uintptr_t)ptr & mask); + const MDBX_page *page = (const MDBX_page *)((uintptr_t)ptr & mask); /* LY: Тут не всё хорошо с абсолютной достоверностью результата, * так как флажок P_DIRTY в LMDB может означать не совсем то, From 34213c554a60eaa016056ea6ebdc1a4d15bb51f4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 21:16:59 +0300 Subject: [PATCH 147/303] mdbx: rework MDBX_reader. --- src/bits.h | 56 +++++++++++++++++++++++++----------------------------- src/mdbx.c | 32 +++++++++++++++---------------- 2 files changed, 42 insertions(+), 46 deletions(-) diff --git a/src/bits.h b/src/bits.h index bd710361..87bfd040 100644 --- a/src/bits.h +++ b/src/bits.h @@ -189,40 +189,35 @@ typedef uint16_t indx_t; #pragma pack(push, 1) -/* The information we store in a single slot of the reader table. - * In addition to a transaction ID, we also record the process and - * thread ID that owns a slot, so that we can detect stale information, - * e.g. threads or processes that went away without cleaning up. - * NOTE: We currently don't check for stale records. We simply re-init - * the table when we know that we're the only process opening the - * lock file. */ -typedef struct MDB_rxbody { +/* The actual reader record, with cacheline padding. */ +typedef struct MDBX_reader { /* Current Transaction ID when this transaction began, or (txnid_t)-1. * Multiple readers that start at the same time will probably have the * same ID here. Again, it's not important to exclude them from * anything; all we need to know is which version of the DB they * started from so we can avoid overwriting any data used in that * particular version. */ - volatile txnid_t mrb_txnid; - /* The process ID of the process owning this reader txn. */ - volatile mdbx_pid_t mrb_pid; - /* The thread ID of the thread owning this txn. */ - volatile mdbx_tid_t mrb_tid; -} MDB_rxbody; + volatile txnid_t mr_txnid; -/* The actual reader record, with cacheline padding. */ -typedef struct MDB_reader { - union { - MDB_rxbody mrx; -/* shorthand for mrb_txnid */ -#define mr_txnid mru.mrx.mrb_txnid -#define mr_pid mru.mrx.mrb_pid -#define mr_tid mru.mrx.mrb_tid - /* cache line alignment */ - char pad[(sizeof(MDB_rxbody) + MDBX_CACHELINE_SIZE - 1) & - ~(MDBX_CACHELINE_SIZE - 1)]; - } mru; -} MDB_reader; + /* The information we store in a single slot of the reader table. + * In addition to a transaction ID, we also record the process and + * thread ID that owns a slot, so that we can detect stale information, + * e.g. threads or processes that went away without cleaning up. + * + * NOTE: We currently don't check for stale records. + * We simply re-init the table when we know that we're the only process + * opening the lock file. */ + + /* The process ID of the process owning this reader txn. */ + volatile mdbx_pid_t mr_pid; + /* The thread ID of the thread owning this txn. */ + volatile mdbx_tid_t mr_tid; + + /* cache line alignment */ + uint8_t pad[~(MDBX_CACHELINE_SIZE - 1) & + (sizeof(txnid_t) + sizeof(mdbx_pid_t) + sizeof(mdbx_tid_t) + + MDBX_CACHELINE_SIZE - 1)]; +} MDBX_reader; /* Information about a single database in the environment. */ typedef struct MDB_db { @@ -345,7 +340,7 @@ typedef struct MDBX_lockinfo { /* Mutex protecting access to this table. */ MDBX_OSAL_LOCK mti_rmutex; #endif - MDB_reader mti_readers[1]; + MDBX_reader mti_readers[1]; } MDBX_lockinfo; #pragma pack(pop) @@ -390,7 +385,7 @@ struct MDB_txn { /* For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ MDB_ID2L dirty_list; /* For read txns: This thread/txn's reader table slot, or NULL. */ - MDB_reader *reader; + MDBX_reader *reader; } mt_u; /* Array of records for each DB known in the environment. */ MDB_dbx *mt_dbxs; @@ -773,7 +768,8 @@ static __inline MDB_meta *mdbx_meta_head(MDB_env *env) { void mdbx_rthc_dtor(void *rthc); void mdbx_rthc_lock(void); void mdbx_rthc_unlock(void); -int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDB_reader *begin, MDB_reader *end); +int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); void mdbx_rthc_remove(mdbx_thread_key_t key); void mdbx_rthc_cleanup(void); diff --git a/src/mdbx.c b/src/mdbx.c index 01bb8a70..275038a9 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -42,8 +42,8 @@ /* rthc (tls keys and destructors) */ typedef struct rthc_entry_t { - MDB_reader *begin; - MDB_reader *end; + MDBX_reader *begin; + MDBX_reader *end; mdbx_thread_key_t key; } rthc_entry_t; @@ -59,7 +59,7 @@ static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT]; static rthc_entry_t *rthc_table = rthc_table_static; __cold void mdbx_rthc_dtor(void *ptr) { - MDB_reader *rthc = (MDB_reader *)ptr; + MDBX_reader *rthc = (MDBX_reader *)ptr; mdbx_rthc_lock(); const mdbx_pid_t self_pid = mdbx_getpid(); @@ -80,7 +80,7 @@ __cold void mdbx_rthc_cleanup(void) { const mdbx_pid_t self_pid = mdbx_getpid(); for (unsigned i = 0; i < rthc_count; ++i) { mdbx_thread_key_t key = rthc_table[i].key; - MDB_reader *rthc = mdbx_thread_rthc_get(key); + MDBX_reader *rthc = mdbx_thread_rthc_get(key); if (rthc) { mdbx_thread_rthc_set(key, NULL); if (rthc->mr_pid == self_pid) { @@ -92,8 +92,8 @@ __cold void mdbx_rthc_cleanup(void) { mdbx_rthc_unlock(); } -__cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDB_reader *begin, - MDB_reader *end) { +__cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end) { #ifndef NDEBUG *key = (mdbx_thread_key_t)0xBADBADBAD; #endif /* NDEBUG */ @@ -136,7 +136,7 @@ __cold void mdbx_rthc_remove(mdbx_thread_key_t key) { for (unsigned i = 0; i < rthc_count; ++i) { if (key == rthc_table[i].key) { const mdbx_pid_t self_pid = mdbx_getpid(); - for (MDB_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; + for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) if (rthc->mr_pid == self_pid) rthc->mr_pid = 0; @@ -1396,7 +1396,7 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { txnid_t oldest = mdbx_meta_lt(a, b) ? b->mm_txnid : a->mm_txnid; int i, reader; - const MDB_reader *const r = env->me_lck->mti_readers; + const MDBX_reader *const r = env->me_lck->mti_readers; for (reader = -1, i = env->me_lck->mti_numreaders; --i >= 0;) { if (r[i].mr_pid) { mdbx_jitter4testing(true); @@ -2115,7 +2115,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { if (flags & MDB_TXN_RDONLY) { txn->mt_flags = MDB_TXN_RDONLY; - MDB_reader *r = txn->mt_u.reader; + MDBX_reader *r = txn->mt_u.reader; if (likely(env->me_flags & MDB_ENV_TXKEY)) { mdbx_assert(env, !(env->me_flags & MDB_NOTLS)); r = mdbx_thread_rthc_get(env->me_txkey); @@ -3890,7 +3890,7 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { return err; if (rc == MDBX_RESULT_TRUE) { - off_t wanna = roundup2((env->me_maxreaders - 1) * sizeof(MDB_reader) + + off_t wanna = roundup2((env->me_maxreaders - 1) * sizeof(MDBX_reader) + sizeof(MDBX_lockinfo), env->me_os_psize); #ifndef NDEBUG @@ -3907,7 +3907,7 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { size = wanna; } } - env->me_maxreaders = (size - sizeof(MDBX_lockinfo)) / sizeof(MDB_reader) + 1; + env->me_maxreaders = (size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader) + 1; void *addr = NULL; err = mdbx_mmap(&addr, size, true, env->me_lfd); @@ -4185,7 +4185,7 @@ static void __cold mdbx_env_close0(MDB_env *env) { } mdbx_munmap((void *)env->me_lck, - (env->me_maxreaders - 1) * sizeof(MDB_reader) + + (env->me_maxreaders - 1) * sizeof(MDBX_reader) + sizeof(MDBX_lockinfo)); env->me_lck = NULL; env->me_pid = 0; @@ -8666,7 +8666,7 @@ int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) { return MDBX_EINVAL; MDB_meta *m1, *m2; - MDB_reader *r; + MDBX_reader *r; unsigned i; m1 = METAPAGE_1(env); @@ -9137,7 +9137,7 @@ int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { return MDBX_EBADSIGN; unsigned snap_nreaders = env->me_lck->mti_numreaders; - MDB_reader *mr = env->me_lck->mti_readers; + MDBX_reader *mr = env->me_lck->mti_readers; for (unsigned i = 0; i < snap_nreaders; i++) { if (mr[i].mr_pid) { txnid_t txnid = mr[i].mr_txnid; @@ -9221,7 +9221,7 @@ int __cold mdbx_reader_check0(MDB_env *env, int rdt_locked, int *dead) { pids[0] = 0; int rc = MDBX_RESULT_FALSE, count = 0; - MDB_reader *mr = env->me_lck->mti_readers; + MDBX_reader *mr = env->me_lck->mti_readers; for (unsigned i = 0; i < snap_nreaders; i++) { const mdbx_pid_t pid = mr[i].mr_pid; @@ -9586,7 +9586,7 @@ static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { return snap; } - MDB_reader *r; + MDBX_reader *r; mdbx_tid_t tid; mdbx_pid_t pid; int rc; From ac8e987346cb8f43a0682e3dbe5ab59f9d515f77 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 21:36:09 +0300 Subject: [PATCH 148/303] mdbx: rework MDBX_txn. --- mdbx.h | 78 ++++---- src/bits.h | 38 ++-- src/mdbx.c | 385 ++++++++++++++++++++-------------------- src/tools/mdbx_chk.c | 2 +- src/tools/mdbx_dump.c | 4 +- src/tools/mdbx_load.c | 2 +- src/tools/mdbx_stat.c | 2 +- test/test.cc | 4 +- test/test.h | 6 +- tutorial/sample-mdb.txt | 2 +- 10 files changed, 262 insertions(+), 261 deletions(-) diff --git a/mdbx.h b/mdbx.h index 67c0ee89..ab38a5f3 100644 --- a/mdbx.h +++ b/mdbx.h @@ -111,7 +111,7 @@ typedef struct MDB_env MDB_env; * * All database operations require a transaction handle. Transactions may be * read-only or read-write. */ -typedef struct MDB_txn MDB_txn; +typedef struct MDBX_txn MDBX_txn; /* A handle for an individual database in the DB environment. */ typedef uint32_t MDB_dbi; @@ -161,7 +161,7 @@ typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); #define MDB_WRITEMAP 0x80000u /* use asynchronous msync when MDB_WRITEMAP is used */ #define MDB_MAPASYNC 0x100000u -/* tie reader locktable slots to MDB_txn objects instead of to threads */ +/* tie reader locktable slots to MDBX_txn objects instead of to threads */ #define MDB_NOTLS 0x200000u /* don't do any locking, caller must manage their own locks * WARNING: libmdbx don't support this mode. */ @@ -464,8 +464,8 @@ LIBMDBX_API int mdbx_env_create(MDB_env **penv); * * - MDB_NOTLS * Don't use Thread-Local Storage. Tie reader locktable slots to - * MDB_txn objects instead of to threads. I.e. mdbx_txn_reset() keeps - * the slot reseved for the MDB_txn object. A thread may use parallel + * MDBX_txn objects instead of to threads. I.e. mdbx_txn_reset() keeps + * the slot reseved for the MDBX_txn object. A thread may use parallel * read-only transactions. A read-only transaction may span threads if * the user synchronizes its use. Applications that multiplex many * user threads over individual OS threads need this option. Such an @@ -722,7 +722,7 @@ LIBMDBX_API int mdbx_env_set_mapsize(MDB_env *env, size_t size); * Starting a read-only transaction normally ties a lock table slot to the * current thread until the environment closes or the thread exits. If * MDB_NOTLS is in use, mdbx_txn_begin() instead ties the slot to the - * MDB_txn object until it or the MDB_env object is destroyed. + * MDBX_txn object until it or the MDB_env object is destroyed. * This function may only be called after mdbx_env_create() and before * mdbx_env_open(). * @@ -830,7 +830,7 @@ LIBMDBX_API int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); * - MDB_RDONLY * This transaction will not perform any write operations. * - * [out] txn Address where the new MDB_txn handle will be stored + * [out] txn Address where the new MDBX_txn handle will be stored * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: @@ -842,14 +842,14 @@ LIBMDBX_API int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); * - MDB_READERS_FULL - a read-only transaction was requested and the reader * lock table is full. See mdbx_env_set_maxreaders(). * - MDBX_ENOMEM - out of memory. */ -LIBMDBX_API int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, - MDB_txn **txn); +LIBMDBX_API int mdbx_txn_begin(MDB_env *env, MDBX_txn *parent, unsigned flags, + MDBX_txn **txn); /* Returns the transaction's MDB_env * * [in] txn A transaction handle returned by mdbx_txn_begin() */ -LIBMDBX_API MDB_env *mdbx_txn_env(MDB_txn *txn); +LIBMDBX_API MDB_env *mdbx_txn_env(MDBX_txn *txn); /* Return the transaction's ID. * @@ -860,7 +860,7 @@ LIBMDBX_API MDB_env *mdbx_txn_env(MDB_txn *txn); * [in] txn A transaction handle returned by mdbx_txn_begin() * * Returns A transaction ID, valid if input is an active transaction. */ -LIBMDBX_API uint64_t mdbx_txn_id(MDB_txn *txn); +LIBMDBX_API uint64_t mdbx_txn_id(MDBX_txn *txn); /* Commit all the operations of a transaction into the database. * @@ -879,7 +879,7 @@ LIBMDBX_API uint64_t mdbx_txn_id(MDB_txn *txn); * - MDBX_ENOSPC - no more disk space. * - MDBX_EIO - a low-level I/O error occurred while writing. * - MDBX_ENOMEM - out of memory. */ -LIBMDBX_API int mdbx_txn_commit(MDB_txn *txn); +LIBMDBX_API int mdbx_txn_commit(MDBX_txn *txn); /* Abandon all the operations of the transaction instead of saving them. * @@ -890,7 +890,7 @@ LIBMDBX_API int mdbx_txn_commit(MDB_txn *txn); * ends. It can be reused with mdbx_cursor_renew() before finally closing it. * * [in] txn A transaction handle returned by mdbx_txn_begin(). */ -LIBMDBX_API int mdbx_txn_abort(MDB_txn *txn); +LIBMDBX_API int mdbx_txn_abort(MDBX_txn *txn); /* Reset a read-only transaction. * @@ -899,7 +899,7 @@ LIBMDBX_API int mdbx_txn_abort(MDB_txn *txn); * allocation overhead if the process will start a new read-only transaction * soon, and also locking overhead if MDB_NOTLS is in use. The reader table * lock is released, but the table slot stays tied to its thread or - * MDB_txn. Use mdbx_txn_abort() to discard a reset handle, and to free + * MDBX_txn. Use mdbx_txn_abort() to discard a reset handle, and to free * its lock table slot if MDB_NOTLS is in use. * * Cursors opened within the transaction must not be used @@ -911,7 +911,7 @@ LIBMDBX_API int mdbx_txn_abort(MDB_txn *txn); * the database size may grow much more rapidly than otherwise. * * [in] txn A transaction handle returned by mdbx_txn_begin() */ -LIBMDBX_API int mdbx_txn_reset(MDB_txn *txn); +LIBMDBX_API int mdbx_txn_reset(MDBX_txn *txn); /* Renew a read-only transaction. * @@ -926,7 +926,7 @@ LIBMDBX_API int mdbx_txn_reset(MDB_txn *txn); * - MDB_PANIC - a fatal error occurred earlier and the environment * must be shut down. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_txn_renew(MDB_txn *txn); +LIBMDBX_API int mdbx_txn_renew(MDBX_txn *txn); /* Open a table in the environment. * @@ -994,10 +994,10 @@ LIBMDBX_API int mdbx_txn_renew(MDB_txn *txn); * environment and MDB_CREATE was not specified. * - MDB_DBS_FULL - too many databases have been opened. * See mdbx_env_set_maxdbs(). */ -LIBMDBX_API int mdbx_dbi_open_ex(MDB_txn *txn, const char *name, unsigned flags, - MDB_dbi *dbi, MDB_cmp_func *keycmp, - MDB_cmp_func *datacmp); -LIBMDBX_API int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, +LIBMDBX_API int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, + unsigned flags, MDB_dbi *dbi, + MDB_cmp_func *keycmp, MDB_cmp_func *datacmp); +LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi); /* Retrieve statistics for a database. @@ -1010,7 +1010,7 @@ LIBMDBX_API int mdbx_dbi_open(MDB_txn *txn, const char *name, unsigned flags, * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, +LIBMDBX_API int mdbx_dbi_stat(MDBX_txn *txn, MDB_dbi dbi, MDBX_stat *stat, size_t bytes); /* Retrieve the DB flags for a database handle. @@ -1020,7 +1020,7 @@ LIBMDBX_API int mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *stat, * [out] flags Address where the flags will be returned. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags); +LIBMDBX_API int mdbx_dbi_flags(MDBX_txn *txn, MDB_dbi dbi, unsigned *flags); /* Close a database handle. Normally unnecessary. * @@ -1051,7 +1051,7 @@ LIBMDBX_API int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); * and close the DB handle. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); +LIBMDBX_API int mdbx_drop(MDBX_txn *txn, MDB_dbi dbi, int del); /* Get items from a database. * @@ -1079,7 +1079,7 @@ LIBMDBX_API int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del); * possible errors are: * - MDB_NOTFOUND - the key was not in the database. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, +LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); /* Store items into a database. @@ -1139,8 +1139,8 @@ LIBMDBX_API int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, * - MDB_TXN_FULL - the transaction has too many dirty pages. * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, - unsigned flags); +LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, + MDB_val *data, unsigned flags); /* Delete items from a database. * @@ -1162,7 +1162,7 @@ LIBMDBX_API int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, * possible errors are: * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, +LIBMDBX_API int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); /* Create a cursor handle. @@ -1183,7 +1183,7 @@ LIBMDBX_API int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, +LIBMDBX_API int mdbx_cursor_open(MDBX_txn *txn, MDB_dbi dbi, MDB_cursor **cursor); /* Close a cursor handle. @@ -1209,12 +1209,12 @@ LIBMDBX_API void mdbx_cursor_close(MDB_cursor *cursor); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *cursor); +LIBMDBX_API int mdbx_cursor_renew(MDBX_txn *txn, MDB_cursor *cursor); /* Return the cursor's transaction handle. * * [in] cursor A cursor handle returned by mdbx_cursor_open() */ -LIBMDBX_API MDB_txn *mdbx_cursor_txn(MDB_cursor *cursor); +LIBMDBX_API MDBX_txn *mdbx_cursor_txn(MDB_cursor *cursor); /* Return the cursor's database handle. * @@ -1353,7 +1353,7 @@ LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, uint64_t *countp); * [in] b The second item to compare * * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ -LIBMDBX_API int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, +LIBMDBX_API int mdbx_cmp(MDBX_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); /* Compare two data items according to a particular database. @@ -1367,7 +1367,7 @@ LIBMDBX_API int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, * [in] b The second item to compare * * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ -LIBMDBX_API int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, +LIBMDBX_API int mdbx_dcmp(MDBX_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); /* A callback function used to print a message from the library. @@ -1429,7 +1429,7 @@ LIBMDBX_API int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); * * Returns Number of transactions committed after the given was started for * read, or -1 on failure. */ -LIBMDBX_API int mdbx_txn_straggler(MDB_txn *txn, int *percent); +LIBMDBX_API int mdbx_txn_straggler(MDBX_txn *txn, int *percent); /* A callback function for killing a laggard readers, * but also could waiting ones. Called in case of MDB_MAP_FULL error. @@ -1487,13 +1487,13 @@ typedef int MDBX_pgvisitor_func(uint64_t pgno, unsigned pgnumber, void *ctx, const char *dbi, const char *type, int nentries, int payload_bytes, int header_bytes, int unused_bytes); -LIBMDBX_API int mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, +LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, void *ctx); typedef struct mdbx_canary { uint64_t x, y, z, v; } mdbx_canary; -LIBMDBX_API int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary); -LIBMDBX_API int mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary); +LIBMDBX_API int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary); +LIBMDBX_API int mdbx_canary_get(MDBX_txn *txn, mdbx_canary *canary); /* Returns: * - MDBX_RESULT_TRUE @@ -1509,19 +1509,19 @@ LIBMDBX_API int mdbx_cursor_on_first(MDB_cursor *mc); /* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */ LIBMDBX_API int mdbx_cursor_on_last(MDB_cursor *mc); -LIBMDBX_API int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, +LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, MDB_val *old_data, unsigned flags); /* Same as mdbx_get(), but: * 1) if values_count is not NULL, then returns the count * of multi-values/duplicates for a given key. * 2) updates the key for pointing to the actual key's data inside DB. */ -LIBMDBX_API int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, +LIBMDBX_API int mdbx_get_ex(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, int *values_count); -LIBMDBX_API int mdbx_is_dirty(const MDB_txn *txn, const void *ptr); +LIBMDBX_API int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr); -LIBMDBX_API int mdbx_dbi_sequence(MDB_txn *txn, MDB_dbi dbi, uint64_t *result, +LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDB_dbi dbi, uint64_t *result, uint64_t increment); #ifdef __cplusplus diff --git a/src/bits.h b/src/bits.h index 87bfd040..1e97869f 100644 --- a/src/bits.h +++ b/src/bits.h @@ -356,12 +356,12 @@ typedef struct MDB_dbx { /* A database transaction. * Every operation requires a transaction handle. */ -struct MDB_txn { +struct MDBX_txn { #define MDBX_MT_SIGNATURE (0x93D53A31) unsigned mt_signature; - MDB_txn *mt_parent; /* parent of a nested txn */ + MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDB_TXN_HAS_CHILD */ - MDB_txn *mt_child; + MDBX_txn *mt_child; pgno_t mt_next_pgno; /* next unallocated page */ /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction @@ -371,22 +371,22 @@ struct MDB_txn { /* The list of reclaimed txns from freeDB */ MDB_IDL mt_lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDB_IDL mt_free_pgs; + MDB_IDL mt_free_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through NEXT_LOOSE_PAGE(page). */ - MDBX_page *mt_loose_pgs; - /* Number of loose pages (mt_loose_pgs) */ + MDBX_page *mt_loose_pages; + /* Number of loose pages (mt_loose_pages) */ unsigned mt_loose_count; /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDB_IDL mt_spill_pgs; + MDB_IDL mt_spill_pages; union { /* For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ - MDB_ID2L dirty_list; + MDB_ID2L mt_rw_dirtylist; /* For read txns: This thread/txn's reader table slot, or NULL. */ - MDBX_reader *reader; - } mt_u; + MDBX_reader *mt_ro_reader; + }; /* Array of records for each DB known in the environment. */ MDB_dbx *mt_dbxs; /* Array of MDB_db records for each known DB */ @@ -423,15 +423,15 @@ struct MDB_txn { #define MDB_TXN_ERROR 0x02 /* txn is unusable after an error */ #define MDB_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */ #define MDB_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */ -#define MDB_TXN_HAS_CHILD 0x10 /* txn has an MDB_txn.mt_child */ +#define MDB_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */ /* most operations on the txn are currently illegal */ #define MDB_TXN_BLOCKED (MDB_TXN_FINISHED | MDB_TXN_ERROR | MDB_TXN_HAS_CHILD) unsigned mt_flags; - /* dirty_list room: Array size - dirty pages visible to this txn. + /* dirtylist room: Array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge - * dirty_list into mt_parent after freeing hidden mt_parent pages. */ - unsigned mt_dirty_room; + * dirtylist into mt_parent after freeing hidden mt_parent pages. */ + unsigned mt_dirtyroom; mdbx_canary mt_canary; }; @@ -461,7 +461,7 @@ struct MDB_cursor { /* Context used for databases with MDB_DUPSORT, otherwise NULL */ struct MDB_xcursor *mc_xcursor; /* The transaction that owns this cursor */ - MDB_txn *mc_txn; + MDBX_txn *mc_txn; /* The database handle this cursor operates on */ MDB_dbi mc_dbi; /* The database record for this cursor */ @@ -555,8 +555,8 @@ struct MDB_env { char *me_map; /* the memory map of the data file */ MDBX_lockinfo *me_lck; /* the memory map of the lock file, never NULL */ void *me_pbuf; /* scratch area for DUPSORT put() */ - MDB_txn *me_txn; /* current write transaction */ - MDB_txn *me_txn0; /* prealloc'd write transaction */ + MDBX_txn *me_txn; /* current write transaction */ + MDBX_txn *me_txn0; /* prealloc'd write transaction */ size_t me_mapsize; /* size of the data memory map */ pgno_t me_maxpg; /* me_mapsize / me_psize */ MDB_dbx *me_dbxs; /* array of static DB info */ @@ -571,7 +571,7 @@ struct MDB_env { /* IDL of pages that became unused in a write txn */ MDB_IDL me_free_pgs; /* ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ - MDB_ID2L me_dirty_list; + MDB_ID2L me_dirtylist; /* Max number of freelist items that can fit in a single overflow page */ unsigned me_maxfree_1pg; /* Max size of a node on a page */ @@ -593,7 +593,7 @@ struct MDB_env { /* Nested transaction */ typedef struct MDB_ntxn { - MDB_txn mnt_txn; /* the transaction */ + MDBX_txn mnt_txn; /* the transaction */ MDB_pgstate mnt_pgstate; /* parent transaction's saved freestate */ } MDB_ntxn; diff --git a/src/mdbx.c b/src/mdbx.c index 275038a9..e7819616 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -414,7 +414,7 @@ txnid_t mdbx_debug_edge; /* The number of overflow pages needed to store the given size. */ #define OVPAGES(size, psize) ((PAGEHDRSZ - 1 + (size)) / (psize) + 1) -/* Link in MDB_txn.mt_loose_pgs list. +/* Link in MDBX_txn.mt_loose_pages list. * Kept outside the page header, which is needed when reusing the page. */ #define NEXT_LOOSE_PAGE(p) (*(MDBX_page **)((p) + 2)) @@ -618,7 +618,7 @@ enum { #define MDB_END_FREE 0x20 /* free txn unless it is MDB_env.me_txn0 */ #define MDB_END_EOTDONE 0x40 /* txn's cursors already closed */ #define MDB_END_SLOT 0x80 /* release any reader slot if MDB_NOTLS */ -static int mdbx_txn_end(MDB_txn *txn, unsigned mode); +static int mdbx_txn_end(MDBX_txn *txn, unsigned mode); static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDBX_page **mp, int *lvl); static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); @@ -655,7 +655,7 @@ static void mdbx_cursor_pop(MDB_cursor *mc); static int mdbx_cursor_push(MDB_cursor *mc, MDBX_page *mp); static int mdbx_cursor_del0(MDB_cursor *mc); -static int mdbx_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, +static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags); static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right); static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, @@ -667,7 +667,7 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); -static void mdbx_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, +static void mdbx_cursor_init(MDB_cursor *mc, MDBX_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); static void mdbx_xcursor_init0(MDB_cursor *mc); static void mdbx_xcursor_init1(MDB_cursor *mc, MDBX_node *node); @@ -962,7 +962,7 @@ static void mdbx_cursor_chk(MDB_cursor *mc) { /* Count all the pages in each DB and in the freelist and make sure * it matches the actual number of pages being used. * All named DBs must be open for a correct count. */ -static void mdbx_audit(MDB_txn *txn) { +static void mdbx_audit(MDBX_txn *txn) { MDB_cursor mc; MDB_val key, data; pgno_t freecount, count; @@ -1012,12 +1012,12 @@ static void mdbx_audit(MDB_txn *txn) { } } -int mdbx_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { +int mdbx_cmp(MDBX_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_cmp(a, b); } -int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { +int mdbx_dcmp(MDBX_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_dcmp(a, b); } @@ -1025,7 +1025,7 @@ int mdbx_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { /* Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. * Set MDB_TXN_ERROR on failure. */ -static MDBX_page *mdbx_page_malloc(MDB_txn *txn, unsigned num) { +static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { MDB_env *env = txn->mt_env; size_t size = env->me_psize; MDBX_page *np = env->me_dpages; @@ -1080,9 +1080,9 @@ static void mdbx_dpage_free(MDB_env *env, MDBX_page *dp) { } /* Return all dirty pages to dpage list */ -static void mdbx_dlist_free(MDB_txn *txn) { +static void mdbx_dlist_free(MDBX_txn *txn) { MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; + MDB_ID2L dl = txn->mt_rw_dirtylist; size_t i, n = dl[0].mid; for (i = 1; i <= n; i++) { @@ -1121,11 +1121,11 @@ static void __cold mdbx_kill_page(MDB_env *env, pgno_t pgno) { static int mdbx_page_loose(MDB_cursor *mc, MDBX_page *mp) { int loose = 0; pgno_t pgno = mp->mp_pgno; - MDB_txn *txn = mc->mc_txn; + MDBX_txn *txn = mc->mc_txn; if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { if (txn->mt_parent) { - MDB_ID2 *dl = txn->mt_u.dirty_list; + MDB_ID2 *dl = txn->mt_rw_dirtylist; /* If txn has a parent, * make sure the page is in our dirty list. */ if (dl[0].mid) { @@ -1153,12 +1153,12 @@ static int mdbx_page_loose(MDB_cursor *mc, MDBX_page *mp) { VALGRIND_MAKE_MEM_UNDEFINED(link, sizeof(MDBX_page *)); ASAN_UNPOISON_MEMORY_REGION(link, sizeof(MDBX_page *)); } - *link = txn->mt_loose_pgs; - txn->mt_loose_pgs = mp; + *link = txn->mt_loose_pages; + txn->mt_loose_pages = mp; txn->mt_loose_count++; mp->mp_flags |= P_LOOSE; } else { - int rc = mdbx_midl_append(&txn->mt_free_pgs, pgno); + int rc = mdbx_midl_append(&txn->mt_free_pages, pgno); if (unlikely(rc)) return rc; } @@ -1177,7 +1177,7 @@ static int mdbx_page_loose(MDB_cursor *mc, MDBX_page *mp) { * Returns 0 on success, non-zero on failure. */ static int mdbx_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { const unsigned Mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP; - MDB_txn *txn = mc->mc_txn; + MDBX_txn *txn = mc->mc_txn; MDB_cursor *m3, *m0 = mc; MDB_xcursor *mx; MDBX_page *dp, *mp; @@ -1232,7 +1232,7 @@ mark_done: return rc; } -static int mdbx_page_flush(MDB_txn *txn, int keep); +static int mdbx_page_flush(MDBX_txn *txn, int keep); /* Spill pages from the dirty list back to disk. * This is intended to prevent running into MDB_TXN_FULL situations, @@ -1244,7 +1244,7 @@ static int mdbx_page_flush(MDB_txn *txn, int keep); * 2) child txns may run out of space if their parents dirtied a * lot of pages and never spilled them. TODO: we probably should do * a preemptive spill during mdbx_txn_begin() of a child txn, if - * the parent's dirty_room is below a given threshold. + * the parent's dirtyroom is below a given threshold. * * Otherwise, if not using nested txns, it is expected that apps will * not run into MDB_TXN_FULL any more. The pages are flushed to disk @@ -1269,9 +1269,9 @@ static int mdbx_page_flush(MDB_txn *txn, int keep); * * Returns 0 on success, non-zero on failure. */ static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { - MDB_txn *txn = m0->mc_txn; + MDBX_txn *txn = m0->mc_txn; MDBX_page *dp; - MDB_ID2L dl = txn->mt_u.dirty_list; + MDB_ID2L dl = txn->mt_rw_dirtylist; unsigned i, j, need; int rc; @@ -1289,16 +1289,16 @@ static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { i += i; /* double it for good measure */ need = i; - if (txn->mt_dirty_room > i) + if (txn->mt_dirtyroom > i) return MDB_SUCCESS; - if (!txn->mt_spill_pgs) { - txn->mt_spill_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX); - if (unlikely(!txn->mt_spill_pgs)) + if (!txn->mt_spill_pages) { + txn->mt_spill_pages = mdbx_midl_alloc(MDB_IDL_UM_MAX); + if (unlikely(!txn->mt_spill_pages)) return MDBX_ENOMEM; } else { /* purge deleted slots */ - MDB_IDL sl = txn->mt_spill_pgs; + MDB_IDL sl = txn->mt_spill_pages; unsigned num = sl[0]; j = 0; for (i = 1; i <= num; i++) { @@ -1332,11 +1332,11 @@ static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { /* Can't spill twice, * make sure it's not already in a parent's spill list. */ if (txn->mt_parent) { - MDB_txn *tx2; + MDBX_txn *tx2; for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { - if (tx2->mt_spill_pgs) { - j = mdbx_midl_search(tx2->mt_spill_pgs, pn); - if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { + if (tx2->mt_spill_pages) { + j = mdbx_midl_search(tx2->mt_spill_pages, pn); + if (j <= tx2->mt_spill_pages[0] && tx2->mt_spill_pages[j] == pn) { dp->mp_flags |= P_KEEP; break; } @@ -1345,12 +1345,12 @@ static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { if (tx2) continue; } - rc = mdbx_midl_append(&txn->mt_spill_pgs, pn); + rc = mdbx_midl_append(&txn->mt_spill_pages, pn); if (unlikely(rc != MDB_SUCCESS)) goto bailout; need--; } - mdbx_midl_sort(txn->mt_spill_pgs); + mdbx_midl_sort(txn->mt_spill_pages); /* Flush the spilled part of dirty list */ rc = mdbx_page_flush(txn, i); @@ -1414,7 +1414,7 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { } /* Add a page to the txn's dirty list */ -static void mdbx_page_dirty(MDB_txn *txn, MDBX_page *mp) { +static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { MDB_ID2 mid; int rc, (*insert)(MDB_ID2L, MDB_ID2 *); @@ -1425,9 +1425,9 @@ static void mdbx_page_dirty(MDB_txn *txn, MDBX_page *mp) { } mid.mid = mp->mp_pgno; mid.mptr = mp; - rc = insert(txn->mt_u.dirty_list, &mid); + rc = insert(txn->mt_rw_dirtylist, &mid); mdbx_tassert(txn, rc == 0); - txn->mt_dirty_room--; + txn->mt_dirtyroom--; } /* Allocate page numbers and memory for writing. Maintain me_pglast, @@ -1456,7 +1456,7 @@ static void mdbx_page_dirty(MDB_txn *txn, MDBX_page *mp) { static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { int rc; - MDB_txn *txn = mc->mc_txn; + MDBX_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; pgno_t pgno, *mop = env->me_pghead; unsigned i = 0, j, mop_len = mop ? mop[0] : 0, n2 = num - 1; @@ -1479,9 +1479,9 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { if (likely(flags & MDBX_ALLOC_CACHE)) { /* If there are any loose pages, just use them */ assert(mp && num); - if (likely(num == 1 && txn->mt_loose_pgs)) { - np = txn->mt_loose_pgs; - txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); + if (likely(num == 1 && txn->mt_loose_pages)) { + np = txn->mt_loose_pages; + txn->mt_loose_pages = NEXT_LOOSE_PAGE(np); txn->mt_loose_count--; mdbx_debug("db %d use loose page %" PRIaPGNO, DDBI(mc), np->mp_pgno); ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize); @@ -1491,7 +1491,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { } /* If our dirty list is already full, we can't do anything */ - if (unlikely(txn->mt_dirty_room == 0)) { + if (unlikely(txn->mt_dirtyroom == 0)) { rc = MDB_TXN_FULL; goto fail; } @@ -1798,20 +1798,20 @@ static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, unsigned psize) { * [in] mp the page being referenced. It must not be dirty. * [out] ret the writable page, if any. * ret is unchanged if mp wasn't spilled. */ -static int mdbx_page_unspill(MDB_txn *txn, MDBX_page *mp, MDBX_page **ret) { +static int mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, MDBX_page **ret) { MDB_env *env = txn->mt_env; - const MDB_txn *tx2; + const MDBX_txn *tx2; unsigned x; pgno_t pgno = mp->mp_pgno, pn = pgno << 1; for (tx2 = txn; tx2; tx2 = tx2->mt_parent) { - if (!tx2->mt_spill_pgs) + if (!tx2->mt_spill_pages) continue; - x = mdbx_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { + x = mdbx_midl_search(tx2->mt_spill_pages, pn); + if (x <= tx2->mt_spill_pages[0] && tx2->mt_spill_pages[x] == pn) { MDBX_page *np; int num; - if (txn->mt_dirty_room == 0) + if (txn->mt_dirtyroom == 0) return MDB_TXN_FULL; if (IS_OVERFLOW(mp)) num = mp->mp_pages; @@ -1832,10 +1832,10 @@ static int mdbx_page_unspill(MDB_txn *txn, MDBX_page *mp, MDBX_page **ret) { /* If in current txn, this page is no longer spilled. * If it happens to be the last page, truncate the spill list. * Otherwise mark it as deleted by setting the LSB. */ - if (x == txn->mt_spill_pgs[0]) - txn->mt_spill_pgs[0]--; + if (x == txn->mt_spill_pages[0]) + txn->mt_spill_pages[0]--; else - txn->mt_spill_pgs[x] |= 1; + txn->mt_spill_pages[x] |= 1; } /* otherwise, if belonging to a parent txn, the * page remains spilled until child commits */ @@ -1857,7 +1857,7 @@ static int mdbx_page_unspill(MDB_txn *txn, MDBX_page *mp, MDBX_page **ret) { * Returns 0 on success, non-zero on failure. */ static int mdbx_page_touch(MDB_cursor *mc) { MDBX_page *mp = mc->mc_pg[mc->mc_top], *np; - MDB_txn *txn = mc->mc_txn; + MDBX_txn *txn = mc->mc_txn; MDB_cursor *m2, *m3; pgno_t pgno; int rc; @@ -1871,14 +1871,14 @@ static int mdbx_page_touch(MDB_cursor *mc) { if (likely(np)) goto done; } - if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pgs, 1)) || + if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pages, 1)) || (rc = mdbx_page_alloc(mc, 1, &np, MDBX_ALLOC_ALL)))) goto fail; pgno = np->mp_pgno; mdbx_debug("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), mp->mp_pgno, pgno); mdbx_cassert(mc, mp->mp_pgno != pgno); - mdbx_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); + mdbx_midl_xappend(txn->mt_free_pages, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { MDBX_page *parent = mc->mc_pg[mc->mc_top - 1]; @@ -1888,7 +1888,7 @@ static int mdbx_page_touch(MDB_cursor *mc) { mc->mc_db->md_root = pgno; } } else if (txn->mt_parent && !IS_SUBP(mp)) { - MDB_ID2 mid, *dl = txn->mt_u.dirty_list; + MDB_ID2 mid, *dl = txn->mt_rw_dirtylist; pgno = mp->mp_pgno; /* If txn has a parent, make sure the page is in our * dirty list. */ @@ -2019,7 +2019,7 @@ int mdbx_env_sync(MDB_env *env, int force) { } /* Back up parent txn's cursors, then grab the originals for tracking */ -static int mdbx_cursor_shadow(MDB_txn *src, MDB_txn *dst) { +static int mdbx_cursor_shadow(MDBX_txn *src, MDBX_txn *dst) { MDB_cursor *mc, *bk; MDB_xcursor *mx; size_t size; @@ -2060,7 +2060,7 @@ static int mdbx_cursor_shadow(MDB_txn *src, MDB_txn *dst) { * [in] merge true to keep changes to parent cursors, false to revert. * * Returns 0 on success, non-zero on failure. */ -static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { +static void mdbx_cursors_eot(MDBX_txn *txn, unsigned merge) { MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; MDB_xcursor *mx; int i; @@ -2103,7 +2103,7 @@ static void mdbx_cursors_eot(MDB_txn *txn, unsigned merge) { } /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ -static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { +static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { MDB_env *env = txn->mt_env; unsigned i, nr; int rc; @@ -2115,7 +2115,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { if (flags & MDB_TXN_RDONLY) { txn->mt_flags = MDB_TXN_RDONLY; - MDBX_reader *r = txn->mt_u.reader; + MDBX_reader *r = txn->mt_ro_reader; if (likely(env->me_flags & MDB_ENV_TXKEY)) { mdbx_assert(env, !(env->me_flags & MDB_NOTLS)); r = mdbx_thread_rthc_get(env->me_txkey); @@ -2210,7 +2210,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { break; } - txn->mt_u.reader = r; + txn->mt_ro_reader = r; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ } else { /* Not yet touching txn == env->me_txn0, it may be active */ @@ -2241,14 +2241,14 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) { txn->mt_flags = flags; txn->mt_child = NULL; - txn->mt_loose_pgs = NULL; + txn->mt_loose_pages = NULL; txn->mt_loose_count = 0; - txn->mt_dirty_room = MDB_IDL_UM_MAX; - txn->mt_u.dirty_list = env->me_dirty_list; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_free_pgs = env->me_free_pgs; - txn->mt_free_pgs[0] = 0; - txn->mt_spill_pgs = NULL; + txn->mt_dirtyroom = MDB_IDL_UM_MAX; + txn->mt_rw_dirtylist = env->me_dirtylist; + txn->mt_rw_dirtylist[0].mid = 0; + txn->mt_free_pages = env->me_free_pgs; + txn->mt_free_pages[0] = 0; + txn->mt_spill_pages = NULL; if (txn->mt_lifo_reclaimed) txn->mt_lifo_reclaimed[0] = 0; env->me_txn = txn; @@ -2284,7 +2284,7 @@ bailout: return rc; } -int mdbx_txn_renew(MDB_txn *txn) { +int mdbx_txn_renew(MDBX_txn *txn) { int rc; if (unlikely(!txn)) @@ -2305,9 +2305,9 @@ int mdbx_txn_renew(MDB_txn *txn) { return rc; } -int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, - MDB_txn **ret) { - MDB_txn *txn; +int mdbx_txn_begin(MDB_env *env, MDBX_txn *parent, unsigned flags, + MDBX_txn **ret) { + MDBX_txn *txn; MDB_ntxn *ntxn; int rc, size, tsize; @@ -2343,7 +2343,7 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, size += tsize = sizeof(MDB_ntxn); } else if (flags & MDB_RDONLY) { size = env->me_maxdbs * (sizeof(MDB_db) + 1); - size += tsize = sizeof(MDB_txn); + size += tsize = sizeof(MDBX_txn); } else { /* Reuse preallocated write txn. However, do not touch it until * mdbx_txn_renew0() succeeds, since it currently may be active. */ @@ -2364,17 +2364,17 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, unsigned i; txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_dbiseqs = parent->mt_dbiseqs; - txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2) * MDB_IDL_UM_SIZE); - if (!txn->mt_u.dirty_list || - !(txn->mt_free_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX))) { - free(txn->mt_u.dirty_list); + txn->mt_rw_dirtylist = malloc(sizeof(MDB_ID2) * MDB_IDL_UM_SIZE); + if (!txn->mt_rw_dirtylist || + !(txn->mt_free_pages = mdbx_midl_alloc(MDB_IDL_UM_MAX))) { + free(txn->mt_rw_dirtylist); free(txn); return MDBX_ENOMEM; } txn->mt_txnid = parent->mt_txnid; - txn->mt_dirty_room = parent->mt_dirty_room; - txn->mt_u.dirty_list[0].mid = 0; - txn->mt_spill_pgs = NULL; + txn->mt_dirtyroom = parent->mt_dirtyroom; + txn->mt_rw_dirtylist[0].mid = 0; + txn->mt_spill_pages = NULL; txn->mt_next_pgno = parent->mt_next_pgno; parent->mt_flags |= MDB_TXN_HAS_CHILD; parent->mt_child = txn; @@ -2419,20 +2419,20 @@ int mdbx_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, return rc; } -MDB_env *mdbx_txn_env(MDB_txn *txn) { +MDB_env *mdbx_txn_env(MDBX_txn *txn) { if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) return NULL; return txn->mt_env; } -uint64_t mdbx_txn_id(MDB_txn *txn) { +uint64_t mdbx_txn_id(MDBX_txn *txn) { if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) return ~(txnid_t)0; return txn->mt_txnid; } /* Export or close DBI handles opened in this txn. */ -static void mdbx_dbis_update(MDB_txn *txn, int keep) { +static void mdbx_dbis_update(MDBX_txn *txn, int keep) { MDB_dbi n = txn->mt_numdbs; MDB_env *env = txn->mt_env; uint8_t *tdbflags = txn->mt_dbflags; @@ -2461,7 +2461,7 @@ static void mdbx_dbis_update(MDB_txn *txn, int keep) { * May be called twice for readonly txns: First reset it, then abort. * [in] txn the transaction handle to end * [in] mode why and how to end the transaction */ -static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { +static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { MDB_env *env = txn->mt_env; static const char *const names[] = MDB_END_NAMES; @@ -2479,12 +2479,12 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { (void *)env, txn->mt_dbs[MAIN_DBI].md_root); if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { - if (txn->mt_u.reader) { - txn->mt_u.reader->mr_txnid = ~(txnid_t)0; + if (txn->mt_ro_reader) { + txn->mt_ro_reader->mr_txnid = ~(txnid_t)0; if (mode & MDB_END_SLOT) { if ((env->me_flags & MDB_ENV_TXKEY) == 0) - txn->mt_u.reader->mr_pid = 0; - txn->mt_u.reader = NULL; + txn->mt_ro_reader->mr_pid = 0; + txn->mt_ro_reader = NULL; } } mdbx_coherent_barrier(); @@ -2511,8 +2511,8 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { txn->mt_flags = MDB_TXN_FINISHED; if (!txn->mt_parent) { - mdbx_midl_shrink(&txn->mt_free_pgs); - env->me_free_pgs = txn->mt_free_pgs; + mdbx_midl_shrink(&txn->mt_free_pages); + env->me_free_pgs = txn->mt_free_pages; /* me_pgstate: */ env->me_pghead = NULL; env->me_pglast = 0; @@ -2526,9 +2526,9 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { txn->mt_parent->mt_child = NULL; txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; - mdbx_midl_free(txn->mt_free_pgs); - mdbx_midl_free(txn->mt_spill_pgs); - free(txn->mt_u.dirty_list); + mdbx_midl_free(txn->mt_free_pages); + mdbx_midl_free(txn->mt_spill_pages); + free(txn->mt_rw_dirtylist); } mdbx_midl_free(pghead); @@ -2542,7 +2542,7 @@ static int mdbx_txn_end(MDB_txn *txn, unsigned mode) { return MDB_SUCCESS; } -int mdbx_txn_reset(MDB_txn *txn) { +int mdbx_txn_reset(MDBX_txn *txn) { if (unlikely(!txn)) return MDBX_EINVAL; @@ -2557,7 +2557,7 @@ int mdbx_txn_reset(MDB_txn *txn) { return mdbx_txn_end(txn, MDB_END_RESET | MDB_END_UPDATE); } -int mdbx_txn_abort(MDB_txn *txn) { +int mdbx_txn_abort(MDBX_txn *txn) { if (unlikely(!txn)) return MDBX_EINVAL; @@ -2575,7 +2575,7 @@ int mdbx_txn_abort(MDB_txn *txn) { return mdbx_txn_end(txn, MDB_END_ABORT | MDB_END_SLOT | MDB_END_FREE); } -static __inline int mdbx_backlog_size(MDB_txn *txn) { +static __inline int mdbx_backlog_size(MDBX_txn *txn) { int reclaimed = txn->mt_env->me_pghead ? txn->mt_env->me_pghead[0] : 0; return reclaimed + txn->mt_loose_count; } @@ -2583,7 +2583,7 @@ static __inline int mdbx_backlog_size(MDB_txn *txn) { /* LY: Prepare a backlog of pages to modify FreeDB itself, * while reclaiming is prohibited. It should be enough to prevent search * in mdbx_page_alloc() during a deleting, when freeDB tree is unbalanced. */ -static int mdbx_prep_backlog(MDB_txn *txn, MDB_cursor *mc) { +static int mdbx_prep_backlog(MDBX_txn *txn, MDB_cursor *mc) { /* LY: extra page(s) for b-tree rebalancing */ const int extra = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) ? 2 : 1; @@ -2608,10 +2608,10 @@ static int mdbx_prep_backlog(MDB_txn *txn, MDB_cursor *mc) { /* Save the freelist as of this transaction to the freeDB. * This changes the freelist. Keep trying until it stabilizes. */ -static int mdbx_freelist_save(MDB_txn *txn) { +static int mdbx_freelist_save(MDBX_txn *txn) { /* env->me_pghead[] can grow and shrink during this call. - * env->me_pglast and txn->mt_free_pgs[] can only grow. - * Page numbers cannot disappear from txn->mt_free_pgs[]. */ + * env->me_pglast and txn->mt_free_pages[] can only grow. + * Page numbers cannot disappear from txn->mt_free_pages[]. */ MDB_cursor mc; MDB_env *env = txn->mt_env; int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; @@ -2676,28 +2676,28 @@ again: } } - if (unlikely(!env->me_pghead) && txn->mt_loose_pgs) { - /* Put loose page numbers in mt_free_pgs, since + if (unlikely(!env->me_pghead) && txn->mt_loose_pages) { + /* Put loose page numbers in mt_free_pages, since * we may be unable to return them to me_pghead. */ - MDBX_page *mp = txn->mt_loose_pgs; - if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pgs, + MDBX_page *mp = txn->mt_loose_pages; + if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pages, txn->mt_loose_count)) != 0)) return rc; for (; mp; mp = NEXT_LOOSE_PAGE(mp)) - mdbx_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - txn->mt_loose_pgs = NULL; + mdbx_midl_xappend(txn->mt_free_pages, mp->mp_pgno); + txn->mt_loose_pages = NULL; txn->mt_loose_count = 0; } /* Save the IDL of pages freed by this txn, to a single record */ - if (freecnt < txn->mt_free_pgs[0]) { + if (freecnt < txn->mt_free_pages[0]) { if (unlikely(!freecnt)) { /* Make sure last page of freeDB is touched and on freelist */ rc = mdbx_page_search(&mc, NULL, MDB_PS_LAST | MDB_PS_MODIFY); if (unlikely(rc && rc != MDB_NOTFOUND)) goto bailout; } - free_pgs = txn->mt_free_pgs; + free_pgs = txn->mt_free_pages; /* Write to last page of freeDB */ key.mv_size = sizeof(txn->mt_txnid); key.mv_data = &txn->mt_txnid; @@ -2707,8 +2707,8 @@ again: rc = mdbx_cursor_put(&mc, &key, &data, MDB_RESERVE); if (unlikely(rc)) goto bailout; - /* Retry if mt_free_pgs[] grew during the Put() */ - free_pgs = txn->mt_free_pgs; + /* Retry if mt_free_pages[] grew during the Put() */ + free_pgs = txn->mt_free_pages; } while (freecnt < free_pgs[0]); mdbx_midl_sort(free_pgs); @@ -2814,9 +2814,9 @@ again: (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); /* Return loose page numbers to me_pghead, though usually none are - * left at this point. The pages themselves remain in dirty_list. */ - if (txn->mt_loose_pgs) { - MDBX_page *mp = txn->mt_loose_pgs; + * left at this point. The pages themselves remain in dirtylist. */ + if (txn->mt_loose_pages) { + MDBX_page *mp = txn->mt_loose_pages; unsigned count = txn->mt_loose_count; MDB_IDL loose; /* Room for loose pages + temp IDL with same */ @@ -2829,7 +2829,7 @@ again: loose[0] = count; mdbx_midl_sort(loose); mdbx_midl_xmerge(mop, loose); - txn->mt_loose_pgs = NULL; + txn->mt_loose_pages = NULL; txn->mt_loose_count = 0; mop_len = mop[0]; } @@ -2922,11 +2922,11 @@ bailout: /* Flush (some) dirty pages to the map, after clearing their dirty flag. * [in] txn the transaction that's being committed - * [in] keep number of initial pages in dirty_list to keep dirty. + * [in] keep number of initial pages in dirtylist to keep dirty. * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_flush(MDB_txn *txn, int keep) { +static int mdbx_page_flush(MDBX_txn *txn, int keep) { MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_u.dirty_list; + MDB_ID2L dl = txn->mt_rw_dirtylist; unsigned psize = env->me_psize, j; int i, pagecount = dl[0].mid, rc; size_t size = 0, pos = 0; @@ -3013,12 +3013,12 @@ static int mdbx_page_flush(MDB_txn *txn, int keep) { done: i--; - txn->mt_dirty_room += i - j; + txn->mt_dirtyroom += i - j; dl[0].mid = j; return MDB_SUCCESS; } -int mdbx_txn_commit(MDB_txn *txn) { +int mdbx_txn_commit(MDBX_txn *txn) { int rc; if (unlikely(txn == NULL)) @@ -3055,7 +3055,7 @@ int mdbx_txn_commit(MDB_txn *txn) { } if (txn->mt_parent) { - MDB_txn *parent = txn->mt_parent; + MDBX_txn *parent = txn->mt_parent; MDBX_page **lp; MDB_ID2L dst, src; MDB_IDL pspill; @@ -3075,10 +3075,10 @@ int mdbx_txn_commit(MDB_txn *txn) { } /* Append our free list to parent's */ - rc = mdbx_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); + rc = mdbx_midl_append_list(&parent->mt_free_pages, txn->mt_free_pages); if (unlikely(rc != MDB_SUCCESS)) goto fail; - mdbx_midl_free(txn->mt_free_pgs); + mdbx_midl_free(txn->mt_free_pages); /* Failures after this must either undo the changes * to the parent or set MDB_TXN_ERROR in the parent. */ @@ -3099,10 +3099,10 @@ int mdbx_txn_commit(MDB_txn *txn) { parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; } - dst = parent->mt_u.dirty_list; - src = txn->mt_u.dirty_list; + dst = parent->mt_rw_dirtylist; + src = txn->mt_rw_dirtylist; /* Remove anything in our dirty list from parent's spill list */ - if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { + if ((pspill = parent->mt_spill_pages) && (ps_len = pspill[0])) { x = y = ps_len; pspill[0] = (pgno_t)-1; /* Mark our dirty pages as deleted in parent spill list */ @@ -3123,9 +3123,9 @@ int mdbx_txn_commit(MDB_txn *txn) { } /* Remove anything in our spill list from parent's dirty list */ - if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) { - for (i = 1; i <= txn->mt_spill_pgs[0]; i++) { - pgno_t pn = txn->mt_spill_pgs[i]; + if (txn->mt_spill_pages && txn->mt_spill_pages[0]) { + for (i = 1; i <= txn->mt_spill_pages[0]; i++) { + pgno_t pn = txn->mt_spill_pages[i]; if (pn & 1) continue; /* deleted spillpg */ pn >>= 1; @@ -3157,7 +3157,7 @@ int mdbx_txn_commit(MDB_txn *txn) { } } } else { /* Simplify the above for single-ancestor case */ - len = MDB_IDL_UM_MAX - txn->mt_dirty_room; + len = MDB_IDL_UM_MAX - txn->mt_dirtyroom; } /* Merge our dirty list with parent's */ y = src[0].mid; @@ -3170,25 +3170,26 @@ int mdbx_txn_commit(MDB_txn *txn) { } mdbx_tassert(txn, i == x); dst[0].mid = len; - free(txn->mt_u.dirty_list); - parent->mt_dirty_room = txn->mt_dirty_room; - if (txn->mt_spill_pgs) { - if (parent->mt_spill_pgs) { + free(txn->mt_rw_dirtylist); + parent->mt_dirtyroom = txn->mt_dirtyroom; + if (txn->mt_spill_pages) { + if (parent->mt_spill_pages) { /* TODO: Prevent failure here, so parent does not fail */ - rc = mdbx_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + rc = + mdbx_midl_append_list(&parent->mt_spill_pages, txn->mt_spill_pages); if (unlikely(rc != MDB_SUCCESS)) parent->mt_flags |= MDB_TXN_ERROR; - mdbx_midl_free(txn->mt_spill_pgs); - mdbx_midl_sort(parent->mt_spill_pgs); + mdbx_midl_free(txn->mt_spill_pages); + mdbx_midl_sort(parent->mt_spill_pages); } else { - parent->mt_spill_pgs = txn->mt_spill_pgs; + parent->mt_spill_pages = txn->mt_spill_pages; } } /* Append our loose page list to parent's */ - for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) + for (lp = &parent->mt_loose_pages; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) ; - *lp = txn->mt_loose_pgs; + *lp = txn->mt_loose_pages; parent->mt_loose_count += txn->mt_loose_count; parent->mt_child = NULL; @@ -3207,7 +3208,7 @@ int mdbx_txn_commit(MDB_txn *txn) { mdbx_cursors_eot(txn, 0); end_mode |= MDB_END_EOTDONE; - if (!txn->mt_u.dirty_list[0].mid && + if (!txn->mt_rw_dirtylist[0].mid && !(txn->mt_flags & (MDB_TXN_DIRTY | MDB_TXN_SPILLS))) goto done; @@ -3243,7 +3244,7 @@ int mdbx_txn_commit(MDB_txn *txn) { mdbx_midl_free(env->me_pghead); env->me_pghead = NULL; - mdbx_midl_shrink(&txn->mt_free_pgs); + mdbx_midl_shrink(&txn->mt_free_pages); if (mdbx_audit_enabled()) mdbx_audit(txn); @@ -4022,7 +4023,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, MDBX_COALESCE | MDBX_LIFORECLAIM | MDB_NOMEMINIT); } else { if (!((env->me_free_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX)) && - (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + (env->me_dirtylist = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) rc = MDBX_ENOMEM; } env->me_flags = flags |= MDB_ENV_ACTIVE; @@ -4095,8 +4096,8 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, } if ((flags & MDB_RDONLY) == 0) { - MDB_txn *txn; - int tsize = sizeof(MDB_txn), + MDBX_txn *txn; + int tsize = sizeof(MDBX_txn), size = tsize + env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + sizeof(unsigned) + 1); @@ -4161,7 +4162,7 @@ static void __cold mdbx_env_close0(MDB_env *env) { free(env->me_dbiseqs); free(env->me_dbflags); free(env->me_path); - free(env->me_dirty_list); + free(env->me_dirtylist); if (env->me_txn0) mdbx_midl_free(env->me_txn0->mt_lifo_reclaimed); free(env->me_txn0); @@ -4514,31 +4515,31 @@ static int mdbx_cursor_push(MDB_cursor *mc, MDBX_page *mp) { * [in] pgno the page number for the page to retrieve. * [out] ret address of a pointer where the page's address will be * stored. - * [out] lvl dirty_list inheritance level of found page. 1=current txn, + * [out] lvl dirtylist inheritance level of found page. 1=current txn, * 0=mapped page. * * Returns 0 on success, non-zero on failure. */ static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDBX_page **ret, int *lvl) { - MDB_txn *txn = mc->mc_txn; + MDBX_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; MDBX_page *p = NULL; int level; if (!(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_WRITEMAP))) { - MDB_txn *tx2 = txn; + MDBX_txn *tx2 = txn; level = 1; do { - MDB_ID2L dl = tx2->mt_u.dirty_list; + MDB_ID2L dl = tx2->mt_rw_dirtylist; unsigned x; /* Spilled pages were dirtied in this txn and flushed * because the dirty list got full. Bring this page * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ - if (tx2->mt_spill_pgs) { + if (tx2->mt_spill_pages) { pgno_t pn = pgno << 1; - x = mdbx_midl_search(tx2->mt_spill_pgs, pn); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) + x = mdbx_midl_search(tx2->mt_spill_pages, pn); + if (x <= tx2->mt_spill_pages[0] && tx2->mt_spill_pages[x] == pn) goto mapped; } if (dl[0].mid) { @@ -4754,11 +4755,11 @@ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { } static int mdbx_ovpage_free(MDB_cursor *mc, MDBX_page *mp) { - MDB_txn *txn = mc->mc_txn; + MDBX_txn *txn = mc->mc_txn; pgno_t pg = mp->mp_pgno; unsigned x = 0, ovpages = mp->mp_pages; MDB_env *env = txn->mt_env; - MDB_IDL sl = txn->mt_spill_pgs; + MDB_IDL sl = txn->mt_spill_pages; pgno_t pn = pg << 1; int rc; @@ -4788,7 +4789,7 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDBX_page *mp) { goto release; } /* Remove from dirty list */ - dl = txn->mt_u.dirty_list; + dl = txn->mt_rw_dirtylist; x = dl[0].mid--; for (ix = dl[x]; ix.mptr != mp; ix = iy) { if (likely(x > 1)) { @@ -4803,7 +4804,7 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDBX_page *mp) { return MDB_PROBLEM; } } - txn->mt_dirty_room++; + txn->mt_dirtyroom++; if (!(env->me_flags & MDB_WRITEMAP)) mdbx_dpage_free(env, mp); release: @@ -4816,7 +4817,7 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDBX_page *mp) { mop[j--] = pg++; mop[0] += ovpages; } else { - rc = mdbx_midl_append_range(&txn->mt_free_pgs, pg, ovpages); + rc = mdbx_midl_append_range(&txn->mt_free_pages, pg, ovpages); if (unlikely(rc)) return rc; } @@ -4855,7 +4856,7 @@ static __inline int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, return MDB_SUCCESS; } -int mdbx_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { +int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { MDB_cursor mc; MDB_xcursor mx; int exact = 0; @@ -5971,8 +5972,8 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDBX_ENOMEM; id2.mid = pg; id2.mptr = np; - /* Note - this page is already counted in parent's dirty_room */ - rc2 = mdbx_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + /* Note - this page is already counted in parent's dirtyroom */ + rc2 = mdbx_mid2l_insert(mc->mc_txn->mt_rw_dirtylist, &id2); mdbx_cassert(mc, rc2 == 0); /* Currently we make the page look as with put() in the @@ -6700,7 +6701,7 @@ static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, } /* Initialize a cursor for a given transaction and database. */ -static void mdbx_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, +static void mdbx_cursor_init(MDB_cursor *mc, MDBX_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) { mc->mc_signature = MDBX_MC_SIGNATURE; mc->mc_next = NULL; @@ -6727,7 +6728,7 @@ static void mdbx_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, } } -int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { +int mdbx_cursor_open(MDBX_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { MDB_cursor *mc; size_t size = sizeof(MDB_cursor); @@ -6765,7 +6766,7 @@ int mdbx_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { return MDB_SUCCESS; } -int mdbx_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { +int mdbx_cursor_renew(MDBX_txn *txn, MDB_cursor *mc) { if (unlikely(!mc || !txn)) return MDBX_EINVAL; @@ -6860,7 +6861,7 @@ void mdbx_cursor_close(MDB_cursor *mc) { } } -MDB_txn *mdbx_cursor_txn(MDB_cursor *mc) { +MDBX_txn *mdbx_cursor_txn(MDB_cursor *mc) { if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) return NULL; return mc->mc_txn; @@ -7383,7 +7384,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { mc->mc_db->md_root = P_INVALID; mc->mc_db->md_depth = 0; mc->mc_db->md_leaf_pages = 0; - rc = mdbx_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + rc = mdbx_midl_append(&mc->mc_txn->mt_free_pages, mp->mp_pgno); if (unlikely(rc)) return rc; /* Adjust cursors pointing to mp */ @@ -7411,7 +7412,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { int i; mdbx_debug("collapsing root page!"); - rc = mdbx_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + rc = mdbx_midl_append(&mc->mc_txn->mt_free_pages, mp->mp_pgno); if (unlikely(rc)) return rc; mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); @@ -7617,7 +7618,7 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { return rc; } -int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { +int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { if (unlikely(!key || !txn)) return MDBX_EINVAL; @@ -7633,7 +7634,7 @@ int mdbx_del(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { return mdbx_del0(txn, dbi, key, data, 0); } -static int mdbx_del0(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, +static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags) { MDB_cursor mc; MDB_xcursor mx; @@ -8095,7 +8096,7 @@ done: return rc; } -int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, +int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned flags) { MDB_cursor mc; MDB_xcursor mx; @@ -8152,7 +8153,7 @@ int mdbx_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, /* State needed for a double-buffering compacting copy. */ typedef struct mdbx_copy { MDB_env *mc_env; - MDB_txn *mc_txn; + MDBX_txn *mc_txn; mdbx_condmutex_t mc_condmutex; char *mc_wbuf[2]; char *mc_over[2]; @@ -8385,7 +8386,7 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { MDB_meta *mm; MDBX_page *mp; mdbx_copy my; - MDB_txn *txn = NULL; + MDBX_txn *txn = NULL; mdbx_thread_t thr; pgno_t root, new_root; int rc; @@ -8476,7 +8477,7 @@ done: /* Copy environment as-is. */ static int __cold mdbx_env_copy_asis(MDB_env *env, mdbx_filehandle_t fd) { - MDB_txn *txn = NULL; + MDBX_txn *txn = NULL; int rc; /* Do the lock/unlock of the reader mutex before starting the @@ -8717,7 +8718,7 @@ static MDB_cmp_func *mdbx_default_datacmp(unsigned flags) { : mdbx_cmp_memn)); } -static int mdbx_dbi_bind(MDB_txn *txn, const MDB_dbi dbi, unsigned user_flags, +static int mdbx_dbi_bind(MDBX_txn *txn, const MDB_dbi dbi, unsigned user_flags, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp) { /* LY: so, accepting only three cases for the table's flags: * 1) user_flags and both comparators are zero @@ -8762,7 +8763,7 @@ static int mdbx_dbi_bind(MDB_txn *txn, const MDB_dbi dbi, unsigned user_flags, return MDB_SUCCESS; } -int mdbx_dbi_open_ex(MDB_txn *txn, const char *table_name, unsigned user_flags, +int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, MDB_dbi *dbi, MDB_cmp_func *keycmp, MDB_cmp_func *datacmp) { if (unlikely(!txn || !dbi || (user_flags & ~VALID_FLAGS) != 0)) @@ -8883,12 +8884,12 @@ int mdbx_dbi_open_ex(MDB_txn *txn, const char *table_name, unsigned user_flags, return rc; } -int mdbx_dbi_open(MDB_txn *txn, const char *table_name, unsigned table_flags, +int mdbx_dbi_open(MDBX_txn *txn, const char *table_name, unsigned table_flags, MDB_dbi *dbi) { return mdbx_dbi_open_ex(txn, table_name, table_flags, dbi, nullptr, nullptr); } -int __cold mdbx_dbi_stat(MDB_txn *txn, MDB_dbi dbi, MDBX_stat *arg, +int __cold mdbx_dbi_stat(MDBX_txn *txn, MDB_dbi dbi, MDBX_stat *arg, size_t bytes) { if (unlikely(!arg || !txn)) return MDBX_EINVAL; @@ -8933,7 +8934,7 @@ int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi) { return MDB_SUCCESS; } -int mdbx_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned *flags) { +int mdbx_dbi_flags(MDBX_txn *txn, MDB_dbi dbi, unsigned *flags) { if (unlikely(!txn || !flags)) return MDBX_EINVAL; @@ -8956,7 +8957,7 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { rc = mdbx_page_search(mc, NULL, MDB_PS_FIRST); if (likely(rc == MDB_SUCCESS)) { - MDB_txn *txn = mc->mc_txn; + MDBX_txn *txn = mc->mc_txn; MDBX_node *ni; MDB_cursor mx; unsigned i; @@ -8983,7 +8984,7 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { if (unlikely(rc)) goto done; mdbx_cassert(mc, IS_OVERFLOW(omp)); - rc = mdbx_midl_append_range(&txn->mt_free_pgs, pg, omp->mp_pages); + rc = mdbx_midl_append_range(&txn->mt_free_pages, pg, omp->mp_pages); if (unlikely(rc)) goto done; mc->mc_db->md_overflow_pages -= omp->mp_pages; @@ -8999,14 +9000,14 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { if (!subs && !mc->mc_db->md_overflow_pages) goto pop; } else { - if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pgs, n)) != 0)) + if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pages, n)) != 0)) goto done; for (i = 0; i < n; i++) { pgno_t pg; ni = NODEPTR(mp, i); pg = NODEPGNO(ni); /* free it */ - mdbx_midl_xappend(txn->mt_free_pgs, pg); + mdbx_midl_xappend(txn->mt_free_pages, pg); } } if (!mc->mc_top) @@ -9028,7 +9029,7 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { } } /* free it */ - rc = mdbx_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); + rc = mdbx_midl_append(&txn->mt_free_pages, mc->mc_db->md_root); done: if (unlikely(rc)) txn->mt_flags |= MDB_TXN_ERROR; @@ -9039,7 +9040,7 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { return rc; } -int mdbx_drop(MDB_txn *txn, MDB_dbi dbi, int del) { +int mdbx_drop(MDBX_txn *txn, MDB_dbi dbi, int del) { MDB_cursor *mc, *m2; int rc; @@ -9098,7 +9099,7 @@ leave: return rc; } -int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { +int mdbx_set_compare(MDBX_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { if (unlikely(!txn)) return MDBX_EINVAL; @@ -9112,7 +9113,7 @@ int mdbx_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { return MDB_SUCCESS; } -int mdbx_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { +int mdbx_set_dupsort(MDBX_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { if (unlikely(!txn)) return MDBX_EINVAL; @@ -9648,7 +9649,7 @@ MDBX_oom_func *__cold mdbx_env_get_oomfunc(MDB_env *env) { /* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */ __attribute__((no_sanitize_thread, noinline)) #endif -int mdbx_txn_straggler(MDB_txn *txn, int *percent) +int mdbx_txn_straggler(MDBX_txn *txn, int *percent) { if (unlikely(!txn)) return -MDBX_EINVAL; @@ -9656,7 +9657,7 @@ int mdbx_txn_straggler(MDB_txn *txn, int *percent) if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(!txn->mt_u.reader)) + if (unlikely(!txn->mt_ro_reader)) return -1; MDB_env *env = txn->mt_env; @@ -9668,12 +9669,12 @@ int mdbx_txn_straggler(MDB_txn *txn, int *percent) last = env->me_txn0->mt_next_pgno; *percent = (last * 100ull + maxpg / 2) / maxpg; } - txnid_t lag = meta->mm_txnid - txn->mt_u.reader->mr_txnid; + txnid_t lag = meta->mm_txnid - txn->mt_ro_reader->mr_txnid; return (lag > INT_MAX) ? INT_MAX : (int)lag; } typedef struct mdbx_walk_ctx { - MDB_txn *mw_txn; + MDBX_txn *mw_txn; void *mw_user; MDBX_pgvisitor_func *mw_visitor; } mdbx_walk_ctx_t; @@ -9806,7 +9807,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, payload_size, header_size, unused_size + align_bytes); } -int __cold mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, +int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, void *user) { if (unlikely(!txn)) return MDB_BAD_TXN; @@ -9830,7 +9831,7 @@ int __cold mdbx_env_pgwalk(MDB_txn *txn, MDBX_pgvisitor_func *visitor, return rc; } -int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { +int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary) { if (unlikely(!txn)) return MDBX_EINVAL; @@ -9862,7 +9863,7 @@ int mdbx_canary_put(MDB_txn *txn, const mdbx_canary *canary) { return MDB_SUCCESS; } -int mdbx_canary_get(MDB_txn *txn, mdbx_canary *canary) { +int mdbx_canary_get(MDBX_txn *txn, mdbx_canary *canary) { if (unlikely(txn == NULL || canary == NULL)) return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) @@ -9957,7 +9958,7 @@ static int mdbx_is_samedata(const MDB_val *a, const MDB_val *b) { * - внешняя аллокация курсоров, в том числе на стеке (без malloc). * - получения статуса страницы по адресу (знать о P_DIRTY). */ -int mdbx_replace(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, +int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, MDB_val *old_data, unsigned flags) { if (unlikely(!key || !old_data || !txn || old_data == new_data)) return MDBX_EINVAL; @@ -10089,7 +10090,7 @@ bailout: return rc; } -int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, +int mdbx_get_ex(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, int *values_count) { DKBUF; mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); @@ -10154,7 +10155,7 @@ int mdbx_get_ex(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, * так гарантируется что актуальный заголовок страницы будет физически * расположен в той-же странице памяти, в том числе для многостраничных * P_OVERFLOW страниц с длинными данными. */ -int mdbx_is_dirty(const MDB_txn *txn, const void *ptr) { +int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { if (unlikely(!txn)) return MDBX_EINVAL; @@ -10206,7 +10207,7 @@ int mdbx_is_dirty(const MDB_txn *txn, const void *ptr) { return MDBX_RESULT_TRUE; } -int mdbx_dbi_sequence(MDB_txn *txn, MDB_dbi dbi, uint64_t *result, +int mdbx_dbi_sequence(MDBX_txn *txn, MDB_dbi dbi, uint64_t *result, uint64_t increment) { if (unlikely(!txn)) return MDBX_EINVAL; diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index f6df930a..998d2773 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -76,7 +76,7 @@ uint64_t total_unused_bytes; int exclusive = 2; MDB_env *env; -MDB_txn *txn, *locktxn; +MDBX_txn *txn, *locktxn; MDBX_envinfo envinfo; MDBX_stat envstat; size_t maxkeysize, userdb_count, skipped_subdb; diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index ca4572a2..51dd89ce 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -84,7 +84,7 @@ static void byte(MDB_val *v) { } /* Dump in BDB-compatible format */ -static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) { +static int dumpit(MDBX_txn *txn, MDB_dbi dbi, char *name) { MDB_cursor *mc; MDBX_stat ms; MDB_val key, data; @@ -155,7 +155,7 @@ static void usage(char *prog) { int main(int argc, char *argv[]) { int i, rc; MDB_env *env; - MDB_txn *txn; + MDBX_txn *txn; MDB_dbi dbi; char *prog = argv[0]; char *envname; diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index f942613b..6a06af84 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -294,7 +294,7 @@ static void usage(void) { int main(int argc, char *argv[]) { int i, rc; MDB_env *env; - MDB_txn *txn; + MDBX_txn *txn; MDB_cursor *mc; MDB_dbi dbi; char *envname; diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index 12375bf5..b9db4172 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -39,7 +39,7 @@ static void usage(char *prog) { int main(int argc, char *argv[]) { int i, rc; MDB_env *env; - MDB_txn *txn; + MDBX_txn *txn; MDB_dbi dbi; MDBX_stat mst; MDBX_envinfo mei; diff --git a/test/test.cc b/test/test.cc index ad82fd39..5fde326d 100644 --- a/test/test.cc +++ b/test/test.cc @@ -154,7 +154,7 @@ void testcase::txn_begin(bool readonly) { log_trace(">> txn_begin(%s)", readonly ? "read-only" : "read-write"); assert(!txn_guard); - MDB_txn *txn = nullptr; + MDBX_txn *txn = nullptr; int rc = mdbx_txn_begin(db_guard.get(), nullptr, readonly ? MDB_RDONLY : 0, &txn); if (unlikely(rc != MDB_SUCCESS)) @@ -168,7 +168,7 @@ void testcase::txn_end(bool abort) { log_trace(">> txn_end(%s)", abort ? "abort" : "commit"); assert(txn_guard); - MDB_txn *txn = txn_guard.release(); + MDBX_txn *txn = txn_guard.release(); if (abort) { int rc = mdbx_txn_abort(txn); if (unlikely(rc != MDB_SUCCESS)) diff --git a/test/test.h b/test/test.h index 07e4a094..43674d3e 100644 --- a/test/test.h +++ b/test/test.h @@ -58,8 +58,8 @@ struct db_deleter : public std::unary_function { void operator()(MDB_env *env) const { mdbx_env_close(env); } }; -struct txn_deleter : public std::unary_function { - void operator()(MDB_txn *txn) const { +struct txn_deleter : public std::unary_function { + void operator()(MDBX_txn *txn) const { int rc = mdbx_txn_abort(txn); if (rc) log_trouble(__func__, "mdbx_txn_abort()", rc); @@ -71,7 +71,7 @@ struct cursor_deleter : public std::unary_function { }; typedef std::unique_ptr scoped_db_guard; -typedef std::unique_ptr scoped_txn_guard; +typedef std::unique_ptr scoped_txn_guard; typedef std::unique_ptr scoped_cursor_guard; //----------------------------------------------------------------------------- diff --git a/tutorial/sample-mdb.txt b/tutorial/sample-mdb.txt index 194afdcc..2dcf87c9 100644 --- a/tutorial/sample-mdb.txt +++ b/tutorial/sample-mdb.txt @@ -27,7 +27,7 @@ int main(int argc,char * argv[]) MDB_env *env; MDB_dbi dbi; MDB_val key, data; - MDB_txn *txn; + MDBX_txn *txn; MDB_cursor *cursor; char sval[32]; From 8c18622592820ee93edee6520f3daa1aaea5b8c2 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 21:05:54 +0300 Subject: [PATCH 149/303] mdbx: rework MDBX_val. --- mdbx.h | 54 ++- src/bits.h | 2 +- src/mdbx.c | 717 ++++++++++++++++++++-------------------- src/tools/mdbx_chk.c | 82 ++--- src/tools/mdbx_dump.c | 24 +- src/tools/mdbx_load.c | 105 +++--- src/tools/mdbx_stat.c | 18 +- test/keygen.cc | 32 +- test/keygen.h | 2 +- tutorial/sample-mdb.txt | 14 +- 10 files changed, 525 insertions(+), 525 deletions(-) diff --git a/mdbx.h b/mdbx.h index ab38a5f3..9727f23a 100644 --- a/mdbx.h +++ b/mdbx.h @@ -137,16 +137,14 @@ struct iovec { #define HAVE_STRUCT_IOVEC #endif /* HAVE_STRUCT_IOVEC */ -typedef struct iovec MDB_val; -#define mv_size iov_len -#define mv_data iov_base +typedef struct iovec MDBX_val; /* The maximum size of a data item. * MDBX only store a 32 bit value for node sizes. */ #define MDBX_MAXDATASIZE INT32_MAX /* A callback function used to compare two keys in a database */ -typedef int(MDB_cmp_func)(const MDB_val *a, const MDB_val *b); +typedef int(MDB_cmp_func)(const MDBX_val *a, const MDBX_val *b); /* Environment Flags */ /* no environment directory */ @@ -1079,8 +1077,8 @@ LIBMDBX_API int mdbx_drop(MDBX_txn *txn, MDB_dbi dbi, int del); * possible errors are: * - MDB_NOTFOUND - the key was not in the database. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, - MDB_val *data); +LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, + MDBX_val *data); /* Store items into a database. * @@ -1139,8 +1137,8 @@ LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, * - MDB_TXN_FULL - the transaction has too many dirty pages. * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, - MDB_val *data, unsigned flags); +LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, + MDBX_val *data, unsigned flags); /* Delete items from a database. * @@ -1162,8 +1160,8 @@ LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, * possible errors are: * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, - MDB_val *data); +LIBMDBX_API int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, + MDBX_val *data); /* Create a cursor handle. * @@ -1238,8 +1236,8 @@ LIBMDBX_API MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); * possible errors are: * - MDB_NOTFOUND - no matching key found. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, - MDB_cursor_op op); +LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDBX_val *key, + MDBX_val *data, MDB_cursor_op op); /* Store by cursor. * @@ -1292,13 +1290,13 @@ LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * - MDB_MULTIPLE * Store multiple contiguous data elements in a single request. This flag * may only be specified if the database was opened with MDB_DUPFIXED. - * The data argument must be an array of two MDB_vals. The mv_size of the - * first MDB_val must be the size of a single data element. The mv_data - * of the first MDB_val must point to the beginning of the array of - * contiguous data elements. The mv_size of the second MDB_val must be + * The data argument must be an array of two MDBX_vals. The iov_len of the + * first MDBX_val must be the size of a single data element. The iov_base + * of the first MDBX_val must point to the beginning of the array of + * contiguous data elements. The iov_len of the second MDBX_val must be * the count of the number of data elements to store. On return this * field will be set to the count of the number of elements actually - * written. The mv_data of the second MDB_val is unused. + * written. The iov_base of the second MDBX_val is unused. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: @@ -1307,8 +1305,8 @@ LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * - MDB_TXN_FULL - the transaction has too many dirty pages. * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, - unsigned flags); +LIBMDBX_API int mdbx_cursor_put(MDB_cursor *cursor, MDBX_val *key, + MDBX_val *data, unsigned flags); /* Delete current key/data pair * @@ -1353,8 +1351,8 @@ LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, uint64_t *countp); * [in] b The second item to compare * * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ -LIBMDBX_API int mdbx_cmp(MDBX_txn *txn, MDB_dbi dbi, const MDB_val *a, - const MDB_val *b); +LIBMDBX_API int mdbx_cmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, + const MDBX_val *b); /* Compare two data items according to a particular database. * @@ -1367,8 +1365,8 @@ LIBMDBX_API int mdbx_cmp(MDBX_txn *txn, MDB_dbi dbi, const MDB_val *a, * [in] b The second item to compare * * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ -LIBMDBX_API int mdbx_dcmp(MDBX_txn *txn, MDB_dbi dbi, const MDB_val *a, - const MDB_val *b); +LIBMDBX_API int mdbx_dcmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, + const MDBX_val *b); /* A callback function used to print a message from the library. * @@ -1395,7 +1393,7 @@ LIBMDBX_API int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); * Returns 0 on success, non-zero on failure. */ LIBMDBX_API int mdbx_reader_check(MDB_env *env, int *dead); -LIBMDBX_API char *mdbx_dkey(const MDB_val *key, char *const buf, +LIBMDBX_API char *mdbx_dkey(const MDBX_val *key, char *const buf, const size_t bufsize); LIBMDBX_API int mdbx_env_close_ex(MDB_env *env, int dont_sync); @@ -1509,15 +1507,15 @@ LIBMDBX_API int mdbx_cursor_on_first(MDB_cursor *mc); /* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */ LIBMDBX_API int mdbx_cursor_on_last(MDB_cursor *mc); -LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, - MDB_val *new_data, MDB_val *old_data, +LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, + MDBX_val *new_data, MDBX_val *old_data, unsigned flags); /* Same as mdbx_get(), but: * 1) if values_count is not NULL, then returns the count * of multi-values/duplicates for a given key. * 2) updates the key for pointing to the actual key's data inside DB. */ -LIBMDBX_API int mdbx_get_ex(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, - MDB_val *data, int *values_count); +LIBMDBX_API int mdbx_get_ex(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, + MDBX_val *data, int *values_count); LIBMDBX_API int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr); diff --git a/src/bits.h b/src/bits.h index 1e97869f..14c1336c 100644 --- a/src/bits.h +++ b/src/bits.h @@ -349,7 +349,7 @@ typedef struct MDBX_lockinfo { * The information here is mostly static/read-only. There is * only a single copy of this record in the environment. */ typedef struct MDB_dbx { - MDB_val md_name; /* name of the database */ + MDBX_val md_name; /* name of the database */ MDB_cmp_func *md_cmp; /* function for comparing keys */ MDB_cmp_func *md_dcmp; /* function for comparing data items */ } MDB_dbx; diff --git a/src/mdbx.c b/src/mdbx.c index e7819616..7081dd20 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -475,11 +475,11 @@ typedef struct MDBX_node { /* Size of a node in a branch page with a given key. * This is just the node header plus the key, there is no data. */ -#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size)) +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->iov_len)) /* Size of a node in a leaf page with a given key and data. * This is node header plus key plus data size. */ -#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size) +#define LEAFSIZE(k, d) (NODESIZE + (k)->iov_len + (d)->iov_len) /* Address of node i in page p */ static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { @@ -556,19 +556,19 @@ static __inline void SETDSZ(MDBX_node *node, unsigned size) { /* Set the node's key into keyptr, if requested. */ #define MDB_GET_KEY(node, keyptr) \ - { \ + do { \ if ((keyptr) != NULL) { \ - (keyptr)->mv_size = NODEKSZ(node); \ - (keyptr)->mv_data = NODEKEY(node); \ + (keyptr)->iov_len = NODEKSZ(node); \ + (keyptr)->iov_base = NODEKEY(node); \ } \ - } + } while (0) /* Set the node's key into key. */ #define MDB_GET_KEY2(node, key) \ - { \ - key.mv_size = NODEKSZ(node); \ - key.mv_data = NODEKEY(node); \ - } + do { \ + key.iov_len = NODEKSZ(node); \ + key.iov_base = NODEKEY(node); \ + } while (0) #define MDB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) @@ -621,16 +621,16 @@ enum { static int mdbx_txn_end(MDBX_txn *txn, unsigned mode); static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDBX_page **mp, int *lvl); -static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); +static int mdbx_page_search_root(MDB_cursor *mc, MDBX_val *key, int modify); #define MDB_PS_MODIFY 1 #define MDB_PS_ROOTONLY 2 #define MDB_PS_FIRST 4 #define MDB_PS_LAST 8 -static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags); +static int mdbx_page_search(MDB_cursor *mc, MDBX_val *key, int flags); static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); #define MDB_SPLIT_REPLACE MDB_APPENDDUP /* newkey is not new */ -static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, +static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, pgno_t newpgno, unsigned nflags); static int mdbx_read_header(MDB_env *env, MDB_meta *meta); @@ -638,34 +638,34 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, MDB_meta *pending); static void mdbx_env_close0(MDB_env *env); -static MDBX_node *mdbx_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); -static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, - MDB_val *data, pgno_t pgno, unsigned flags); +static MDBX_node *mdbx_node_search(MDB_cursor *mc, MDBX_val *key, int *exactp); +static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDBX_val *key, + MDBX_val *data, pgno_t pgno, unsigned flags); static void mdbx_node_del(MDB_cursor *mc, int ksize); static void mdbx_node_shrink(MDBX_page *mp, indx_t indx); static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); -static int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, MDB_val *data); -static size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); -static size_t mdbx_branch_size(MDB_env *env, MDB_val *key); +static int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, MDBX_val *data); +static size_t mdbx_leaf_size(MDB_env *env, MDBX_val *key, MDBX_val *data); +static size_t mdbx_branch_size(MDB_env *env, MDBX_val *key); static int mdbx_rebalance(MDB_cursor *mc); -static int mdbx_update_key(MDB_cursor *mc, MDB_val *key); +static int mdbx_update_key(MDB_cursor *mc, MDBX_val *key); static void mdbx_cursor_pop(MDB_cursor *mc); static int mdbx_cursor_push(MDB_cursor *mc, MDBX_page *mp); static int mdbx_cursor_del0(MDB_cursor *mc); -static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, +static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, unsigned flags); static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right); -static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, +static int mdbx_cursor_next(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, MDB_cursor_op op); -static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, +static int mdbx_cursor_prev(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, MDB_cursor_op op); -static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, +static int mdbx_cursor_set(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, MDB_cursor_op op, int *exactp); -static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data); -static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data); +static int mdbx_cursor_first(MDB_cursor *mc, MDBX_val *key, MDBX_val *data); +static int mdbx_cursor_last(MDB_cursor *mc, MDBX_val *key, MDBX_val *data); static void mdbx_cursor_init(MDB_cursor *mc, MDBX_txn *txn, MDB_dbi dbi, MDB_xcursor *mx); @@ -810,7 +810,7 @@ void __cold mdbx_debug_log(int type, const char *function, int line, } /* Dump a key in ascii or hexadecimal. */ -char *mdbx_dkey(const MDB_val *key, char *const buf, const size_t bufsize) { +char *mdbx_dkey(const MDBX_val *key, char *const buf, const size_t bufsize) { if (!key) return ""; if (!buf || bufsize < 4) @@ -818,24 +818,24 @@ char *mdbx_dkey(const MDB_val *key, char *const buf, const size_t bufsize) { if (!key->iov_len) return ""; - const uint8_t *const data = key->mv_data; + const uint8_t *const data = key->iov_base; bool is_ascii = true; unsigned i; - for (i = 0; is_ascii && i < key->mv_size; i++) + for (i = 0; is_ascii && i < key->iov_len; i++) if (data[i] < ' ' || data[i] > 127) is_ascii = false; if (is_ascii) { int len = snprintf(buf, bufsize, "%.*s", - (key->mv_size > INT_MAX) ? INT_MAX : (int)key->mv_size, data); + (key->iov_len > INT_MAX) ? INT_MAX : (int)key->iov_len, data); assert(len > 0 && (unsigned)len < bufsize); (void)len; } else { char *const detent = buf + bufsize - 2; char *ptr = buf; *ptr++ = '<'; - for (i = 0; i < key->mv_size; i++) { + for (i = 0; i < key->iov_len; i++) { const ptrdiff_t left = detent - ptr; assert(left > 0); int len = snprintf(ptr, left, "%02x", data[i]); @@ -865,7 +865,7 @@ static void mdbx_page_list(MDBX_page *mp) { const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : ""; MDBX_node *node; unsigned i, nkeys, nsize, total = 0; - MDB_val key; + MDBX_val key; DKBUF; switch (mp->mp_flags & @@ -903,16 +903,16 @@ static void mdbx_page_list(MDBX_page *mp) { for (i = 0; i < nkeys; i++) { if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ - key.mv_size = nsize = mp->mp_leaf2_ksize; - key.mv_data = LEAF2KEY(mp, i, nsize); + key.iov_len = nsize = mp->mp_leaf2_ksize; + key.iov_base = LEAF2KEY(mp, i, nsize); total += nsize; mdbx_print("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); continue; } node = NODEPTR(mp, i); - key.mv_size = node->mn_ksize; - key.mv_data = node->mn_data; - nsize = NODESIZE + key.mv_size; + key.iov_len = node->mn_ksize; + key.iov_base = node->mn_data; + nsize = NODESIZE + key.iov_len; if (IS_BRANCH(mp)) { mdbx_print("key %u: page %" PRIu64 ", %s\n", i, NODEPGNO(node), DKEY(&key)); @@ -964,7 +964,7 @@ static void mdbx_cursor_chk(MDB_cursor *mc) { * All named DBs must be open for a correct count. */ static void mdbx_audit(MDBX_txn *txn) { MDB_cursor mc; - MDB_val key, data; + MDBX_val key, data; pgno_t freecount, count; MDB_dbi i; int rc; @@ -972,7 +972,7 @@ static void mdbx_audit(MDBX_txn *txn) { freecount = 0; mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdbx_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) - freecount += *(pgno_t *)data.mv_data; + freecount += *(pgno_t *)data.iov_base; mdbx_tassert(txn, rc == MDB_NOTFOUND); count = 0; @@ -1012,12 +1012,13 @@ static void mdbx_audit(MDBX_txn *txn) { } } -int mdbx_cmp(MDBX_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { +int mdbx_cmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, const MDBX_val *b) { mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_cmp(a, b); } -int mdbx_dcmp(MDBX_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { +int mdbx_dcmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, + const MDBX_val *b) { mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_dcmp(a, b); } @@ -1268,7 +1269,7 @@ static int mdbx_page_flush(MDBX_txn *txn, int keep); * [in] data For a put operation, the data being stored. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) { +static int mdbx_page_spill(MDB_cursor *m0, MDBX_val *key, MDBX_val *data) { MDBX_txn *txn = m0->mc_txn; MDBX_page *dp; MDB_ID2L dl = txn->mt_rw_dirtylist; @@ -1499,7 +1500,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { for (;;) { /* oom-kick retry loop */ for (op = MDB_FIRST;; op = (flags & MDBX_LIFORECLAIM) ? MDB_PREV : MDB_NEXT) { - MDB_val key, data; + MDBX_val key, data; MDBX_node *leaf; pgno_t *idl; @@ -1538,8 +1539,8 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { op = MDB_SET_RANGE; } - key.mv_data = &last; - key.mv_size = sizeof(last); + key.iov_base = &last; + key.iov_len = sizeof(last); } if (!(flags & MDBX_LIFORECLAIM)) { @@ -1562,8 +1563,8 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { if (oldest < mdbx_find_oldest(env, NULL)) { oldest = env->me_pgoldest; last = oldest - 1; - key.mv_data = &last; - key.mv_size = sizeof(last); + key.iov_base = &last; + key.iov_len = sizeof(last); op = MDB_SET_RANGE; rc = mdbx_cursor_get(&m2, &key, NULL, op); } @@ -1574,7 +1575,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { goto fail; } - last = *(txnid_t *)key.mv_data; + last = *(txnid_t *)key.iov_base; if (oldest <= last) { if (!found_oldest) { oldest = mdbx_find_oldest(env, NULL); @@ -1610,9 +1611,9 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { } } - idl = (pgno_t *)data.mv_data; + idl = (pgno_t *)data.iov_base; mdbx_tassert(txn, idl[0] == 0 || - data.mv_size == (idl[0] + 1) * sizeof(pgno_t)); + data.iov_len == (idl[0] + 1) * sizeof(pgno_t)); i = idl[0]; if (!mop) { if (unlikely(!(env->me_pghead = mop = mdbx_midl_alloc(i)))) { @@ -2442,10 +2443,10 @@ static void mdbx_dbis_update(MDBX_txn *txn, int keep) { if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; } else { - char *ptr = env->me_dbxs[i].md_name.mv_data; + char *ptr = env->me_dbxs[i].md_name.iov_base; if (ptr) { - env->me_dbxs[i].md_name.mv_data = NULL; - env->me_dbxs[i].md_name.mv_size = 0; + env->me_dbxs[i].md_name.iov_base = NULL; + env->me_dbxs[i].md_name.iov_len = 0; env->me_dbflags[i] = 0; env->me_dbiseqs[i]++; free(ptr); @@ -2630,7 +2631,7 @@ static int mdbx_freelist_save(MDBX_txn *txn) { again: for (;;) { /* Come back here after each Put() in case freelist changed */ - MDB_val key, data; + MDBX_val key, data; pgno_t *pgs; ssize_t j; @@ -2644,7 +2645,7 @@ again: rc = mdbx_prep_backlog(txn, &mc); if (unlikely(rc)) goto bailout; - pglast = head_id = *(txnid_t *)key.mv_data; + pglast = head_id = *(txnid_t *)key.iov_base; total_room = head_room = 0; more = 1; mdbx_tassert(txn, pglast <= env->me_pglast); @@ -2658,8 +2659,8 @@ again: /* LY: cleanup reclaimed records. */ while (cleanup_idx < txn->mt_lifo_reclaimed[0]) { pglast = txn->mt_lifo_reclaimed[++cleanup_idx]; - key.mv_data = &pglast; - key.mv_size = sizeof(pglast); + key.iov_base = &pglast; + key.iov_len = sizeof(pglast); rc = mdbx_cursor_get(&mc, &key, NULL, MDB_SET); if (likely(rc != MDB_NOTFOUND)) { if (unlikely(rc)) @@ -2699,11 +2700,11 @@ again: } free_pgs = txn->mt_free_pages; /* Write to last page of freeDB */ - key.mv_size = sizeof(txn->mt_txnid); - key.mv_data = &txn->mt_txnid; + key.iov_len = sizeof(txn->mt_txnid); + key.iov_base = &txn->mt_txnid; do { freecnt = free_pgs[0]; - data.mv_size = MDB_IDL_SIZEOF(free_pgs); + data.iov_len = MDB_IDL_SIZEOF(free_pgs); rc = mdbx_cursor_put(&mc, &key, &data, MDB_RESERVE); if (unlikely(rc)) goto bailout; @@ -2712,7 +2713,7 @@ again: } while (freecnt < free_pgs[0]); mdbx_midl_sort(free_pgs); - memcpy(data.mv_data, free_pgs, data.mv_size); + memcpy(data.iov_base, free_pgs, data.iov_len); if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { unsigned i = free_pgs[0]; @@ -2794,14 +2795,14 @@ again: head_room = 0; continue; } - key.mv_size = sizeof(head_id); - key.mv_data = &head_id; - data.mv_size = (head_room + 1) * sizeof(pgno_t); + key.iov_len = sizeof(head_id); + key.iov_base = &head_id; + data.iov_len = (head_room + 1) * sizeof(pgno_t); rc = mdbx_cursor_put(&mc, &key, &data, MDB_RESERVE); if (unlikely(rc)) goto bailout; /* IDL is initially empty, zero out at least the length */ - pgs = (pgno_t *)data.mv_data; + pgs = (pgno_t *)data.iov_base; j = head_room > clean_limit ? head_room : 0; do { pgs[j] = 0; @@ -2837,9 +2838,9 @@ again: /* Fill in the reserved me_pghead records */ rc = MDB_SUCCESS; if (mop_len) { - MDB_val key, data; - key.mv_size = data.mv_size = 0; /* avoid MSVC warning */ - key.mv_data = data.mv_data = NULL; + MDBX_val key, data; + key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ + key.iov_base = data.iov_base = NULL; mop += mop_len; if (!lifo) { @@ -2854,14 +2855,14 @@ again: pgno_t save; if (!lifo) { - id = *(txnid_t *)key.mv_data; + id = *(txnid_t *)key.iov_base; mdbx_tassert(txn, id <= env->me_pglast); } else { mdbx_tassert(txn, refill_idx > 0 && refill_idx <= txn->mt_lifo_reclaimed[0]); id = txn->mt_lifo_reclaimed[refill_idx--]; - key.mv_data = &id; - key.mv_size = sizeof(id); + key.iov_base = &id; + key.iov_len = sizeof(id); rc = mdbx_cursor_get(&mc, &key, &data, MDB_SET); if (unlikely(rc)) goto bailout; @@ -2870,14 +2871,14 @@ again: txn, cleanup_idx == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - len = (ssize_t)(data.mv_size / sizeof(pgno_t)) - 1; + len = (ssize_t)(data.iov_len / sizeof(pgno_t)) - 1; mdbx_tassert(txn, len >= 0); if (len > mop_len) len = mop_len; - data.mv_size = (len + 1) * sizeof(pgno_t); - key.mv_data = &id; - key.mv_size = sizeof(id); - data.mv_data = mop -= len; + data.iov_len = (len + 1) * sizeof(pgno_t); + key.iov_base = &id; + key.iov_len = sizeof(id); + data.iov_base = mop -= len; save = mop[0]; mop[0] = len; @@ -3220,8 +3221,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { if (txn->mt_numdbs > CORE_DBS) { MDB_cursor mc; MDB_dbi i; - MDB_val data; - data.mv_size = sizeof(MDB_db); + MDBX_val data; + data.iov_len = sizeof(MDB_db); mdbx_cursor_init(&mc, txn, MAIN_DBI, NULL); for (i = CORE_DBS; i < txn->mt_numdbs; i++) { @@ -3230,7 +3231,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { rc = MDB_BAD_DBI; goto fail; } - data.mv_data = &txn->mt_dbs[i]; + data.iov_base = &txn->mt_dbs[i]; rc = mdbx_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, F_SUBDATA); if (unlikely(rc != MDB_SUCCESS)) goto fail; @@ -4154,7 +4155,7 @@ static void __cold mdbx_env_close0(MDB_env *env) { /* Doing this here since me_dbxs may not exist during mdbx_env_close */ if (env->me_dbxs) { for (unsigned i = env->me_maxdbs; --i >= CORE_DBS;) - free(env->me_dbxs[i].md_name.mv_data); + free(env->me_dbxs[i].md_name.iov_base); free(env->me_dbxs); } @@ -4242,15 +4243,15 @@ void __cold mdbx_env_close(MDB_env *env) { mdbx_env_close_ex(env, 0); } #endif /* Compare two items pointing at aligned unsigned int's. */ -static int __hot mdbx_cmp_int_ai(const MDB_val *a, const MDB_val *b) { - mdbx_assert(NULL, a->mv_size == b->mv_size); - mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(int) && - 0 == (uintptr_t)b->mv_data % sizeof(int)); - switch (a->mv_size) { +static int __hot mdbx_cmp_int_ai(const MDBX_val *a, const MDBX_val *b) { + mdbx_assert(NULL, a->iov_len == b->iov_len); + mdbx_assert(NULL, 0 == (uintptr_t)a->iov_base % sizeof(int) && + 0 == (uintptr_t)b->iov_base % sizeof(int)); + switch (a->iov_len) { case 4: - return mdbx_cmp2int(*(uint32_t *)a->mv_data, *(uint32_t *)b->mv_data); + return mdbx_cmp2int(*(uint32_t *)a->iov_base, *(uint32_t *)b->iov_base); case 8: - return mdbx_cmp2int(*(uint64_t *)a->mv_data, *(uint64_t *)b->mv_data); + return mdbx_cmp2int(*(uint64_t *)a->iov_base, *(uint64_t *)b->iov_base); default: mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", mdbx_func_, __LINE__); @@ -4259,37 +4260,37 @@ static int __hot mdbx_cmp_int_ai(const MDB_val *a, const MDB_val *b) { } /* Compare two items pointing at 2-byte aligned unsigned int's. */ -static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { - mdbx_assert(NULL, a->mv_size == b->mv_size); - mdbx_assert(NULL, 0 == (uintptr_t)a->mv_data % sizeof(uint16_t) && - 0 == (uintptr_t)b->mv_data % sizeof(uint16_t)); +static int __hot mdbx_cmp_int_a2(const MDBX_val *a, const MDBX_val *b) { + mdbx_assert(NULL, a->iov_len == b->iov_len); + mdbx_assert(NULL, 0 == (uintptr_t)a->iov_base % sizeof(uint16_t) && + 0 == (uintptr_t)b->iov_base % sizeof(uint16_t)); #if UNALIGNED_OK - switch (a->mv_size) { + switch (a->iov_len) { case 4: - return mdbx_cmp2int(*(uint32_t *)a->mv_data, *(uint32_t *)b->mv_data); + return mdbx_cmp2int(*(uint32_t *)a->iov_base, *(uint32_t *)b->iov_base); case 8: - return mdbx_cmp2int(*(uint64_t *)a->mv_data, *(uint64_t *)b->mv_data); + return mdbx_cmp2int(*(uint64_t *)a->iov_base, *(uint64_t *)b->iov_base); default: mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", mdbx_func_, __LINE__); return 0; } #else - mdbx_assert(NULL, 0 == a->mv_size % sizeof(uint16_t)); + mdbx_assert(NULL, 0 == a->iov_len % sizeof(uint16_t)); { int diff; const uint16_t *pa, *pb, *end; #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - end = (const uint16_t *)a->mv_data; - pa = (const uint16_t *)((char *)a->mv_data + a->mv_size); - pb = (const uint16_t *)((char *)b->mv_data + a->mv_size); + end = (const uint16_t *)a->iov_base; + pa = (const uint16_t *)((char *)a->iov_base + a->iov_len); + pb = (const uint16_t *)((char *)b->iov_base + a->iov_len); do { diff = *--pa - *--pb; #else /* __BYTE_ORDER__ */ - end = (const uint16_t *)((char *)a->mv_data + a->mv_size); - pa = (const uint16_t *)a->mv_data; - pb = (const uint16_t *)b->mv_data; + end = (const uint16_t *)((char *)a->iov_base + a->iov_len); + pa = (const uint16_t *)a->iov_base; + pb = (const uint16_t *)b->iov_base; do { diff = *pa++ - *pb++; #endif /* __BYTE_ORDER__ */ @@ -4304,69 +4305,69 @@ static int __hot mdbx_cmp_int_a2(const MDB_val *a, const MDB_val *b) { /* Compare two items pointing at unsigneds of unknown alignment. * * This is also set as MDB_INTEGERDUP|MDB_DUPFIXED's MDB_dbx.md_dcmp. */ -static int __hot mdbx_cmp_int_ua(const MDB_val *a, const MDB_val *b) { - mdbx_assert(NULL, a->mv_size == b->mv_size); +static int __hot mdbx_cmp_int_ua(const MDBX_val *a, const MDBX_val *b) { + mdbx_assert(NULL, a->iov_len == b->iov_len); #if UNALIGNED_OK - switch (a->mv_size) { + switch (a->iov_len) { case 4: - return mdbx_cmp2int(*(uint32_t *)a->mv_data, *(uint32_t *)b->mv_data); + return mdbx_cmp2int(*(uint32_t *)a->iov_base, *(uint32_t *)b->iov_base); case 8: - return mdbx_cmp2int(*(uint64_t *)a->mv_data, *(uint64_t *)b->mv_data); + return mdbx_cmp2int(*(uint64_t *)a->iov_base, *(uint64_t *)b->iov_base); default: mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", mdbx_func_, __LINE__); return 0; } #else - mdbx_assert(NULL, a->mv_size == sizeof(int) || a->mv_size == sizeof(size_t)); + mdbx_assert(NULL, a->iov_len == sizeof(int) || a->iov_len == sizeof(size_t)); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ { int diff; const uint8_t *pa, *pb; - pa = (const uint8_t *)a->mv_data + a->mv_size; - pb = (const uint8_t *)b->mv_data + a->mv_size; + pa = (const uint8_t *)a->iov_base + a->iov_len; + pb = (const uint8_t *)b->iov_base + a->iov_len; do { diff = *--pa - *--pb; if (likely(diff != 0)) break; - } while (pa != a->mv_data); + } while (pa != a->iov_base); return diff; } -#else /* __BYTE_ORDER__ */ - return memcmp(a->mv_data, b->mv_data, a->mv_size); +#else /* __BYTE_ORDER__ */ + return memcmp(a->iov_base, b->iov_base, a->iov_len); #endif /* __BYTE_ORDER__ */ #endif /* UNALIGNED_OK */ } /* Compare two items lexically */ -static int __hot mdbx_cmp_memn(const MDB_val *a, const MDB_val *b) { +static int __hot mdbx_cmp_memn(const MDBX_val *a, const MDBX_val *b) { /* LY: assumes that length of keys are NOT equal for most cases, * if no then branch-prediction should mitigate the problem */ #if 0 /* LY: without branch instructions on x86, * but isn't best for equal length of keys */ - int diff_len = mdbx_cmp2int(a->mv_size, b->mv_size); + int diff_len = mdbx_cmp2int(a->iov_len, b->iov_len); #else /* LY: best when length of keys are equal, * but got a branch-penalty otherwise */ - if (likely(a->mv_size == b->mv_size)) - return memcmp(a->mv_data, b->mv_data, a->mv_size); - int diff_len = (a->mv_size < b->mv_size) ? -1 : 1; + if (likely(a->iov_len == b->iov_len)) + return memcmp(a->iov_base, b->iov_base, a->iov_len); + int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; #endif - size_t shortest = (a->mv_size < b->mv_size) ? a->mv_size : b->mv_size; - int diff_data = memcmp(a->mv_data, b->mv_data, shortest); + size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + int diff_data = memcmp(a->iov_base, b->iov_base, shortest); return likely(diff_data) ? diff_data : diff_len; } /* Compare two items in reverse byte order */ -static int __hot mdbx_cmp_memnr(const MDB_val *a, const MDB_val *b) { +static int __hot mdbx_cmp_memnr(const MDBX_val *a, const MDBX_val *b) { const uint8_t *pa, *pb, *end; - pa = (const uint8_t *)a->mv_data + a->mv_size; - pb = (const uint8_t *)b->mv_data + b->mv_size; - size_t minlen = (a->mv_size < b->mv_size) ? a->mv_size : b->mv_size; + pa = (const uint8_t *)a->iov_base + a->iov_len; + pb = (const uint8_t *)b->iov_base + b->iov_len; + size_t minlen = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; end = pa - minlen; while (pa != end) { @@ -4374,7 +4375,7 @@ static int __hot mdbx_cmp_memnr(const MDB_val *a, const MDB_val *b) { if (likely(diff)) return diff; } - return mdbx_cmp2int(a->mv_size, b->mv_size); + return mdbx_cmp2int(a->iov_len, b->iov_len); } /* Search for key within a page, using binary search. @@ -4383,14 +4384,14 @@ static int __hot mdbx_cmp_memnr(const MDB_val *a, const MDB_val *b) { * in *exactp (1 or 0). * Updates the cursor index with the index of the found entry. * If no entry larger or equal to the key is found, returns NULL. */ -static MDBX_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, +static MDBX_node *__hot mdbx_node_search(MDB_cursor *mc, MDBX_val *key, int *exactp) { unsigned i = 0, nkeys; int low, high; int rc = 0; MDBX_page *mp = mc->mc_pg[mc->mc_top]; MDBX_node *node = NULL; - MDB_val nodekey; + MDBX_val nodekey; MDB_cmp_func *cmp; DKBUF; @@ -4411,11 +4412,11 @@ static MDBX_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, cmp = mdbx_cmp_int_ai; if (IS_LEAF2(mp)) { - nodekey.mv_size = mc->mc_db->md_xsize; + nodekey.iov_len = mc->mc_db->md_xsize; node = NODEPTR(mp, 0); /* fake */ while (low <= high) { i = (low + high) >> 1; - nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size); + nodekey.iov_base = LEAF2KEY(mp, i, nodekey.iov_len); rc = cmp(key, &nodekey); mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc); if (rc == 0) @@ -4430,8 +4431,8 @@ static MDBX_node *__hot mdbx_node_search(MDB_cursor *mc, MDB_val *key, i = (low + high) >> 1; node = NODEPTR(mp, i); - nodekey.mv_size = NODEKSZ(node); - nodekey.mv_data = NODEKEY(node); + nodekey.iov_len = NODEKSZ(node); + nodekey.iov_base = NODEKEY(node); rc = cmp(key, &nodekey); if (IS_LEAF(mp)) @@ -4572,7 +4573,7 @@ done: /* Finish mdbx_page_search() / mdbx_page_search_lowest(). * The cursor is at the root page, set up the rest of it. */ -static int mdbx_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { +static int mdbx_page_search_root(MDB_cursor *mc, MDBX_val *key, int flags) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; int rc; DKBUF; @@ -4682,7 +4683,7 @@ static int mdbx_page_search_lowest(MDB_cursor *mc) { * lookups. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { +static int mdbx_page_search(MDB_cursor *mc, MDBX_val *key, int flags) { int rc; pgno_t root; @@ -4702,7 +4703,7 @@ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { if (rc) return rc; { - MDB_val data; + MDBX_val data; int exact = 0; MDBX_node *leaf = mdbx_node_search(&mc2, &mc->mc_dbx->md_name, &exact); if (!exact) @@ -4714,13 +4715,13 @@ static int mdbx_page_search(MDB_cursor *mc, MDB_val *key, int flags) { return rc; uint16_t md_flags; - memcpy(&md_flags, ((char *)data.mv_data + offsetof(MDB_db, md_flags)), + memcpy(&md_flags, ((char *)data.iov_base + offsetof(MDB_db, md_flags)), sizeof(uint16_t)); /* The txn may not know this DBI, or another process may * have dropped and recreated the DB with other flags. */ if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != md_flags)) return MDB_INCOMPATIBLE; - memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); + memcpy(mc->mc_db, data.iov_base, sizeof(MDB_db)); } *mc->mc_dbflag &= ~DB_STALE; } @@ -4833,30 +4834,30 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDBX_page *mp) { * * Returns 0 on success, non-zero on failure. */ static __inline int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, - MDB_val *data) { + MDBX_val *data) { MDBX_page *omp; /* overflow page */ pgno_t pgno; int rc; if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { - data->mv_size = NODEDSZ(leaf); - data->mv_data = NODEDATA(leaf); + data->iov_len = NODEDSZ(leaf); + data->iov_base = NODEDATA(leaf); return MDB_SUCCESS; } /* Read overflow data. */ - data->mv_size = NODEDSZ(leaf); + data->iov_len = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); if (unlikely((rc = mdbx_page_get(mc, pgno, &omp, NULL)) != 0)) { mdbx_debug("read overflow page %" PRIaPGNO " failed", pgno); return rc; } - data->mv_data = PAGEDATA(omp); + data->iov_base = PAGEDATA(omp); return MDB_SUCCESS; } -int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { +int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data) { MDB_cursor mc; MDB_xcursor mx; int exact = 0; @@ -4938,7 +4939,7 @@ static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { } /* Move the cursor to the next data item. */ -static int mdbx_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, +static int mdbx_cursor_next(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, MDB_cursor_op op) { MDBX_page *mp; MDBX_node *leaf; @@ -5000,8 +5001,8 @@ skip: mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); return MDB_SUCCESS; } @@ -5027,7 +5028,7 @@ skip: } /* Move the cursor to the previous data item. */ -static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, +static int mdbx_cursor_prev(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, MDB_cursor_op op) { MDBX_page *mp; MDBX_node *leaf; @@ -5087,8 +5088,8 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); return MDB_SUCCESS; } @@ -5114,7 +5115,7 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, } /* Set the cursor on a specific data item. */ -static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, +static int mdbx_cursor_set(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, MDB_cursor_op op, int *exactp) { int rc; MDBX_page *mp; @@ -5122,8 +5123,8 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, DKBUF; if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && - unlikely(key->mv_size != sizeof(uint32_t) && - key->mv_size != sizeof(uint64_t))) { + unlikely(key->iov_len != sizeof(uint32_t) && + key->iov_len != sizeof(uint64_t))) { mdbx_cassert(mc, !"key-size is invalid for MDB_INTEGERKEY"); return MDB_BAD_VALSIZE; } @@ -5133,7 +5134,7 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, /* See if we're already on the right page */ if (mc->mc_flags & C_INITIALIZED) { - MDB_val nodekey; + MDBX_val nodekey; mp = mc->mc_pg[mc->mc_top]; if (!NUMKEYS(mp)) { @@ -5141,8 +5142,8 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDB_NOTFOUND; } if (mp->mp_flags & P_LEAF2) { - nodekey.mv_size = mc->mc_db->md_xsize; - nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size); + nodekey.iov_len = mc->mc_db->md_xsize; + nodekey.iov_base = LEAF2KEY(mp, 0, nodekey.iov_len); } else { leaf = NODEPTR(mp, 0); MDB_GET_KEY2(leaf, nodekey); @@ -5162,7 +5163,7 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned nkeys = NUMKEYS(mp); if (nkeys > 1) { if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = LEAF2KEY(mp, nkeys - 1, nodekey.mv_size); + nodekey.iov_base = LEAF2KEY(mp, nkeys - 1, nodekey.iov_len); } else { leaf = NODEPTR(mp, nkeys - 1); MDB_GET_KEY2(leaf, nodekey); @@ -5179,8 +5180,8 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { /* This is definitely the right page, skip search_page */ if (mp->mp_flags & P_LEAF2) { - nodekey.mv_data = - LEAF2KEY(mp, mc->mc_ki[mc->mc_top], nodekey.mv_size); + nodekey.iov_base = + LEAF2KEY(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len); } else { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); MDB_GET_KEY2(leaf, nodekey); @@ -5253,8 +5254,8 @@ set1: if (IS_LEAF2(mp)) { if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); } return MDB_SUCCESS; } @@ -5280,7 +5281,7 @@ set1: return rc; } } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { - MDB_val olddata; + MDBX_val olddata; if (unlikely((rc = mdbx_node_read(mc, leaf, &olddata)) != MDB_SUCCESS)) return rc; rc = mc->mc_dbx->md_dcmp(data, &olddata); @@ -5307,7 +5308,7 @@ set1: } /* Move the cursor to the first item in the database. */ -static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) { +static int mdbx_cursor_first(MDB_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; MDBX_node *leaf; @@ -5328,8 +5329,8 @@ static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) { mc->mc_ki[mc->mc_top] = 0; if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size); + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->iov_len); return MDB_SUCCESS; } @@ -5349,7 +5350,7 @@ static int mdbx_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) { } /* Move the cursor to the last item in the database. */ -static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) { +static int mdbx_cursor_last(MDB_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; MDBX_node *leaf; @@ -5370,9 +5371,9 @@ static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) { leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = - LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size); + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = + LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->iov_len); return MDB_SUCCESS; } @@ -5392,11 +5393,11 @@ static int mdbx_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) { return MDB_SUCCESS; } -int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, +int mdbx_cursor_get(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, MDB_cursor_op op) { int rc; int exact = 0; - int (*mfunc)(MDB_cursor * mc, MDB_val * key, MDB_val * data); + int (*mfunc)(MDB_cursor * mc, MDBX_val * key, MDBX_val * data); if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -5421,8 +5422,8 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, rc = MDB_SUCCESS; if (IS_LEAF2(mp)) { - key->mv_size = mc->mc_db->md_xsize; - key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size); + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); } else { MDBX_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); MDB_GET_KEY(leaf, key); @@ -5481,8 +5482,8 @@ int mdbx_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor *mx; fetchm: mx = &mc->mc_xcursor->mx_cursor; - data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize; - data->mv_data = PAGEDATA(mx->mc_pg[mx->mc_top]); + data->iov_len = NUMKEYS(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize; + data->iov_base = PAGEDATA(mx->mc_pg[mx->mc_top]); mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top]) - 1; } else { rc = MDB_NOTFOUND; @@ -5586,12 +5587,12 @@ static int mdbx_cursor_touch(MDB_cursor *mc) { /* Do not spill pages to disk if txn is getting full, may fail instead */ #define MDB_NOSPILL 0x8000 -int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, +int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, unsigned flags) { MDB_env *env; MDBX_page *fp, *sub_root = NULL; uint16_t fp_flags; - MDB_val xdata, *rdata, dkey, olddata; + MDBX_val xdata, *rdata, dkey, olddata; MDB_db dummy; int do_sub = 0, insert_key, insert_data; unsigned mcount = 0, dcount = 0, nospill; @@ -5610,8 +5611,8 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, /* Check this first so counter will always be zero on any early failures. */ if (flags & MDB_MULTIPLE) { - dcount = data[1].mv_size; - data[1].mv_size = 0; + dcount = data[1].iov_len; + data[1].iov_len = 0; if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED))) return MDB_INCOMPATIBLE; } @@ -5619,7 +5620,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (flags & MDB_RESERVE) { if (unlikely(mc->mc_db->md_flags & (MDB_DUPSORT | MDB_REVERSEDUP))) return MDB_INCOMPATIBLE; - data->mv_data = nullptr; + data->iov_base = nullptr; } nospill = flags & MDB_NOSPILL; @@ -5628,32 +5629,32 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; - if (unlikely(key->mv_size > env->me_maxkey_limit)) + if (unlikely(key->iov_len > env->me_maxkey_limit)) return MDB_BAD_VALSIZE; - if (unlikely(data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) + if (unlikely(data->iov_len > ((mc->mc_db->md_flags & MDB_DUPSORT) ? env->me_maxkey_limit : MDBX_MAXDATASIZE))) return MDB_BAD_VALSIZE; if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && - unlikely(key->mv_size != sizeof(uint32_t) && - key->mv_size != sizeof(uint64_t))) { + unlikely(key->iov_len != sizeof(uint32_t) && + key->iov_len != sizeof(uint64_t))) { mdbx_cassert(mc, !"key-size is invalid for MDB_INTEGERKEY"); return MDB_BAD_VALSIZE; } if ((mc->mc_db->md_flags & MDB_INTEGERDUP) && - unlikely(data->mv_size != sizeof(uint32_t) && - data->mv_size != sizeof(uint64_t))) { + unlikely(data->iov_len != sizeof(uint32_t) && + data->iov_len != sizeof(uint64_t))) { mdbx_cassert(mc, !"data-size is invalid MDB_INTEGERDUP"); return MDB_BAD_VALSIZE; } mdbx_debug("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, - DDBI(mc), DKEY(key), key ? key->mv_size : 0, - DVAL((flags & MDB_RESERVE) ? nullptr : data), data->mv_size); + DDBI(mc), DKEY(key), key ? key->iov_len : 0, + DVAL((flags & MDB_RESERVE) ? nullptr : data), data->iov_len); int dupdata_flag = 0; if (flags & MDB_CURRENT) { @@ -5662,7 +5663,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, * со значением в текущей позиции курсора. * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц * с MDB_DUPSORT также требуется текущий размер данных. */ - MDB_val current_key, current_data; + MDBX_val current_key, current_data; rc = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDB_GET_CURRENT); if (unlikely(rc != MDB_SUCCESS)) return rc; @@ -5679,7 +5680,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, * отличается, то вместо inplace обновления требуется удаление и * последующая вставка. */ if (mc->mc_xcursor->mx_db.md_entries > 1 || - current_data.mv_size != data->mv_size) { + current_data.iov_len != data->iov_len) { rc = mdbx_cursor_del(mc, 0); if (rc != MDB_SUCCESS) return rc; @@ -5697,9 +5698,9 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, rc = MDB_NO_ROOT; } else if ((flags & MDB_CURRENT) == 0) { int exact = 0; - MDB_val d2; + MDBX_val d2; if (flags & MDB_APPEND) { - MDB_val k2; + MDBX_val k2; rc = mdbx_cursor_last(mc, &k2, &d2); if (rc == 0) { rc = mc->mc_dbx->md_cmp(key, &k2); @@ -5729,7 +5730,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (!nospill) { if (flags & MDB_MULTIPLE) { rdata = &xdata; - xdata.mv_size = data->mv_size * dcount; + xdata.iov_len = data->iov_len * dcount; } else { rdata = data; } @@ -5769,9 +5770,9 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, * "old sub-page" for prep_subDB to expand to a full page. */ fp_flags = P_LEAF | P_DIRTY; fp = env->me_pbuf; - fp->mp_leaf2_ksize = (uint16_t)data->mv_size; /* used if MDB_DUPFIXED */ + fp->mp_leaf2_ksize = (uint16_t)data->iov_len; /* used if MDB_DUPFIXED */ fp->mp_lower = fp->mp_upper = (PAGEHDRSZ - PAGEBASE); - olddata.mv_size = PAGEHDRSZ; + olddata.iov_len = PAGEHDRSZ; goto prep_subDB; } } else { @@ -5779,10 +5780,10 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { char *ptr; unsigned ksize = mc->mc_db->md_xsize; - if (key->mv_size != ksize) + if (key->iov_len != ksize) return MDB_BAD_VALSIZE; ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); - memcpy(ptr, key->mv_data, ksize); + memcpy(ptr, key->iov_base, ksize); fix_parent: /* if overwriting slot 0 of leaf, need to * update branch key if there is a parent page */ @@ -5807,8 +5808,8 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, more:; MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - olddata.mv_size = NODEDSZ(leaf); - olddata.mv_data = NODEDATA(leaf); + olddata.iov_len = NODEDSZ(leaf); + olddata.iov_base = NODEDATA(leaf); /* DB has dups? */ if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { @@ -5817,7 +5818,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, * mp: new (sub-)page. offset: growth in page size. * xdata: node data with new page or DB. */ unsigned i, offset = 0; - MDBX_page *mp = fp = xdata.mv_data = env->me_pbuf; + MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf; mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; /* Was a single item before, must convert now */ @@ -5837,34 +5838,34 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, /* Back up original data item */ dupdata_flag = 1; - dkey.mv_size = olddata.mv_size; - dkey.mv_data = memcpy(fp + 1, olddata.mv_data, olddata.mv_size); + dkey.iov_len = olddata.iov_len; + dkey.iov_base = memcpy(fp + 1, olddata.iov_base, olddata.iov_len); /* Make sub-page header for the dup items, with dummy body */ fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP; fp->mp_lower = (PAGEHDRSZ - PAGEBASE); - xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; + xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; if (mc->mc_db->md_flags & MDB_DUPFIXED) { fp->mp_flags |= P_LEAF2; - fp->mp_leaf2_ksize = (uint16_t)data->mv_size; - xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ + fp->mp_leaf2_ksize = (uint16_t)data->iov_len; + xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ } else { - xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + - (dkey.mv_size & 1) + (data->mv_size & 1); + xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + + (dkey.iov_len & 1) + (data->iov_len & 1); } - fp->mp_upper = (uint16_t)(xdata.mv_size - PAGEBASE); - olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */ + fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEBASE); + olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ } else if (leaf->mn_flags & F_SUBDATA) { /* Data is on sub-DB, just store it */ flags |= F_DUPDATA | F_SUBDATA; goto put_sub; } else { /* Data is on sub-page */ - fp = olddata.mv_data; + fp = olddata.iov_base; switch (flags) { default: if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { - offset = EVEN(NODESIZE + sizeof(indx_t) + data->mv_size); + offset = EVEN(NODESIZE + sizeof(indx_t) + data->iov_len); break; } offset = fp->mp_leaf2_ksize; @@ -5881,11 +5882,11 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, flags |= F_DUPDATA; goto put_sub; } - xdata.mv_size = olddata.mv_size + offset; + xdata.iov_len = olddata.iov_len + offset; } fp_flags = fp->mp_flags; - if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) { + if (NODESIZE + NODEKSZ(leaf) + xdata.iov_len > env->me_nodemax) { /* Too big for a sub-page, convert to sub-DB */ fp_flags &= ~P_SUBP; prep_subDB: @@ -5903,11 +5904,11 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, dummy.md_leaf_pages = 1; dummy.md_overflow_pages = 0; dummy.md_entries = NUMKEYS(fp); - xdata.mv_size = sizeof(MDB_db); - xdata.mv_data = &dummy; + xdata.iov_len = sizeof(MDB_db); + xdata.iov_base = &dummy; if ((rc = mdbx_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) return rc; - offset = env->me_psize - olddata.mv_size; + offset = env->me_psize - olddata.iov_len; flags |= F_DUPDATA | F_SUBDATA; dummy.md_root = mp->mp_pgno; sub_root = mp; @@ -5922,7 +5923,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, } else { memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, - olddata.mv_size - fp->mp_upper - PAGEBASE); + olddata.iov_len - fp->mp_upper - PAGEBASE); for (i = 0; i < NUMKEYS(fp); i++) mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; } @@ -5943,9 +5944,9 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { MDBX_page *omp; pgno_t pg; - int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); + int level, ovpages, dpages = OVPAGES(data->iov_len, env->me_psize); - memcpy(&pg, olddata.mv_data, sizeof(pg)); + memcpy(&pg, olddata.iov_base, sizeof(pg)); if (unlikely((rc2 = mdbx_page_get(mc, pg, &omp, &level)) != 0)) return rc2; ovpages = omp->mp_pages; @@ -5983,31 +5984,31 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, /* Skip the part where LMDB will put *data. * Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. */ - size_t off = (PAGEHDRSZ + data->mv_size) & -(ssize_t)sizeof(size_t); + size_t off = (PAGEHDRSZ + data->iov_len) & -(ssize_t)sizeof(size_t); memcpy((size_t *)((char *)np + off), (size_t *)((char *)omp + off), whole - off); memcpy(np, omp, PAGEHDRSZ); /* Copy header of page */ omp = np; } - SETDSZ(leaf, data->mv_size); + SETDSZ(leaf, data->iov_len); if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = PAGEDATA(omp); + data->iov_base = PAGEDATA(omp); else - memcpy(PAGEDATA(omp), data->mv_data, data->mv_size); + memcpy(PAGEDATA(omp), data->iov_base, data->iov_len); return MDB_SUCCESS; } } if ((rc2 = mdbx_ovpage_free(mc, omp)) != MDB_SUCCESS) return rc2; - } else if (data->mv_size == olddata.mv_size) { - assert(EVEN(key->mv_size) == EVEN(leaf->mn_ksize)); + } else if (data->iov_len == olddata.iov_len) { + assert(EVEN(key->iov_len) == EVEN(leaf->mn_ksize)); /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, * but instead we opt to shrink the node in that case. */ if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = olddata.mv_data; + data->iov_base = olddata.iov_base; else if (!(mc->mc_flags & C_SUB)) - memcpy(olddata.mv_data, data->mv_data, data->mv_size); + memcpy(olddata.iov_base, data->iov_base, data->iov_len); else { assert(NUMKEYS(mc->mc_pg[mc->mc_top]) == 1); assert(mc->mc_pg[mc->mc_top]->mp_upper == @@ -6016,7 +6017,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, !IS_LEAF2(mc->mc_pg[mc->mc_top])); assert(NODEDSZ(leaf) == 0); assert(leaf->mn_flags == 0); - memcpy(NODEKEY(leaf), key->mv_data, leaf->mn_ksize = key->mv_size); + memcpy(NODEKEY(leaf), key->iov_base, leaf->mn_ksize = key->iov_len); assert((char *)NODEDATA(leaf) + NODEDSZ(leaf) < (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); goto fix_parent; @@ -6030,7 +6031,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, new_sub: nflags = flags & NODE_ADD_FLAGS; - nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size + nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len : mdbx_leaf_size(env, key, rdata); if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { if ((flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) @@ -6073,8 +6074,8 @@ new_sub: int xflags; size_t ecount; put_sub: - xdata.mv_size = 0; - xdata.mv_data = ""; + xdata.iov_len = 0; + xdata.iov_base = ""; MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (flags & MDB_CURRENT) { xflags = (flags & MDB_NODUPDATA) @@ -6093,7 +6094,7 @@ new_sub: if (unlikely(rc)) goto bad_sub; /* we've done our job */ - dkey.mv_size = 0; + dkey.iov_len = 0; } if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { /* Adjust other cursors pointing to mp */ @@ -6142,9 +6143,9 @@ new_sub: if (!rc) { mcount++; /* let caller know how many succeeded, if any */ - data[1].mv_size = mcount; + data[1].iov_len = mcount; if (mcount < dcount) { - data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; + data[0].iov_base = (char *)data[0].iov_base + data[0].iov_len; insert_key = insert_data = 0; goto more; } @@ -6323,14 +6324,14 @@ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, * [in] data The data for the node. * * Returns The number of bytes needed to store the node. */ -static __inline size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, - MDB_val *data) { +static __inline size_t mdbx_leaf_size(MDB_env *env, MDBX_val *key, + MDBX_val *data) { size_t sz; sz = LEAFSIZE(key, data); if (sz > env->me_nodemax) { /* put on overflow page */ - sz -= data->mv_size - sizeof(pgno_t); + sz -= data->iov_len - sizeof(pgno_t); } return EVEN(sz + sizeof(indx_t)); @@ -6348,7 +6349,7 @@ static __inline size_t mdbx_leaf_size(MDB_env *env, MDB_val *key, * [in] key The key for the node. * * Returns The number of bytes needed to store the node. */ -static __inline size_t mdbx_branch_size(MDB_env *env, MDB_val *key) { +static __inline size_t mdbx_branch_size(MDB_env *env, MDBX_val *key) { size_t sz; sz = INDXSIZE(key); @@ -6357,7 +6358,7 @@ static __inline size_t mdbx_branch_size(MDB_env *env, MDB_val *key) { /* not implemented */ mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __FUNCTION__, __LINE__); - sz -= key->mv_size - sizeof(pgno_t); + sz -= key->iov_len - sizeof(pgno_t); } return sz + sizeof(indx_t); @@ -6379,8 +6380,8 @@ static __inline size_t mdbx_branch_size(MDB_env *env, MDB_val *key) { * MDB_PAGE_FULL - there is insufficient room in the page. This error * should never happen since all callers already calculate * the page's free space before calling this function. */ -static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, - MDB_val *data, pgno_t pgno, unsigned flags) { +static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDBX_val *key, + MDBX_val *data, pgno_t pgno, unsigned flags) { unsigned i; size_t node_size = NODESIZE; ssize_t room; @@ -6396,8 +6397,8 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, mdbx_debug("add to %s %spage %" PRIaPGNO " index %i, data size %" PRIuPTR " key size %" PRIuPTR " [%s]", IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mp->mp_pgno, indx, data ? data->mv_size : 0, - key ? key->mv_size : 0, DKEY(key)); + mp->mp_pgno, indx, data ? data->iov_len : 0, + key ? key->iov_len : 0, DKEY(key)); if (IS_LEAF2(mp)) { mdbx_cassert(mc, key); @@ -6408,7 +6409,7 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, if (dif > 0) memmove(ptr + ksize, ptr, dif * ksize); /* insert new key */ - memcpy(ptr, key->mv_data, ksize); + memcpy(ptr, key->iov_base, ksize); /* Just using these for counting */ mp->mp_lower += sizeof(indx_t); @@ -6418,20 +6419,20 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); if (key != NULL) - node_size += key->mv_size; + node_size += key->iov_len; if (IS_LEAF(mp)) { mdbx_cassert(mc, key && data); if (unlikely(F_ISSET(flags, F_BIGDATA))) { /* Data already on overflow page. */ node_size += sizeof(pgno_t); - } else if (unlikely(node_size + data->mv_size > + } else if (unlikely(node_size + data->iov_len > mc->mc_txn->mt_env->me_nodemax)) { - int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); + int ovpages = OVPAGES(data->iov_len, mc->mc_txn->mt_env->me_psize); int rc; /* Put data on overflow page. */ mdbx_debug("data size is %" PRIuPTR ", node would be %" PRIuPTR ", put data on overflow page", - data->mv_size, node_size + data->mv_size); + data->iov_len, node_size + data->iov_len); node_size = EVEN(node_size + sizeof(pgno_t)); if ((ssize_t)node_size > room) goto full; @@ -6441,7 +6442,7 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, flags |= F_BIGDATA; goto update; } else { - node_size += data->mv_size; + node_size += data->iov_len; } } node_size = EVEN(node_size); @@ -6462,32 +6463,32 @@ update: /* Write the node data. */ node = NODEPTR(mp, indx); - node->mn_ksize = (key == NULL) ? 0 : (uint16_t)key->mv_size; + node->mn_ksize = (key == NULL) ? 0 : (uint16_t)key->iov_len; node->mn_flags = flags; if (IS_LEAF(mp)) - SETDSZ(node, data->mv_size); + SETDSZ(node, data->iov_len); else SETPGNO(node, pgno); if (key) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); + memcpy(NODEKEY(node), key->iov_base, key->iov_len); if (IS_LEAF(mp)) { ndata = NODEDATA(node); if (unlikely(ofp == NULL)) { if (unlikely(F_ISSET(flags, F_BIGDATA))) - memcpy(ndata, data->mv_data, sizeof(pgno_t)); + memcpy(ndata, data->iov_base, sizeof(pgno_t)); else if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = ndata; - else if (likely(ndata != data->mv_data)) - memcpy(ndata, data->mv_data, data->mv_size); + data->iov_base = ndata; + else if (likely(ndata != data->iov_base)) + memcpy(ndata, data->iov_base, data->iov_len); } else { memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); ndata = PAGEDATA(ofp); if (F_ISSET(flags, MDB_RESERVE)) - data->mv_data = ndata; - else if (likely(ndata != data->mv_data)) - memcpy(ndata, data->mv_data, data->mv_size); + data->iov_base = ndata; + else if (likely(ndata != data->iov_base)) + memcpy(ndata, data->iov_base, data->iov_len); } } @@ -6621,8 +6622,8 @@ static void mdbx_xcursor_init0(MDB_cursor *mc) { mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags = C_SUB; - mx->mx_dbx.md_name.mv_size = 0; - mx->mx_dbx.md_name.mv_data = NULL; + mx->mx_dbx.md_name.iov_len = 0; + mx->mx_dbx.md_name.iov_base = NULL; mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; mx->mx_dbx.md_dcmp = NULL; } @@ -6878,7 +6879,7 @@ MDB_dbi mdbx_cursor_dbi(MDB_cursor *mc) { * [in] mc Cursor pointing to the node to operate on. * [in] key The new key to use. * Returns 0 on success, non-zero on failure. */ -static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { +static int mdbx_update_key(MDB_cursor *mc, MDBX_val *key) { MDBX_page *mp; MDBX_node *node; char *base; @@ -6892,17 +6893,17 @@ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { node = NODEPTR(mp, indx); ptr = mp->mp_ptrs[indx]; if (MDB_DEBUG) { - MDB_val k2; + MDBX_val k2; char kbuf2[DKBUF_MAXKEYSIZE * 2 + 1]; - k2.mv_data = NODEKEY(node); - k2.mv_size = node->mn_ksize; + k2.iov_base = NODEKEY(node); + k2.iov_len = node->mn_ksize; mdbx_debug("update key %u (ofs %u) [%s] to [%s] on page %" PRIaPGNO "", indx, ptr, mdbx_dkey(&k2, kbuf2, sizeof(kbuf2)), DKEY(key), mp->mp_pgno); } /* Sizes must be 2-byte aligned. */ - ksize = EVEN(key->mv_size); + ksize = EVEN(key->iov_len); oksize = EVEN(node->mn_ksize); delta = ksize - oksize; @@ -6932,11 +6933,11 @@ static int mdbx_update_key(MDB_cursor *mc, MDB_val *key) { } /* But even if no shift was needed, update ksize */ - if (node->mn_ksize != key->mv_size) - node->mn_ksize = (uint16_t)key->mv_size; + if (node->mn_ksize != key->iov_len) + node->mn_ksize = (uint16_t)key->iov_len; - if (key->mv_size) - memcpy(NODEKEY(node), key->mv_data, key->mv_size); + if (key->iov_len) + memcpy(NODEKEY(node), key->iov_base, key->iov_len); return MDB_SUCCESS; } @@ -6963,7 +6964,7 @@ static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); /* Move a node from csrc to cdst. */ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { MDBX_node *srcnode; - MDB_val key, data; + MDBX_val key, data; pgno_t srcpg; MDB_cursor mn; int rc; @@ -6976,11 +6977,11 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { return rc; if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_xsize; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], - key.mv_size); - data.mv_size = 0; - data.mv_data = NULL; + key.iov_len = csrc->mc_db->md_xsize; + key.iov_base = LEAF2KEY(csrc->mc_pg[csrc->mc_top], + csrc->mc_ki[csrc->mc_top], key.iov_len); + data.iov_len = 0; + data.iov_base = NULL; srcpg = 0; flags = 0; } else { @@ -6997,39 +6998,39 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { if (unlikely(rc)) return rc; if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_size = csrc->mc_db->md_xsize; - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + key.iov_len = csrc->mc_db->md_xsize; + key.iov_base = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.iov_len); } else { s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); + key.iov_len = NODEKSZ(s2); + key.iov_base = NODEKEY(s2); } csrc->mc_snum = snum--; csrc->mc_top = snum; } else { - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); + key.iov_len = NODEKSZ(srcnode); + key.iov_base = NODEKEY(srcnode); } - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); + data.iov_len = NODEDSZ(srcnode); + data.iov_base = NODEDATA(srcnode); } mn.mc_xcursor = NULL; if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) { unsigned snum = cdst->mc_snum; MDBX_node *s2; - MDB_val bkey; + MDBX_val bkey; /* must find the lowest key below dst */ mdbx_cursor_copy(cdst, &mn); rc = mdbx_page_search_lowest(&mn); if (unlikely(rc)) return rc; if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { - bkey.mv_size = mn.mc_db->md_xsize; - bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size); + bkey.iov_len = mn.mc_db->md_xsize; + bkey.iov_base = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.iov_len); } else { s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); - bkey.mv_size = NODEKSZ(s2); - bkey.mv_data = NODEKEY(s2); + bkey.iov_len = NODEKSZ(s2); + bkey.iov_base = NODEKEY(s2); } mn.mc_snum = snum--; mn.mc_top = snum; @@ -7053,7 +7054,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { return rc; /* Delete the node from the source page. */ - mdbx_node_del(csrc, key.mv_size); + mdbx_node_del(csrc, key.iov_len); { /* Adjust other cursors pointing to mp */ @@ -7117,11 +7118,11 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { if (csrc->mc_ki[csrc->mc_top] == 0) { if (csrc->mc_ki[csrc->mc_top - 1] != 0) { if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); + key.iov_base = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.iov_len); } else { srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); + key.iov_len = NODEKSZ(srcnode); + key.iov_base = NODEKEY(srcnode); } mdbx_debug("update separator for source page %" PRIaPGNO " to [%s]", csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)); @@ -7134,9 +7135,9 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { return rc; } if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { - MDB_val nullkey; + MDBX_val nullkey; indx_t ix = csrc->mc_ki[csrc->mc_top]; - nullkey.mv_size = 0; + nullkey.iov_len = 0; csrc->mc_ki[csrc->mc_top] = 0; rc = mdbx_update_key(csrc, &nullkey); csrc->mc_ki[csrc->mc_top] = ix; @@ -7147,11 +7148,11 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { if (cdst->mc_ki[cdst->mc_top] == 0) { if (cdst->mc_ki[cdst->mc_top - 1] != 0) { if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { - key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); + key.iov_base = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.iov_len); } else { srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0); - key.mv_size = NODEKSZ(srcnode); - key.mv_data = NODEKEY(srcnode); + key.iov_len = NODEKSZ(srcnode); + key.iov_base = NODEKEY(srcnode); } mdbx_debug("update separator for destination page %" PRIaPGNO " to [%s]", cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)); @@ -7164,9 +7165,9 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { return rc; } if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { - MDB_val nullkey; + MDBX_val nullkey; indx_t ix = cdst->mc_ki[cdst->mc_top]; - nullkey.mv_size = 0; + nullkey.iov_len = 0; cdst->mc_ki[cdst->mc_top] = 0; rc = mdbx_update_key(cdst, &nullkey); cdst->mc_ki[cdst->mc_top] = ix; @@ -7189,7 +7190,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { MDBX_page *psrc, *pdst; MDBX_node *srcnode; - MDB_val key, data; + MDBX_val key, data; unsigned nkeys; int rc; indx_t i, j; @@ -7213,13 +7214,13 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { /* Move all nodes from src to dst. */ j = nkeys = NUMKEYS(pdst); if (IS_LEAF2(psrc)) { - key.mv_size = csrc->mc_db->md_xsize; - key.mv_data = PAGEDATA(psrc); + key.iov_len = csrc->mc_db->md_xsize; + key.iov_base = PAGEDATA(psrc); for (i = 0; i < NUMKEYS(psrc); i++, j++) { rc = mdbx_node_add(cdst, j, &key, NULL, 0, 0); if (unlikely(rc != MDB_SUCCESS)) return rc; - key.mv_data = (char *)key.mv_data + key.mv_size; + key.iov_base = (char *)key.iov_base + key.iov_len; } } else { for (i = 0; i < NUMKEYS(psrc); i++, j++) { @@ -7234,20 +7235,20 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { if (unlikely(rc)) return rc; if (IS_LEAF2(mn.mc_pg[mn.mc_top])) { - key.mv_size = mn.mc_db->md_xsize; - key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size); + key.iov_len = mn.mc_db->md_xsize; + key.iov_base = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.iov_len); } else { s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0); - key.mv_size = NODEKSZ(s2); - key.mv_data = NODEKEY(s2); + key.iov_len = NODEKSZ(s2); + key.iov_base = NODEKEY(s2); } } else { - key.mv_size = srcnode->mn_ksize; - key.mv_data = NODEKEY(srcnode); + key.iov_len = srcnode->mn_ksize; + key.iov_base = NODEKEY(srcnode); } - data.mv_size = NODEDSZ(srcnode); - data.mv_data = NODEDATA(srcnode); + data.iov_len = NODEDSZ(srcnode); + data.iov_base = NODEDATA(srcnode); rc = mdbx_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); if (unlikely(rc != MDB_SUCCESS)) @@ -7263,7 +7264,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { csrc->mc_top--; mdbx_node_del(csrc, 0); if (csrc->mc_ki[csrc->mc_top] == 0) { - key.mv_size = 0; + key.iov_len = 0; rc = mdbx_update_key(csrc, &key); if (unlikely(rc)) { csrc->mc_top++; @@ -7618,7 +7619,7 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { return rc; } -int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { +int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data) { if (unlikely(!key || !txn)) return MDBX_EINVAL; @@ -7634,12 +7635,12 @@ int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data) { return mdbx_del0(txn, dbi, key, data, 0); } -static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, +static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, unsigned flags) { MDB_cursor mc; MDB_xcursor mx; MDB_cursor_op op; - MDB_val rdata; + MDBX_val rdata; int rc, exact = 0; DKBUF; @@ -7683,7 +7684,7 @@ static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, * [in] newpgno The page number, if the new node is a branch node. * [in] nflags The NODE_ADD_FLAGS for the new node. * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, +static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, pgno_t newpgno, unsigned nflags) { unsigned flags; int rc = MDB_SUCCESS, new_root = 0, did_split = 0; @@ -7692,7 +7693,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, int i, j, split_indx, nkeys, pmax; MDB_env *env = mc->mc_txn->mt_env; MDBX_node *node; - MDB_val sepkey, rkey, xdata, *rdata = &xdata; + MDBX_val sepkey, rkey, xdata, *rdata = &xdata; MDBX_page *copy = NULL; MDBX_page *mp, *rp, *pp; int ptop; @@ -7777,25 +7778,25 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, rp->mp_lower += lsize; mp->mp_upper += rsize - lsize; rp->mp_upper -= rsize - lsize; - sepkey.mv_size = ksize; + sepkey.iov_len = ksize; if (newindx == split_indx) { - sepkey.mv_data = newkey->mv_data; + sepkey.iov_base = newkey->iov_base; } else { - sepkey.mv_data = split; + sepkey.iov_base = split; } if (x < 0) { ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); memcpy(rp->mp_ptrs, split, rsize); - sepkey.mv_data = rp->mp_ptrs; + sepkey.iov_base = rp->mp_ptrs; memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); - memcpy(ins, newkey->mv_data, ksize); + memcpy(ins, newkey->iov_base, ksize); mp->mp_lower += sizeof(indx_t); mp->mp_upper -= ksize - sizeof(indx_t); } else { if (x) memcpy(rp->mp_ptrs, split, x * ksize); ins = LEAF2KEY(rp, x, ksize); - memcpy(ins, newkey->mv_data, ksize); + memcpy(ins, newkey->iov_base, ksize); memcpy(ins + ksize, split + x * ksize, rsize - x * ksize); rp->mp_lower += sizeof(indx_t); rp->mp_upper -= ksize - sizeof(indx_t); @@ -7879,12 +7880,12 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, } } if (split_indx == newindx) { - sepkey.mv_size = newkey->mv_size; - sepkey.mv_data = newkey->mv_data; + sepkey.iov_len = newkey->iov_len; + sepkey.iov_base = newkey->iov_base; } else { node = (MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); - sepkey.mv_size = node->mn_ksize; - sepkey.mv_data = NODEKEY(node); + sepkey.iov_len = node->mn_ksize; + sepkey.iov_base = NODEKEY(node); } } } @@ -7949,8 +7950,8 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, j = 0; do { if (i == newindx) { - rkey.mv_data = newkey->mv_data; - rkey.mv_size = newkey->mv_size; + rkey.iov_base = newkey->iov_base; + rkey.iov_len = newkey->iov_len; if (IS_LEAF(mp)) { rdata = newdata; } else @@ -7960,11 +7961,11 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, mc->mc_ki[mc->mc_top] = j; } else { node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); - rkey.mv_data = NODEKEY(node); - rkey.mv_size = node->mn_ksize; + rkey.iov_base = NODEKEY(node); + rkey.iov_len = node->mn_ksize; if (IS_LEAF(mp)) { - xdata.mv_data = NODEDATA(node); - xdata.mv_size = NODEDSZ(node); + xdata.iov_base = NODEDATA(node); + xdata.iov_len = NODEDSZ(node); rdata = &xdata; } else pgno = NODEPGNO(node); @@ -7973,7 +7974,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, if (!IS_LEAF(mp) && j == 0) { /* First branch index doesn't need key data. */ - rkey.mv_size = 0; + rkey.iov_len = 0; } rc = mdbx_node_add(mc, j, &rkey, rdata, pgno, flags); @@ -8015,7 +8016,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, if (nflags & MDB_RESERVE) { node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); + newdata->iov_base = NODEDATA(node); } } else { if (newindx >= split_indx) { @@ -8096,7 +8097,7 @@ done: return rc; } -int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, +int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, unsigned flags) { MDB_cursor mc; MDB_xcursor mx; @@ -8432,10 +8433,10 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { * to find the new last_pg, which also becomes the new root. */ pgno_t freecount = 0; MDB_cursor mc; - MDB_val key, data; + MDBX_val key, data; mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdbx_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) - freecount += *(pgno_t *)data.mv_data; + freecount += *(pgno_t *)data.iov_base; if (rc != MDB_NOTFOUND) goto finish; freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + @@ -8792,13 +8793,13 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, size_t len = strlen(table_name); MDB_dbi scan, slot = txn->mt_numdbs; for (scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.mv_size) { + if (!txn->mt_dbxs[scan].md_name.iov_len) { /* Remember this free slot */ slot = scan; continue; } - if (len == txn->mt_dbxs[scan].md_name.mv_size && - !strncmp(table_name, txn->mt_dbxs[scan].md_name.mv_data, len)) { + if (len == txn->mt_dbxs[scan].md_name.iov_len && + !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { *dbi = scan; return mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp); } @@ -8814,9 +8815,9 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, /* Find the DB info */ int exact = 0; - MDB_val key, data; - key.mv_size = len; - key.mv_data = (void *)table_name; + MDBX_val key, data; + key.iov_len = len; + key.iov_base = (void *)table_name; MDB_cursor mc; mdbx_cursor_init(&mc, txn, MAIN_DBI, NULL); int rc = mdbx_cursor_set(&mc, &key, &data, MDB_SET, &exact); @@ -8845,8 +8846,8 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, memset(&db_dummy, 0, sizeof(db_dummy)); db_dummy.md_root = P_INVALID; db_dummy.md_flags = user_flags & PERSISTENT_FLAGS; - data.mv_size = sizeof(db_dummy); - data.mv_data = &db_dummy; + data.iov_len = sizeof(db_dummy); + data.iov_base = &db_dummy; WITH_CURSOR_TRACKING(mc, rc = mdbx_cursor_put(&mc, &key, &data, F_SUBDATA | MDB_NOOVERWRITE)); @@ -8857,20 +8858,20 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, } /* Got info, register DBI in this txn */ - txn->mt_dbxs[slot].md_name.mv_data = namedup; - txn->mt_dbxs[slot].md_name.mv_size = len; + txn->mt_dbxs[slot].md_name.iov_base = namedup; + txn->mt_dbxs[slot].md_name.iov_len = len; txn->mt_dbxs[slot].md_cmp = nullptr; txn->mt_dbxs[slot].md_dcmp = nullptr; txn->mt_dbflags[slot] = dbflag; txn->mt_dbiseqs[slot] = (txn->mt_env->me_dbiseqs[slot] += 1); - txn->mt_dbs[slot] = *(MDB_db *)data.mv_data; + txn->mt_dbs[slot] = *(MDB_db *)data.iov_base; rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp); if (unlikely(rc != MDB_SUCCESS)) { assert((dbflag & DB_DIRTY) == 0); /* cleanup slot */ - txn->mt_dbxs[slot].md_name.mv_data = NULL; - txn->mt_dbxs[slot].md_name.mv_size = 0; + txn->mt_dbxs[slot].md_name.iov_base = NULL; + txn->mt_dbxs[slot].md_name.iov_len = 0; txn->mt_dbflags[slot] = 0; bailout: free(namedup); @@ -8921,13 +8922,13 @@ int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi) { return MDBX_EINVAL; /* FIXME: locking to avoid races ? */ - ptr = env->me_dbxs[dbi].md_name.mv_data; + ptr = env->me_dbxs[dbi].md_name.iov_base; /* If there was no name, this was already closed */ if (unlikely(!ptr)) return MDB_BAD_DBI; - env->me_dbxs[dbi].md_name.mv_data = NULL; - env->me_dbxs[dbi].md_name.mv_size = 0; + env->me_dbxs[dbi].md_name.iov_base = NULL; + env->me_dbxs[dbi].md_name.iov_len = 0; env->me_dbflags[dbi] = 0; env->me_dbiseqs[dbi]++; free(ptr); @@ -9930,7 +9931,7 @@ int mdbx_cursor_eof(MDB_cursor *mc) { return MDBX_RESULT_FALSE; } -static int mdbx_is_samedata(const MDB_val *a, const MDB_val *b) { +static int mdbx_is_samedata(const MDBX_val *a, const MDBX_val *b) { return a->iov_len == b->iov_len && memcmp(a->iov_base, b->iov_base, a->iov_len) == 0; } @@ -9958,8 +9959,8 @@ static int mdbx_is_samedata(const MDB_val *a, const MDB_val *b) { * - внешняя аллокация курсоров, в том числе на стеке (без malloc). * - получения статуса страницы по адресу (знать о P_DIRTY). */ -int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, - MDB_val *old_data, unsigned flags) { +int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *new_data, + MDBX_val *old_data, unsigned flags) { if (unlikely(!key || !old_data || !txn || old_data == new_data)) return MDBX_EINVAL; @@ -9990,7 +9991,7 @@ int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, txn->mt_cursors[dbi] = &mc; int rc; - MDB_val present_key = *key; + MDBX_val present_key = *key; if (F_ISSET(flags, MDB_CURRENT | MDB_NOOVERWRITE)) { /* в old_data значение для выбора конкретного дубликата */ if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDB_DUPSORT))) { @@ -10015,7 +10016,7 @@ int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *new_data, /* в old_data буфер для сохранения предыдущего значения */ if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) return MDBX_EINVAL; - MDB_val present_data; + MDBX_val present_data; rc = mdbx_cursor_get(&mc, &present_key, &present_data, MDB_SET_KEY); if (unlikely(rc != MDB_SUCCESS)) { old_data->iov_base = NULL; @@ -10090,7 +10091,7 @@ bailout: return rc; } -int mdbx_get_ex(MDBX_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, +int mdbx_get_ex(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, int *values_count) { DKBUF; mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 998d2773..ff696a17 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -301,44 +301,44 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, return gotsignal ? EINTR : MDB_SUCCESS; } -typedef int(visitor)(const uint64_t record_number, const MDB_val *key, - const MDB_val *data); +typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, + const MDBX_val *data); static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent); -static int handle_userdb(const uint64_t record_number, const MDB_val *key, - const MDB_val *data) { +static int handle_userdb(const uint64_t record_number, const MDBX_val *key, + const MDBX_val *data) { (void)record_number; (void)key; (void)data; return MDB_SUCCESS; } -static int handle_freedb(const uint64_t record_number, const MDB_val *key, - const MDB_val *data) { +static int handle_freedb(const uint64_t record_number, const MDBX_val *key, + const MDBX_val *data) { char *bad = ""; pgno_t pg, prev; int i, number, span = 0; - pgno_t *iptr = data->mv_data; - txnid_t txnid = *(txnid_t *)key->mv_data; + pgno_t *iptr = data->iov_base; + txnid_t txnid = *(txnid_t *)key->iov_base; - if (key->mv_size != sizeof(txnid_t)) + if (key->iov_len != sizeof(txnid_t)) problem_add("entry", record_number, "wrong txn-id size", - "key-size %" PRIiPTR "", key->mv_size); + "key-size %" PRIiPTR "", key->iov_len); else if (txnid < 1 || txnid > envinfo.me_last_txnid) problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN "", txnid); - if (data->mv_size < sizeof(pgno_t) || data->mv_size % sizeof(pgno_t)) + if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) problem_add("entry", record_number, "wrong idl size", "%" PRIuPTR "", - data->mv_size); + data->iov_len); else { number = *iptr++; if (number >= MDB_IDL_UM_MAX) problem_add("entry", record_number, "wrong idl length", "%" PRIiPTR "", number); - else if ((number + 1) * sizeof(pgno_t) != data->mv_size) + else if ((number + 1) * sizeof(pgno_t) != data->iov_len) problem_add("entry", record_number, "mismatch idl length", "%" PRIiPTR " != %" PRIuPTR "", (number + 1) * sizeof(pgno_t), - data->mv_size); + data->iov_len); else { freedb_pages += number; if (envinfo.me_tail_txnid > txnid) @@ -381,21 +381,21 @@ static int handle_freedb(const uint64_t record_number, const MDB_val *key, return MDB_SUCCESS; } -static int handle_maindb(const uint64_t record_number, const MDB_val *key, - const MDB_val *data) { +static int handle_maindb(const uint64_t record_number, const MDBX_val *key, + const MDBX_val *data) { char *name; int rc; size_t i; - name = key->mv_data; - for (i = 0; i < key->mv_size; ++i) { + name = key->iov_base; + for (i = 0; i < key->iov_len; ++i) { if (name[i] < ' ') return handle_userdb(record_number, key, data); } - name = malloc(key->mv_size + 1); - memcpy(name, key->mv_data, key->mv_size); - name[key->mv_size] = '\0'; + name = malloc(key->iov_len + 1); + memcpy(name, key->iov_base, key->iov_len); + name[key->iov_len] = '\0'; userdb_count++; rc = process_db(-1, name, handle_userdb, 0); @@ -409,8 +409,8 @@ static int handle_maindb(const uint64_t record_number, const MDB_val *key, static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { MDB_cursor *mc; MDBX_stat ms; - MDB_val key, data; - MDB_val prev_key, prev_data; + MDBX_val key, data; + MDBX_val prev_key, prev_data; unsigned flags; int rc, i; struct problem *saved_list; @@ -486,8 +486,8 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { } saved_list = problems_push(); - prev_key.mv_data = NULL; - prev_data.mv_size = 0; + prev_key.iov_base = NULL; + prev_data.iov_len = 0; rc = mdbx_cursor_get(mc, &key, &data, MDB_FIRST); while (rc == MDB_SUCCESS) { if (gotsignal) { @@ -497,26 +497,26 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { goto bailout; } - if (key.mv_size > maxkeysize) { + if (key.iov_len > maxkeysize) { problem_add("entry", record_count, "key length exceeds max-key-size", - "%" PRIuPTR " > %u", key.mv_size, maxkeysize); - } else if ((flags & MDB_INTEGERKEY) && key.mv_size != sizeof(uint64_t) && - key.mv_size != sizeof(uint32_t)) { + "%" PRIuPTR " > %u", key.iov_len, maxkeysize); + } else if ((flags & MDB_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && + key.iov_len != sizeof(uint32_t)) { problem_add("entry", record_count, "wrong key length", - "%" PRIuPTR " != 4or8", key.mv_size); + "%" PRIuPTR " != 4or8", key.iov_len); } - if ((flags & MDB_INTEGERDUP) && data.mv_size != sizeof(uint64_t) && - data.mv_size != sizeof(uint32_t)) { + if ((flags & MDB_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && + data.iov_len != sizeof(uint32_t)) { problem_add("entry", record_count, "wrong data length", - "%" PRIuPTR " != 4or8", data.mv_size); + "%" PRIuPTR " != 4or8", data.iov_len); } - if (prev_key.mv_data) { - if ((flags & MDB_DUPFIXED) && prev_data.mv_size != data.mv_size) { + if (prev_key.iov_base) { + if ((flags & MDB_DUPFIXED) && prev_data.iov_len != data.iov_len) { problem_add("entry", record_count, "different data length", - "%" PRIuPTR " != %" PRIuPTR "", prev_data.mv_size, - data.mv_size); + "%" PRIuPTR " != %" PRIuPTR "", prev_data.iov_len, + data.iov_len); } int cmp = mdbx_cmp(txn, dbi, &prev_key, &key); @@ -535,9 +535,9 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { } } else if (verbose) { if (flags & MDB_INTEGERKEY) - print(" - fixed key-size %" PRIuPTR "\n", key.mv_size); + print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); if (flags & (MDB_INTEGERDUP | MDB_DUPFIXED)) - print(" - fixed data-size %" PRIuPTR "\n", data.mv_size); + print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); } if (handler) { @@ -547,8 +547,8 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { } record_count++; - key_bytes += key.mv_size; - data_bytes += data.mv_size; + key_bytes += key.iov_len; + data_bytes += data.iov_len; prev_key = key; prev_data = data; diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index 51dd89ce..6c63db56 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -53,12 +53,12 @@ static void hex(unsigned char c) { putchar(hexc[c & 0xf]); } -static void text(MDB_val *v) { +static void text(MDBX_val *v) { unsigned char *c, *end; putchar(' '); - c = v->mv_data; - end = c + v->mv_size; + c = v->iov_base; + end = c + v->iov_len; while (c < end) { if (isprint(*c)) { putchar(*c); @@ -71,12 +71,12 @@ static void text(MDB_val *v) { putchar('\n'); } -static void byte(MDB_val *v) { +static void byte(MDBX_val *v) { unsigned char *c, *end; putchar(' '); - c = v->mv_data; - end = c + v->mv_size; + c = v->iov_base; + end = c + v->iov_len; while (c < end) { hex(*c++); } @@ -87,7 +87,7 @@ static void byte(MDB_val *v) { static int dumpit(MDBX_txn *txn, MDB_dbi dbi, char *name) { MDB_cursor *mc; MDBX_stat ms; - MDB_val key, data; + MDBX_val key, data; MDBX_envinfo info; unsigned int flags; int rc, i; @@ -256,7 +256,7 @@ int main(int argc, char *argv[]) { if (alldbs) { MDB_cursor *cursor; - MDB_val key; + MDBX_val key; int count = 0; rc = mdbx_cursor_open(txn, dbi, &cursor); @@ -268,12 +268,12 @@ int main(int argc, char *argv[]) { while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { char *str; MDB_dbi db2; - if (memchr(key.mv_data, '\0', key.mv_size)) + if (memchr(key.iov_base, '\0', key.iov_len)) continue; count++; - str = malloc(key.mv_size + 1); - memcpy(str, key.mv_data, key.mv_size); - str[key.mv_size] = '\0'; + str = malloc(key.iov_len + 1); + memcpy(str, key.iov_base, key.iov_len); + str[key.iov_len] = '\0'; rc = mdbx_dbi_open(txn, str, 0, &db2); if (rc == MDB_SUCCESS) { if (list) { diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index 6a06af84..4a277c4c 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -39,7 +39,7 @@ static int Eof; static MDBX_envinfo envinfo; -static MDB_val kbuf, dbuf; +static MDBX_val kbuf, dbuf; #define STRLENOF(s) (sizeof(s) - 1) @@ -63,93 +63,94 @@ static void readhdr(void) { char *ptr; dbi_flags = 0; - while (fgets(dbuf.mv_data, dbuf.mv_size, stdin) != NULL) { + while (fgets(dbuf.iov_base, dbuf.iov_len, stdin) != NULL) { lineno++; - if (!strncmp(dbuf.mv_data, "db_pagesize=", STRLENOF("db_pagesize=")) || - !strncmp(dbuf.mv_data, "duplicates=", STRLENOF("duplicates="))) { + if (!strncmp(dbuf.iov_base, "db_pagesize=", STRLENOF("db_pagesize=")) || + !strncmp(dbuf.iov_base, "duplicates=", STRLENOF("duplicates="))) { /* LY: silently ignore information fields. */ continue; - } else if (!strncmp(dbuf.mv_data, "VERSION=", STRLENOF("VERSION="))) { - version = atoi((char *)dbuf.mv_data + STRLENOF("VERSION=")); + } else if (!strncmp(dbuf.iov_base, "VERSION=", STRLENOF("VERSION="))) { + version = atoi((char *)dbuf.iov_base + STRLENOF("VERSION=")); if (version > 3) { fprintf(stderr, "%s: line %" PRIiPTR ": unsupported VERSION %d\n", prog, lineno, version); exit(EXIT_FAILURE); } - } else if (!strncmp(dbuf.mv_data, "HEADER=END", STRLENOF("HEADER=END"))) { + } else if (!strncmp(dbuf.iov_base, "HEADER=END", STRLENOF("HEADER=END"))) { break; - } else if (!strncmp(dbuf.mv_data, "format=", STRLENOF("format="))) { - if (!strncmp((char *)dbuf.mv_data + STRLENOF("FORMAT="), "print", + } else if (!strncmp(dbuf.iov_base, "format=", STRLENOF("format="))) { + if (!strncmp((char *)dbuf.iov_base + STRLENOF("FORMAT="), "print", STRLENOF("print"))) mode |= PRINT; - else if (strncmp((char *)dbuf.mv_data + STRLENOF("FORMAT="), "bytevalue", + else if (strncmp((char *)dbuf.iov_base + STRLENOF("FORMAT="), "bytevalue", STRLENOF("bytevalue"))) { fprintf(stderr, "%s: line %" PRIiPTR ": unsupported FORMAT %s\n", prog, - lineno, (char *)dbuf.mv_data + STRLENOF("FORMAT=")); + lineno, (char *)dbuf.iov_base + STRLENOF("FORMAT=")); exit(EXIT_FAILURE); } - } else if (!strncmp(dbuf.mv_data, "database=", STRLENOF("database="))) { - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + } else if (!strncmp(dbuf.iov_base, "database=", STRLENOF("database="))) { + ptr = memchr(dbuf.iov_base, '\n', dbuf.iov_len); if (ptr) *ptr = '\0'; if (subname) free(subname); - subname = strdup((char *)dbuf.mv_data + STRLENOF("database=")); - } else if (!strncmp(dbuf.mv_data, "type=", STRLENOF("type="))) { - if (strncmp((char *)dbuf.mv_data + STRLENOF("type="), "btree", + subname = strdup((char *)dbuf.iov_base + STRLENOF("database=")); + } else if (!strncmp(dbuf.iov_base, "type=", STRLENOF("type="))) { + if (strncmp((char *)dbuf.iov_base + STRLENOF("type="), "btree", STRLENOF("btree"))) { fprintf(stderr, "%s: line %" PRIiPTR ": unsupported type %s\n", prog, - lineno, (char *)dbuf.mv_data + STRLENOF("type=")); + lineno, (char *)dbuf.iov_base + STRLENOF("type=")); exit(EXIT_FAILURE); } - } else if (!strncmp(dbuf.mv_data, "mapaddr=", STRLENOF("mapaddr="))) { + } else if (!strncmp(dbuf.iov_base, "mapaddr=", STRLENOF("mapaddr="))) { int i; - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + ptr = memchr(dbuf.iov_base, '\n', dbuf.iov_len); if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data + STRLENOF("mapaddr="), "%p", + i = sscanf((char *)dbuf.iov_base + STRLENOF("mapaddr="), "%p", &envinfo.me_mapaddr); if (i != 1) { fprintf(stderr, "%s: line %" PRIiPTR ": invalid mapaddr %s\n", prog, - lineno, (char *)dbuf.mv_data + STRLENOF("mapaddr=")); + lineno, (char *)dbuf.iov_base + STRLENOF("mapaddr=")); exit(EXIT_FAILURE); } - } else if (!strncmp(dbuf.mv_data, "mapsize=", STRLENOF("mapsize="))) { + } else if (!strncmp(dbuf.iov_base, "mapsize=", STRLENOF("mapsize="))) { int i; - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + ptr = memchr(dbuf.iov_base, '\n', dbuf.iov_len); if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data + STRLENOF("mapsize="), "%" PRIu64 "", + i = sscanf((char *)dbuf.iov_base + STRLENOF("mapsize="), "%" PRIu64 "", &envinfo.me_mapsize); if (i != 1) { fprintf(stderr, "%s: line %" PRIiPTR ": invalid mapsize %s\n", prog, - lineno, (char *)dbuf.mv_data + STRLENOF("mapsize=")); + lineno, (char *)dbuf.iov_base + STRLENOF("mapsize=")); exit(EXIT_FAILURE); } - } else if (!strncmp(dbuf.mv_data, "maxreaders=", STRLENOF("maxreaders="))) { + } else if (!strncmp(dbuf.iov_base, "maxreaders=", + STRLENOF("maxreaders="))) { int i; - ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); + ptr = memchr(dbuf.iov_base, '\n', dbuf.iov_len); if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data + STRLENOF("maxreaders="), "%u", + i = sscanf((char *)dbuf.iov_base + STRLENOF("maxreaders="), "%u", &envinfo.me_maxreaders); if (i != 1) { fprintf(stderr, "%s: line %" PRIiPTR ": invalid maxreaders %s\n", prog, - lineno, (char *)dbuf.mv_data + STRLENOF("maxreaders=")); + lineno, (char *)dbuf.iov_base + STRLENOF("maxreaders=")); exit(EXIT_FAILURE); } } else { int i; for (i = 0; dbflags[i].bit; i++) { - if (!strncmp(dbuf.mv_data, dbflags[i].name, dbflags[i].len) && - ((char *)dbuf.mv_data)[dbflags[i].len] == '=') { - if (((char *)dbuf.mv_data)[dbflags[i].len + 1] == '1') + if (!strncmp(dbuf.iov_base, dbflags[i].name, dbflags[i].len) && + ((char *)dbuf.iov_base)[dbflags[i].len] == '=') { + if (((char *)dbuf.iov_base)[dbflags[i].len + 1] == '1') dbi_flags |= dbflags[i].bit; break; } } if (!dbflags[i].bit) { - ptr = memchr(dbuf.mv_data, '=', dbuf.mv_size); + ptr = memchr(dbuf.iov_base, '=', dbuf.iov_len); if (!ptr) { fprintf(stderr, "%s: line %" PRIiPTR ": unexpected format\n", prog, lineno); @@ -158,7 +159,7 @@ static void readhdr(void) { *ptr = '\0'; fprintf(stderr, "%s: line %" PRIiPTR ": unrecognized keyword ignored: %s\n", - prog, lineno, (char *)dbuf.mv_data); + prog, lineno, (char *)dbuf.iov_base); } } } @@ -183,7 +184,7 @@ static int unhex(unsigned char *c2) { return c; } -static int readline(MDB_val *out, MDB_val *buf) { +static int readline(MDBX_val *out, MDBX_val *buf) { unsigned char *c1, *c2, *end; size_t len, l2; int c; @@ -196,48 +197,48 @@ static int readline(MDB_val *out, MDB_val *buf) { } if (c != ' ') { lineno++; - if (fgets(buf->mv_data, buf->mv_size, stdin) == NULL) { + if (fgets(buf->iov_base, buf->iov_len, stdin) == NULL) { badend: Eof = 1; badend(); return EOF; } - if (c == 'D' && !strncmp(buf->mv_data, "ATA=END", STRLENOF("ATA=END"))) + if (c == 'D' && !strncmp(buf->iov_base, "ATA=END", STRLENOF("ATA=END"))) return EOF; goto badend; } } - if (fgets(buf->mv_data, buf->mv_size, stdin) == NULL) { + if (fgets(buf->iov_base, buf->iov_len, stdin) == NULL) { Eof = 1; return EOF; } lineno++; - c1 = buf->mv_data; + c1 = buf->iov_base; len = strlen((char *)c1); l2 = len; /* Is buffer too short? */ while (c1[len - 1] != '\n') { - buf->mv_data = realloc(buf->mv_data, buf->mv_size * 2); - if (!buf->mv_data) { + buf->iov_base = realloc(buf->iov_base, buf->iov_len * 2); + if (!buf->iov_base) { Eof = 1; fprintf(stderr, "%s: line %" PRIiPTR ": out of memory, line too long\n", prog, lineno); return EOF; } - c1 = buf->mv_data; + c1 = buf->iov_base; c1 += l2; - if (fgets((char *)c1, buf->mv_size + 1, stdin) == NULL) { + if (fgets((char *)c1, buf->iov_len + 1, stdin) == NULL) { Eof = 1; badend(); return EOF; } - buf->mv_size *= 2; + buf->iov_len *= 2; len = strlen((char *)c1); l2 += len; } - c1 = c2 = buf->mv_data; + c1 = c2 = buf->iov_base; len = l2; c1[--len] = '\0'; end = c1 + len; @@ -279,8 +280,8 @@ static int readline(MDB_val *out, MDB_val *buf) { c2 += 2; } } - c2 = out->mv_data = buf->mv_data; - out->mv_size = c1 - c2; + c2 = out->iov_base = buf->iov_base; + out->iov_len = c1 - c2; return 0; } @@ -345,8 +346,8 @@ int main(int argc, char *argv[]) { if (optind != argc - 1) usage(); - dbuf.mv_size = 4096; - dbuf.mv_data = malloc(dbuf.mv_size); + dbuf.iov_len = 4096; + dbuf.iov_base = malloc(dbuf.iov_len); if (!(mode & NOHDR)) readhdr(); @@ -379,11 +380,11 @@ int main(int argc, char *argv[]) { goto env_close; } - kbuf.mv_size = mdbx_env_get_maxkeysize(env) * 2 + 2; - kbuf.mv_data = malloc(kbuf.mv_size); + kbuf.iov_len = mdbx_env_get_maxkeysize(env) * 2 + 2; + kbuf.iov_base = malloc(kbuf.iov_len); while (!Eof) { - MDB_val key, data; + MDBX_val key, data; int batch = 0; rc = mdbx_txn_begin(env, NULL, 0, &txn); diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index b9db4172..451f3c7c 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -158,7 +158,7 @@ int main(int argc, char *argv[]) { if (freinfo) { MDB_cursor *cursor; - MDB_val key, data; + MDBX_val key, data; size_t pages = 0, *iptr; size_t reclaimable = 0; @@ -178,9 +178,9 @@ int main(int argc, char *argv[]) { } prstat(&mst); while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { - iptr = data.mv_data; + iptr = data.iov_base; pages += *iptr; - if (envinfo && mei.me_tail_txnid > *(size_t *)key.mv_data) + if (envinfo && mei.me_tail_txnid > *(size_t *)key.iov_base) reclaimable += *iptr; if (freinfo > 1) { char *bad = ""; @@ -198,7 +198,7 @@ int main(int argc, char *argv[]) { } printf(" Transaction %" PRIuPTR ", %" PRIiPTR " pages, maxspan %" PRIiPTR "%s\n", - *(size_t *)key.mv_data, j, span, bad); + *(size_t *)key.iov_base, j, span, bad); if (freinfo > 2) { for (--j; j >= 0;) { pg = iptr[j]; @@ -262,7 +262,7 @@ int main(int argc, char *argv[]) { if (alldbs) { MDB_cursor *cursor; - MDB_val key; + MDBX_val key; rc = mdbx_cursor_open(txn, dbi, &cursor); if (rc) { @@ -273,11 +273,11 @@ int main(int argc, char *argv[]) { while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { char *str; MDB_dbi db2; - if (memchr(key.mv_data, '\0', key.mv_size)) + if (memchr(key.iov_base, '\0', key.iov_len)) continue; - str = malloc(key.mv_size + 1); - memcpy(str, key.mv_data, key.mv_size); - str[key.mv_size] = '\0'; + str = malloc(key.iov_len + 1); + memcpy(str, key.iov_base, key.iov_len); + str[key.iov_len] = '\0'; rc = mdbx_dbi_open(txn, str, 0, &db2); if (rc == MDB_SUCCESS) printf("Status of %s\n", str); diff --git a/test/keygen.cc b/test/keygen.cc index 0d7c0409..921b324e 100644 --- a/test/keygen.cc +++ b/test/keygen.cc @@ -193,8 +193,8 @@ void __hot maker::mk(const serial_t serial, const essentials ¶ms, assert(params.maxlen >= params.minlen); assert(params.maxlen >= length(serial)); - out.value.mv_data = out.bytes; - out.value.mv_size = params.minlen; + out.value.iov_base = out.bytes; + out.value.iov_len = params.minlen; if (params.flags & (MDB_INTEGERKEY | MDB_INTEGERDUP)) { assert(params.maxlen == params.minlen); @@ -204,29 +204,29 @@ void __hot maker::mk(const serial_t serial, const essentials ¶ms, else out.u32 = (uint32_t)serial; } else if (params.flags & (MDB_REVERSEKEY | MDB_REVERSEDUP)) { - if (out.value.mv_size > 8) { - memset(out.bytes, '\0', out.value.mv_size - 8); - unaligned::store(out.bytes + out.value.mv_size - 8, htobe64(serial)); + if (out.value.iov_len > 8) { + memset(out.bytes, '\0', out.value.iov_len - 8); + unaligned::store(out.bytes + out.value.iov_len - 8, htobe64(serial)); } else { out.u64 = htobe64(serial); - if (out.value.mv_size < 8) { - out.value.mv_size = std::max(length(serial), out.value.mv_size); - out.value.mv_data = out.bytes + 8 - out.value.mv_size; + if (out.value.iov_len < 8) { + out.value.iov_len = std::max(length(serial), out.value.iov_len); + out.value.iov_base = out.bytes + 8 - out.value.iov_len; } } } else { out.u64 = htole64(serial); - if (out.value.mv_size > 8) - memset(out.bytes + 8, '\0', out.value.mv_size - 8); + if (out.value.iov_len > 8) + memset(out.bytes + 8, '\0', out.value.iov_len - 8); else - out.value.mv_size = std::max(length(serial), out.value.mv_size); + out.value.iov_len = std::max(length(serial), out.value.iov_len); } - assert(out.value.mv_size >= params.minlen); - assert(out.value.mv_size <= params.maxlen); - assert(out.value.mv_size >= length(serial)); - assert(out.value.mv_data >= out.bytes); - assert((uint8_t *)out.value.mv_data + out.value.mv_size <= + assert(out.value.iov_len >= params.minlen); + assert(out.value.iov_len <= params.maxlen); + assert(out.value.iov_len >= length(serial)); + assert(out.value.iov_base >= out.bytes); + assert((uint8_t *)out.value.iov_base + out.value.iov_len <= out.bytes + out.limit); } diff --git a/test/keygen.h b/test/keygen.h index e6eeb194..f109bf40 100644 --- a/test/keygen.h +++ b/test/keygen.h @@ -78,7 +78,7 @@ enum { }; struct result { - MDB_val value; + MDBX_val value; size_t limit; union { uint8_t bytes[sizeof(uint64_t)]; diff --git a/tutorial/sample-mdb.txt b/tutorial/sample-mdb.txt index 2dcf87c9..5ba7ea12 100644 --- a/tutorial/sample-mdb.txt +++ b/tutorial/sample-mdb.txt @@ -26,7 +26,7 @@ int main(int argc,char * argv[]) int rc; MDB_env *env; MDB_dbi dbi; - MDB_val key, data; + MDBX_val key, data; MDBX_txn *txn; MDB_cursor *cursor; char sval[32]; @@ -38,10 +38,10 @@ int main(int argc,char * argv[]) rc = mdbx_txn_begin(env, NULL, 0, &txn); rc = mdbx_dbi_open(txn, NULL, 0, &dbi); - key.mv_size = sizeof(int); - key.mv_data = sval; - data.mv_size = sizeof(sval); - data.mv_data = sval; + key.iov_len = sizeof(int); + key.iov_base = sval; + data.iov_len = sizeof(sval); + data.iov_base = sval; sprintf(sval, "%03x %d foo bar", 32, 3141592); rc = mdbx_put(txn, dbi, &key, &data, 0); @@ -54,8 +54,8 @@ int main(int argc,char * argv[]) rc = mdbx_cursor_open(txn, dbi, &cursor); while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { printf("key: %p %.*s, data: %p %.*s\n", - key.mv_data, (int) key.mv_size, (char *) key.mv_data, - data.mv_data, (int) data.mv_size, (char *) data.mv_data); + key.iov_base, (int) key.iov_len, (char *) key.iov_base, + data.iov_base, (int) data.iov_len, (char *) data.iov_base); } mdbx_cursor_close(cursor); mdbx_txn_abort(txn); From 132c9c994e603f29082c94d75c556efb379d33bc Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 21:54:06 +0300 Subject: [PATCH 150/303] mdbx: add mdbx_fastmutex_t. --- src/osal.c | 38 ++++++++++++++++++++++++++++++++++++++ src/osal.h | 7 +++++++ 2 files changed, 45 insertions(+) diff --git a/src/osal.c b/src/osal.c index 5a0f4009..b0e1d6b1 100644 --- a/src/osal.c +++ b/src/osal.c @@ -267,6 +267,44 @@ int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex) { /*----------------------------------------------------------------------------*/ +int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + InitializeCriticalSection(fastmutex); + return MDB_SUCCESS; +#else + return pthread_mutex_init(fastmutex, NULL); +#endif +} + +int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + DeleteCriticalSection(fastmutex); + return MDB_SUCCESS; +#else + return pthread_mutex_destroy(fastmutex); +#endif +} + +int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + EnterCriticalSection(fastmutex); + return MDB_SUCCESS; +#else + return pthread_mutex_lock(fastmutex); +#endif +} + +int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + LeaveCriticalSection(fastmutex); + return MDB_SUCCESS; +#else + return pthread_mutex_unlock(fastmutex); +#endif +} + +/*----------------------------------------------------------------------------*/ + int mdbx_openfile(const char *pathname, int flags, mode_t mode, mdbx_filehandle_t *fd) { *fd = INVALID_HANDLE_VALUE; diff --git a/src/osal.h b/src/osal.h index 661337ca..8698fea9 100644 --- a/src/osal.h +++ b/src/osal.h @@ -68,6 +68,7 @@ typedef struct { HANDLE mutex; HANDLE event; } mdbx_condmutex_t; +typedef CRITICAL_SECTION mdbx_fastmutex_t; #else #include #include @@ -85,6 +86,7 @@ typedef struct { pthread_mutex_t mutex; pthread_cond_t cond; } mdbx_condmutex_t; +typedef pthread_mutex_t mdbx_fastmutex_t; #endif /* Platform */ #ifndef SSIZE_MAX @@ -395,6 +397,11 @@ int mdbx_condmutex_signal(mdbx_condmutex_t *condmutex); int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex); int mdbx_condmutex_destroy(mdbx_condmutex_t *condmutex); +int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); +int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); +int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); +int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); + int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, off_t offset, size_t expected_written); int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, off_t offset); From 2c4399702632f510389b985d2c60a7a7f7783dd2 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 22:07:35 +0300 Subject: [PATCH 151/303] mdbx: change mdbx_env_set_oomfunc() API. --- mdbx.h | 6 ++++-- src/mdbx.c | 12 +++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/mdbx.h b/mdbx.h index 9727f23a..70b87a8e 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1452,8 +1452,10 @@ typedef int(MDBX_oom_func)(MDB_env *env, int pid, mdbx_tid_t tid, uint64_t txn, * a laggard readers to allowing reclaiming of freeDB. * * [in] env An environment handle returned by mdbx_env_create(). - * [in] oomfunc A MDBX_oom_func function or NULL to disable. */ -LIBMDBX_API void mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); + * [in] oomfunc A MDBX_oom_func function or NULL to disable. + * + * Returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); /* Get the current oom_func callback. * diff --git a/src/mdbx.c b/src/mdbx.c index 7081dd20..cebc1d59 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9635,9 +9635,15 @@ int __cold mdbx_env_set_syncbytes(MDB_env *env, size_t bytes) { return env->me_map ? mdbx_env_sync(env, 0) : MDB_SUCCESS; } -void __cold mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oomfunc) { - if (likely(env && env->me_signature == MDBX_ME_SIGNATURE)) - env->me_oom_func = oomfunc; +int __cold mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oomfunc) { + if (unlikely(!env)) + return MDBX_EINVAL; + + if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + + env->me_oom_func = oomfunc; + return MDB_SUCCESS; } MDBX_oom_func *__cold mdbx_env_get_oomfunc(MDB_env *env) { From 3fa09a99373881c559d19d8c17dbde87657e2e4e Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 22:08:18 +0300 Subject: [PATCH 152/303] test: setup oom-callback for retry. --- test/test.cc | 23 +++++++++++++++++++++++ test/test.h | 3 +++ 2 files changed, 26 insertions(+) diff --git a/test/test.cc b/test/test.cc index 5fde326d..ad5141f6 100644 --- a/test/test.cc +++ b/test/test.cc @@ -90,6 +90,25 @@ static void mdbx_debug_logger(int type, const char *function, int line, abort(); } +int testcase::oom_callback(MDB_env *env, int pid, mdbx_tid_t tid, uint64_t txn, + unsigned gap, int retry) { + + testcase *self = (testcase *)mdbx_env_get_userctx(env); + + if (retry == 0) + log_notice("oom_callback: waitfor pid %u, thread %" PRIuPTR + ", txn #%" PRIu64 ", gap %d", + pid, (size_t)tid, txn, gap); + + if (self->should_continue()) { + osal_yield(); + osal_udelay(retry * 100); + return 1 /* always retry */; + } + + return -1; +} + void testcase::db_prepare() { log_trace(">> db_prepare"); assert(!db_guard); @@ -122,6 +141,10 @@ void testcase::db_prepare() { if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_env_set_maxdbs()", rc); + rc = mdbx_env_set_oomfunc(env, testcase::oom_callback); + if (unlikely(rc != MDB_SUCCESS)) + failure_perror("mdbx_env_set_oomfunc()", rc); + rc = mdbx_env_set_mapsize(env, (size_t)config.params.size); if (unlikely(rc != MDB_SUCCESS)) failure_perror("mdbx_env_set_mapsize()", rc); diff --git a/test/test.h b/test/test.h index 43674d3e..441bc72c 100644 --- a/test/test.h +++ b/test/test.h @@ -96,6 +96,9 @@ protected: mdbx_canary canary; } last; + static int oom_callback(MDB_env *env, int pid, mdbx_tid_t tid, uint64_t txn, + unsigned gap, int retry); + void db_prepare(); void db_open(); void db_close(); From 88ea2768f582ca52b930178ce84fea5650adc750 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 23 May 2017 22:31:07 +0300 Subject: [PATCH 153/303] mdbx: fix gcc 'comparison is always true' warning. --- src/mdbx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index cebc1d59..6be770ad 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -510,8 +510,7 @@ static __inline pgno_t NODEPGNO(const MDBX_node *node) { /* Set the page number in a branch node */ static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { - if (sizeof(pgno_t) > 4) - assert(pgno <= UINT64_C(0xffffFFFFffff)); + assert(pgno <= (pgno_t)UINT64_C(0xffffFFFFffff)); if (UNALIGNED_OK) { if (sizeof(pgno_t) > 4) From 62ebc5933090f56d3976e198770d9d772a925658 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 01:07:15 +0300 Subject: [PATCH 154/303] mdbx: rework version info (stub for now). --- COPYRIGHT | 2 +- Makefile | 16 +++------ libmdbx.files | 1 + mdbx.h | 75 ++++++++++++++++------------------------- mdbx_osal.h | 20 +++++++---- src/mdbx.c | 13 +------ src/tools/mdbx_chk.c | 3 +- src/tools/mdbx_copy.1 | 2 +- src/tools/mdbx_copy.c | 5 +-- src/tools/mdbx_dump.1 | 2 +- src/tools/mdbx_dump.c | 5 +-- src/tools/mdbx_load.1 | 2 +- src/tools/mdbx_load.c | 5 +-- src/tools/mdbx_stat.1 | 2 +- src/tools/mdbx_stat.c | 5 +-- src/version.c | 34 +++++++++++++++++++ tutorial/sample-bdb.txt | 2 +- tutorial/sample-mdb.txt | 2 +- 18 files changed, 103 insertions(+), 93 deletions(-) create mode 100644 src/version.c diff --git a/COPYRIGHT b/COPYRIGHT index f4a7607c..4c586215 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -1,5 +1,5 @@ Copyright 2015-2017 Leonid Yuriev . -Copyright 2011-2017 Howard Chu, Symas Corp. +Copyright 2011-2015 Howard Chu, Symas Corp. Copyright 2015,2016 Peter-Service R&D LLC. All rights reserved. diff --git a/Makefile b/Makefile index 82c3311d..0573e6b7 100644 --- a/Makefile +++ b/Makefile @@ -42,8 +42,6 @@ HEADERS := mdbx.h LIBRARIES := libmdbx.a libmdbx.so TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 - -MDBX_SRC := mdbx.h mdbx_osal.h $(addprefix src/, mdbx.c osal.c lck-posix.c defs.h bits.h osal.h midl.h) SHELL := /bin/bash .PHONY: mdbx all install clean check coverage @@ -70,19 +68,13 @@ clean: check: test/test rm -f $(TESTDB) && (set -o pipefail; test/test --pathname=$(TESTDB) --dont-cleanup-after basic | tee test.log | tail -n 42) && ./mdbx_chk -vn $(TESTDB) -mdbx.o: $(MDBX_SRC) Makefile - $(CC) $(CFLAGS) -c src/mdbx.c -o $@ +src/%.o: src/%.c mdbx.h mdbx_osal.h $(addprefix src/, defs.h bits.h osal.h midl.h) Makefile + $(CC) $(CFLAGS) -c $(filter %.c, $^) -o $@ -osal.o: $(MDBX_SRC) Makefile - $(CC) $(CFLAGS) -c src/osal.c -o $@ - -lck-posix.o: $(MDBX_SRC) Makefile - $(CC) $(CFLAGS) -c src/lck-posix.c -o $@ - -libmdbx.a: mdbx.o osal.o lck-posix.o +libmdbx.a: $(addprefix src/, mdbx.o osal.o lck-posix.o version.o) $(AR) rs $@ $? -libmdbx.so: mdbx.o osal.o lck-posix.o +libmdbx.so: libmdbx.a $(CC) $(CFLAGS) -save-temps $^ -pthread -shared $(LDFLAGS) -o $@ mdbx_%: src/tools/mdbx_%.c libmdbx.a diff --git a/libmdbx.files b/libmdbx.files index 2c209b29..63bee3c9 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -21,6 +21,7 @@ src/tools/mdbx_load.1 src/tools/mdbx_load.c src/tools/mdbx_stat.1 src/tools/mdbx_stat.c +src/version.c test/actor.cc test/base.h test/chrono.cc diff --git a/mdbx.h b/mdbx.h index 70b87a8e..1715070f 100644 --- a/mdbx.h +++ b/mdbx.h @@ -19,7 +19,7 @@ * * --- * - * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. + * Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -46,53 +46,45 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #pragma once -/* *INDENT-OFF* */ -/* clang-format off */ +#ifndef LIBMDBX_H +#define LIBMDBX_H -#ifndef _MDBX_H_ -#define _MDBX_H_ - -#include "mdbx_osal.h" +#define MDBX_VERSION_MAJOR 0 +#define MDBX_VERSION_MINOR 0 #ifdef _MSC_VER -#pragma warning(pop) +#pragma warning(push) #endif -/* *INDENT-ON* */ -/* clang-format on */ +#include "mdbx_osal.h" #ifdef __cplusplus extern "C" { #endif -/* Library major version */ -#define MDBX_VERSION_MAJOR 0 -/* Library minor version */ -#define MDBX_VERSION_MINOR 2 -/* Library patch version */ -#define MDBX_VERSION_PATCH 0 +typedef struct mdbx_version_info { + uint8_t major; + uint8_t minor; + uint16_t release; + uint32_t revision; + struct { + const char *datetime; + const char *tree; + const char *commit; + const char *describe; + } git; +} mdbx_version_info; -/* Combine args a,b,c into a single integer for easy version comparisons */ -#define MDB_VERINT(a, b, c) (((a) << 24) | ((b) << 16) | (c)) +typedef struct mdbx_build_info { + const char *datetime; + const char *target; + const char *options; + const char *compiler; + const char *flags; +} mdbx_build_info; -/* The full library version as a single integer */ -#define MDBX_VERSION_FULL \ - MDB_VERINT(MDBX_VERSION_MAJOR, MDBX_VERSION_MINOR, MDBX_VERSION_PATCH) - -/* The release date of this library version */ -#define MDBX_VERSION_DATE "DEVEL" - -/* A stringifier for the version info */ -#define MDBX_VERSTR(a, b, c, d) \ - "MDBX " #a "." #b "." #c ": (" d ", https://github.com/ReOpen/libmdbx)" - -/* A helper for the stringifier macro */ -#define MDBX_VERFOO(a, b, c, d) MDBX_VERSTR(a, b, c, d) - -/* The full library version as a C string */ -#define MDBX_VERSION_STRING \ - MDBX_VERFOO(MDBX_VERSION_MAJOR, MDBX_VERSION_MINOR, MDBX_VERSION_PATCH, \ - MDBX_VERSION_DATE) +extern LIBMDBX_API const struct mdbx_version_info mdbx_version; +extern LIBMDBX_API const struct mdbx_build_info mdbx_build; /* The name of the lock file in the DB environment */ #define MDBX_LOCKNAME "/mdbx.lck" @@ -352,15 +344,6 @@ typedef struct MDBX_envinfo { uint64_t me_meta2_txnid, me_meta2_sign; } MDBX_envinfo; -/* Return the LMDB library version information. - * - * [out] major if non-NULL, the library major version number is copied here - * [out] minor if non-NULL, the library minor version number is copied here - * [out] patch if non-NULL, the library patch version number is copied here - * - * Returns "version string" The library version as a string */ -LIBMDBX_API const char *mdbx_version(int *major, int *minor, int *patch); - /* Return a string describing a given error code. * * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) @@ -1532,4 +1515,4 @@ LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDB_dbi dbi, uint64_t *result, #pragma warning(pop) #endif -#endif /* _MDBX_H_ */ +#endif /* LIBMDBX_H */ diff --git a/mdbx_osal.h b/mdbx_osal.h index 438b1a95..10237c5f 100644 --- a/mdbx_osal.h +++ b/mdbx_osal.h @@ -50,13 +50,7 @@ #endif #endif /* __dll_import */ -#if defined(LIBMDBX_EXPORTS) -#define LIBMDBX_API __dll_export -#elif defined(LIBMDBX_IMPORTS) -#define LIBMDBX_API __dll_import -#else -#define LIBMDBX_API -#endif /* LIBMDBX_API */ +/*--------------------------------------------------------------------------*/ #ifdef _MSC_VER #pragma warning(push) @@ -122,4 +116,16 @@ typedef pthread_t mdbx_tid_t; #define MDBX_EIO EIO #endif +#ifdef _MSC_VER +#pragma warning(pop) +#endif + /*--------------------------------------------------------------------------*/ + +#if defined(LIBMDBX_EXPORTS) +#define LIBMDBX_API __dll_export +#elif defined(LIBMDBX_IMPORTS) +#define LIBMDBX_API __dll_import +#else +#define LIBMDBX_API +#endif /* LIBMDBX_API */ diff --git a/src/mdbx.c b/src/mdbx.c index 6be770ad..fd234506 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9,7 +9,7 @@ * * --- * - * Portions Copyright 2011-2017 Howard Chu, Symas Corp. All rights reserved. + * Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -677,17 +677,6 @@ static int mdbx_drop0(MDB_cursor *mc, int subs); static MDB_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_ai, mdbx_cmp_int_a2, mdbx_cmp_int_ua; -/* Return the library version info. */ -const char *mdbx_version(int *major, int *minor, int *patch) { - if (major) - *major = MDBX_VERSION_MAJOR; - if (minor) - *minor = MDBX_VERSION_MINOR; - if (patch) - *patch = MDBX_VERSION_PATCH; - return MDBX_VERSION_STRING; -} - static const char *__mdbx_strerr(int errnum) { /* Table of descriptions for LMDB errors */ static const char *const tbl[] = { diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index ff696a17..3b614f8a 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -633,7 +633,8 @@ int main(int argc, char *argv[]) { while ((i = getopt(argc, argv, "Vvqnwcds:")) != EOF) { switch (i) { case 'V': - printf("%s\n", MDBX_VERSION_STRING); + printf("%s (%s, build %s)\n", mdbx_version.git.describe, + mdbx_version.git.datetime, mdbx_build.datetime); exit(EXIT_SUCCESS); break; case 'v': diff --git a/src/tools/mdbx_copy.1 b/src/tools/mdbx_copy.1 index 06a620fd..6c3fd6f8 100644 --- a/src/tools/mdbx_copy.1 +++ b/src/tools/mdbx_copy.1 @@ -1,5 +1,5 @@ .\" Copyright 2015-2017 Leonid Yuriev . -.\" Copyright 2012-2017 Howard Chu, Symas Corp. All Rights Reserved. +.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_COPY 1 "2014/06/20" "LMDB 0.9.14" diff --git a/src/tools/mdbx_copy.c b/src/tools/mdbx_copy.c index 3b413e17..528d5edb 100644 --- a/src/tools/mdbx_copy.c +++ b/src/tools/mdbx_copy.c @@ -34,8 +34,9 @@ int main(int argc, char *argv[]) { else if (argv[1][1] == 'c' && argv[1][2] == '\0') cpflags |= MDB_CP_COMPACT; else if (argv[1][1] == 'V' && argv[1][2] == '\0') { - printf("%s\n", MDBX_VERSION_STRING); - exit(0); + printf("%s (%s, build %s)\n", mdbx_version.git.describe, + mdbx_version.git.datetime, mdbx_build.datetime); + exit(EXIT_SUCCESS); } else argc = 0; } diff --git a/src/tools/mdbx_dump.1 b/src/tools/mdbx_dump.1 index 80718bb0..12cb239e 100644 --- a/src/tools/mdbx_dump.1 +++ b/src/tools/mdbx_dump.1 @@ -1,5 +1,5 @@ .\" Copyright 2015-2017 Leonid Yuriev . -.\" Copyright 2014-2017 Howard Chu, Symas Corp. All Rights Reserved. +.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_DUMP 1 "2014/06/20" "LMDB 0.9.14" diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index 6c63db56..1f4fa78c 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -177,8 +177,9 @@ int main(int argc, char *argv[]) { while ((i = getopt(argc, argv, "af:lnps:V")) != EOF) { switch (i) { case 'V': - printf("%s\n", MDBX_VERSION_STRING); - exit(0); + printf("%s (%s, build %s)\n", mdbx_version.git.describe, + mdbx_version.git.datetime, mdbx_build.datetime); + exit(EXIT_SUCCESS); break; case 'l': list = 1; diff --git a/src/tools/mdbx_load.1 b/src/tools/mdbx_load.1 index 63b88f10..e326a523 100644 --- a/src/tools/mdbx_load.1 +++ b/src/tools/mdbx_load.1 @@ -1,5 +1,5 @@ .\" Copyright 2015-2017 Leonid Yuriev . -.\" Copyright 2014-2017 Howard Chu, Symas Corp. All Rights Reserved. +.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_LOAD 1 "2014/06/20" "LMDB 0.9.14" diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index 4a277c4c..531c9632 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -317,8 +317,9 @@ int main(int argc, char *argv[]) { while ((i = getopt(argc, argv, "f:ns:NTV")) != EOF) { switch (i) { case 'V': - printf("%s\n", MDBX_VERSION_STRING); - exit(0); + printf("%s (%s, build %s)\n", mdbx_version.git.describe, + mdbx_version.git.datetime, mdbx_build.datetime); + exit(EXIT_SUCCESS); break; case 'f': if (freopen(optarg, "r", stdin) == NULL) { diff --git a/src/tools/mdbx_stat.1 b/src/tools/mdbx_stat.1 index 096fffc1..e62f288f 100644 --- a/src/tools/mdbx_stat.1 +++ b/src/tools/mdbx_stat.1 @@ -1,5 +1,5 @@ .\" Copyright 2015-2017 Leonid Yuriev . -.\" Copyright 2012-2017 Howard Chu, Symas Corp. All Rights Reserved. +.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .TH MDB_STAT 1 "2014/06/20" "LMDB 0.9.14" diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index 451f3c7c..1c163d5b 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -64,8 +64,9 @@ int main(int argc, char *argv[]) { while ((i = getopt(argc, argv, "Vaefnrs:")) != EOF) { switch (i) { case 'V': - printf("%s\n", MDBX_VERSION_STRING); - exit(0); + printf("%s (%s, build %s)\n", mdbx_version.git.describe, + mdbx_version.git.datetime, mdbx_build.datetime); + exit(EXIT_SUCCESS); break; case 'a': if (subname) diff --git a/src/version.c b/src/version.c new file mode 100644 index 00000000..131f62a5 --- /dev/null +++ b/src/version.c @@ -0,0 +1,34 @@ +/* + * Copyright 2015-2017 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "./bits.h" + +#if MDBX_VERSION_MAJOR != 0 || MDBX_VERSION_MINOR != 0 +#error "API version mismatch!" +#endif + +#define MDBX_VERSION_RELEASE 0 +#define MDBX_VERSION_REVISION 0 + +const struct mdbx_version_info mdbx_version = { + MDBX_VERSION_MAJOR, + MDBX_VERSION_MINOR, + MDBX_VERSION_RELEASE, + MDBX_VERSION_REVISION, + {"@MDBX_GIT_TIMESTAMP@", "@MDBX_GIT_TREE@", "@MDBX_GIT_COMMIT@", + "@MDBX_GIT_DESCRIBE@"}}; + +const struct mdbx_build_info mdbx_build = { + "@MDBX_BUILD_TIMESTAMP@", "@MDBX_BUILD_TAGRET@", "@MDBX_BUILD_OPTIONS@", + "@MDBX_BUILD_COMPILER@", "@MDBX_BUILD_FLAGS@"}; diff --git a/tutorial/sample-bdb.txt b/tutorial/sample-bdb.txt index c4343e9e..6e3a739d 100644 --- a/tutorial/sample-bdb.txt +++ b/tutorial/sample-bdb.txt @@ -5,7 +5,7 @@ /* * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2012-2017 Howard Chu, Symas Corp. + * Copyright 2012-2015 Howard Chu, Symas Corp. * Copyright 2015,2016 Peter-Service R&D LLC. * All rights reserved. * diff --git a/tutorial/sample-mdb.txt b/tutorial/sample-mdb.txt index 5ba7ea12..bc7dc9eb 100644 --- a/tutorial/sample-mdb.txt +++ b/tutorial/sample-mdb.txt @@ -5,7 +5,7 @@ /* * Copyright 2015-2017 Leonid Yuriev . - * Copyright 2012-2017 Howard Chu, Symas Corp. + * Copyright 2012-2015 Howard Chu, Symas Corp. * Copyright 2015,2016 Peter-Service R&D LLC. * All rights reserved. * From 06de7a5155af2f3e50d41925506cc07f80c2dba9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 01:42:10 +0300 Subject: [PATCH 155/303] mdbx: rename the rest MDBX_xyz. --- Makefile | 4 +- README.md | 8 +- dll.vcxproj | 4 +- mdbx.h | 548 ++++----- src/bits.h | 221 ++-- src/lck-posix.c | 64 +- src/lck-windows.c | 38 +- src/mdbx.c | 2557 ++++++++++++++++++++------------------- src/midl.h | 22 +- src/osal.c | 99 +- src/osal.h | 24 +- src/tools/mdbx_chk.c | 70 +- src/tools/mdbx_copy.1 | 6 +- src/tools/mdbx_copy.c | 12 +- src/tools/mdbx_dump.1 | 6 +- src/tools/mdbx_dump.c | 48 +- src/tools/mdbx_load.1 | 8 +- src/tools/mdbx_load.c | 30 +- src/tools/mdbx_stat.1 | 8 +- src/tools/mdbx_stat.c | 30 +- test/config.cc | 22 +- test/hill.cc | 29 +- test/keygen.cc | 13 +- test/log.cc | 4 +- test/main.cc | 6 +- test/osal-windows.cc | 2 +- test/test.cc | 46 +- test/test.h | 22 +- test/test.vcxproj | 4 +- tutorial/sample-mdb.txt | 12 +- 30 files changed, 1993 insertions(+), 1974 deletions(-) diff --git a/Makefile b/Makefile index 0573e6b7..e1fd41f4 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ suffix ?= CC ?= gcc CXX ?= g++ -XCFLAGS ?= -DNDEBUG=1 -DMDB_DEBUG=0 -DLIBMDBX_EXPORTS=1 +XCFLAGS ?= -DNDEBUG=1 -DMDBX_DEBUG=0 -DLIBMDBX_EXPORTS=1 CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden CFLAGS += -D_GNU_SOURCE=1 -std=gnu99 -pthread $(XCFLAGS) CXXFLAGS = -std=c++11 $(filter-out -std=gnu99,$(CFLAGS)) @@ -117,7 +117,7 @@ endif ci-rule = ( CC=$$(which $1); if [ -n "$$CC" ]; then \ echo -n "probe by $2 ($$(readlink -f $$(which $$CC))): " && \ $(MAKE) clean >$1.log 2>$1.err && \ - $(MAKE) CC=$$(readlink -f $$CC) XCFLAGS="-UNDEBUG -DMDB_DEBUG=2" all check 1>$1.log 2>$1.err && echo "OK" \ + $(MAKE) CC=$$(readlink -f $$CC) XCFLAGS="-UNDEBUG -DMDBX_DEBUG=2" all check 1>$1.log 2>$1.err && echo "OK" \ || ( echo "FAILED"; cat $1.err >&2; exit 1 ); \ else echo "no $2 ($1) for probe"; fi; ) ci: diff --git a/README.md b/README.md index a23da3b7..4c85905e 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ _libmdbx_ наследует все ключевые возможности и [MVCC](https://ru.wikipedia.org/wiki/MVCC) и [COW](https://ru.wikipedia.org/wiki/%D0%9A%D0%BE%D0%BF%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%BF%D1%80%D0%B8_%D0%B7%D0%B0%D0%BF%D0%B8%D1%81%D0%B8). Изменения строго последовательны и не блокируются чтением, -   конфликты между транзакциями не возможны. + конфликты между транзакциями не возможны. При этом гарантируется чтение только зафиксированных данных, см [relaxing serializability](https://en.wikipedia.org/wiki/Serializability). 4. Чтение и поиск [без блокировок](https://ru.wikipedia.org/wiki/%D0%9D%D0%B5%D0%B1%D0%BB%D0%BE%D0%BA%D0%B8%D1%80%D1%83%D1%8E%D1%89%D0%B0%D1%8F_%D1%81%D0%B8%D0%BD%D1%85%D1%80%D0%BE%D0%BD%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D1%8F), @@ -305,7 +305,7 @@ RECLAIM` в _libmdbx_. посредством `mdbx_cursor_eof()`. 10. Возможность явно запросить обновление существующей записи, без -создания новой посредством флажка `MDB_CURRENT` для `mdbx_put()`. +создания новой посредством флажка `MDBX_CURRENT` для `mdbx_put()`. 11. Возможность обновить или удалить запись с получением предыдущего значения данных посредством `mdbx_replace()`. @@ -328,7 +328,7 @@ RECLAIM` в _libmdbx_. который используется одним из читателей. 17. Функция `mdbx_del()` не игнорирует дополнительный (уточняющий) -аргумент `data` для таблиц без дубликатов (без флажка `MDB_DUPSORT`), а +аргумент `data` для таблиц без дубликатов (без флажка `MDBX_DUPSORT`), а при его ненулевом значении всегда использует его для сверки с удаляемой записью. @@ -342,7 +342,7 @@ RECLAIM` в _libmdbx_. изменениях, иначе они будут неизменны). 20. Корректное обновление текущей записи, в том числе сортированного -дубликата, при использовании режима `MDB_CURRENT` в `mdbx_cursor_put()`. +дубликата, при использовании режима `MDBX_CURRENT` в `mdbx_cursor_put()`. 21. Все курсоры, как в транзакциях только для чтения, так и в пишущих, могут быть переиспользованы посредством `mdbx_cursor_renew()` и ДОЛЖНЫ diff --git a/dll.vcxproj b/dll.vcxproj index 13cee8f4..8c179768 100644 --- a/dll.vcxproj +++ b/dll.vcxproj @@ -77,7 +77,7 @@ - WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions);MDB_DEBUG=1 + WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions);MDBX_DEBUG=1 MultiThreadedDebugDLL Level3 ProgramDatabase @@ -121,7 +121,7 @@ Level3 - WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions);MDB_DEBUG=1 + WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions);MDBX_DEBUG=1 MultiThreadedDebugDLL true diff --git a/mdbx.h b/mdbx.h index 1715070f..eb2f4cfd 100644 --- a/mdbx.h +++ b/mdbx.h @@ -97,7 +97,7 @@ extern LIBMDBX_API const struct mdbx_build_info mdbx_build; * * A DB environment supports multiple databases, all residing in the same * shared-memory map. */ -typedef struct MDB_env MDB_env; +typedef struct MDBX_env MDBX_env; /* Opaque structure for a transaction handle. * @@ -106,10 +106,10 @@ typedef struct MDB_env MDB_env; typedef struct MDBX_txn MDBX_txn; /* A handle for an individual database in the DB environment. */ -typedef uint32_t MDB_dbi; +typedef uint32_t MDBX_dbi; /* Opaque structure for navigating through a database */ -typedef struct MDB_cursor MDB_cursor; +typedef struct MDBX_cursor MDBX_cursor; /* Generic structure used for passing keys and data in and out * of the database. @@ -119,7 +119,7 @@ typedef struct MDB_cursor MDB_cursor; * free them, they commonly point into the database itself. * * Key sizes must be between 1 and mdbx_env_get_maxkeysize() inclusive. - * The same applies to data sizes in databases with the MDB_DUPSORT flag. + * The same applies to data sizes in databases with the MDBX_DUPSORT flag. * Other data items can in theory be from 0 to 0xffffffff bytes long. */ #ifndef HAVE_STRUCT_IOVEC struct iovec { @@ -136,172 +136,172 @@ typedef struct iovec MDBX_val; #define MDBX_MAXDATASIZE INT32_MAX /* A callback function used to compare two keys in a database */ -typedef int(MDB_cmp_func)(const MDBX_val *a, const MDBX_val *b); +typedef int(MDBX_cmp_func)(const MDBX_val *a, const MDBX_val *b); /* Environment Flags */ /* no environment directory */ -#define MDB_NOSUBDIR 0x4000u +#define MDBX_NOSUBDIR 0x4000u /* don't fsync after commit */ -#define MDB_NOSYNC 0x10000u +#define MDBX_NOSYNC 0x10000u /* read only */ -#define MDB_RDONLY 0x20000u +#define MDBX_RDONLY 0x20000u /* don't fsync metapage after commit */ -#define MDB_NOMETASYNC 0x40000u +#define MDBX_NOMETASYNC 0x40000u /* use writable mmap */ -#define MDB_WRITEMAP 0x80000u -/* use asynchronous msync when MDB_WRITEMAP is used */ -#define MDB_MAPASYNC 0x100000u +#define MDBX_WRITEMAP 0x80000u +/* use asynchronous msync when MDBX_WRITEMAP is used */ +#define MDBX_MAPASYNC 0x100000u /* tie reader locktable slots to MDBX_txn objects instead of to threads */ -#define MDB_NOTLS 0x200000u +#define MDBX_NOTLS 0x200000u /* don't do any locking, caller must manage their own locks * WARNING: libmdbx don't support this mode. */ -#define MDB_NOLOCK__UNSUPPORTED 0x400000u +#define MDBX_NOLOCK__UNSUPPORTED 0x400000u /* don't do readahead */ -#define MDB_NORDAHEAD 0x800000u +#define MDBX_NORDAHEAD 0x800000u /* don't initialize malloc'd memory before writing to datafile */ -#define MDB_NOMEMINIT 0x1000000u +#define MDBX_NOMEMINIT 0x1000000u /* aim to coalesce FreeDB records */ #define MDBX_COALESCE 0x2000000u /* LIFO policy for reclaiming FreeDB records */ #define MDBX_LIFORECLAIM 0x4000000u /* make a steady-sync only on close and explicit env-sync */ -#define MDBX_UTTERLY_NOSYNC (MDB_NOSYNC | MDB_MAPASYNC) +#define MDBX_UTTERLY_NOSYNC (MDBX_NOSYNC | MDBX_MAPASYNC) /* debuging option, fill/perturb released pages */ #define MDBX_PAGEPERTURB 0x8000000u /* Database Flags */ /* use reverse string keys */ -#define MDB_REVERSEKEY 0x02u +#define MDBX_REVERSEKEY 0x02u /* use sorted duplicates */ -#define MDB_DUPSORT 0x04u +#define MDBX_DUPSORT 0x04u /* numeric keys in native byte order, either uint32_t or uint64_t. * The keys must all be of the same size. */ -#define MDB_INTEGERKEY 0x08u -/* with MDB_DUPSORT, sorted dup items have fixed size */ -#define MDB_DUPFIXED 0x10u -/* with MDB_DUPSORT, dups are MDB_INTEGERKEY-style integers */ -#define MDB_INTEGERDUP 0x20u -/* with MDB_DUPSORT, use reverse string dups */ -#define MDB_REVERSEDUP 0x40u +#define MDBX_INTEGERKEY 0x08u +/* with MDBX_DUPSORT, sorted dup items have fixed size */ +#define MDBX_DUPFIXED 0x10u +/* with MDBX_DUPSORT, dups are MDBX_INTEGERKEY-style integers */ +#define MDBX_INTEGERDUP 0x20u +/* with MDBX_DUPSORT, use reverse string dups */ +#define MDBX_REVERSEDUP 0x40u /* create DB if not already existing */ -#define MDB_CREATE 0x40000u +#define MDBX_CREATE 0x40000u /* Write Flags */ /* For put: Don't write if the key already exists. */ -#define MDB_NOOVERWRITE 0x10u -/* Only for MDB_DUPSORT +#define MDBX_NOOVERWRITE 0x10u +/* Only for MDBX_DUPSORT * For put: don't write if the key and data pair already exist. * For mdbx_cursor_del: remove all duplicate data items. */ -#define MDB_NODUPDATA 0x20u +#define MDBX_NODUPDATA 0x20u /* For mdbx_cursor_put: overwrite the current key/data pair * MDBX allows this flag for mdbx_put() for explicit overwrite/update without * insertion. */ -#define MDB_CURRENT 0x40u +#define MDBX_CURRENT 0x40u /* For put: Just reserve space for data, don't copy it. Return a * pointer to the reserved space. */ -#define MDB_RESERVE 0x10000u +#define MDBX_RESERVE 0x10000u /* Data is being appended, don't split full pages. */ -#define MDB_APPEND 0x20000u +#define MDBX_APPEND 0x20000u /* Duplicate data is being appended, don't split full pages. */ -#define MDB_APPENDDUP 0x40000u -/* Store multiple data items in one call. Only for MDB_DUPFIXED. */ -#define MDB_MULTIPLE 0x80000u +#define MDBX_APPENDDUP 0x40000u +/* Store multiple data items in one call. Only for MDBX_DUPFIXED. */ +#define MDBX_MULTIPLE 0x80000u /* Copy Flags */ /* Compacting copy: Omit free space from copy, and renumber all * pages sequentially. */ -#define MDB_CP_COMPACT 1u +#define MDBX_CP_COMPACT 1u /* Cursor Get operations. * * This is the set of all operations for retrieving data * using a cursor. */ -typedef enum MDB_cursor_op { - MDB_FIRST, /* Position at first key/data item */ - MDB_FIRST_DUP, /* MDB_DUPSORT-only: Position at first data item +typedef enum MDBX_cursor_op { + MDBX_FIRST, /* Position at first key/data item */ + MDBX_FIRST_DUP, /* MDBX_DUPSORT-only: Position at first data item * of current key. */ - MDB_GET_BOTH, /* MDB_DUPSORT-only: Position at key/data pair. */ - MDB_GET_BOTH_RANGE, /* MDB_DUPSORT-only: position at key, nearest data. */ - MDB_GET_CURRENT, /* Return key/data at current cursor position */ - MDB_GET_MULTIPLE, /* MDB_DUPFIXED-only: Return key and up to a page of + MDBX_GET_BOTH, /* MDBX_DUPSORT-only: Position at key/data pair. */ + MDBX_GET_BOTH_RANGE, /* MDBX_DUPSORT-only: position at key, nearest data. */ + MDBX_GET_CURRENT, /* Return key/data at current cursor position */ + MDBX_GET_MULTIPLE, /* MDBX_DUPFIXED-only: Return key and up to a page of * duplicate data items from current cursor position. - * Move cursor to prepare for MDB_NEXT_MULTIPLE.*/ - MDB_LAST, /* Position at last key/data item */ - MDB_LAST_DUP, /* MDB_DUPSORT-only: Position at last data item + * Move cursor to prepare for MDBX_NEXT_MULTIPLE.*/ + MDBX_LAST, /* Position at last key/data item */ + MDBX_LAST_DUP, /* MDBX_DUPSORT-only: Position at last data item * of current key. */ - MDB_NEXT, /* Position at next data item */ - MDB_NEXT_DUP, /* MDB_DUPSORT-only: Position at next data item + MDBX_NEXT, /* Position at next data item */ + MDBX_NEXT_DUP, /* MDBX_DUPSORT-only: Position at next data item * of current key. */ - MDB_NEXT_MULTIPLE, /* MDB_DUPFIXED-only: Return key and up to a page of + MDBX_NEXT_MULTIPLE, /* MDBX_DUPFIXED-only: Return key and up to a page of * duplicate data items from next cursor position. - * Move cursor to prepare for MDB_NEXT_MULTIPLE. */ - MDB_NEXT_NODUP, /* Position at first data item of next key */ - MDB_PREV, /* Position at previous data item */ - MDB_PREV_DUP, /* MDB_DUPSORT-only: Position at previous data item + * Move cursor to prepare for MDBX_NEXT_MULTIPLE. */ + MDBX_NEXT_NODUP, /* Position at first data item of next key */ + MDBX_PREV, /* Position at previous data item */ + MDBX_PREV_DUP, /* MDBX_DUPSORT-only: Position at previous data item * of current key. */ - MDB_PREV_NODUP, /* Position at last data item of previous key */ - MDB_SET, /* Position at specified key */ - MDB_SET_KEY, /* Position at specified key, return both key and data */ - MDB_SET_RANGE, /* Position at first key greater than or equal to + MDBX_PREV_NODUP, /* Position at last data item of previous key */ + MDBX_SET, /* Position at specified key */ + MDBX_SET_KEY, /* Position at specified key, return both key and data */ + MDBX_SET_RANGE, /* Position at first key greater than or equal to * specified key. */ - MDB_PREV_MULTIPLE /* MDB_DUPFIXED-only: Position at previous page and + MDBX_PREV_MULTIPLE /* MDBX_DUPFIXED-only: Position at previous page and * return key and up to a page of duplicate data items. */ -} MDB_cursor_op; +} MDBX_cursor_op; /* Return Codes * BerkeleyDB uses -30800 to -30999, we'll go under them */ /* Successful result */ -#define MDB_SUCCESS 0 -#define MDBX_RESULT_FALSE MDB_SUCCESS +#define MDBX_SUCCESS 0 +#define MDBX_RESULT_FALSE MDBX_SUCCESS #define MDBX_RESULT_TRUE (-1) /* key/data pair already exists */ -#define MDB_KEYEXIST (-30799) +#define MDBX_KEYEXIST (-30799) /* key/data pair not found (EOF) */ -#define MDB_NOTFOUND (-30798) +#define MDBX_NOTFOUND (-30798) /* Requested page not found - this usually indicates corruption */ -#define MDB_PAGE_NOTFOUND (-30797) +#define MDBX_PAGE_NOTFOUND (-30797) /* Located page was wrong type */ -#define MDB_CORRUPTED (-30796) +#define MDBX_CORRUPTED (-30796) /* Update of meta page failed or environment had fatal error */ -#define MDB_PANIC (-30795) +#define MDBX_PANIC (-30795) /* DB file version mismatch with libmdbx */ -#define MDB_VERSION_MISMATCH (-30794) -/* File is not a valid LMDB file */ -#define MDB_INVALID (-30793) +#define MDBX_VERSION_MISMATCH (-30794) +/* File is not a valid MDBX file */ +#define MDBX_INVALID (-30793) /* Environment mapsize reached */ -#define MDB_MAP_FULL (-30792) +#define MDBX_MAP_FULL (-30792) /* Environment maxdbs reached */ -#define MDB_DBS_FULL (-30791) +#define MDBX_DBS_FULL (-30791) /* Environment maxreaders reached */ -#define MDB_READERS_FULL (-30790) +#define MDBX_READERS_FULL (-30790) /* Txn has too many dirty pages */ -#define MDB_TXN_FULL (-30788) +#define MDBX_TXN_FULL (-30788) /* Cursor stack too deep - internal error */ -#define MDB_CURSOR_FULL (-30787) +#define MDBX_CURSOR_FULL (-30787) /* Page has not enough space - internal error */ -#define MDB_PAGE_FULL (-30786) +#define MDBX_PAGE_FULL (-30786) /* Database contents grew beyond environment mapsize */ -#define MDB_MAP_RESIZED (-30785) +#define MDBX_MAP_RESIZED (-30785) /* Operation and DB incompatible, or DB type changed. This can mean: - * - The operation expects an MDB_DUPSORT / MDB_DUPFIXED database. - * - Opening a named DB when the unnamed DB has MDB_DUPSORT/MDB_INTEGERKEY. + * - The operation expects an MDBX_DUPSORT / MDBX_DUPFIXED database. + * - Opening a named DB when the unnamed DB has MDBX_DUPSORT/MDBX_INTEGERKEY. * - Accessing a data record as a database, or vice versa. * - The database was dropped and recreated with different flags. */ -#define MDB_INCOMPATIBLE (-30784) +#define MDBX_INCOMPATIBLE (-30784) /* Invalid reuse of reader locktable slot */ -#define MDB_BAD_RSLOT (-30783) +#define MDBX_BAD_RSLOT (-30783) /* Transaction must abort, has a child, or is invalid */ -#define MDB_BAD_TXN (-30782) +#define MDBX_BAD_TXN (-30782) /* Unsupported size of key/DB name/data, or wrong DUPFIXED size */ -#define MDB_BAD_VALSIZE (-30781) +#define MDBX_BAD_VALSIZE (-30781) /* The specified DBI was changed unexpectedly */ -#define MDB_BAD_DBI (-30780) +#define MDBX_BAD_DBI (-30780) /* Unexpected problem - txn should abort */ -#define MDB_PROBLEM (-30779) +#define MDBX_PROBLEM (-30779) /* The last defined error code */ -#define MDB_LAST_ERRCODE MDB_PROBLEM +#define MDBX_LAST_ERRCODE MDBX_PROBLEM /* The mdbx_put() or mdbx_replace() was called for key, that has more that one associated value. */ @@ -317,7 +317,7 @@ typedef enum MDB_cursor_op { #define MDBX_WANNA_RECOVERY (-30419) /* The given key value is mismatched to the current cursor position, - * when mdbx_cursor_put() called with MDB_CURRENT option. */ + * when mdbx_cursor_put() called with MDBX_CURRENT option. */ #define MDBX_EKEYMISMATCH (-30418) /* Statistics for a database in the environment */ @@ -349,7 +349,7 @@ typedef struct MDBX_envinfo { * This function is a superset of the ANSI C X3.159-1989 (ANSI C) strerror(3) * function. If the error code is greater than or equal to 0, then the string * returned by the system function strerror(3) is returned. If the error code - * is less than 0, an error string corresponding to the LMDB library error is + * is less than 0, an error string corresponding to the MDBX library error is * returned. See errors for a list of MDBX-specific error codes. * * [in] err The error code @@ -358,9 +358,9 @@ typedef struct MDBX_envinfo { LIBMDBX_API const char *mdbx_strerror(int errnum); LIBMDBX_API const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); -/* Create an LMDB environment handle. +/* Create an MDBX environment handle. * - * This function allocates memory for a MDB_env structure. To release + * This function allocates memory for a MDBX_env structure. To release * the allocated memory and discard the handle, call mdbx_env_close(). * Before the handle may be used, it must be opened using mdbx_env_open(). * Various other options may also need to be set before opening the handle, @@ -370,12 +370,12 @@ LIBMDBX_API const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen); * [out] env The address where the new handle will be stored * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_create(MDB_env **penv); +LIBMDBX_API int mdbx_env_create(MDBX_env **penv); /* Open an environment handle. * * If this function fails, mdbx_env_close() must be called to discard - * the MDB_env handle. + * the MDBX_env handle. * * [in] env An environment handle returned by mdbx_env_create() * [in] path The directory in which the database files reside. @@ -385,65 +385,65 @@ LIBMDBX_API int mdbx_env_create(MDB_env **penv); * or more of the values described here. * * Flags set by mdbx_env_set_flags() are also used: - * - MDB_NOSUBDIR - * By default, LMDB creates its environment in a directory whose + * - MDBX_NOSUBDIR + * By default, MDBX creates its environment in a directory whose * pathname is given in path, and creates its data and lock files * under that directory. With this option, path is used as-is for * the database main data file. The database lock file is the path * with "-lock" appended. * - * - MDB_RDONLY + * - MDBX_RDONLY * Open the environment in read-only mode. No write operations will - * be allowed. LMDB will still modify the lock file - except on + * be allowed. MDBX will still modify the lock file - except on * read-only filesystems, where MDBX does not use locks. * - * - MDB_WRITEMAP - * Use a writeable memory map unless MDB_RDONLY is set. This uses fewer + * - MDBX_WRITEMAP + * Use a writeable memory map unless MDBX_RDONLY is set. This uses fewer * mallocs but loses protection from application bugs like wild pointer * writes and other bad updates into the database. * This may be slightly faster for DBs that fit entirely in RAM, * but is slower for DBs larger than RAM. * Incompatible with nested transactions. - * Do not mix processes with and without MDB_WRITEMAP on the same + * Do not mix processes with and without MDBX_WRITEMAP on the same * environment. This can defeat durability (mdbx_env_sync etc). * - * - MDB_NOMETASYNC + * - MDBX_NOMETASYNC * Flush system buffers to disk only once per transaction, omit the * metadata flush. Defer that until the system flushes files to disk, - * or next non-MDB_RDONLY commit or mdbx_env_sync(). This optimization + * or next non-MDBX_RDONLY commit or mdbx_env_sync(). This optimization * maintains database integrity, but a system crash may undo the last * committed transaction. I.e. it preserves the ACI (atomicity, * consistency, isolation) but not D (durability) database property. * This flag may be changed at any time using mdbx_env_set_flags(). * - * - MDB_NOSYNC + * - MDBX_NOSYNC * Don't flush system buffers to disk when committing a transaction. * This optimization means a system crash can corrupt the database or * lose the last transactions if buffers are not yet flushed to disk. * The risk is governed by how often the system flushes dirty buffers * to disk and how often mdbx_env_sync() is called. However, if the - * filesystem preserves write order and the MDB_WRITEMAP and/or + * filesystem preserves write order and the MDBX_WRITEMAP and/or * MDBX_LIFORECLAIM flags are not used, transactions exhibit ACI * (atomicity, consistency, isolation) properties and only lose D * (durability). I.e. database integrity is maintained, but a system * crash may undo the final transactions. * - * Note that (MDB_NOSYNC | MDB_WRITEMAP) leaves the system with no + * Note that (MDBX_NOSYNC | MDBX_WRITEMAP) leaves the system with no * hint for when to write transactions to disk. - * Therefore the (MDB_MAPASYNC | MDB_WRITEMAP) may be preferable. + * Therefore the (MDBX_MAPASYNC | MDBX_WRITEMAP) may be preferable. * This flag may be changed at any time using mdbx_env_set_flags(). * - * - MDBX_UTTERLY_NOSYNC (internally MDB_NOSYNC | MDB_MAPASYNC) + * - MDBX_UTTERLY_NOSYNC (internally MDBX_NOSYNC | MDBX_MAPASYNC) * FIXME: TODO * - * - MDB_MAPASYNC - * When using MDB_WRITEMAP, use asynchronous flushes to disk. As with - * MDB_NOSYNC, a system crash can then corrupt the database or lose + * - MDBX_MAPASYNC + * When using MDBX_WRITEMAP, use asynchronous flushes to disk. As with + * MDBX_NOSYNC, a system crash can then corrupt the database or lose * the last transactions. Calling mdbx_env_sync() ensures on-disk * database integrity until next commit. This flag may be changed at * any time using mdbx_env_set_flags(). * - * - MDB_NOTLS + * - MDBX_NOTLS * Don't use Thread-Local Storage. Tie reader locktable slots to * MDBX_txn objects instead of to threads. I.e. mdbx_txn_reset() keeps * the slot reseved for the MDBX_txn object. A thread may use parallel @@ -451,9 +451,9 @@ LIBMDBX_API int mdbx_env_create(MDB_env **penv); * the user synchronizes its use. Applications that multiplex many * user threads over individual OS threads need this option. Such an * application must also serialize the write transactions in an OS - * thread, since LMDB's write locking is unaware of the user threads. + * thread, since MDBX's write locking is unaware of the user threads. * - * - MDB_NOLOCK (don't supported by MDBX) + * - MDBX_NOLOCK (don't supported by MDBX) * Don't do any locking. If concurrent access is anticipated, the * caller must manage all concurrency itself. For proper operation * the caller must enforce single-writer semantics, and must ensure @@ -461,13 +461,13 @@ LIBMDBX_API int mdbx_env_create(MDB_env **penv); * active. The simplest approach is to use an exclusive lock so that * no readers may be active at all when a writer begins. * - * - MDB_NORDAHEAD + * - MDBX_NORDAHEAD * Turn off readahead. Most operating systems perform readahead on * read requests by default. This option turns it off if the OS * supports it. Turning it off may help random read performance * when the DB is larger than RAM and system RAM is full. * - * - MDB_NOMEMINIT + * - MDBX_NOMEMINIT * Don't initialize malloc'd memory before writing to unused spaces * in the data file. By default, memory for pages written to the data * file is obtained using malloc. While these pages may be reused in @@ -480,9 +480,9 @@ LIBMDBX_API int mdbx_env_create(MDB_env **penv); * cost so some applications may want to disable it using this flag. This * option can be a problem for applications which handle sensitive data * like passwords, and it makes memory checkers like Valgrind noisy. This - * flag is not needed with MDB_WRITEMAP, which writes directly to the + * flag is not needed with MDBX_WRITEMAP, which writes directly to the * mmap instead of using malloc for pages. The initialization is also - * skipped if MDB_RESERVE is used; the caller is expected to overwrite + * skipped if MDBX_RESERVE is used; the caller is expected to overwrite * all of the memory that was reserved in that case. This flag may be * changed at any time using mdbx_env_set_flags(). * @@ -493,27 +493,27 @@ LIBMDBX_API int mdbx_env_create(MDB_env **penv); * * - MDBX_LIFORECLAIM * LIFO policy for reclaiming FreeDB records. This significantly reduce - * write IPOs in case MDB_NOSYNC with periodically checkpoints. + * write IPOs in case MDBX_NOSYNC with periodically checkpoints. * FIXME: TODO * * [in] mode The UNIX permissions to set on created files. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_VERSION_MISMATCH - the version of the LMDB library doesn't match the + * - MDBX_VERSION_MISMATCH - the version of the MDBX library doesn't match the * version that created the database environment. - * - MDB_INVALID - the environment file headers are corrupted. + * - MDBX_INVALID - the environment file headers are corrupted. * - MDBX_ENOENT - the directory specified by the path parameter * doesn't exist. * - MDBX_EACCES - the user didn't have permission to access * the environment files. * - MDBX_EAGAIN - the environment was locked by another process. */ -LIBMDBX_API int mdbx_env_open(MDB_env *env, const char *path, unsigned flags, +LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, mode_t mode); -LIBMDBX_API int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, - mode_t mode, int *exclusive); +LIBMDBX_API int mdbx_env_open_ex(MDBX_env *env, const char *path, + unsigned flags, mode_t mode, int *exclusive); -/* Copy an LMDB environment to the specified path, with options. +/* Copy an MDBX environment to the specified path, with options. * * This function may be used to make a backup of an existing environment. * No lockfile is created, since it gets recreated at need. @@ -529,7 +529,7 @@ LIBMDBX_API int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, * to 0 or by bitwise OR'ing together one or more of the values * described here: * - * - MDB_CP_COMPACT + * - MDBX_CP_COMPACT * Perform compaction while copying: omit free pages and sequentially * renumber all pages in output. This option consumes little bit more * CPU for processing, but may running quickly than the default, on @@ -538,9 +538,9 @@ LIBMDBX_API int mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, * NOTE: Currently it fails if the environment has suffered a page leak. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_copy(MDB_env *env, const char *path, unsigned flags); +LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *path, unsigned flags); -/* Copy an LMDB environment to the specified file descriptor, +/* Copy an MDBX environment to the specified file descriptor, * with options. * * This function may be used to make a backup of an existing environment. @@ -559,42 +559,42 @@ LIBMDBX_API int mdbx_env_copy(MDB_env *env, const char *path, unsigned flags); * options. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_copy2fd(MDB_env *env, mdbx_filehandle_t fd, +LIBMDBX_API int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, unsigned flags); -/* Return statistics about the LMDB environment. +/* Return statistics about the MDBX environment. * * [in] env An environment handle returned by mdbx_env_create() - * [out] stat The address of an MDB_stat structure where the statistics + * [out] stat The address of an MDBX_stat structure where the statistics * will be copied */ -LIBMDBX_API int mdbx_env_stat(MDB_env *env, MDBX_stat *stat, size_t bytes); +LIBMDBX_API int mdbx_env_stat(MDBX_env *env, MDBX_stat *stat, size_t bytes); -/* Return information about the LMDB environment. +/* Return information about the MDBX environment. * * [in] env An environment handle returned by mdbx_env_create() - * [out] stat The address of an MDB_envinfo structure + * [out] stat The address of an MDBX_envinfo structure * where the information will be copied */ -LIBMDBX_API int mdbx_env_info(MDB_env *env, MDBX_envinfo *info, size_t bytes); +LIBMDBX_API int mdbx_env_info(MDBX_env *env, MDBX_envinfo *info, size_t bytes); /* Flush the data buffers to disk. * * Data is always written to disk when mdbx_txn_commit() is called, - * but the operating system may keep it buffered. LMDB always flushes + * but the operating system may keep it buffered. MDBX always flushes * the OS buffers upon commit as well, unless the environment was - * opened with MDB_NOSYNC or in part MDB_NOMETASYNC. This call is - * not valid if the environment was opened with MDB_RDONLY. + * opened with MDBX_NOSYNC or in part MDBX_NOMETASYNC. This call is + * not valid if the environment was opened with MDBX_RDONLY. * * [in] env An environment handle returned by mdbx_env_create() * [in] force If non-zero, force a synchronous flush. Otherwise if the - * environment has the MDB_NOSYNC flag set the flushes will be - * omitted, and with MDB_MAPASYNC they will be asynchronous. + * environment has the MDBX_NOSYNC flag set the flushes will be + * omitted, and with MDBX_MAPASYNC they will be asynchronous. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EACCES - the environment is read-only. * - MDBX_EINVAL - an invalid parameter was specified. * - MDBX_EIO - an error occurred during synchronization. */ -LIBMDBX_API int mdbx_env_sync(MDB_env *env, int force); +LIBMDBX_API int mdbx_env_sync(MDBX_env *env, int force); /* Close the environment and release the memory map. * @@ -611,7 +611,7 @@ LIBMDBX_API int mdbx_env_sync(MDB_env *env, int force); * on opening next time, and transactions since the last non-weak * checkpoint (meta-page update) will rolledback for consistency * guarantee. */ -LIBMDBX_API void mdbx_env_close(MDB_env *env); +LIBMDBX_API void mdbx_env_close(MDBX_env *env); /* Set environment flags. * @@ -626,7 +626,7 @@ LIBMDBX_API void mdbx_env_close(MDB_env *env); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff); +LIBMDBX_API int mdbx_env_set_flags(MDBX_env *env, unsigned flags, int onoff); /* Get environment flags. * @@ -636,7 +636,7 @@ LIBMDBX_API int mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_env_get_flags(MDB_env *env, unsigned *flags); +LIBMDBX_API int mdbx_env_get_flags(MDBX_env *env, unsigned *flags); /* Return the path that was used in mdbx_env_open(). * @@ -648,7 +648,7 @@ LIBMDBX_API int mdbx_env_get_flags(MDB_env *env, unsigned *flags); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_env_get_path(MDB_env *env, const char **path); +LIBMDBX_API int mdbx_env_get_path(MDBX_env *env, const char **path); /* Return the file descriptor for the given environment. * @@ -661,7 +661,7 @@ LIBMDBX_API int mdbx_env_get_path(MDB_env *env, const char **path); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *fd); +LIBMDBX_API int mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *fd); /* Set the size of the memory map to use for this environment. * @@ -681,7 +681,7 @@ LIBMDBX_API int mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *fd); * * If the mapsize is increased by another process, and data has grown * beyond the range of the current mapsize, mdbx_txn_begin() will - * return MDB_MAP_RESIZED. This function may be called with a size + * return MDBX_MAP_RESIZED. This function may be called with a size * of zero to adopt the new size. * * Any attempt to set a size smaller than the space already consumed by the @@ -694,7 +694,7 @@ LIBMDBX_API int mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *fd); * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified, * or the environment has an active write transaction. */ -LIBMDBX_API int mdbx_env_set_mapsize(MDB_env *env, size_t size); +LIBMDBX_API int mdbx_env_set_mapsize(MDBX_env *env, size_t size); /* Set the maximum number of threads/reader slots for the environment. * @@ -702,8 +702,8 @@ LIBMDBX_API int mdbx_env_set_mapsize(MDB_env *env, size_t size); * readers in the the environment. The default is 61. * Starting a read-only transaction normally ties a lock table slot to the * current thread until the environment closes or the thread exits. If - * MDB_NOTLS is in use, mdbx_txn_begin() instead ties the slot to the - * MDBX_txn object until it or the MDB_env object is destroyed. + * MDBX_NOTLS is in use, mdbx_txn_begin() instead ties the slot to the + * MDBX_txn object until it or the MDBX_env object is destroyed. * This function may only be called after mdbx_env_create() and before * mdbx_env_open(). * @@ -714,7 +714,7 @@ LIBMDBX_API int mdbx_env_set_mapsize(MDB_env *env, size_t size); * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified, * or the environment is already open. */ -LIBMDBX_API int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); +LIBMDBX_API int mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers); /* Get the maximum number of threads/reader slots for the environment. * @@ -724,7 +724,7 @@ LIBMDBX_API int mdbx_env_set_maxreaders(MDB_env *env, unsigned readers); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); +LIBMDBX_API int mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers); /* Set the maximum number of named databases for the environment. * @@ -745,48 +745,48 @@ LIBMDBX_API int mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers); * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified, * or the environment is already open. */ -LIBMDBX_API int mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); +LIBMDBX_API int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs); -/* Get the maximum size of keys and MDB_DUPSORT data we can write. +/* Get the maximum size of keys and MDBX_DUPSORT data we can write. * * [in] env An environment handle returned by mdbx_env_create() * * Returns The maximum size of a key we can write. */ -LIBMDBX_API int mdbx_env_get_maxkeysize(MDB_env *env); +LIBMDBX_API int mdbx_env_get_maxkeysize(MDBX_env *env); LIBMDBX_API int mdbx_get_maxkeysize(size_t pagesize); -/* Set application information associated with the MDB_env. +/* Set application information associated with the MDBX_env. * * [in] env An environment handle returned by mdbx_env_create() * [in] ctx An arbitrary pointer for whatever the application needs. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_set_userctx(MDB_env *env, void *ctx); +LIBMDBX_API int mdbx_env_set_userctx(MDBX_env *env, void *ctx); -/* Get the application information associated with the MDB_env. +/* Get the application information associated with the MDBX_env. * * [in] env An environment handle returned by mdbx_env_create() * Returns The pointer set by mdbx_env_set_userctx(). */ -LIBMDBX_API void *mdbx_env_get_userctx(MDB_env *env); +LIBMDBX_API void *mdbx_env_get_userctx(MDBX_env *env); -/* A callback function for most LMDB assert() failures, +/* A callback function for most MDBX assert() failures, * called before printing the message and aborting. * * [in] env An environment handle returned by mdbx_env_create(). * [in] msg The assertion message, not including newline. */ -typedef void MDB_assert_func(MDB_env *env, const char *msg, - const char *function, unsigned line); +typedef void MDBX_assert_func(MDBX_env *env, const char *msg, + const char *function, unsigned line); /* Set or reset the assert() callback of the environment. * - * Disabled if liblmdb is buillt with MDB_DEBUG=0. - * NOTE: This hack should become obsolete as lmdb's error handling matures. + * Disabled if libmdbx is buillt with MDBX_DEBUG=0. + * NOTE: This hack should become obsolete as mdbx's error handling matures. * * [in] env An environment handle returned by mdbx_env_create(). - * [in] func An MDB_assert_func function, or 0. + * [in] func An MDBX_assert_func function, or 0. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); +LIBMDBX_API int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func); /* Create a transaction for use with the environment. * @@ -794,7 +794,7 @@ LIBMDBX_API int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); * or mdbx_txn_commit(). * NOTE: A transaction and its cursors must only be used by a single * thread, and a thread may only have a single transaction at a time. - * If MDB_NOTLS is in use, this does not apply to read-only transactions. + * If MDBX_NOTLS is in use, this does not apply to read-only transactions. * NOTE: Cursors may not span transactions. * * [in] env An environment handle returned by mdbx_env_create() @@ -808,29 +808,29 @@ LIBMDBX_API int mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func); * must be set to 0 or by bitwise OR'ing together one or more * of the values described here. * - * - MDB_RDONLY + * - MDBX_RDONLY * This transaction will not perform any write operations. * * [out] txn Address where the new MDBX_txn handle will be stored * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_PANIC - a fatal error occurred earlier and the environment + * - MDBX_PANIC - a fatal error occurred earlier and the environment * must be shut down. - * - MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's + * - MDBX_MAP_RESIZED - another process wrote data beyond this MDBX_env's * mapsize and this environment's map must be resized * as well. See mdbx_env_set_mapsize(). - * - MDB_READERS_FULL - a read-only transaction was requested and the reader + * - MDBX_READERS_FULL - a read-only transaction was requested and the reader * lock table is full. See mdbx_env_set_maxreaders(). * - MDBX_ENOMEM - out of memory. */ -LIBMDBX_API int mdbx_txn_begin(MDB_env *env, MDBX_txn *parent, unsigned flags, +LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, MDBX_txn **txn); -/* Returns the transaction's MDB_env +/* Returns the transaction's MDBX_env * * [in] txn A transaction handle returned by mdbx_txn_begin() */ -LIBMDBX_API MDB_env *mdbx_txn_env(MDBX_txn *txn); +LIBMDBX_API MDBX_env *mdbx_txn_env(MDBX_txn *txn); /* Return the transaction's ID. * @@ -878,10 +878,10 @@ LIBMDBX_API int mdbx_txn_abort(MDBX_txn *txn); * Abort the transaction like mdbx_txn_abort(), but keep the transaction * handle. Therefore mdbx_txn_renew() may reuse the handle. This saves * allocation overhead if the process will start a new read-only transaction - * soon, and also locking overhead if MDB_NOTLS is in use. The reader table + * soon, and also locking overhead if MDBX_NOTLS is in use. The reader table * lock is released, but the table slot stays tied to its thread or * MDBX_txn. Use mdbx_txn_abort() to discard a reset handle, and to free - * its lock table slot if MDB_NOTLS is in use. + * its lock table slot if MDBX_NOTLS is in use. * * Cursors opened within the transaction must not be used * again after this call, except with mdbx_cursor_renew(). @@ -904,7 +904,7 @@ LIBMDBX_API int mdbx_txn_reset(MDBX_txn *txn); * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_PANIC - a fatal error occurred earlier and the environment + * - MDBX_PANIC - a fatal error occurred earlier and the environment * must be shut down. * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_txn_renew(MDBX_txn *txn); @@ -937,61 +937,61 @@ LIBMDBX_API int mdbx_txn_renew(MDBX_txn *txn); * [in] flags Special options for this table. This parameter must be set * to 0 or by bitwise OR'ing together one or more of the values * described here: - * - MDB_REVERSEKEY + * - MDBX_REVERSEKEY * Keys are strings to be compared in reverse order, from the end * of the strings to the beginning. By default, Keys are treated as * strings and compared from beginning to end. - * - MDB_DUPSORT + * - MDBX_DUPSORT * Duplicate keys may be used in the table. Or, from another point of * view, keys may have multiple data items, stored in sorted order. By * default keys must be unique and may have only a single data item. - * - MDB_INTEGERKEY + * - MDBX_INTEGERKEY * Keys are binary integers in native byte order, either uin32_t or * uint64_t, and will be sorted as such. The keys must all be of the * same size. - * - MDB_DUPFIXED - * This flag may only be used in combination with MDB_DUPSORT. This + * - MDBX_DUPFIXED + * This flag may only be used in combination with MDBX_DUPSORT. This * option tells the library that the data items for this database are * all the same size, which allows further optimizations in storage and - * retrieval. When all data items are the same size, the MDB_GET_MULTIPLE, - * MDB_NEXT_MULTIPLE and MDB_PREV_MULTIPLE cursor operations may be used + * retrieval. When all data items are the same size, the MDBX_GET_MULTIPLE, + * MDBX_NEXT_MULTIPLE and MDBX_PREV_MULTIPLE cursor operations may be used * to retrieve multiple items at once. - * - MDB_INTEGERDUP + * - MDBX_INTEGERDUP * This option specifies that duplicate data items are binary integers, - * similar to MDB_INTEGERKEY keys. - * - MDB_REVERSEDUP + * similar to MDBX_INTEGERKEY keys. + * - MDBX_REVERSEDUP * This option specifies that duplicate data items should be compared as * strings in reverse order (the comparison is performed in the direction * from the last byte to the first). - * - MDB_CREATE + * - MDBX_CREATE * Create the named database if it doesn't exist. This option is not * allowed in a read-only transaction or a read-only environment. * - * [out] dbi Address where the new MDB_dbi handle will be stored + * [out] dbi Address where the new MDBX_dbi handle will be stored * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_NOTFOUND - the specified database doesn't exist in the - * environment and MDB_CREATE was not specified. - * - MDB_DBS_FULL - too many databases have been opened. + * - MDBX_NOTFOUND - the specified database doesn't exist in the + * environment and MDBX_CREATE was not specified. + * - MDBX_DBS_FULL - too many databases have been opened. * See mdbx_env_set_maxdbs(). */ LIBMDBX_API int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, - unsigned flags, MDB_dbi *dbi, - MDB_cmp_func *keycmp, MDB_cmp_func *datacmp); + unsigned flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, unsigned flags, - MDB_dbi *dbi); + MDBX_dbi *dbi); /* Retrieve statistics for a database. * * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] dbi A database handle returned by mdbx_dbi_open() - * [out] stat The address of an MDB_stat structure where the statistics + * [out] stat The address of an MDBX_stat structure where the statistics * will be copied * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_dbi_stat(MDBX_txn *txn, MDB_dbi dbi, MDBX_stat *stat, +LIBMDBX_API int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *stat, size_t bytes); /* Retrieve the DB flags for a database handle. @@ -1001,7 +1001,7 @@ LIBMDBX_API int mdbx_dbi_stat(MDBX_txn *txn, MDB_dbi dbi, MDBX_stat *stat, * [out] flags Address where the flags will be returned. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_dbi_flags(MDBX_txn *txn, MDB_dbi dbi, unsigned *flags); +LIBMDBX_API int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags); /* Close a database handle. Normally unnecessary. * @@ -1011,7 +1011,7 @@ LIBMDBX_API int mdbx_dbi_flags(MDBX_txn *txn, MDB_dbi dbi, unsigned *flags); * the database handle or one of its cursors any further. Do not close * a handle if an existing transaction has modified its database. * Doing so can cause misbehavior from database corruption to errors - * like MDB_BAD_VALSIZE (since the DB name is gone). + * like MDBX_BAD_VALSIZE (since the DB name is gone). * * Closing a database handle is not necessary, but lets mdbx_dbi_open() * reuse the handle value. Usually it's better to set a bigger @@ -1020,7 +1020,7 @@ LIBMDBX_API int mdbx_dbi_flags(MDBX_txn *txn, MDB_dbi dbi, unsigned *flags); * [in] env An environment handle returned by mdbx_env_create() * [in] dbi A database handle returned by mdbx_dbi_open() */ -LIBMDBX_API int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); +LIBMDBX_API int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi); /* Empty or delete+close a database. * @@ -1032,14 +1032,14 @@ LIBMDBX_API int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi); * and close the DB handle. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_drop(MDBX_txn *txn, MDB_dbi dbi, int del); +LIBMDBX_API int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del); /* Get items from a database. * * This function retrieves key/data pairs from the database. The address * and length of the data associated with the specified key are returned * in the structure to which data refers. - * If the database supports duplicate keys (MDB_DUPSORT) then the + * If the database supports duplicate keys (MDBX_DUPSORT) then the * first data item for the key will be returned. Retrieval of other * items requires the use of mdbx_cursor_get(). * @@ -1058,9 +1058,9 @@ LIBMDBX_API int mdbx_drop(MDBX_txn *txn, MDB_dbi dbi, int del); * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_NOTFOUND - the key was not in the database. + * - MDBX_NOTFOUND - the key was not in the database. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, +LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data); /* Store items into a database. @@ -1068,7 +1068,7 @@ LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, * This function stores key/data pairs in the database. The default behavior * is to enter the new key/data pair, replacing any previously existing key * if duplicates are disallowed, or adding a duplicate data item if - * duplicates are allowed (MDB_DUPSORT). + * duplicates are allowed (MDBX_DUPSORT). * * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] dbi A database handle returned by mdbx_dbi_open() @@ -1078,49 +1078,49 @@ LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, * set to 0 or by bitwise OR'ing together one or more of the * values described here. * - * - MDB_NODUPDATA + * - MDBX_NODUPDATA * Enter the new key/data pair only if it does not already appear * in the database. This flag may only be specified if the database - * was opened with MDB_DUPSORT. The function will return MDB_KEYEXIST + * was opened with MDBX_DUPSORT. The function will return MDBX_KEYEXIST * if the key/data pair already appears in the database. * - * - MDB_NOOVERWRITE + * - MDBX_NOOVERWRITE * Enter the new key/data pair only if the key does not already appear - * in the database. The function will return MDB_KEYEXIST if the key + * in the database. The function will return MDBX_KEYEXIST if the key * already appears in the database, even if the database supports - * duplicates (MDB_DUPSORT). The data parameter will be set to point + * duplicates (MDBX_DUPSORT). The data parameter will be set to point * to the existing item. * - * - MDB_CURRENT + * - MDBX_CURRENT * Update an single existing entry, but not add new ones. The function - * will return MDB_NOTFOUND if the given key not exist in the database. + * will return MDBX_NOTFOUND if the given key not exist in the database. * Or the MDBX_EMULTIVAL in case duplicates for the given key. * - * - MDB_RESERVE + * - MDBX_RESERVE * Reserve space for data of the given size, but don't copy the given * data. Instead, return a pointer to the reserved space, which the * caller can fill in later - before the next update operation or the * transaction ends. This saves an extra memcpy if the data is being * generated later. MDBX does nothing else with this memory, the caller * is expected to modify all of the space requested. This flag must not - * be specified if the database was opened with MDB_DUPSORT. + * be specified if the database was opened with MDBX_DUPSORT. * - * - MDB_APPEND + * - MDBX_APPEND * Append the given key/data pair to the end of the database. This option * allows fast bulk loading when keys are already known to be in the * correct order. Loading unsorted keys with this flag will cause * a MDBX_EKEYMISMATCH error. * - * - MDB_APPENDDUP + * - MDBX_APPENDDUP * As above, but for sorted dup data. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). - * - MDB_TXN_FULL - the transaction has too many dirty pages. + * - MDBX_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). + * - MDBX_TXN_FULL - the transaction has too many dirty pages. * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, +LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, unsigned flags); /* Delete items from a database. @@ -1131,7 +1131,7 @@ LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, * support sorted duplicate data items or not. If the data parameter * is non-NULL only the matching data item will be deleted. * - * This function will return MDB_NOTFOUND if the specified key/data + * This function will return MDBX_NOTFOUND if the specified key/data * pair is not in the database. * * [in] txn A transaction handle returned by mdbx_txn_begin() @@ -1143,7 +1143,7 @@ LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, * possible errors are: * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, +LIBMDBX_API int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data); /* Create a cursor handle. @@ -1159,13 +1159,13 @@ LIBMDBX_API int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, * * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] dbi A database handle returned by mdbx_dbi_open() - * [out] cursor Address where the new MDB_cursor handle will be stored + * [out] cursor Address where the new MDBX_cursor handle will be stored * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_open(MDBX_txn *txn, MDB_dbi dbi, - MDB_cursor **cursor); +LIBMDBX_API int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, + MDBX_cursor **cursor); /* Close a cursor handle. * @@ -1173,7 +1173,7 @@ LIBMDBX_API int mdbx_cursor_open(MDBX_txn *txn, MDB_dbi dbi, * Its transaction must still be live if it is a write-transaction. * * [in] cursor A cursor handle returned by mdbx_cursor_open() */ -LIBMDBX_API void mdbx_cursor_close(MDB_cursor *cursor); +LIBMDBX_API void mdbx_cursor_close(MDBX_cursor *cursor); /* Renew a cursor handle. * @@ -1190,37 +1190,37 @@ LIBMDBX_API void mdbx_cursor_close(MDB_cursor *cursor); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_renew(MDBX_txn *txn, MDB_cursor *cursor); +LIBMDBX_API int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *cursor); /* Return the cursor's transaction handle. * * [in] cursor A cursor handle returned by mdbx_cursor_open() */ -LIBMDBX_API MDBX_txn *mdbx_cursor_txn(MDB_cursor *cursor); +LIBMDBX_API MDBX_txn *mdbx_cursor_txn(MDBX_cursor *cursor); /* Return the cursor's database handle. * * [in] cursor A cursor handle returned by mdbx_cursor_open() */ -LIBMDBX_API MDB_dbi mdbx_cursor_dbi(MDB_cursor *cursor); +LIBMDBX_API MDBX_dbi mdbx_cursor_dbi(MDBX_cursor *cursor); /* Retrieve by cursor. * * This function retrieves key/data pairs from the database. The address and * length of the key are returned in the object to which key refers (except - * for the case of the MDB_SET option, in which the key object is unchanged), + * for the case of the MDBX_SET option, in which the key object is unchanged), * and the address and length of the data are returned in the object to which * data refers. See mdbx_get() for restrictions on using the output values. * * [in] cursor A cursor handle returned by mdbx_cursor_open() * [in,out] key The key for a retrieved item * [in,out] data The data of a retrieved item - * [in] op A cursor operation MDB_cursor_op + * [in] op A cursor operation MDBX_cursor_op * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDB_NOTFOUND - no matching key found. + * - MDBX_NOTFOUND - no matching key found. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDBX_val *key, - MDBX_val *data, MDB_cursor_op op); +LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); /* Store by cursor. * @@ -1233,46 +1233,46 @@ LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDBX_val *key, * [in] flags Options for this operation. This parameter * must be set to 0 or one of the values described here. * - * - MDB_CURRENT + * - MDBX_CURRENT * Replace the item at the current cursor position. The key parameter * must still be provided, and must match it, otherwise the function * return MDBX_EKEYMISMATCH. * * NOTE: MDBX unlike LMDB allows you to change the size of the data and - * automatically handles reordering for sorted duplicates (MDB_DUPSORT). + * automatically handles reordering for sorted duplicates (MDBX_DUPSORT). * - * - MDB_NODUPDATA + * - MDBX_NODUPDATA * Enter the new key/data pair only if it does not already appear in the * database. This flag may only be specified if the database was opened - * with MDB_DUPSORT. The function will return MDB_KEYEXIST if the + * with MDBX_DUPSORT. The function will return MDBX_KEYEXIST if the * key/data pair already appears in the database. * - * - MDB_NOOVERWRITE + * - MDBX_NOOVERWRITE * Enter the new key/data pair only if the key does not already appear - * in the database. The function will return MDB_KEYEXIST if the key + * in the database. The function will return MDBX_KEYEXIST if the key * already appears in the database, even if the database supports - * duplicates (MDB_DUPSORT). + * duplicates (MDBX_DUPSORT). * - * - MDB_RESERVE + * - MDBX_RESERVE * Reserve space for data of the given size, but don't copy the given * data. Instead, return a pointer to the reserved space, which the * caller can fill in later - before the next update operation or the * transaction ends. This saves an extra memcpy if the data is being * generated later. This flag must not be specified if the database - * was opened with MDB_DUPSORT. + * was opened with MDBX_DUPSORT. * - * - MDB_APPEND + * - MDBX_APPEND * Append the given key/data pair to the end of the database. No key * comparisons are performed. This option allows fast bulk loading when * keys are already known to be in the correct order. Loading unsorted - * keys with this flag will cause a MDB_KEYEXIST error. + * keys with this flag will cause a MDBX_KEYEXIST error. * - * - MDB_APPENDDUP + * - MDBX_APPENDDUP * As above, but for sorted dup data. * - * - MDB_MULTIPLE + * - MDBX_MULTIPLE * Store multiple contiguous data elements in a single request. This flag - * may only be specified if the database was opened with MDB_DUPFIXED. + * may only be specified if the database was opened with MDBX_DUPFIXED. * The data argument must be an array of two MDBX_vals. The iov_len of the * first MDBX_val must be the size of a single data element. The iov_base * of the first MDBX_val must point to the beginning of the array of @@ -1284,11 +1284,11 @@ LIBMDBX_API int mdbx_cursor_get(MDB_cursor *cursor, MDBX_val *key, * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EKEYMISMATCH - * - MDB_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). - * - MDB_TXN_FULL - the transaction has too many dirty pages. + * - MDBX_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). + * - MDBX_TXN_FULL - the transaction has too many dirty pages. * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_put(MDB_cursor *cursor, MDBX_val *key, +LIBMDBX_API int mdbx_cursor_put(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, unsigned flags); /* Delete current key/data pair @@ -1299,20 +1299,20 @@ LIBMDBX_API int mdbx_cursor_put(MDB_cursor *cursor, MDBX_val *key, * [in] flags Options for this operation. This parameter must be set to 0 * or one of the values described here. * - * - MDB_NODUPDATA + * - MDBX_NODUPDATA * Delete all of the data items for the current key. This flag may only - * be specified if the database was opened with MDB_DUPSORT. + * be specified if the database was opened with MDBX_DUPSORT. * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); +LIBMDBX_API int mdbx_cursor_del(MDBX_cursor *cursor, unsigned flags); /* Return count of duplicates for current key. * * This call is only valid on databases that support sorted duplicate data - * items MDB_DUPSORT. + * items MDBX_DUPSORT. * * [in] cursor A cursor handle returned by mdbx_cursor_open() * [out] countp Address where the count will be stored @@ -1321,7 +1321,7 @@ LIBMDBX_API int mdbx_cursor_del(MDB_cursor *cursor, unsigned flags); * possible errors are: * - MDBX_EINVAL - cursor is not initialized, or an invalid parameter * was specified. */ -LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, uint64_t *countp); +LIBMDBX_API int mdbx_cursor_count(MDBX_cursor *cursor, uint64_t *countp); /* Compare two data items according to a particular database. * @@ -1334,13 +1334,13 @@ LIBMDBX_API int mdbx_cursor_count(MDB_cursor *cursor, uint64_t *countp); * [in] b The second item to compare * * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ -LIBMDBX_API int mdbx_cmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, +LIBMDBX_API int mdbx_cmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b); /* Compare two data items according to a particular database. * * This returns a comparison as if the two items were data items of - * the specified database. The database must have the MDB_DUPSORT flag. + * the specified database. The database must have the MDBX_DUPSORT flag. * * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] dbi A database handle returned by mdbx_dbi_open() @@ -1348,7 +1348,7 @@ LIBMDBX_API int mdbx_cmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, * [in] b The second item to compare * * Returns < 0 if a < b, 0 if a == b, > 0 if a > b */ -LIBMDBX_API int mdbx_dcmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, +LIBMDBX_API int mdbx_dcmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b); /* A callback function used to print a message from the library. @@ -1357,16 +1357,16 @@ LIBMDBX_API int mdbx_dcmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, * [in] ctx An arbitrary context pointer for the callback. * * Returns < 0 on failure, >= 0 on success. */ -typedef int(MDB_msg_func)(const char *msg, void *ctx); +typedef int(MDBX_msg_func)(const char *msg, void *ctx); /* Dump the entries in the reader lock table. * * [in] env An environment handle returned by mdbx_env_create() - * [in] func A MDB_msg_func function + * [in] func A MDBX_msg_func function * [in] ctx Anything the message function needs * * Returns < 0 on failure, >= 0 on success. */ -LIBMDBX_API int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); +LIBMDBX_API int mdbx_reader_list(MDBX_env *env, MDBX_msg_func *func, void *ctx); /* Check for stale entries in the reader lock table. * @@ -1374,21 +1374,21 @@ LIBMDBX_API int mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx); * [out] dead Number of stale slots that were cleared * * Returns 0 on success, non-zero on failure. */ -LIBMDBX_API int mdbx_reader_check(MDB_env *env, int *dead); +LIBMDBX_API int mdbx_reader_check(MDBX_env *env, int *dead); LIBMDBX_API char *mdbx_dkey(const MDBX_val *key, char *const buf, const size_t bufsize); -LIBMDBX_API int mdbx_env_close_ex(MDB_env *env, int dont_sync); +LIBMDBX_API int mdbx_env_close_ex(MDBX_env *env, int dont_sync); /* Set threshold to force flush the data buffers to disk, - * even of MDB_NOSYNC, MDB_NOMETASYNC and MDB_MAPASYNC flags + * even of MDBX_NOSYNC, MDBX_NOMETASYNC and MDBX_MAPASYNC flags * in the environment. * * Data is always written to disk when mdbx_txn_commit() is called, * but the operating system may keep it buffered. MDBX always flushes * the OS buffers upon commit as well, unless the environment was - * opened with MDB_NOSYNC or in part MDB_NOMETASYNC. + * opened with MDBX_NOSYNC or in part MDBX_NOMETASYNC. * * The default is 0, than mean no any threshold checked, and no additional * flush will be made. @@ -1398,7 +1398,7 @@ LIBMDBX_API int mdbx_env_close_ex(MDB_env *env, int dont_sync); * flush would be made. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); +LIBMDBX_API int mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes); /* Returns a lag of the reading for the given transaction. * @@ -1413,7 +1413,7 @@ LIBMDBX_API int mdbx_env_set_syncbytes(MDB_env *env, size_t bytes); LIBMDBX_API int mdbx_txn_straggler(MDBX_txn *txn, int *percent); /* A callback function for killing a laggard readers, - * but also could waiting ones. Called in case of MDB_MAP_FULL error. + * but also could waiting ones. Called in case of MDBX_MAP_FULL error. * * [in] env An environment handle returned by mdbx_env_create(). * [in] pid pid of the reader process. @@ -1426,7 +1426,7 @@ LIBMDBX_API int mdbx_txn_straggler(MDBX_txn *txn, int *percent); * 0 on a race condition (no such reader), * 1 on success (reader was killed), * >1 on success (reader was SURE killed). */ -typedef int(MDBX_oom_func)(MDB_env *env, int pid, mdbx_tid_t tid, uint64_t txn, +typedef int(MDBX_oom_func)(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, unsigned gap, int retry); /* Set the OOM callback. @@ -1438,7 +1438,7 @@ typedef int(MDBX_oom_func)(MDB_env *env, int pid, mdbx_tid_t tid, uint64_t txn, * [in] oomfunc A MDBX_oom_func function or NULL to disable. * * Returns A non-zero error value on failure and 0 on success. */ -LIBMDBX_API int mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); +LIBMDBX_API int mdbx_env_set_oomfunc(MDBX_env *env, MDBX_oom_func *oom_func); /* Get the current oom_func callback. * @@ -1448,7 +1448,7 @@ LIBMDBX_API int mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oom_func); * [in] env An environment handle returned by mdbx_env_create(). * * Returns A MDBX_oom_func function or NULL if disabled. */ -LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDB_env *env); +LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDBX_env *env); #define MDBX_DBG_ASSERT 1 #define MDBX_DBG_PRINT 2 @@ -1484,27 +1484,27 @@ LIBMDBX_API int mdbx_canary_get(MDBX_txn *txn, mdbx_canary *canary); * - MDBX_RESULT_FALSE * when data available; * - Otherwise the error code. */ -LIBMDBX_API int mdbx_cursor_eof(MDB_cursor *mc); +LIBMDBX_API int mdbx_cursor_eof(MDBX_cursor *mc); /* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */ -LIBMDBX_API int mdbx_cursor_on_first(MDB_cursor *mc); +LIBMDBX_API int mdbx_cursor_on_first(MDBX_cursor *mc); /* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */ -LIBMDBX_API int mdbx_cursor_on_last(MDB_cursor *mc); +LIBMDBX_API int mdbx_cursor_on_last(MDBX_cursor *mc); -LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, +LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *new_data, MDBX_val *old_data, unsigned flags); /* Same as mdbx_get(), but: * 1) if values_count is not NULL, then returns the count * of multi-values/duplicates for a given key. * 2) updates the key for pointing to the actual key's data inside DB. */ -LIBMDBX_API int mdbx_get_ex(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, +LIBMDBX_API int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, int *values_count); LIBMDBX_API int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr); -LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDB_dbi dbi, uint64_t *result, +LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, uint64_t increment); #ifdef __cplusplus diff --git a/src/bits.h b/src/bits.h index 14c1336c..cc2e6ab6 100644 --- a/src/bits.h +++ b/src/bits.h @@ -70,11 +70,11 @@ #include "./osal.h" -#ifndef MDB_DEBUG -# define MDB_DEBUG 0 +#ifndef MDBX_DEBUG +# define MDBX_DEBUG 0 #endif -#if MDB_DEBUG +#if MDBX_DEBUG # undef NDEBUG #endif @@ -168,19 +168,19 @@ typedef uint64_t txnid_t; * IDs are in the list. In the original back-bdb code, IDLs are * sorted in ascending order. For libmdb IDLs are sorted in * descending order. */ -typedef pgno_t *MDB_IDL; +typedef pgno_t *MDBX_IDL; /* An ID2 is an ID/pointer pair. */ -typedef struct MDB_ID2 { +typedef struct MDBX_ID2 { pgno_t mid; /* The ID */ void *mptr; /* The pointer */ -} MDB_ID2; +} MDBX_ID2; /* An ID2L is an ID2 List, a sorted array of ID2s. * The first element's mid member is a count of how many actual * elements are in the array. The mptr member of the first element is * unused. The array is sorted in ascending order by mid. */ -typedef MDB_ID2 *MDB_ID2L; +typedef MDBX_ID2 *MDBX_ID2L; /* Used for offsets within a single page. * Since memory pages are typically 4 or 8KB in size, 12-13 bits, @@ -220,7 +220,7 @@ typedef struct MDBX_reader { } MDBX_reader; /* Information about a single database in the environment. */ -typedef struct MDB_db { +typedef struct MDBX_db { uint32_t md_xsize; /* also ksize for LEAF2 pages */ uint16_t md_flags; /* see mdbx_dbi_open */ uint16_t md_depth; /* depth of this tree */ @@ -230,20 +230,20 @@ typedef struct MDB_db { pgno_t md_overflow_pages; /* number of overflow pages */ pgno_t md_root; /* the root page of this tree */ uint64_t md_entries; /* number of data items */ -} MDB_db; +} MDBX_db; /* Meta page content. * A meta page is the start point for accessing a database snapshot. * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */ -typedef struct MDB_meta { - /* Stamp identifying this as an LMDB file. It must be set - * to MDB_MAGIC. */ +typedef struct MDBX_meta { + /* Stamp identifying this as an MDBX file. It must be set + * to MDBX_MAGIC. */ uint32_t mm_magic; - /* Version number of this file. Must be set to MDB_DATA_VERSION. */ + /* Version number of this file. Must be set to MDBX_DATA_VERSION. */ uint32_t mm_version; - size_t mm_mapsize; /* size of mmap region */ - MDB_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ - /* The size of pages used in this DB */ + size_t mm_mapsize; /* size of mmap region */ + MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ + /* The size of pages used in this DB */ #define mm_psize mm_dbs[FREE_DBI].md_xsize /* Any persistent environment flags, see mdbx_env */ #define mm_flags mm_dbs[FREE_DBI].md_flags @@ -251,21 +251,21 @@ typedef struct MDB_meta { * Actually the file may be shorter if the freeDB lists the final pages. */ pgno_t mm_last_pg; volatile txnid_t mm_txnid; /* txnid that committed this page */ -#define MDB_DATASIGN_NONE 0u -#define MDB_DATASIGN_WEAK 1u +#define MDBX_DATASIGN_NONE 0u +#define MDBX_DATASIGN_WEAK 1u volatile uint64_t mm_datasync_sign; -#define SIGN_IS_WEAK(sign) ((sign) == MDB_DATASIGN_WEAK) -#define SIGN_IS_STEADY(sign) ((sign) > MDB_DATASIGN_WEAK) +#define SIGN_IS_WEAK(sign) ((sign) == MDBX_DATASIGN_WEAK) +#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) #define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) volatile mdbx_canary mm_canary; -} MDB_meta; +} MDBX_meta; /* Common header for all page types. The page type depends on mp_flags. * * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages - * omit mp_ptrs and pack sorted MDB_DUPFIXED values after the page header. + * omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header. * * P_OVERFLOW records occupy one or more contiguous pages where only the * first has a page header. They hold the real data of F_BIGDATA nodes. @@ -274,9 +274,9 @@ typedef struct MDB_meta { * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page. * (Duplicate data can also go in sub-databases, which use normal pages.) * - * P_META pages contain MDB_meta, the start point of an LMDB snapshot. + * P_META pages contain MDBX_meta, the start point of an MDBX snapshot. * - * Each non-metapage up to MDB_meta.mm_last_pg is reachable exactly once + * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a freeDB record. */ typedef struct MDBX_page { union { @@ -289,8 +289,8 @@ typedef struct MDBX_page { #define P_OVERFLOW 0x04 /* overflow page */ #define P_META 0x08 /* meta page */ #define P_DIRTY 0x10 /* dirty page, also set for P_SUBP pages */ -#define P_LEAF2 0x20 /* for MDB_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDB_DUPSORT sub-pages */ +#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ #define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ #define P_KEEP 0x8000 /* leave this page alone during spill */ uint16_t mp_flags; @@ -311,19 +311,19 @@ typedef struct MDBX_page { * The members define size and alignment, and silence type * aliasing warnings. They are not used directly; that could * mean incorrectly using several union members in parallel. */ -typedef union MDB_metabuf { +typedef union MDBX_metabuf { MDBX_page mb_page; struct { char mm_pad[PAGEHDRSZ]; - MDB_meta mm_meta; + MDBX_meta mm_meta; } mb_metabuf; -} MDB_metabuf; +} MDBX_metabuf; /* The header for the reader table (a memory-mapped lock file). */ typedef struct MDBX_lockinfo { - /* Stamp identifying this as an LMDB file. It must be set to MDB_MAGIC. */ + /* Stamp identifying this as an MDBX file. It must be set to MDBX_MAGIC. */ uint64_t mti_magic; - /* Format of this lock file. Must be set to MDB_LOCK_FORMAT. */ + /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */ uint64_t mti_format; /* Flags which environment was opened. */ uint64_t mti_envmode; @@ -348,11 +348,11 @@ typedef struct MDBX_lockinfo { /* Auxiliary DB info. * The information here is mostly static/read-only. There is * only a single copy of this record in the environment. */ -typedef struct MDB_dbx { - MDBX_val md_name; /* name of the database */ - MDB_cmp_func *md_cmp; /* function for comparing keys */ - MDB_cmp_func *md_dcmp; /* function for comparing data items */ -} MDB_dbx; +typedef struct MDBX_dbx { + MDBX_val md_name; /* name of the database */ + MDBX_cmp_func *md_cmp; /* function for comparing keys */ + MDBX_cmp_func *md_dcmp; /* function for comparing data items */ +} MDBX_dbx; /* A database transaction. * Every operation requires a transaction handle. */ @@ -360,18 +360,18 @@ struct MDBX_txn { #define MDBX_MT_SIGNATURE (0x93D53A31) unsigned mt_signature; MDBX_txn *mt_parent; /* parent of a nested txn */ - /* Nested txn under this txn, set together with flag MDB_TXN_HAS_CHILD */ + /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ MDBX_txn *mt_child; pgno_t mt_next_pgno; /* next unallocated page */ /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; - MDB_env *mt_env; /* the DB environment */ - /* The list of reclaimed txns from freeDB */ - MDB_IDL mt_lifo_reclaimed; + MDBX_env *mt_env; /* the DB environment */ + /* The list of reclaimed txns from freeDB */ + MDBX_IDL mt_lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDB_IDL mt_free_pages; + MDBX_IDL mt_free_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through NEXT_LOOSE_PAGE(page). */ MDBX_page *mt_loose_pages; @@ -380,17 +380,17 @@ struct MDBX_txn { /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDB_IDL mt_spill_pages; + MDBX_IDL mt_spill_pages; union { - /* For write txns: Modified pages. Sorted when not MDB_WRITEMAP. */ - MDB_ID2L mt_rw_dirtylist; + /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ + MDBX_ID2L mt_rw_dirtylist; /* For read txns: This thread/txn's reader table slot, or NULL. */ MDBX_reader *mt_ro_reader; }; /* Array of records for each DB known in the environment. */ - MDB_dbx *mt_dbxs; - /* Array of MDB_db records for each known DB */ - MDB_db *mt_dbs; + MDBX_dbx *mt_dbxs; + /* Array of MDBX_db records for each known DB */ + MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ unsigned *mt_dbiseqs; @@ -398,34 +398,35 @@ struct MDBX_txn { #define DB_DIRTY 0x01 /* DB was written in this txn */ #define DB_STALE 0x02 /* Named-DB record is older than txnID */ #define DB_NEW 0x04 /* Named-DB handle opened in this txn */ -#define DB_VALID 0x08 /* DB handle is valid, see also MDB_VALID */ +#define DB_VALID 0x08 /* DB handle is valid, see also MDBX_VALID */ #define DB_USRVALID 0x10 /* As DB_VALID, but not set for FREE_DBI */ -#define DB_DUPDATA 0x20 /* DB is MDB_DUPSORT data */ +#define DB_DUPDATA 0x20 /* DB is MDBX_DUPSORT data */ /* In write txns, array of cursors for each DB */ - MDB_cursor **mt_cursors; + MDBX_cursor **mt_cursors; /* Array of flags for each DB */ uint8_t *mt_dbflags; /* Number of DB records in use, or 0 when the txn is finished. * This number only ever increments until the txn finishes; we * don't decrement it when individual DB handles are closed. */ - MDB_dbi mt_numdbs; + MDBX_dbi mt_numdbs; /* Transaction Flags */ /* mdbx_txn_begin() flags */ -#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC | MDB_NOSYNC | MDB_RDONLY) -#define MDB_TXN_NOMETASYNC \ - MDB_NOMETASYNC /* don't sync meta for this txn on commit */ -#define MDB_TXN_NOSYNC MDB_NOSYNC /* don't sync this txn on commit */ -#define MDB_TXN_RDONLY MDB_RDONLY /* read-only transaction */ - /* internal txn flags */ -#define MDB_TXN_WRITEMAP MDB_WRITEMAP /* copy of MDB_env flag in writers */ -#define MDB_TXN_FINISHED 0x01 /* txn is finished or never began */ -#define MDB_TXN_ERROR 0x02 /* txn is unusable after an error */ -#define MDB_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */ -#define MDB_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */ -#define MDB_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */ +#define MDBX_TXN_BEGIN_FLAGS (MDBX_NOMETASYNC | MDBX_NOSYNC | MDBX_RDONLY) +#define MDBX_TXN_NOMETASYNC \ + MDBX_NOMETASYNC /* don't sync meta for this txn on commit */ +#define MDBX_TXN_NOSYNC MDBX_NOSYNC /* don't sync this txn on commit */ +#define MDBX_TXN_RDONLY MDBX_RDONLY /* read-only transaction */ + /* internal txn flags */ +#define MDBX_TXN_WRITEMAP MDBX_WRITEMAP /* copy of MDBX_env flag in writers */ +#define MDBX_TXN_FINISHED 0x01 /* txn is finished or never began */ +#define MDBX_TXN_ERROR 0x02 /* txn is unusable after an error */ +#define MDBX_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */ +#define MDBX_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */ +#define MDBX_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */ /* most operations on the txn are currently illegal */ -#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED | MDB_TXN_ERROR | MDB_TXN_HAS_CHILD) +#define MDBX_TXN_BLOCKED \ + (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD) unsigned mt_flags; /* dirtylist room: Array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' @@ -440,34 +441,34 @@ struct MDBX_txn { * raise this on a 64 bit machine. */ #define CURSOR_STACK 32 -struct MDB_xcursor; +struct MDBX_xcursor; /* Cursors are used for all DB operations. * A cursor holds a path of (page pointer, key index) from the DB - * root to a position in the DB, plus other state. MDB_DUPSORT + * root to a position in the DB, plus other state. MDBX_DUPSORT * cursors include an xcursor to the current data item. Write txns * track their cursors and keep them up to date when data moves. * Exception: An xcursor's pointer to a P_SUBP page can be stale. * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */ -struct MDB_cursor { +struct MDBX_cursor { #define MDBX_MC_SIGNATURE (0xFE05D5B1) #define MDBX_MC_READY4CLOSE (0x2817A047) #define MDBX_MC_WAIT4EOT (0x90E297A7) unsigned mc_signature; /* Next cursor on this DB in this txn */ - MDB_cursor *mc_next; + MDBX_cursor *mc_next; /* Backup of the original cursor if this cursor is a shadow */ - MDB_cursor *mc_backup; - /* Context used for databases with MDB_DUPSORT, otherwise NULL */ - struct MDB_xcursor *mc_xcursor; + MDBX_cursor *mc_backup; + /* Context used for databases with MDBX_DUPSORT, otherwise NULL */ + struct MDBX_xcursor *mc_xcursor; /* The transaction that owns this cursor */ MDBX_txn *mc_txn; /* The database handle this cursor operates on */ - MDB_dbi mc_dbi; + MDBX_dbi mc_dbi; /* The database record for this cursor */ - MDB_db *mc_db; + MDBX_db *mc_db; /* The database auxiliary record for this cursor */ - MDB_dbx *mc_dbx; + MDBX_dbx *mc_dbx; /* The mt_dbflag for this database */ uint8_t *mc_dbflag; uint16_t mc_snum; /* number of pushed pages */ @@ -488,16 +489,16 @@ struct MDB_cursor { * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these * levels - main DB, optional sub-DB, sorted-duplicate DB. */ -typedef struct MDB_xcursor { +typedef struct MDBX_xcursor { /* A sub-cursor for traversing the Dup DB */ - MDB_cursor mx_cursor; + MDBX_cursor mx_cursor; /* The database record for this Dup DB */ - MDB_db mx_db; + MDBX_db mx_db; /* The auxiliary DB record for this Dup DB */ - MDB_dbx mx_dbx; + MDBX_dbx mx_dbx; /* The mt_dbflag for this Dup DB */ uint8_t mx_dbflag; -} MDB_xcursor; +} MDBX_xcursor; /* Check if there is an inited xcursor, so XCURSOR_REFRESH() is proper */ #define XCURSOR_INITED(mc) \ @@ -514,42 +515,42 @@ typedef struct MDB_xcursor { (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ } while (0) -/* State of FreeDB old pages, stored in the MDB_env */ -typedef struct MDB_pgstate { +/* State of FreeDB old pages, stored in the MDBX_env */ +typedef struct MDBX_pgstate { pgno_t *mf_pghead; /* Reclaimed freeDB pages, or NULL before use */ txnid_t mf_pglast; /* ID of last used record, or 0 if !mf_pghead */ -} MDB_pgstate; +} MDBX_pgstate; #define MDBX_LOCKINFO_WHOLE_SIZE \ ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ ~((size_t)MDBX_CACHELINE_SIZE - 1)) /* Lockfile format signature: version, features and field layout */ -#define MDB_LOCK_FORMAT \ +#define MDBX_LOCK_FORMAT \ (((uint64_t)(MDBX_OSAL_LOCK_SIGN) << 32) + \ ((MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1) << 16) + \ - (MDB_LOCK_VERSION) /* Flags which describe functionality */) + (MDBX_LOCK_VERSION) /* Flags which describe functionality */) /* The database environment. */ -struct MDB_env { +struct MDBX_env { #define MDBX_ME_SIGNATURE (0x9A899641) unsigned me_signature; mdbx_filehandle_t me_fd; /* The main data file */ mdbx_filehandle_t me_lfd; /* The lock file */ /* Failed to update the meta page. Probably an I/O error. */ -#define MDB_FATAL_ERROR 0x80000000U +#define MDBX_FATAL_ERROR 0x80000000U /* Some fields are initialized. */ -#define MDB_ENV_ACTIVE 0x20000000U +#define MDBX_ENV_ACTIVE 0x20000000U /* me_txkey is set */ -#define MDB_ENV_TXKEY 0x10000000U +#define MDBX_ENV_TXKEY 0x10000000U uint32_t me_flags; /* see mdbx_env */ unsigned me_psize; /* DB page size, inited from me_os_psize */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ /* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */ unsigned me_close_readers; - MDB_dbi me_numdbs; /* number of DBs opened */ - MDB_dbi me_maxdbs; /* size of the DB table */ + MDBX_dbi me_numdbs; /* number of DBs opened */ + MDBX_dbi me_maxdbs; /* size of the DB table */ mdbx_pid_t me_pid; /* process ID of this env */ char *me_path; /* path to the DB files */ char *me_map; /* the memory map of the data file */ @@ -559,19 +560,19 @@ struct MDB_env { MDBX_txn *me_txn0; /* prealloc'd write transaction */ size_t me_mapsize; /* size of the data memory map */ pgno_t me_maxpg; /* me_mapsize / me_psize */ - MDB_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDB_db.md_flags */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ txnid_t me_pgoldest; /* ID of oldest reader last time we looked */ - MDB_pgstate me_pgstate; /* state of old pages from freeDB */ + MDBX_pgstate me_pgstate; /* state of old pages from freeDB */ #define me_pglast me_pgstate.mf_pglast #define me_pghead me_pgstate.mf_pghead MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ /* IDL of pages that became unused in a write txn */ - MDB_IDL me_free_pgs; - /* ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ - MDB_ID2L me_dirtylist; + MDBX_IDL me_free_pgs; + /* ID2L of pages written during a write txn. Length MDBX_IDL_UM_SIZE. */ + MDBX_ID2L me_dirtylist; /* Max number of freelist items that can fit in a single overflow page */ unsigned me_maxfree_1pg; /* Max size of a node on a page */ @@ -579,8 +580,8 @@ struct MDB_env { unsigned me_maxkey_limit; /* max size of a key */ int me_live_reader; /* have liveness lock in reader table */ void *me_userctx; /* User-settable context */ -#if MDB_DEBUG - MDB_assert_func *me_assert_func; /* Callback for assertion failures */ +#if MDBX_DEBUG + MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif uint64_t me_sync_pending; /* Total dirty/non-sync'ed bytes * since the last mdbx_env_sync() */ @@ -592,10 +593,10 @@ struct MDB_env { }; /* Nested transaction */ -typedef struct MDB_ntxn { - MDBX_txn mnt_txn; /* the transaction */ - MDB_pgstate mnt_pgstate; /* parent transaction's saved freestate */ -} MDB_ntxn; +typedef struct MDBX_ntxn { + MDBX_txn mnt_txn; /* the transaction */ + MDBX_pgstate mnt_pgstate; /* parent transaction's saved freestate */ +} MDBX_ntxn; /*----------------------------------------------------------------------------*/ @@ -616,7 +617,7 @@ void mdbx_panic(const char *fmt, ...) #endif ; -#if MDB_DEBUG +#if MDBX_DEBUG #define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT) @@ -633,7 +634,7 @@ void mdbx_panic(const char *fmt, ...) #else #define mdbx_assert_enabled() (0) #endif /* NDEBUG */ -#endif /* MDB_DEBUG */ +#endif /* MDBX_DEBUG */ #define mdbx_print(fmt, ...) \ mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) @@ -748,18 +749,18 @@ static __inline void mdbx_jitter4testing(bool tiny) { #endif } -int mdbx_reader_check0(MDB_env *env, int rlocked, int *dead); +int mdbx_reader_check0(MDBX_env *env, int rlocked, int *dead); -#define METAPAGE_1(env) (&((MDB_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) +#define METAPAGE_1(env) (&((MDBX_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) #define METAPAGE_2(env) \ - (&((MDB_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) + (&((MDBX_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) -static __inline MDB_meta *mdbx_meta_head(MDB_env *env) { +static __inline MDBX_meta *mdbx_meta_head(MDBX_env *env) { mdbx_jitter4testing(true); - MDB_meta *a = METAPAGE_1(env); + MDBX_meta *a = METAPAGE_1(env); mdbx_jitter4testing(true); - MDB_meta *b = METAPAGE_2(env); + MDBX_meta *b = METAPAGE_2(env); mdbx_jitter4testing(true); return (a->mm_txnid > b->mm_txnid) ? a : b; diff --git a/src/lck-posix.c b/src/lck-posix.c index f253c4f8..5eb6942c 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -16,18 +16,18 @@ /* Some platforms define the EOWNERDEAD error code * even though they don't support Robust Mutexes. - * Compile with -DMDB_USE_ROBUST=0. */ -#ifndef MDB_USE_ROBUST + * Compile with -DMDBX_USE_ROBUST=0. */ +#ifndef MDBX_USE_ROBUST /* Howard Chu: Android currently lacks Robust Mutex support */ #if defined(EOWNERDEAD) && \ !defined(ANDROID) /* LY: glibc before 2.10 has a troubles with Robust \ Mutex too. */ \ && __GLIBC_PREREQ(2, 10) -#define MDB_USE_ROBUST 1 +#define MDBX_USE_ROBUST 1 #else -#define MDB_USE_ROBUST 0 +#define MDBX_USE_ROBUST 0 #endif -#endif /* MDB_USE_ROBUST */ +#endif /* MDBX_USE_ROBUST */ /*----------------------------------------------------------------------------*/ /* rthc */ @@ -91,13 +91,13 @@ static __inline int mdbx_lck_shared(int lfd) { return mdbx_lck_op(lfd, F_SETLKW, F_RDLCK, 0, 1); } -int mdbx_lck_downgrade(MDB_env *env) { return mdbx_lck_shared(env->me_lfd); } +int mdbx_lck_downgrade(MDBX_env *env) { return mdbx_lck_shared(env->me_lfd); } -int mdbx_rpid_set(MDB_env *env) { +int mdbx_rpid_set(MDBX_env *env) { return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid, 1); } -int mdbx_rpid_clear(MDB_env *env) { +int mdbx_rpid_clear(MDBX_env *env) { return mdbx_lck_op(env->me_lfd, F_SETLKW, F_UNLCK, env->me_pid, 1); } @@ -107,7 +107,7 @@ int mdbx_rpid_clear(MDB_env *env) { * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) * MDBX_RESULT_FALSE, if pid is dead (lock acquired) * or otherwise the errcode. */ -int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { +int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { int rc = mdbx_lck_op(env->me_lfd, F_GETLK, F_WRLCK, pid, 1); if (rc == 0) return MDBX_RESULT_FALSE; @@ -118,9 +118,9 @@ int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { /*---------------------------------------------------------------------------*/ -static int mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, int rc); +static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, int rc); -int mdbx_lck_init(MDB_env *env) { +int mdbx_lck_init(MDBX_env *env) { pthread_mutexattr_t ma; int rc = pthread_mutexattr_init(&ma); if (rc) @@ -130,7 +130,7 @@ int mdbx_lck_init(MDB_env *env) { if (rc) goto bailout; -#if MDB_USE_ROBUST +#if MDBX_USE_ROBUST #if __GLIBC_PREREQ(2, 12) rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); #else @@ -138,7 +138,7 @@ int mdbx_lck_init(MDB_env *env) { #endif if (rc) goto bailout; -#endif /* MDB_USE_ROBUST */ +#endif /* MDBX_USE_ROBUST */ #if _POSIX_C_SOURCE >= 199506L rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); @@ -158,7 +158,7 @@ bailout: return rc; } -void mdbx_lck_destroy(MDB_env *env) { +void mdbx_lck_destroy(MDBX_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* try get exclusive access */ if (env->me_lck && mdbx_lck_exclusive(env->me_lfd) == 0) { @@ -173,28 +173,28 @@ void mdbx_lck_destroy(MDB_env *env) { } } -static int mdbx_robust_lock(MDB_env *env, pthread_mutex_t *mutex) { +static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) { int rc = pthread_mutex_lock(mutex); if (unlikely(rc != 0)) rc = mdbx_mutex_failed(env, mutex, rc); return rc; } -static int mdbx_robust_unlock(MDB_env *env, pthread_mutex_t *mutex) { +static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) { int rc = pthread_mutex_unlock(mutex); if (unlikely(rc != 0)) rc = mdbx_mutex_failed(env, mutex, rc); return rc; } -int mdbx_rdt_lock(MDB_env *env) { +int mdbx_rdt_lock(MDBX_env *env) { mdbx_trace(">>"); int rc = mdbx_robust_lock(env, &env->me_lck->mti_rmutex); mdbx_trace("<< rc %d", rc); return rc; } -void mdbx_rdt_unlock(MDB_env *env) { +void mdbx_rdt_unlock(MDBX_env *env) { mdbx_trace(">>"); int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex); mdbx_trace("<< rc %d", rc); @@ -202,14 +202,14 @@ void mdbx_rdt_unlock(MDB_env *env) { mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); } -int mdbx_txn_lock(MDB_env *env) { +int mdbx_txn_lock(MDBX_env *env) { mdbx_trace(">>"); int rc = mdbx_robust_lock(env, &env->me_lck->mti_wmutex); mdbx_trace("<< rc %d", rc); - return MDBX_IS_ERROR(rc) ? rc : MDB_SUCCESS; + return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; } -void mdbx_txn_unlock(MDB_env *env) { +void mdbx_txn_unlock(MDBX_env *env) { mdbx_trace(">>"); int rc = mdbx_robust_unlock(env, &env->me_lck->mti_wmutex); mdbx_trace("<< rc %d", rc); @@ -243,7 +243,7 @@ static int internal_seize_lck(int lfd) { return rc; } -int mdbx_lck_seize(MDB_env *env) { +int mdbx_lck_seize(MDBX_env *env) { assert(env->me_fd != INVALID_HANDLE_VALUE); if (env->me_lfd == INVALID_HANDLE_VALUE) { @@ -256,7 +256,7 @@ int mdbx_lck_seize(MDB_env *env) { return MDBX_RESULT_FALSE; } - if ((env->me_flags & MDB_RDONLY) == 0) { + if ((env->me_flags & MDBX_RDONLY) == 0) { /* Check that another process don't operates in without-lck mode. */ int rc = mdbx_lck_op(env->me_fd, F_SETLK, F_WRLCK, env->me_pid, 1); if (rc != 0) { @@ -273,20 +273,20 @@ int mdbx_lck_seize(MDB_env *env) { #define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) #endif -static int __cold mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, +static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, int rc) { -#if MDB_USE_ROBUST +#if MDBX_USE_ROBUST if (rc == EOWNERDEAD) { /* We own the mutex. Clean up after dead previous owner. */ int rlocked = (mutex == &env->me_lck->mti_rmutex); - rc = MDB_SUCCESS; + rc = MDBX_SUCCESS; if (!rlocked) { if (unlikely(env->me_txn)) { /* env is hosed if the dead thread was ours */ - env->me_flags |= MDB_FATAL_ERROR; + env->me_flags |= MDBX_FATAL_ERROR; env->me_txn = NULL; - rc = MDB_PANIC; + rc = MDBX_PANIC; } } mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), @@ -299,17 +299,17 @@ static int __cold mdbx_mutex_failed(MDB_env *env, pthread_mutex_t *mutex, if (unlikely(mreco_rc)) mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc)); - rc = (rc == MDB_SUCCESS) ? check_rc : rc; + rc = (rc == MDBX_SUCCESS) ? check_rc : rc; if (MDBX_IS_ERROR(rc)) pthread_mutex_unlock(mutex); return rc; } -#endif /* MDB_USE_ROBUST */ +#endif /* MDBX_USE_ROBUST */ mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(rc)); if (rc != EDEADLK) { - env->me_flags |= MDB_FATAL_ERROR; - rc = MDB_PANIC; + env->me_flags |= MDBX_FATAL_ERROR; + rc = MDBX_PANIC; } return rc; } diff --git a/src/lck-windows.c b/src/lck-windows.c index 7654f635..5b6551d6 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -125,13 +125,13 @@ static __inline BOOL funlock(mdbx_filehandle_t fd, off_t offset, size_t bytes) { #define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN #define LCK_WHOLE 0, LCK_MAXLEN -int mdbx_txn_lock(MDB_env *env) { +int mdbx_txn_lock(MDBX_env *env) { if (flock(env->me_fd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_BODY)) - return MDB_SUCCESS; + return MDBX_SUCCESS; return mdbx_get_errno_checked(); } -void mdbx_txn_unlock(MDB_env *env) { +void mdbx_txn_unlock(MDBX_env *env) { if (!funlock(env->me_fd, LCK_BODY)) mdbx_panic("%s failed: errcode %u", mdbx_func_, GetLastError()); } @@ -147,17 +147,17 @@ void mdbx_txn_unlock(MDB_env *env) { #define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN #define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN -int mdbx_rdt_lock(MDB_env *env) { +int mdbx_rdt_lock(MDBX_env *env) { if (env->me_lfd == INVALID_HANDLE_VALUE) - return MDB_SUCCESS; /* readonly database in readonly filesystem */ + return MDBX_SUCCESS; /* readonly database in readonly filesystem */ /* transite from S-? (used) to S-E (locked), e.g. exlcusive lock upper-part */ if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) - return MDB_SUCCESS; + return MDBX_SUCCESS; return mdbx_get_errno_checked(); } -void mdbx_rdt_unlock(MDB_env *env) { +void mdbx_rdt_unlock(MDBX_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* transite from S-E (locked) to S-? (used), e.g. unlock upper-part */ if (!funlock(env->me_lfd, LCK_UPPER)) @@ -181,9 +181,9 @@ void mdbx_rdt_unlock(MDB_env *env) { E-E = exclusive */ -int mdbx_lck_init(MDB_env *env) { +int mdbx_lck_init(MDBX_env *env) { (void)env; - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Seize state as exclusive (E-E and returns MDBX_RESULT_TRUE) @@ -238,7 +238,7 @@ static int internal_seize_lck(HANDLE lfd) { return rc; } -int mdbx_lck_seize(MDB_env *env) { +int mdbx_lck_seize(MDBX_env *env) { int rc; assert(env->me_fd != INVALID_HANDLE_VALUE); @@ -255,7 +255,7 @@ int mdbx_lck_seize(MDB_env *env) { rc = internal_seize_lck(env->me_lfd); mdbx_jitter4testing(false); - if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDB_RDONLY) == 0) { + if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { /* Check that another process don't operates in without-lck mode. * Doing such check by exclusive locking the body-part of db. Should be * noted: @@ -280,7 +280,7 @@ int mdbx_lck_seize(MDB_env *env) { } /* Transite from exclusive state (E-E) to used (S-?) */ -int mdbx_lck_downgrade(MDB_env *env) { +int mdbx_lck_downgrade(MDBX_env *env) { int rc; assert(env->me_fd != INVALID_HANDLE_VALUE); @@ -301,10 +301,10 @@ int mdbx_lck_downgrade(MDB_env *env) { mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, "S-E(locked) >> S-?(used)", GetLastError()); } - return MDB_SUCCESS /* 5) now at S-? (used), done */; + return MDBX_SUCCESS /* 5) now at S-? (used), done */; } -void mdbx_lck_destroy(MDB_env *env) { +void mdbx_lck_destroy(MDBX_env *env) { int rc; if (env->me_lfd != INVALID_HANDLE_VALUE) { @@ -353,14 +353,14 @@ void mdbx_lck_destroy(MDB_env *env) { /*----------------------------------------------------------------------------*/ /* reader checking (by pid) */ -int mdbx_rpid_set(MDB_env *env) { +int mdbx_rpid_set(MDBX_env *env) { (void)env; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int mdbx_rpid_clear(MDB_env *env) { +int mdbx_rpid_clear(MDBX_env *env) { (void)env; - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Checks reader by pid. @@ -369,7 +369,7 @@ int mdbx_rpid_clear(MDB_env *env) { * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) * MDBX_RESULT_FALSE, if pid is dead (lock acquired) * or otherwise the errcode. */ -int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid) { +int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { (void)env; HANDLE hProcess = OpenProcess(PROCESS_QUERY_LIMITED_INFORMATION, FALSE, pid); int rc; diff --git a/src/mdbx.c b/src/mdbx.c index fd234506..6e71ea2a 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -47,7 +47,7 @@ typedef struct rthc_entry_t { mdbx_thread_key_t key; } rthc_entry_t; -#if MDB_DEBUG +#if MDBX_DEBUG #define RTHC_INITIAL_LIMIT 1 #else #define RTHC_INITIAL_LIMIT 16 @@ -98,7 +98,7 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, *key = (mdbx_thread_key_t)0xBADBADBAD; #endif /* NDEBUG */ int rc = mdbx_thread_key_create(key); - if (rc != MDB_SUCCESS) + if (rc != MDBX_SUCCESS) return rc; mdbx_rthc_lock(); @@ -121,7 +121,7 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, rthc_table[rthc_count].end = end; ++rthc_count; mdbx_rthc_unlock(); - return MDB_SUCCESS; + return MDBX_SUCCESS; bailout: mdbx_thread_key_delete(*key); @@ -161,88 +161,88 @@ __cold void mdbx_rthc_remove(mdbx_thread_key_t key) { * [in] ids The IDL to search. * [in] id The ID to search for. * Returns The index of the first ID greater than or equal to id. */ -static unsigned mdbx_midl_search(MDB_IDL ids, pgno_t id); +static unsigned mdbx_midl_search(MDBX_IDL ids, pgno_t id); /* Allocate an IDL. * Allocates memory for an IDL of the given size. * Returns IDL on success, NULL on failure. */ -static MDB_IDL mdbx_midl_alloc(int num); +static MDBX_IDL mdbx_midl_alloc(int num); /* Free an IDL. * [in] ids The IDL to free. */ -static void mdbx_midl_free(MDB_IDL ids); +static void mdbx_midl_free(MDBX_IDL ids); /* Shrink an IDL. * Return the IDL to the default size if it has grown larger. * [in,out] idp Address of the IDL to shrink. */ -static void mdbx_midl_shrink(MDB_IDL *idp); +static void mdbx_midl_shrink(MDBX_IDL *idp); /* Make room for num additional elements in an IDL. * [in,out] idp Address of the IDL. * [in] num Number of elements to make room for. * Returns 0 on success, MDBX_ENOMEM on failure. */ -static int mdbx_midl_need(MDB_IDL *idp, unsigned num); +static int mdbx_midl_need(MDBX_IDL *idp, unsigned num); /* Append an ID onto an IDL. * [in,out] idp Address of the IDL to append to. * [in] id The ID to append. * Returns 0 on success, MDBX_ENOMEM if the IDL is too large. */ -static int mdbx_midl_append(MDB_IDL *idp, pgno_t id); +static int mdbx_midl_append(MDBX_IDL *idp, pgno_t id); /* Append an IDL onto an IDL. * [in,out] idp Address of the IDL to append to. * [in] app The IDL to append. * Returns 0 on success, MDBX_ENOMEM if the IDL is too large. */ -static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app); +static int mdbx_midl_append_list(MDBX_IDL *idp, MDBX_IDL app); /* Append an ID range onto an IDL. * [in,out] idp Address of the IDL to append to. * [in] id The lowest ID to append. * [in] n Number of IDs to append. * Returns 0 on success, MDBX_ENOMEM if the IDL is too large. */ -static int mdbx_midl_append_range(MDB_IDL *idp, pgno_t id, unsigned n); +static int mdbx_midl_append_range(MDBX_IDL *idp, pgno_t id, unsigned n); /* Merge an IDL onto an IDL. The destination IDL must be big enough. * [in] idl The IDL to merge into. * [in] merge The IDL to merge. */ -static void mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge); +static void mdbx_midl_xmerge(MDBX_IDL idl, MDBX_IDL merge); /* Sort an IDL. * [in,out] ids The IDL to sort. */ -static void mdbx_midl_sort(MDB_IDL ids); +static void mdbx_midl_sort(MDBX_IDL ids); /* Search for an ID in an ID2L. * [in] ids The ID2L to search. * [in] id The ID to search for. * Returns The index of the first ID2 whose mid member is greater than * or equal to id. */ -static unsigned mdbx_mid2l_search(MDB_ID2L ids, pgno_t id); +static unsigned mdbx_mid2l_search(MDBX_ID2L ids, pgno_t id); /* Insert an ID2 into a ID2L. * [in,out] ids The ID2L to insert into. * [in] id The ID2 to insert. * Returns 0 on success, -1 if the ID was already present in the ID2L. */ -static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id); +static int mdbx_mid2l_insert(MDBX_ID2L ids, MDBX_ID2 *id); /* Append an ID2 into a ID2L. * [in,out] ids The ID2L to append into. * [in] id The ID2 to append. * Returns 0 on success, -2 if the ID2L is too big. */ -static int mdbx_mid2l_append(MDB_ID2L ids, MDB_ID2 *id); +static int mdbx_mid2l_append(MDBX_ID2L ids, MDBX_ID2 *id); /*----------------------------------------------------------------------------*/ int mdbx_runtime_flags = MDBX_DBG_PRINT -#if MDB_DEBUG +#if MDBX_DEBUG | MDBX_DBG_ASSERT #endif -#if MDB_DEBUG > 1 +#if MDBX_DEBUG > 1 | MDBX_DBG_TRACE #endif -#if MDB_DEBUG > 2 +#if MDBX_DEBUG > 2 | MDBX_DBG_AUDIT #endif -#if MDB_DEBUG > 3 +#if MDBX_DEBUG > 3 | MDBX_DBG_EXTRA #endif ; @@ -251,17 +251,17 @@ MDBX_debug_func *mdbx_debug_logger; int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); -#if MDB_DEBUG +#if MDBX_DEBUG txnid_t mdbx_debug_edge; #endif /* Features under development */ -#ifndef MDB_DEVEL -#define MDB_DEVEL 0 +#ifndef MDBX_DEVEL +#define MDBX_DEVEL 0 #endif -/* Internal error codes, not exposed outside liblmdb */ -#define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) +/* Internal error codes, not exposed outside libmdbx */ +#define MDBX_NO_ROOT (MDBX_LAST_ERRCODE + 10) /* Debuging output value of a cursor DBI: Negative in a sub-cursor. */ #define DDBI(mc) \ @@ -272,7 +272,7 @@ txnid_t mdbx_debug_edge; * It is 32k or 64k, since value-PAGEBASE must fit in * MDBX_page.mp_upper. * - * LMDB will use database pages < OS pages if needed. + * MDBX will use database pages < OS pages if needed. * That causes more I/O in write transactions: The OS must * know (read) the whole page before writing a partial page. * @@ -295,22 +295,22 @@ txnid_t mdbx_debug_edge; * 2 because then there would no longer be a tree structure. With this * value, items larger than 2KB will go into overflow pages, and on * average only 1KB will be wasted. */ -#define MDB_MINKEYS 2 +#define MDBX_MINKEYS 2 -/* A stamp that identifies a file as an LMDB file. +/* A stamp that identifies a file as an MDBX file. * There's nothing special about this value other than that it is easily * recognizable, and it will reflect any byte order mismatches. */ -#define MDB_MAGIC 0xBEEFC0DE +#define MDBX_MAGIC 0xBEEFC0DE /* The version number for a database's datafile format. */ -#define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) +#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 999 : 1) /* The version number for a database's lockfile format. */ -#define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) +#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 999 : 1) /* Key size which fits in a DKBUF. */ #define DKBUF_MAXKEYSIZE 511 /* FIXME */ -#if MDB_DEBUG +#if MDBX_DEBUG #define DKBUF char _kbuf[DKBUF_MAXKEYSIZE * 4 + 2] #define DKEY(x) mdbx_dkey(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1) #define DVAL(x) \ @@ -345,7 +345,7 @@ txnid_t mdbx_debug_edge; * read transactions started by the same thread need no further locking to * proceed. * - * If MDB_NOTLS is set, the slot address is not saved in thread-specific data. + * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. * No reader table is used if the database is on a read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't @@ -384,7 +384,7 @@ txnid_t mdbx_debug_edge; #define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) /* ITS#7713, change PAGEBASE to handle 65536 byte pages */ -#define PAGEBASE ((MDB_DEVEL) ? PAGEHDRSZ : 0) +#define PAGEBASE ((MDBX_DEVEL) ? PAGEHDRSZ : 0) /* Number of nodes on a page */ #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) @@ -463,7 +463,7 @@ typedef struct MDBX_node { #define F_DUPDATA 0x04 /* data has duplicates */ /* valid flags for mdbx_node_add() */ -#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDB_RESERVE | MDB_APPEND) +#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) uint8_t mn_data[1]; /* key and data are appended here */ } MDBX_node; @@ -549,12 +549,12 @@ static __inline void SETDSZ(MDBX_node *node, unsigned size) { #define NODEKSZ(node) ((node)->mn_ksize) /* The address of a key in a LEAF2 page. - * LEAF2 pages are used for MDB_DUPFIXED sorted-duplicate sub-DBs. + * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. * There are no node headers, keys are stored contiguously. */ #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks))) /* Set the node's key into keyptr, if requested. */ -#define MDB_GET_KEY(node, keyptr) \ +#define MDBX_GET_KEY(node, keyptr) \ do { \ if ((keyptr) != NULL) { \ (keyptr)->iov_len = NODEKSZ(node); \ @@ -563,24 +563,24 @@ static __inline void SETDSZ(MDBX_node *node, unsigned size) { } while (0) /* Set the node's key into key. */ -#define MDB_GET_KEY2(node, key) \ +#define MDBX_GET_KEY2(node, key) \ do { \ key.iov_len = NODEKSZ(node); \ key.iov_base = NODEKEY(node); \ } while (0) -#define MDB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ -#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) +#define MDBX_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDBX_VALID)) /* mdbx_dbi_open() flags */ #define VALID_FLAGS \ - (MDB_REVERSEKEY | MDB_DUPSORT | MDB_INTEGERKEY | MDB_DUPFIXED | \ - MDB_INTEGERDUP | MDB_REVERSEDUP | MDB_CREATE) + (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \ + MDBX_INTEGERDUP | MDBX_REVERSEDUP | MDBX_CREATE) /* max number of pages to commit in one writev() call */ -#define MDB_COMMIT_PAGES 64 -#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ -#undef MDB_COMMIT_PAGES -#define MDB_COMMIT_PAGES IOV_MAX +#define MDBX_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ +#undef MDBX_COMMIT_PAGES +#define MDBX_COMMIT_PAGES IOV_MAX #endif /* Check txn and dbi arguments to a function */ @@ -591,128 +591,130 @@ static __inline void SETDSZ(MDBX_node *node, unsigned size) { #define TXN_DBI_CHANGED(txn, dbi) \ ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) -static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags); -static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, +static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, int flags); +static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num, MDBX_page **mp); -static int mdbx_page_touch(MDB_cursor *mc); -static int mdbx_cursor_touch(MDB_cursor *mc); +static int mdbx_page_touch(MDBX_cursor *mc); +static int mdbx_cursor_touch(MDBX_cursor *mc); -#define MDB_END_NAMES \ +#define MDBX_END_NAMES \ { \ "committed", "empty-commit", "abort", "reset", "reset-tmp", "fail-begin", \ "fail-beginchild" \ } enum { /* mdbx_txn_end operation number, for logging */ - MDB_END_COMMITTED, - MDB_END_EMPTY_COMMIT, - MDB_END_ABORT, - MDB_END_RESET, - MDB_END_RESET_TMP, - MDB_END_FAIL_BEGIN, - MDB_END_FAIL_BEGINCHILD + MDBX_END_COMMITTED, + MDBX_END_EMPTY_COMMIT, + MDBX_END_ABORT, + MDBX_END_RESET, + MDBX_END_RESET_TMP, + MDBX_END_FAIL_BEGIN, + MDBX_END_FAIL_BEGINCHILD }; -#define MDB_END_OPMASK 0x0F /* mask for mdbx_txn_end() operation number */ -#define MDB_END_UPDATE 0x10 /* update env state (DBIs) */ -#define MDB_END_FREE 0x20 /* free txn unless it is MDB_env.me_txn0 */ -#define MDB_END_EOTDONE 0x40 /* txn's cursors already closed */ -#define MDB_END_SLOT 0x80 /* release any reader slot if MDB_NOTLS */ +#define MDBX_END_OPMASK 0x0F /* mask for mdbx_txn_end() operation number */ +#define MDBX_END_UPDATE 0x10 /* update env state (DBIs) */ +#define MDBX_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ +#define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */ +#define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode); -static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDBX_page **mp, int *lvl); -static int mdbx_page_search_root(MDB_cursor *mc, MDBX_val *key, int modify); -#define MDB_PS_MODIFY 1 -#define MDB_PS_ROOTONLY 2 -#define MDB_PS_FIRST 4 -#define MDB_PS_LAST 8 -static int mdbx_page_search(MDB_cursor *mc, MDBX_val *key, int flags); -static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); +static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **mp, + int *lvl); +static int mdbx_page_search_root(MDBX_cursor *mc, MDBX_val *key, int modify); +#define MDBX_PS_MODIFY 1 +#define MDBX_PS_ROOTONLY 2 +#define MDBX_PS_FIRST 4 +#define MDBX_PS_LAST 8 +static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags); +static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst); -#define MDB_SPLIT_REPLACE MDB_APPENDDUP /* newkey is not new */ -static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, +#define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ +static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, pgno_t newpgno, unsigned nflags); -static int mdbx_read_header(MDB_env *env, MDB_meta *meta); -static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, - MDB_meta *pending); -static void mdbx_env_close0(MDB_env *env); +static int mdbx_read_header(MDBX_env *env, MDBX_meta *meta); +static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, + MDBX_meta *pending); +static void mdbx_env_close0(MDBX_env *env); -static MDBX_node *mdbx_node_search(MDB_cursor *mc, MDBX_val *key, int *exactp); -static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDBX_val *key, +static MDBX_node *mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, int *exactp); +static int mdbx_node_add(MDBX_cursor *mc, indx_t indx, MDBX_val *key, MDBX_val *data, pgno_t pgno, unsigned flags); -static void mdbx_node_del(MDB_cursor *mc, int ksize); +static void mdbx_node_del(MDBX_cursor *mc, int ksize); static void mdbx_node_shrink(MDBX_page *mp, indx_t indx); -static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); -static int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, MDBX_val *data); -static size_t mdbx_leaf_size(MDB_env *env, MDBX_val *key, MDBX_val *data); -static size_t mdbx_branch_size(MDB_env *env, MDBX_val *key); +static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft); +static int mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf, MDBX_val *data); +static size_t mdbx_leaf_size(MDBX_env *env, MDBX_val *key, MDBX_val *data); +static size_t mdbx_branch_size(MDBX_env *env, MDBX_val *key); -static int mdbx_rebalance(MDB_cursor *mc); -static int mdbx_update_key(MDB_cursor *mc, MDBX_val *key); +static int mdbx_rebalance(MDBX_cursor *mc); +static int mdbx_update_key(MDBX_cursor *mc, MDBX_val *key); -static void mdbx_cursor_pop(MDB_cursor *mc); -static int mdbx_cursor_push(MDB_cursor *mc, MDBX_page *mp); +static void mdbx_cursor_pop(MDBX_cursor *mc); +static int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp); -static int mdbx_cursor_del0(MDB_cursor *mc); -static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, +static int mdbx_cursor_del0(MDBX_cursor *mc); +static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, unsigned flags); -static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right); -static int mdbx_cursor_next(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, - MDB_cursor_op op); -static int mdbx_cursor_prev(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, - MDB_cursor_op op); -static int mdbx_cursor_set(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, - MDB_cursor_op op, int *exactp); -static int mdbx_cursor_first(MDB_cursor *mc, MDBX_val *key, MDBX_val *data); -static int mdbx_cursor_last(MDB_cursor *mc, MDBX_val *key, MDBX_val *data); +static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right); +static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op); +static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op); +static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op, int *exactp); +static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data); +static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data); -static void mdbx_cursor_init(MDB_cursor *mc, MDBX_txn *txn, MDB_dbi dbi, - MDB_xcursor *mx); -static void mdbx_xcursor_init0(MDB_cursor *mc); -static void mdbx_xcursor_init1(MDB_cursor *mc, MDBX_node *node); -static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); +static void mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi, + MDBX_xcursor *mx); +static void mdbx_xcursor_init0(MDBX_cursor *mc); +static void mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node); +static void mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, + int force); -static int mdbx_drop0(MDB_cursor *mc, int subs); +static int mdbx_drop0(MDBX_cursor *mc, int subs); -static MDB_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_ai, +static MDBX_cmp_func mdbx_cmp_memn, mdbx_cmp_memnr, mdbx_cmp_int_ai, mdbx_cmp_int_a2, mdbx_cmp_int_ua; static const char *__mdbx_strerr(int errnum) { - /* Table of descriptions for LMDB errors */ + /* Table of descriptions for MDBX errors */ static const char *const tbl[] = { - "MDB_KEYEXIST: Key/data pair already exists", - "MDB_NOTFOUND: No matching key/data pair found", - "MDB_PAGE_NOTFOUND: Requested page not found", - "MDB_CORRUPTED: Database is corrupted", - "MDB_PANIC: Update of meta page failed or environment had fatal error", - "MDB_VERSION_MISMATCH: DB version mismatch libmdbx", - "MDB_INVALID: File is not an LMDB file", - "MDB_MAP_FULL: Environment mapsize limit reached", - "MDB_DBS_FULL: Too may DBI (maxdbs reached)", - "MDB_READERS_FULL: Too many readers (maxreaders reached)", - NULL /* MDB_TLS_FULL (-30789): unused in MDBX */, - "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too " + "MDBX_KEYEXIST: Key/data pair already exists", + "MDBX_NOTFOUND: No matching key/data pair found", + "MDBX_PAGE_NOTFOUND: Requested page not found", + "MDBX_CORRUPTED: Database is corrupted", + "MDBX_PANIC: Update of meta page failed or environment had fatal error", + "MDBX_VERSION_MISMATCH: DB version mismatch libmdbx", + "MDBX_INVALID: File is not an MDBX file", + "MDBX_MAP_FULL: Environment mapsize limit reached", + "MDBX_DBS_FULL: Too may DBI (maxdbs reached)", + "MDBX_READERS_FULL: Too many readers (maxreaders reached)", + NULL /* MDBX_TLS_FULL (-30789): unused in MDBX */, + "MDBX_TXN_FULL: Transaction has too many dirty pages - transaction too " "big", - "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", - "MDB_PAGE_FULL: Internal error - page has no more space", - "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", - "MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", - "MDB_BAD_RSLOT: Invalid reuse of reader locktable slot", - "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", - "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong " + "MDBX_CURSOR_FULL: Internal error - cursor stack limit reached", + "MDBX_PAGE_FULL: Internal error - page has no more space", + "MDBX_MAP_RESIZED: Database contents grew beyond environment mapsize", + "MDBX_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed", + "MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot", + "MDBX_BAD_TXN: Transaction must abort, has a child, or is invalid", + "MDBX_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong " "DUPFIXED size", - "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", - "MDB_PROBLEM: Unexpected problem - txn should abort", + "MDBX_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", + "MDBX_PROBLEM: Unexpected problem - txn should abort", }; - if (errnum >= MDB_KEYEXIST && errnum <= MDB_LAST_ERRCODE) { - int i = errnum - MDB_KEYEXIST; + if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_LAST_ERRCODE) { + int i = errnum - MDBX_KEYEXIST; return tbl[i]; } switch (errnum) { - case MDB_SUCCESS: - return "MDB_SUCCESS: Successful"; + case MDBX_SUCCESS: + return "MDBX_SUCCESS: Successful"; case MDBX_EMULTIVAL: return "MDBX_EMULTIVAL: Unable to update multi-value for the given key"; case MDBX_EBADSIGN: @@ -776,7 +778,7 @@ const char *__cold mdbx_strerror(int errnum) { return msg; } -static txnid_t mdbx_oomkick(MDB_env *env, txnid_t oldest); +static txnid_t mdbx_oomkick(MDBX_env *env, txnid_t oldest); void __cold mdbx_debug_log(int type, const char *function, int line, const char *fmt, ...) { @@ -879,7 +881,7 @@ static void mdbx_page_list(MDBX_page *mp) { return; case P_META: mdbx_print("Meta-page %" PRIu64 " txnid %" PRIu64 "\n", pgno, - ((MDB_meta *)PAGEDATA(mp))->mm_txnid); + ((MDBX_meta *)PAGEDATA(mp))->mm_txnid); return; default: mdbx_print("Bad page %" PRIu64 " flags 0x%X\n", pgno, mp->mp_flags); @@ -922,7 +924,7 @@ static void mdbx_page_list(MDBX_page *mp) { SIZELEFT(mp)); } -static void mdbx_cursor_chk(MDB_cursor *mc) { +static void mdbx_cursor_chk(MDBX_cursor *mc) { unsigned i; MDBX_node *node; MDBX_page *mp; @@ -951,21 +953,21 @@ static void mdbx_cursor_chk(MDB_cursor *mc) { * it matches the actual number of pages being used. * All named DBs must be open for a correct count. */ static void mdbx_audit(MDBX_txn *txn) { - MDB_cursor mc; + MDBX_cursor mc; MDBX_val key, data; pgno_t freecount, count; - MDB_dbi i; + MDBX_dbi i; int rc; freecount = 0; mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); - while ((rc = mdbx_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + while ((rc = mdbx_cursor_get(&mc, &key, &data, MDBX_NEXT)) == 0) freecount += *(pgno_t *)data.iov_base; - mdbx_tassert(txn, rc == MDB_NOTFOUND); + mdbx_tassert(txn, rc == MDBX_NOTFOUND); count = 0; for (i = 0; i < txn->mt_numdbs; i++) { - MDB_xcursor mx; + MDBX_xcursor mx; if (!(txn->mt_dbflags[i] & DB_VALID)) continue; mdbx_cursor_init(&mc, txn, i, &mx); @@ -973,23 +975,23 @@ static void mdbx_audit(MDBX_txn *txn) { continue; count += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + txn->mt_dbs[i].md_overflow_pages; - if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { - rc = mdbx_page_search(&mc, NULL, MDB_PS_FIRST); - for (; rc == MDB_SUCCESS; rc = mdbx_cursor_sibling(&mc, 1)) { + if (txn->mt_dbs[i].md_flags & MDBX_DUPSORT) { + rc = mdbx_page_search(&mc, NULL, MDBX_PS_FIRST); + for (; rc == MDBX_SUCCESS; rc = mdbx_cursor_sibling(&mc, 1)) { unsigned j; MDBX_page *mp; mp = mc.mc_pg[mc.mc_top]; for (j = 0; j < NUMKEYS(mp); j++) { MDBX_node *leaf = NODEPTR(mp, j); if (leaf->mn_flags & F_SUBDATA) { - MDB_db db; + MDBX_db db; memcpy(&db, NODEDATA(leaf), sizeof(db)); count += db.md_branch_pages + db.md_leaf_pages + db.md_overflow_pages; } } } - mdbx_tassert(txn, rc == MDB_NOTFOUND); + mdbx_tassert(txn, rc == MDBX_NOTFOUND); } } if (freecount + count + NUM_METAS != txn->mt_next_pgno) { @@ -1000,12 +1002,13 @@ static void mdbx_audit(MDBX_txn *txn) { } } -int mdbx_cmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, const MDBX_val *b) { +int mdbx_cmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, + const MDBX_val *b) { mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_cmp(a, b); } -int mdbx_dcmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, +int mdbx_dcmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_dcmp(a, b); @@ -1013,9 +1016,9 @@ int mdbx_dcmp(MDBX_txn *txn, MDB_dbi dbi, const MDBX_val *a, /* Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. - * Set MDB_TXN_ERROR on failure. */ + * Set MDBX_TXN_ERROR on failure. */ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { - MDB_env *env = txn->mt_env; + MDBX_env *env = txn->mt_env; size_t size = env->me_psize; MDBX_page *np = env->me_dpages; if (likely(num == 1 && np)) { @@ -1027,13 +1030,13 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { size *= num; np = malloc(size); if (unlikely(!np)) { - txn->mt_flags |= MDB_TXN_ERROR; + txn->mt_flags |= MDBX_TXN_ERROR; return np; } VALGRIND_MEMPOOL_ALLOC(env, np, size); } - if ((env->me_flags & MDB_NOMEMINIT) == 0) { + if ((env->me_flags & MDBX_NOMEMINIT) == 0) { /* For a single page alloc, we init everything after the page header. * For multi-page, we init the final page; if the caller needed that * many pages they will be filling in at least up to the last page. */ @@ -1051,14 +1054,14 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { /* Free a single page. * Saves single pages to a list, for future reuse. * (This is not used for multi-page overflow pages.) */ -static __inline void mdbx_page_free(MDB_env *env, MDBX_page *mp) { +static __inline void mdbx_page_free(MDBX_env *env, MDBX_page *mp) { mp->mp_next = env->me_dpages; VALGRIND_MEMPOOL_FREE(env, mp); env->me_dpages = mp; } /* Free a dirty page */ -static void mdbx_dpage_free(MDB_env *env, MDBX_page *dp) { +static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp) { if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { mdbx_page_free(env, dp); } else { @@ -1070,8 +1073,8 @@ static void mdbx_dpage_free(MDB_env *env, MDBX_page *dp) { /* Return all dirty pages to dpage list */ static void mdbx_dlist_free(MDBX_txn *txn) { - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_rw_dirtylist; + MDBX_env *env = txn->mt_env; + MDBX_ID2L dl = txn->mt_rw_dirtylist; size_t i, n = dl[0].mid; for (i = 1; i <= n; i++) { @@ -1080,11 +1083,11 @@ static void mdbx_dlist_free(MDBX_txn *txn) { dl[0].mid = 0; } -static void __cold mdbx_kill_page(MDB_env *env, pgno_t pgno) { +static void __cold mdbx_kill_page(MDBX_env *env, pgno_t pgno) { const size_t offs = env->me_psize * pgno; const size_t shift = offsetof(MDBX_page, mp_pages); - if (env->me_flags & MDB_WRITEMAP) { + if (env->me_flags & MDBX_WRITEMAP) { MDBX_page *mp = (MDBX_page *)(env->me_map + offs); memset(&mp->mp_pages, 0x6F /* 'o', 111 */, env->me_psize - shift); VALGRIND_MAKE_MEM_NOACCESS(&mp->mp_pages, env->me_psize - shift); @@ -1107,14 +1110,14 @@ static void __cold mdbx_kill_page(MDB_env *env, pgno_t pgno) { * * If the page wasn't dirtied in this txn, just add it * to this txn's free list. */ -static int mdbx_page_loose(MDB_cursor *mc, MDBX_page *mp) { +static int mdbx_page_loose(MDBX_cursor *mc, MDBX_page *mp) { int loose = 0; pgno_t pgno = mp->mp_pgno; MDBX_txn *txn = mc->mc_txn; if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) { if (txn->mt_parent) { - MDB_ID2 *dl = txn->mt_rw_dirtylist; + MDBX_ID2 *dl = txn->mt_rw_dirtylist; /* If txn has a parent, * make sure the page is in our dirty list. */ if (dl[0].mid) { @@ -1122,8 +1125,8 @@ static int mdbx_page_loose(MDB_cursor *mc, MDBX_page *mp) { if (x <= dl[0].mid && dl[x].mid == pgno) { if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ mc->mc_flags &= ~(C_INITIALIZED | C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; + txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; } /* ok, it's ours */ loose = 1; @@ -1152,7 +1155,7 @@ static int mdbx_page_loose(MDB_cursor *mc, MDBX_page *mp) { return rc; } - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. @@ -1164,15 +1167,15 @@ static int mdbx_page_loose(MDB_cursor *mc, MDBX_page *mp) { * [in] all No shortcuts. Needed except after a full mdbx_page_flush(). * * Returns 0 on success, non-zero on failure. */ -static int mdbx_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { +static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, int all) { const unsigned Mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP; MDBX_txn *txn = mc->mc_txn; - MDB_cursor *m3, *m0 = mc; - MDB_xcursor *mx; + MDBX_cursor *m3, *m0 = mc; + MDBX_xcursor *mx; MDBX_page *dp, *mp; MDBX_node *leaf; unsigned i, j; - int rc = MDB_SUCCESS, level; + int rc = MDBX_SUCCESS, level; /* Mark pages seen by cursors: First m0, then tracked cursors */ for (i = txn->mt_numdbs;;) { @@ -1210,7 +1213,7 @@ mark_done: if (pgno == P_INVALID) continue; if (unlikely((rc = mdbx_page_get(m0, pgno, &dp, &level)) != - MDB_SUCCESS)) + MDBX_SUCCESS)) break; if ((dp->mp_flags & Mask) == pflags && level <= 1) dp->mp_flags ^= P_KEEP; @@ -1224,11 +1227,11 @@ mark_done: static int mdbx_page_flush(MDBX_txn *txn, int keep); /* Spill pages from the dirty list back to disk. - * This is intended to prevent running into MDB_TXN_FULL situations, + * This is intended to prevent running into MDBX_TXN_FULL situations, * but note that they may still occur in a few cases: * * 1) our estimate of the txn size could be too small. Currently this - * seems unlikely, except with a large number of MDB_MULTIPLE items. + * seems unlikely, except with a large number of MDBX_MULTIPLE items. * * 2) child txns may run out of space if their parents dirtied a * lot of pages and never spilled them. TODO: we probably should do @@ -1236,7 +1239,7 @@ static int mdbx_page_flush(MDBX_txn *txn, int keep); * the parent's dirtyroom is below a given threshold. * * Otherwise, if not using nested txns, it is expected that apps will - * not run into MDB_TXN_FULL any more. The pages are flushed to disk + * not run into MDBX_TXN_FULL any more. The pages are flushed to disk * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. * If the txn never references them again, they can be left alone. * If the txn only reads them, they can be used without any fuss. @@ -1257,15 +1260,15 @@ static int mdbx_page_flush(MDBX_txn *txn, int keep); * [in] data For a put operation, the data being stored. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_spill(MDB_cursor *m0, MDBX_val *key, MDBX_val *data) { +static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { MDBX_txn *txn = m0->mc_txn; MDBX_page *dp; - MDB_ID2L dl = txn->mt_rw_dirtylist; + MDBX_ID2L dl = txn->mt_rw_dirtylist; unsigned i, j, need; int rc; if (m0->mc_flags & C_SUB) - return MDB_SUCCESS; + return MDBX_SUCCESS; /* Estimate how much space this op will take */ i = m0->mc_db->md_depth; @@ -1279,15 +1282,15 @@ static int mdbx_page_spill(MDB_cursor *m0, MDBX_val *key, MDBX_val *data) { need = i; if (txn->mt_dirtyroom > i) - return MDB_SUCCESS; + return MDBX_SUCCESS; if (!txn->mt_spill_pages) { - txn->mt_spill_pages = mdbx_midl_alloc(MDB_IDL_UM_MAX); + txn->mt_spill_pages = mdbx_midl_alloc(MDBX_IDL_UM_MAX); if (unlikely(!txn->mt_spill_pages)) return MDBX_ENOMEM; } else { /* purge deleted slots */ - MDB_IDL sl = txn->mt_spill_pages; + MDBX_IDL sl = txn->mt_spill_pages; unsigned num = sl[0]; j = 0; for (i = 1; i <= num; i++) { @@ -1299,7 +1302,7 @@ static int mdbx_page_spill(MDB_cursor *m0, MDBX_val *key, MDBX_val *data) { /* Preserve pages which may soon be dirtied again */ rc = mdbx_pages_xkeep(m0, P_DIRTY, 1); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* Less aggressive spill - we originally spilled the entire dirty list, @@ -1308,8 +1311,8 @@ static int mdbx_page_spill(MDB_cursor *m0, MDBX_val *key, MDBX_val *data) { * of those pages will need to be used again. So now we spill only 1/8th * of the dirty pages. Testing revealed this to be a good tradeoff, * better than 1/2, 1/4, or 1/10. */ - if (need < MDB_IDL_UM_MAX / 8) - need = MDB_IDL_UM_MAX / 8; + if (need < MDBX_IDL_UM_MAX / 8) + need = MDBX_IDL_UM_MAX / 8; /* Save the page IDs of all the pages we're flushing */ /* flush from the tail forward, this saves a lot of shifting later on. */ @@ -1335,7 +1338,7 @@ static int mdbx_page_spill(MDB_cursor *m0, MDBX_val *key, MDBX_val *data) { continue; } rc = mdbx_midl_append(&txn->mt_spill_pages, pn); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; need--; } @@ -1343,45 +1346,45 @@ static int mdbx_page_spill(MDB_cursor *m0, MDBX_val *key, MDBX_val *data) { /* Flush the spilled part of dirty list */ rc = mdbx_page_flush(txn, i); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* Reset any dirty pages we kept that page_flush didn't see */ rc = mdbx_pages_xkeep(m0, P_DIRTY | P_KEEP, i); bailout: - txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; + txn->mt_flags |= rc ? MDBX_TXN_ERROR : MDBX_TXN_SPILLS; return rc; } -static __inline uint64_t mdbx_meta_sign(MDB_meta *meta) { - uint64_t sign = MDB_DATASIGN_NONE; +static __inline uint64_t mdbx_meta_sign(MDBX_meta *meta) { + uint64_t sign = MDBX_DATASIGN_NONE; #if 0 /* TODO */ sign = hippeus_hash64(&meta->mm_mapsize, - sizeof(MDB_meta) - offsetof(MDB_meta, mm_mapsize), - meta->mm_version | (uint64_t)MDB_MAGIC << 32); + sizeof(MDBX_meta) - offsetof(MDBX_meta, mm_mapsize), + meta->mm_version | (uint64_t)MDBX_MAGIC << 32); #else (void)meta; #endif - /* LY: newer returns MDB_DATASIGN_NONE or MDB_DATASIGN_WEAK */ - return (sign > MDB_DATASIGN_WEAK) ? sign : ~sign; + /* LY: newer returns MDBX_DATASIGN_NONE or MDBX_DATASIGN_WEAK */ + return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; } -static __inline MDB_meta *mdbx_env_meta_flipflop(const MDB_env *env, - MDB_meta *meta) { +static __inline MDBX_meta *mdbx_env_meta_flipflop(const MDBX_env *env, + MDBX_meta *meta) { return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env); } -static __inline int mdbx_meta_lt(const MDB_meta *a, const MDB_meta *b) { +static __inline int mdbx_meta_lt(const MDBX_meta *a, const MDBX_meta *b) { if (META_IS_STEADY(a) == META_IS_STEADY(b)) return a->mm_txnid < b->mm_txnid; return META_IS_STEADY(b); } /* Find oldest txnid still referenced. */ -static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { - const MDB_meta *const a = METAPAGE_1(env); - const MDB_meta *const b = METAPAGE_2(env); +static txnid_t mdbx_find_oldest(MDBX_env *env, int *laggard) { + const MDBX_meta *const a = METAPAGE_1(env); + const MDBX_meta *const b = METAPAGE_2(env); txnid_t oldest = mdbx_meta_lt(a, b) ? b->mm_txnid : a->mm_txnid; int i, reader; @@ -1404,10 +1407,10 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) { /* Add a page to the txn's dirty list */ static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { - MDB_ID2 mid; - int rc, (*insert)(MDB_ID2L, MDB_ID2 *); + MDBX_ID2 mid; + int rc, (*insert)(MDBX_ID2L, MDBX_ID2 *); - if (txn->mt_flags & MDB_TXN_WRITEMAP) { + if (txn->mt_flags & MDBX_TXN_WRITEMAP) { insert = mdbx_mid2l_append; } else { insert = mdbx_mid2l_insert; @@ -1420,7 +1423,7 @@ static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { } /* Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. Set MDB_TXN_ERROR on failure. + * me_pghead and mt_next_pgno. Set MDBX_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. @@ -1443,16 +1446,17 @@ static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { #define MDBX_ALLOC_ALL \ (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW | MDBX_ALLOC_KICK) -static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { +static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, + int flags) { int rc; MDBX_txn *txn = mc->mc_txn; - MDB_env *env = txn->mt_env; + MDBX_env *env = txn->mt_env; pgno_t pgno, *mop = env->me_pghead; unsigned i = 0, j, mop_len = mop ? mop[0] : 0, n2 = num - 1; MDBX_page *np; txnid_t oldest = 0, last = 0; - MDB_cursor_op op; - MDB_cursor m2; + MDBX_cursor_op op; + MDBX_cursor m2; int found_oldest = 0; if (likely(flags & MDBX_ALLOC_GC)) { @@ -1475,19 +1479,19 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { mdbx_debug("db %d use loose page %" PRIaPGNO, DDBI(mc), np->mp_pgno); ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize); *mp = np; - return MDB_SUCCESS; + return MDBX_SUCCESS; } } /* If our dirty list is already full, we can't do anything */ if (unlikely(txn->mt_dirtyroom == 0)) { - rc = MDB_TXN_FULL; + rc = MDBX_TXN_FULL; goto fail; } for (;;) { /* oom-kick retry loop */ - for (op = MDB_FIRST;; - op = (flags & MDBX_LIFORECLAIM) ? MDB_PREV : MDB_NEXT) { + for (op = MDBX_FIRST;; + op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) { MDBX_val key, data; MDBX_node *leaf; pgno_t *idl; @@ -1495,7 +1499,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { /* Seek a big enough contiguous page range. Prefer * pages at the tail, just truncating the list. */ if (likely(flags & MDBX_ALLOC_CACHE) && mop_len > n2 && - (!(flags & MDBX_COALESCE) || op == MDB_FIRST)) { + (!(flags & MDBX_COALESCE) || op == MDBX_FIRST)) { i = mop_len; do { pgno = mop[i]; @@ -1504,7 +1508,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { } while (--i > n2); } - if (op == MDB_FIRST) { /* 1st iteration */ + if (op == MDBX_FIRST) { /* 1st iteration */ /* Prepare to fetch more and coalesce */ if (unlikely(!(flags & MDBX_ALLOC_GC))) break; @@ -1519,12 +1523,12 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { /* Begin from oldest reader if any */ if (oldest > 2) { last = oldest - 1; - op = MDB_SET_RANGE; + op = MDBX_SET_RANGE; } } else if (env->me_pglast) { /* Continue lookup from env->me_pglast to higher/last */ last = env->me_pglast; - op = MDB_SET_RANGE; + op = MDBX_SET_RANGE; } key.iov_base = &last; @@ -1533,7 +1537,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { if (!(flags & MDBX_LIFORECLAIM)) { /* Do not fetch more if the record will be too recent */ - if (op != MDB_FIRST && ++last >= oldest) { + if (op != MDBX_FIRST && ++last >= oldest) { if (!found_oldest) { oldest = mdbx_find_oldest(env, NULL); found_oldest = 1; @@ -1544,8 +1548,8 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { } rc = mdbx_cursor_get(&m2, &key, NULL, op); - if (rc == MDB_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { - if (op == MDB_SET_RANGE) + if (rc == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { + if (op == MDBX_SET_RANGE) continue; found_oldest = 1; if (oldest < mdbx_find_oldest(env, NULL)) { @@ -1553,12 +1557,12 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { last = oldest - 1; key.iov_base = &last; key.iov_len = sizeof(last); - op = MDB_SET_RANGE; + op = MDBX_SET_RANGE; rc = mdbx_cursor_get(&m2, &key, NULL, op); } } if (unlikely(rc)) { - if (rc == MDB_NOTFOUND) + if (rc == MDBX_NOTFOUND) break; goto fail; } @@ -1588,7 +1592,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { np = m2.mc_pg[m2.mc_top]; leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); - if (unlikely((rc = mdbx_node_read(&m2, leaf, &data)) != MDB_SUCCESS)) + if (unlikely((rc = mdbx_node_read(&m2, leaf, &data)) != MDBX_SUCCESS)) goto fail; if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { @@ -1634,11 +1638,11 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) { /* force gc reclaim mode */ - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Don't try to coalesce too much. */ - if (mop_len > MDB_IDL_UM_SIZE / 2) + if (mop_len > MDBX_IDL_UM_SIZE / 2) break; if (flags & MDBX_COALESCE) { if (mop_len /* current size */ >= env->me_maxfree_1pg / 2 || @@ -1661,27 +1665,27 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { /* Use new pages from the map when nothing suitable in the freeDB */ i = 0; pgno = txn->mt_next_pgno; - rc = MDB_MAP_FULL; + rc = MDBX_MAP_FULL; if (likely(pgno + num <= env->me_maxpg)) { - rc = MDB_NOTFOUND; + rc = MDBX_NOTFOUND; if (likely(flags & MDBX_ALLOC_NEW)) goto done; } if ((flags & MDBX_ALLOC_GC) && - ((flags & MDBX_ALLOC_KICK) || rc == MDB_MAP_FULL)) { - MDB_meta *head = mdbx_meta_head(env); - MDB_meta *tail = mdbx_env_meta_flipflop(env, head); + ((flags & MDBX_ALLOC_KICK) || rc == MDBX_MAP_FULL)) { + MDBX_meta *head = mdbx_meta_head(env); + MDBX_meta *tail = mdbx_env_meta_flipflop(env, head); if (oldest == tail->mm_txnid && META_IS_WEAK(head) && !META_IS_WEAK(tail)) { - MDB_meta meta = *head; + MDBX_meta meta = *head; /* LY: Here an oom was happened: * - all pages had allocated; * - reclaiming was stopped at the last steady-sync; * - the head-sync is weak. * Now we need make a sync to resume reclaiming. If both - * MDB_NOSYNC and MDB_MAPASYNC flags are set, then assume that + * MDBX_NOSYNC and MDBX_MAPASYNC flags are set, then assume that * utterly no-sync write mode was requested. In such case * don't make a steady-sync, but only a legacy-mode checkpoint, * just for resume reclaiming only, not for data consistency. */ @@ -1691,12 +1695,12 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest); - int me_flags = env->me_flags & MDB_WRITEMAP; + int me_flags = env->me_flags & MDBX_WRITEMAP; if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC) me_flags |= MDBX_UTTERLY_NOSYNC; mdbx_assert(env, env->me_sync_pending > 0); - if (mdbx_env_sync_locked(env, me_flags, &meta) == MDB_SUCCESS) { + if (mdbx_env_sync_locked(env, me_flags, &meta) == MDBX_SUCCESS) { txnid_t snap = mdbx_find_oldest(env, NULL); if (snap > oldest) { continue; @@ -1704,7 +1708,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { } } - if (rc == MDB_MAP_FULL) { + if (rc == MDBX_MAP_FULL) { txnid_t snap = mdbx_oomkick(env, oldest); if (snap > oldest) { oldest = snap; @@ -1716,7 +1720,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { fail: if (mp) { *mp = NULL; - txn->mt_flags |= MDB_TXN_ERROR; + txn->mt_flags |= MDBX_TXN_ERROR; } assert(rc); return rc; @@ -1724,7 +1728,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDBX_page **mp, int flags) { done: assert(mp && num); - if (env->me_flags & MDB_WRITEMAP) { + if (env->me_flags & MDBX_WRITEMAP) { np = (MDBX_page *)(env->me_map + env->me_psize * pgno); /* LY: reset no-access flag from mdbx_kill_page() */ VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); @@ -1755,7 +1759,7 @@ done: mdbx_page_dirty(txn, np); *mp = np; - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Copy the used portions of a non-overflow page. @@ -1788,7 +1792,7 @@ static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, unsigned psize) { * [out] ret the writable page, if any. * ret is unchanged if mp wasn't spilled. */ static int mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, MDBX_page **ret) { - MDB_env *env = txn->mt_env; + MDBX_env *env = txn->mt_env; const MDBX_txn *tx2; unsigned x; pgno_t pgno = mp->mp_pgno, pn = pgno << 1; @@ -1801,12 +1805,12 @@ static int mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, MDBX_page **ret) { MDBX_page *np; int num; if (txn->mt_dirtyroom == 0) - return MDB_TXN_FULL; + return MDBX_TXN_FULL; if (IS_OVERFLOW(mp)) num = mp->mp_pages; else num = 1; - if (env->me_flags & MDB_WRITEMAP) { + if (env->me_flags & MDBX_WRITEMAP) { np = mp; } else { np = mdbx_page_malloc(txn, num); @@ -1835,24 +1839,24 @@ static int mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, MDBX_page **ret) { break; } } - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Touch a page: make it dirty and re-insert into tree with updated pgno. - * Set MDB_TXN_ERROR on failure. + * Set MDBX_TXN_ERROR on failure. * * [in] mc cursor pointing to the page to be touched * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_touch(MDB_cursor *mc) { +static int mdbx_page_touch(MDBX_cursor *mc) { MDBX_page *mp = mc->mc_pg[mc->mc_top], *np; MDBX_txn *txn = mc->mc_txn; - MDB_cursor *m2, *m3; + MDBX_cursor *m2, *m3; pgno_t pgno; int rc; if (!F_ISSET(mp->mp_flags, P_DIRTY)) { - if (txn->mt_flags & MDB_TXN_SPILLS) { + if (txn->mt_flags & MDBX_TXN_SPILLS) { np = NULL; rc = mdbx_page_unspill(txn, mp, &np); if (unlikely(rc)) @@ -1877,7 +1881,7 @@ static int mdbx_page_touch(MDB_cursor *mc) { mc->mc_db->md_root = pgno; } } else if (txn->mt_parent && !IS_SUBP(mp)) { - MDB_ID2 mid, *dl = txn->mt_rw_dirtylist; + MDBX_ID2 mid, *dl = txn->mt_rw_dirtylist; pgno = mp->mp_pgno; /* If txn has a parent, make sure the page is in our * dirty list. */ @@ -1886,13 +1890,13 @@ static int mdbx_page_touch(MDB_cursor *mc) { if (x <= dl[0].mid && dl[x].mid == pgno) { if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ mc->mc_flags &= ~(C_INITIALIZED | C_EOF); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; + txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; } return 0; } } - mdbx_cassert(mc, dl[0].mid < MDB_IDL_UM_MAX); + mdbx_cassert(mc, dl[0].mid < MDBX_IDL_UM_MAX); /* No - copy it */ np = mdbx_page_malloc(txn, 1); if (unlikely(!np)) @@ -1937,11 +1941,11 @@ done: return 0; fail: - txn->mt_flags |= MDB_TXN_ERROR; + txn->mt_flags |= MDBX_TXN_ERROR; return rc; } -int mdbx_env_sync(MDB_env *env, int force) { +int mdbx_env_sync(MDBX_env *env, int force) { if (unlikely(!env)) return MDBX_EINVAL; @@ -1949,42 +1953,42 @@ int mdbx_env_sync(MDB_env *env, int force) { return MDBX_EBADSIGN; if (unlikely(!env->me_lck)) - return MDB_PANIC; + return MDBX_PANIC; - unsigned flags = env->me_flags & ~MDB_NOMETASYNC; - if (unlikely(flags & (MDB_RDONLY | MDB_FATAL_ERROR))) + unsigned flags = env->me_flags & ~MDBX_NOMETASYNC; + if (unlikely(flags & (MDBX_RDONLY | MDBX_FATAL_ERROR))) return MDBX_EACCESS; int rc = mdbx_txn_lock(env); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; - MDB_meta *head = mdbx_meta_head(env); + MDBX_meta *head = mdbx_meta_head(env); if (!META_IS_STEADY(head) || env->me_sync_pending || env->me_mapsize != head->mm_mapsize) { if (force || head->mm_mapsize != env->me_mapsize || (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold)) - flags &= MDB_WRITEMAP /* clear flags for full steady sync */; + flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; if (env->me_sync_pending > env->me_psize * 16 && - (flags & MDB_NOSYNC) == 0) { - assert(((flags ^ env->me_flags) & MDB_WRITEMAP) == 0); + (flags & MDBX_NOSYNC) == 0) { + assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); size_t used_size = env->me_psize * (head->mm_last_pg + 1); mdbx_txn_unlock(env); /* LY: pre-sync without holding lock to reduce latency for writer(s) */ - if (flags & MDB_WRITEMAP) { - rc = mdbx_msync(env->me_map, used_size, flags & MDB_MAPASYNC); + if (flags & MDBX_WRITEMAP) { + rc = mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC); } else { rc = mdbx_filesync(env->me_fd, false); } - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_txn_lock(env); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; /* LY: head may be changed. */ @@ -1993,9 +1997,9 @@ int mdbx_env_sync(MDB_env *env, int force) { if (!META_IS_STEADY(head) || env->me_sync_pending || env->me_mapsize != head->mm_mapsize) { - MDB_meta meta = *head; + MDBX_meta meta = *head; rc = mdbx_env_sync_locked(env, flags, &meta); - if (unlikely(rc != MDB_SUCCESS)) { + if (unlikely(rc != MDBX_SUCCESS)) { mdbx_txn_unlock(env); return rc; } @@ -2003,22 +2007,22 @@ int mdbx_env_sync(MDB_env *env, int force) { } mdbx_txn_unlock(env); - assert(rc == MDB_SUCCESS); - return MDB_SUCCESS; + assert(rc == MDBX_SUCCESS); + return MDBX_SUCCESS; } /* Back up parent txn's cursors, then grab the originals for tracking */ static int mdbx_cursor_shadow(MDBX_txn *src, MDBX_txn *dst) { - MDB_cursor *mc, *bk; - MDB_xcursor *mx; + MDBX_cursor *mc, *bk; + MDBX_xcursor *mx; size_t size; int i; for (i = src->mt_numdbs; --i >= 0;) { if ((mc = src->mt_cursors[i]) != NULL) { - size = sizeof(MDB_cursor); + size = sizeof(MDBX_cursor); if (mc->mc_xcursor) - size += sizeof(MDB_xcursor); + size += sizeof(MDBX_xcursor); for (; mc; mc = bk->mc_next) { bk = malloc(size); if (unlikely(!bk)) @@ -2032,7 +2036,7 @@ static int mdbx_cursor_shadow(MDBX_txn *src, MDBX_txn *dst) { mc->mc_txn = dst; mc->mc_dbflag = &dst->mt_dbflags[i]; if ((mx = mc->mc_xcursor) != NULL) { - *(MDB_xcursor *)(bk + 1) = *mx; + *(MDBX_xcursor *)(bk + 1) = *mx; mx->mx_cursor.mc_txn = dst; } mc->mc_next = dst->mt_cursors[i]; @@ -2040,7 +2044,7 @@ static int mdbx_cursor_shadow(MDBX_txn *src, MDBX_txn *dst) { } } } - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Close this write txn's cursors, give parent txn's cursors back to parent. @@ -2050,8 +2054,8 @@ static int mdbx_cursor_shadow(MDBX_txn *src, MDBX_txn *dst) { * * Returns 0 on success, non-zero on failure. */ static void mdbx_cursors_eot(MDBX_txn *txn, unsigned merge) { - MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; - MDB_xcursor *mx; + MDBX_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDBX_xcursor *mx; int i; for (i = txn->mt_numdbs; --i >= 0;) { @@ -2074,7 +2078,7 @@ static void mdbx_cursors_eot(MDBX_txn *txn, unsigned merge) { /* Abort nested txn */ *mc = *bk; if ((mx = mc->mc_xcursor) != NULL) - *mx = *(MDB_xcursor *)(bk + 1); + *mx = *(MDBX_xcursor *)(bk + 1); } bk->mc_signature = 0; free(bk); @@ -2093,46 +2097,46 @@ static void mdbx_cursors_eot(MDBX_txn *txn, unsigned merge) { /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { - MDB_env *env = txn->mt_env; + MDBX_env *env = txn->mt_env; unsigned i, nr; int rc; if (unlikely(env->me_pid != mdbx_getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; } - if (flags & MDB_TXN_RDONLY) { - txn->mt_flags = MDB_TXN_RDONLY; + if (flags & MDBX_TXN_RDONLY) { + txn->mt_flags = MDBX_TXN_RDONLY; MDBX_reader *r = txn->mt_ro_reader; - if (likely(env->me_flags & MDB_ENV_TXKEY)) { - mdbx_assert(env, !(env->me_flags & MDB_NOTLS)); + if (likely(env->me_flags & MDBX_ENV_TXKEY)) { + mdbx_assert(env, !(env->me_flags & MDBX_NOTLS)); r = mdbx_thread_rthc_get(env->me_txkey); if (likely(r)) { mdbx_assert(env, r->mr_pid == env->me_pid); mdbx_assert(env, r->mr_tid == mdbx_thread_self()); } } else { - mdbx_assert(env, !env->me_lck || (env->me_flags & MDB_NOTLS)); + mdbx_assert(env, !env->me_lck || (env->me_flags & MDBX_NOTLS)); } if (likely(r)) { if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != ~(txnid_t)0)) - return MDB_BAD_RSLOT; + return MDBX_BAD_RSLOT; } else if (env->me_lck) { mdbx_pid_t pid = env->me_pid; mdbx_tid_t tid = mdbx_thread_self(); - mdbx_assert(env, env->me_lck->mti_magic == MDB_MAGIC); - mdbx_assert(env, env->me_lck->mti_format == MDB_LOCK_FORMAT); + mdbx_assert(env, env->me_lck->mti_magic == MDBX_MAGIC); + mdbx_assert(env, env->me_lck->mti_format == MDBX_LOCK_FORMAT); rc = mdbx_rdt_lock(env); if (unlikely(MDBX_IS_ERROR(rc))) return rc; - rc = MDB_SUCCESS; + rc = MDBX_SUCCESS; if (unlikely(env->me_live_reader != pid)) { rc = mdbx_rpid_set(env); - if (unlikely(rc != MDB_SUCCESS)) { + if (unlikely(rc != MDBX_SUCCESS)) { mdbx_rdt_unlock(env); return rc; } @@ -2151,7 +2155,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { rc = mdbx_reader_check0(env, 1, NULL); if (rc != MDBX_RESULT_TRUE) { mdbx_rdt_unlock(env); - return (rc == MDB_SUCCESS) ? MDB_READERS_FULL : rc; + return (rc == MDBX_SUCCESS) ? MDBX_READERS_FULL : rc; } } @@ -2172,12 +2176,12 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { r->mr_pid = pid; mdbx_rdt_unlock(env); - if (likely(env->me_flags & MDB_ENV_TXKEY)) + if (likely(env->me_flags & MDBX_ENV_TXKEY)) mdbx_thread_rthc_set(env->me_txkey, r); } while (1) { - MDB_meta *const meta = mdbx_meta_head(txn->mt_env); + MDBX_meta *const meta = mdbx_meta_head(txn->mt_env); mdbx_jitter4testing(false); const txnid_t snap = meta->mm_txnid; mdbx_jitter4testing(false); @@ -2191,7 +2195,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { /* Snap the state from current meta-head */ txn->mt_txnid = snap; txn->mt_next_pgno = meta->mm_last_pg + 1; - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); txn->mt_canary = meta->mm_canary; /* LY: Retry on a race, ITS#7970. */ @@ -2209,11 +2213,11 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { return rc; mdbx_jitter4testing(false); - MDB_meta *meta = mdbx_meta_head(env); + MDBX_meta *meta = mdbx_meta_head(env); mdbx_jitter4testing(false); txn->mt_canary = meta->mm_canary; txn->mt_txnid = meta->mm_txnid + 1; -#if MDB_DEBUG +#if MDBX_DEBUG if (unlikely(txn->mt_txnid == mdbx_debug_edge)) { if (!mdbx_debug_logger) mdbx_runtime_flags |= @@ -2224,7 +2228,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { #endif if (unlikely(txn->mt_txnid < meta->mm_txnid)) { mdbx_debug("txnid overflow!"); - rc = MDB_TXN_FULL; + rc = MDBX_TXN_FULL; goto bailout; } @@ -2232,7 +2236,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_child = NULL; txn->mt_loose_pages = NULL; txn->mt_loose_count = 0; - txn->mt_dirtyroom = MDB_IDL_UM_MAX; + txn->mt_dirtyroom = MDBX_IDL_UM_MAX; txn->mt_rw_dirtylist = env->me_dirtylist; txn->mt_rw_dirtylist[0].mid = 0; txn->mt_free_pages = env->me_free_pgs; @@ -2243,7 +2247,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { env->me_txn = txn; memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned)); /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDB_db)); + memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); /* Moved to here to avoid a data race in read TXNs */ txn->mt_next_pgno = meta->mm_last_pg + 1; } @@ -2254,22 +2258,22 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { unsigned x = env->me_dbflags[i]; txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; txn->mt_dbflags[i] = - (x & MDB_VALID) ? DB_VALID | DB_USRVALID | DB_STALE : 0; + (x & MDBX_VALID) ? DB_VALID | DB_USRVALID | DB_STALE : 0; } txn->mt_dbflags[MAIN_DBI] = DB_VALID | DB_USRVALID; txn->mt_dbflags[FREE_DBI] = DB_VALID; - if (unlikely(env->me_flags & MDB_FATAL_ERROR)) { + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { mdbx_debug("environment had fatal error, must shutdown!"); - rc = MDB_PANIC; + rc = MDBX_PANIC; } else if (unlikely(env->me_maxpg < txn->mt_next_pgno)) { - rc = MDB_MAP_RESIZED; + rc = MDBX_MAP_RESIZED; } else { - return MDB_SUCCESS; + return MDBX_SUCCESS; } bailout: - assert(rc != MDB_SUCCESS); - mdbx_txn_end(txn, MDB_END_SLOT | MDB_END_FAIL_BEGIN); + assert(rc != MDBX_SUCCESS); + mdbx_txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); return rc; } @@ -2282,22 +2286,22 @@ int mdbx_txn_renew(MDBX_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY | MDB_TXN_FINISHED))) + if (unlikely(!F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY | MDBX_TXN_FINISHED))) return MDBX_EINVAL; - rc = mdbx_txn_renew0(txn, MDB_TXN_RDONLY); - if (rc == MDB_SUCCESS) { + rc = mdbx_txn_renew0(txn, MDBX_TXN_RDONLY); + if (rc == MDBX_SUCCESS) { mdbx_debug("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "", - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); } return rc; } -int mdbx_txn_begin(MDB_env *env, MDBX_txn *parent, unsigned flags, +int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, MDBX_txn **ret) { MDBX_txn *txn; - MDB_ntxn *ntxn; + MDBX_ntxn *ntxn; int rc, size, tsize; if (unlikely(!env || !ret)) @@ -2307,14 +2311,14 @@ int mdbx_txn_begin(MDB_env *env, MDBX_txn *parent, unsigned flags, return MDBX_EBADSIGN; if (unlikely(env->me_pid != mdbx_getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; } - flags &= MDB_TXN_BEGIN_FLAGS; - flags |= env->me_flags & MDB_WRITEMAP; + flags &= MDBX_TXN_BEGIN_FLAGS; + flags |= env->me_flags & MDBX_WRITEMAP; - if (unlikely(env->me_flags & MDB_RDONLY & + if (unlikely(env->me_flags & MDBX_RDONLY & ~flags)) /* write txn in RDONLY env */ return MDBX_EACCESS; @@ -2324,14 +2328,14 @@ int mdbx_txn_begin(MDB_env *env, MDBX_txn *parent, unsigned flags, /* Nested transactions: Max 1 child, write txns only, no writemap */ flags |= parent->mt_flags; - if (unlikely(flags & (MDB_RDONLY | MDB_WRITEMAP | MDB_TXN_BLOCKED))) { - return (parent->mt_flags & MDB_TXN_RDONLY) ? MDBX_EINVAL : MDB_BAD_TXN; + if (unlikely(flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED))) { + return (parent->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_BAD_TXN; } - /* Child txns save MDB_pgstate and use own copy of cursors */ - size = env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + 1); - size += tsize = sizeof(MDB_ntxn); - } else if (flags & MDB_RDONLY) { - size = env->me_maxdbs * (sizeof(MDB_db) + 1); + /* Child txns save MDBX_pgstate and use own copy of cursors */ + size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); + size += tsize = sizeof(MDBX_ntxn); + } else if (flags & MDBX_RDONLY) { + size = env->me_maxdbs * (sizeof(MDBX_db) + 1); size += tsize = sizeof(MDBX_txn); } else { /* Reuse preallocated write txn. However, do not touch it until @@ -2344,18 +2348,18 @@ int mdbx_txn_begin(MDB_env *env, MDBX_txn *parent, unsigned flags, return MDBX_ENOMEM; } txn->mt_dbxs = env->me_dbxs; /* static */ - txn->mt_dbs = (MDB_db *)((char *)txn + tsize); + txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); txn->mt_dbflags = (uint8_t *)txn + size - env->me_maxdbs; txn->mt_flags = flags; txn->mt_env = env; if (parent) { unsigned i; - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_dbiseqs = parent->mt_dbiseqs; - txn->mt_rw_dirtylist = malloc(sizeof(MDB_ID2) * MDB_IDL_UM_SIZE); + txn->mt_rw_dirtylist = malloc(sizeof(MDBX_ID2) * MDBX_IDL_UM_SIZE); if (!txn->mt_rw_dirtylist || - !(txn->mt_free_pages = mdbx_midl_alloc(MDB_IDL_UM_MAX))) { + !(txn->mt_free_pages = mdbx_midl_alloc(MDBX_IDL_UM_MAX))) { free(txn->mt_rw_dirtylist); free(txn); return MDBX_ENOMEM; @@ -2365,19 +2369,19 @@ int mdbx_txn_begin(MDB_env *env, MDBX_txn *parent, unsigned flags, txn->mt_rw_dirtylist[0].mid = 0; txn->mt_spill_pages = NULL; txn->mt_next_pgno = parent->mt_next_pgno; - parent->mt_flags |= MDB_TXN_HAS_CHILD; + parent->mt_flags |= MDBX_TXN_HAS_CHILD; parent->mt_child = txn; txn->mt_parent = parent; txn->mt_numdbs = parent->mt_numdbs; - memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); /* Copy parent's mt_dbflags, but clear DB_NEW */ for (i = 0; i < txn->mt_numdbs; i++) txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; rc = 0; - ntxn = (MDB_ntxn *)txn; + ntxn = (MDBX_ntxn *)txn; ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ if (env->me_pghead) { - size = MDB_IDL_SIZEOF(env->me_pghead); + size = MDBX_IDL_SIZEOF(env->me_pghead); env->me_pghead = mdbx_midl_alloc(env->me_pghead[0]); if (likely(env->me_pghead)) memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); @@ -2387,8 +2391,8 @@ int mdbx_txn_begin(MDB_env *env, MDBX_txn *parent, unsigned flags, if (likely(!rc)) rc = mdbx_cursor_shadow(parent, txn); if (unlikely(rc)) - mdbx_txn_end(txn, MDB_END_FAIL_BEGINCHILD); - } else { /* MDB_RDONLY */ + mdbx_txn_end(txn, MDBX_END_FAIL_BEGINCHILD); + } else { /* MDBX_RDONLY */ txn->mt_dbiseqs = env->me_dbiseqs; renew: rc = mdbx_txn_renew0(txn, flags); @@ -2401,14 +2405,14 @@ int mdbx_txn_begin(MDB_env *env, MDBX_txn *parent, unsigned flags, txn->mt_signature = MDBX_MT_SIGNATURE; *ret = txn; mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "", - txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', (void *)txn, + txn->mt_txnid, (flags & MDBX_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); } return rc; } -MDB_env *mdbx_txn_env(MDBX_txn *txn) { +MDBX_env *mdbx_txn_env(MDBX_txn *txn) { if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) return NULL; return txn->mt_env; @@ -2422,14 +2426,14 @@ uint64_t mdbx_txn_id(MDBX_txn *txn) { /* Export or close DBI handles opened in this txn. */ static void mdbx_dbis_update(MDBX_txn *txn, int keep) { - MDB_dbi n = txn->mt_numdbs; - MDB_env *env = txn->mt_env; + MDBX_dbi n = txn->mt_numdbs; + MDBX_env *env = txn->mt_env; uint8_t *tdbflags = txn->mt_dbflags; for (unsigned i = n; --i >= CORE_DBS;) { if (tdbflags[i] & DB_NEW) { if (keep) { - env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID; + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDBX_VALID; } else { char *ptr = env->me_dbxs[i].md_name.iov_base; if (ptr) { @@ -2451,41 +2455,41 @@ static void mdbx_dbis_update(MDBX_txn *txn, int keep) { * [in] txn the transaction handle to end * [in] mode why and how to end the transaction */ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { - MDB_env *env = txn->mt_env; - static const char *const names[] = MDB_END_NAMES; + MDBX_env *env = txn->mt_env; + static const char *const names[] = MDBX_END_NAMES; if (unlikely(txn->mt_env->me_pid != mdbx_getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; } /* Export or close DBI handles opened in this txn */ - mdbx_dbis_update(txn, mode & MDB_END_UPDATE); + mdbx_dbis_update(txn, mode & MDBX_END_UPDATE); mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO "", - names[mode & MDB_END_OPMASK], txn->mt_txnid, - (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + names[mode & MDBX_END_OPMASK], txn->mt_txnid, + (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { + if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { if (txn->mt_ro_reader) { txn->mt_ro_reader->mr_txnid = ~(txnid_t)0; - if (mode & MDB_END_SLOT) { - if ((env->me_flags & MDB_ENV_TXKEY) == 0) + if (mode & MDBX_END_SLOT) { + if ((env->me_flags & MDBX_ENV_TXKEY) == 0) txn->mt_ro_reader->mr_pid = 0; txn->mt_ro_reader = NULL; } } mdbx_coherent_barrier(); txn->mt_numdbs = 0; /* prevent further DBI activity */ - txn->mt_flags |= MDB_TXN_FINISHED; + txn->mt_flags |= MDBX_TXN_FINISHED; - } else if (!F_ISSET(txn->mt_flags, MDB_TXN_FINISHED)) { + } else if (!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED)) { pgno_t *pghead = env->me_pghead; - if (!(mode & MDB_END_EOTDONE)) /* !(already closed cursors) */ + if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ mdbx_cursors_eot(txn, 0); - if (!(env->me_flags & MDB_WRITEMAP)) { + if (!(env->me_flags & MDBX_WRITEMAP)) { mdbx_dlist_free(txn); } @@ -2497,7 +2501,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { } } txn->mt_numdbs = 0; - txn->mt_flags = MDB_TXN_FINISHED; + txn->mt_flags = MDBX_TXN_FINISHED; if (!txn->mt_parent) { mdbx_midl_shrink(&txn->mt_free_pages); @@ -2513,8 +2517,8 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { mdbx_txn_unlock(env); } else { txn->mt_parent->mt_child = NULL; - txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD; - env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; + txn->mt_parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; + env->me_pgstate = ((MDBX_ntxn *)txn)->mnt_pgstate; mdbx_midl_free(txn->mt_free_pages); mdbx_midl_free(txn->mt_spill_pages); free(txn->mt_rw_dirtylist); @@ -2523,12 +2527,12 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { mdbx_midl_free(pghead); } - if (mode & MDB_END_FREE) { + if (mode & MDBX_END_FREE) { txn->mt_signature = 0; free(txn); } - return MDB_SUCCESS; + return MDBX_SUCCESS; } int mdbx_txn_reset(MDBX_txn *txn) { @@ -2539,11 +2543,11 @@ int mdbx_txn_reset(MDBX_txn *txn) { return MDBX_EBADSIGN; /* This call is only valid for read-only txns */ - if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) + if (unlikely(!(txn->mt_flags & MDBX_TXN_RDONLY))) return MDBX_EINVAL; /* LY: don't close DBI-handles in MDBX mode */ - return mdbx_txn_end(txn, MDB_END_RESET | MDB_END_UPDATE); + return mdbx_txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); } int mdbx_txn_abort(MDBX_txn *txn) { @@ -2553,15 +2557,15 @@ int mdbx_txn_abort(MDBX_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) + if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) /* LY: don't close DBI-handles in MDBX mode */ - return mdbx_txn_end(txn, MDB_END_ABORT | MDB_END_UPDATE | MDB_END_SLOT | - MDB_END_FREE); + return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | + MDBX_END_FREE); if (txn->mt_child) mdbx_txn_abort(txn->mt_child); - return mdbx_txn_end(txn, MDB_END_ABORT | MDB_END_SLOT | MDB_END_FREE); + return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); } static __inline int mdbx_backlog_size(MDBX_txn *txn) { @@ -2572,7 +2576,7 @@ static __inline int mdbx_backlog_size(MDBX_txn *txn) { /* LY: Prepare a backlog of pages to modify FreeDB itself, * while reclaiming is prohibited. It should be enough to prevent search * in mdbx_page_alloc() during a deleting, when freeDB tree is unbalanced. */ -static int mdbx_prep_backlog(MDBX_txn *txn, MDB_cursor *mc) { +static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { /* LY: extra page(s) for b-tree rebalancing */ const int extra = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) ? 2 : 1; @@ -2584,14 +2588,14 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDB_cursor *mc) { while (unlikely(mdbx_backlog_size(txn) < extra)) { rc = mdbx_page_alloc(mc, 1, NULL, MDBX_ALLOC_GC); if (unlikely(rc)) { - if (unlikely(rc != MDB_NOTFOUND)) + if (unlikely(rc != MDBX_NOTFOUND)) return rc; break; } } } - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Save the freelist as of this transaction to the freeDB. @@ -2601,8 +2605,8 @@ static int mdbx_freelist_save(MDBX_txn *txn) { /* env->me_pghead[] can grow and shrink during this call. * env->me_pglast and txn->mt_free_pages[] can only grow. * Page numbers cannot disappear from txn->mt_free_pages[]. */ - MDB_cursor mc; - MDB_env *env = txn->mt_env; + MDBX_cursor mc; + MDBX_env *env = txn->mt_env; int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; txnid_t pglast = 0, head_id = 0; pgno_t freecnt = 0, *free_pgs, *mop; @@ -2612,9 +2616,10 @@ static int mdbx_freelist_save(MDBX_txn *txn) { mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); - /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ - clean_limit = (env->me_flags & (MDB_NOMEMINIT | MDB_WRITEMAP)) ? SSIZE_MAX - : maxfree_1pg; + /* MDBX_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */ + clean_limit = (env->me_flags & (MDBX_NOMEMINIT | MDBX_WRITEMAP)) + ? SSIZE_MAX + : maxfree_1pg; again: for (;;) { @@ -2649,8 +2654,8 @@ again: pglast = txn->mt_lifo_reclaimed[++cleanup_idx]; key.iov_base = &pglast; key.iov_len = sizeof(pglast); - rc = mdbx_cursor_get(&mc, &key, NULL, MDB_SET); - if (likely(rc != MDB_NOTFOUND)) { + rc = mdbx_cursor_get(&mc, &key, NULL, MDBX_SET); + if (likely(rc != MDBX_NOTFOUND)) { if (unlikely(rc)) goto bailout; rc = mdbx_prep_backlog(txn, &mc); @@ -2682,8 +2687,8 @@ again: if (freecnt < txn->mt_free_pages[0]) { if (unlikely(!freecnt)) { /* Make sure last page of freeDB is touched and on freelist */ - rc = mdbx_page_search(&mc, NULL, MDB_PS_LAST | MDB_PS_MODIFY); - if (unlikely(rc && rc != MDB_NOTFOUND)) + rc = mdbx_page_search(&mc, NULL, MDBX_PS_LAST | MDBX_PS_MODIFY); + if (unlikely(rc && rc != MDBX_NOTFOUND)) goto bailout; } free_pgs = txn->mt_free_pages; @@ -2692,8 +2697,8 @@ again: key.iov_base = &txn->mt_txnid; do { freecnt = free_pgs[0]; - data.iov_len = MDB_IDL_SIZEOF(free_pgs); - rc = mdbx_cursor_put(&mc, &key, &data, MDB_RESERVE); + data.iov_len = MDBX_IDL_SIZEOF(free_pgs); + rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); if (unlikely(rc)) goto bailout; /* Retry if mt_free_pages[] grew during the Put() */ @@ -2742,14 +2747,14 @@ again: if (likely(rc == 0)) /* LY: ok, reclaimed from freedb. */ continue; - if (unlikely(rc != MDB_NOTFOUND)) + if (unlikely(rc != MDBX_NOTFOUND)) /* LY: other troubles... */ goto bailout; /* LY: freedb is empty, will look any free txn-id in high2low order. */ if (unlikely(env->me_pglast < 1)) { /* LY: not any txn in the past of freedb. */ - rc = MDB_MAP_FULL; + rc = MDBX_MAP_FULL; goto bailout; } @@ -2786,7 +2791,7 @@ again: key.iov_len = sizeof(head_id); key.iov_base = &head_id; data.iov_len = (head_room + 1) * sizeof(pgno_t); - rc = mdbx_cursor_put(&mc, &key, &data, MDB_RESERVE); + rc = mdbx_cursor_put(&mc, &key, &data, MDBX_RESERVE); if (unlikely(rc)) goto bailout; /* IDL is initially empty, zero out at least the length */ @@ -2807,12 +2812,12 @@ again: if (txn->mt_loose_pages) { MDBX_page *mp = txn->mt_loose_pages; unsigned count = txn->mt_loose_count; - MDB_IDL loose; + MDBX_IDL loose; /* Room for loose pages + temp IDL with same */ if ((rc = mdbx_midl_need(&env->me_pghead, 2 * count + 1)) != 0) goto bailout; mop = env->me_pghead; - loose = mop + MDB_IDL_ALLOCLEN(mop) - count; + loose = mop + MDBX_IDL_ALLOCLEN(mop) - count; for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) loose[++count] = mp->mp_pgno; loose[0] = count; @@ -2824,7 +2829,7 @@ again: } /* Fill in the reserved me_pghead records */ - rc = MDB_SUCCESS; + rc = MDBX_SUCCESS; if (mop_len) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ @@ -2851,7 +2856,7 @@ again: id = txn->mt_lifo_reclaimed[refill_idx--]; key.iov_base = &id; key.iov_len = sizeof(id); - rc = mdbx_cursor_get(&mc, &key, &data, MDB_SET); + rc = mdbx_cursor_get(&mc, &key, &data, MDBX_SET); if (unlikely(rc)) goto bailout; } @@ -2870,7 +2875,7 @@ again: save = mop[0]; mop[0] = len; - rc = mdbx_cursor_put(&mc, &key, &data, MDB_CURRENT); + rc = mdbx_cursor_put(&mc, &key, &data, MDBX_CURRENT); mdbx_tassert( txn, cleanup_idx == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); @@ -2879,7 +2884,7 @@ again: goto bailout; if (!lifo) { - rc = mdbx_cursor_next(&mc, &key, &data, MDB_NEXT); + rc = mdbx_cursor_next(&mc, &key, &data, MDBX_NEXT); if (unlikely(rc)) goto bailout; } @@ -2914,21 +2919,21 @@ bailout: * [in] keep number of initial pages in dirtylist to keep dirty. * Returns 0 on success, non-zero on failure. */ static int mdbx_page_flush(MDBX_txn *txn, int keep) { - MDB_env *env = txn->mt_env; - MDB_ID2L dl = txn->mt_rw_dirtylist; + MDBX_env *env = txn->mt_env; + MDBX_ID2L dl = txn->mt_rw_dirtylist; unsigned psize = env->me_psize, j; int i, pagecount = dl[0].mid, rc; size_t size = 0, pos = 0; pgno_t pgno = 0; MDBX_page *dp = NULL; - struct iovec iov[MDB_COMMIT_PAGES]; + struct iovec iov[MDBX_COMMIT_PAGES]; ssize_t wpos = 0, wsize = 0; size_t next_pos = 1; /* impossible pos, so pos != next_pos */ int n = 0; j = i = keep; - if (env->me_flags & MDB_WRITEMAP) { + if (env->me_flags & MDBX_WRITEMAP) { /* Clear dirty flags */ while (++i <= pagecount) { dp = dl[i].mptr; @@ -2963,12 +2968,12 @@ static int mdbx_page_flush(MDBX_txn *txn, int keep) { size *= dp->mp_pages; env->me_sync_pending += size; } - /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ - if (pos != next_pos || n == MDB_COMMIT_PAGES || wsize + size > MAX_WRITE) { + /* Write up to MDBX_COMMIT_PAGES dirty pages at a time. */ + if (pos != next_pos || n == MDBX_COMMIT_PAGES || wsize + size > MAX_WRITE) { if (n) { /* Write previous page(s) */ rc = mdbx_pwritev(env->me_fd, iov, n, wpos, wsize); - if (unlikely(rc != MDB_SUCCESS)) { + if (unlikely(rc != MDBX_SUCCESS)) { mdbx_debug("Write error: %s", strerror(rc)); return rc; } @@ -3004,7 +3009,7 @@ done: i--; txn->mt_dirtyroom += i - j; dl[0].mid = j; - return MDB_SUCCESS; + return MDBX_SUCCESS; } int mdbx_txn_commit(MDBX_txn *txn) { @@ -3016,38 +3021,38 @@ int mdbx_txn_commit(MDBX_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; - MDB_env *env = txn->mt_env; + MDBX_env *env = txn->mt_env; if (unlikely(env->me_pid != mdbx_getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; } if (txn->mt_child) { rc = mdbx_txn_commit(txn->mt_child); txn->mt_child = NULL; - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto fail; } /* mdbx_txn_end() mode for a commit which writes nothing */ unsigned end_mode = - MDB_END_EMPTY_COMMIT | MDB_END_UPDATE | MDB_END_SLOT | MDB_END_FREE; - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + MDBX_END_EMPTY_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; + if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) goto done; - if (unlikely(txn->mt_flags & (MDB_TXN_FINISHED | MDB_TXN_ERROR))) { + if (unlikely(txn->mt_flags & (MDBX_TXN_FINISHED | MDBX_TXN_ERROR))) { mdbx_debug("error flag is set, can't commit"); if (txn->mt_parent) - txn->mt_parent->mt_flags |= MDB_TXN_ERROR; - rc = MDB_BAD_TXN; + txn->mt_parent->mt_flags |= MDBX_TXN_ERROR; + rc = MDBX_BAD_TXN; goto fail; } if (txn->mt_parent) { MDBX_txn *parent = txn->mt_parent; MDBX_page **lp; - MDB_ID2L dst, src; - MDB_IDL pspill; + MDBX_ID2L dst, src; + MDBX_IDL pspill; unsigned i, x, y, len, ps_len; /* Append our reclaim list to parent's */ @@ -3055,7 +3060,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { if (parent->mt_lifo_reclaimed) { rc = mdbx_midl_append_list(&parent->mt_lifo_reclaimed, txn->mt_lifo_reclaimed); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto fail; mdbx_midl_free(txn->mt_lifo_reclaimed); } else @@ -3065,11 +3070,11 @@ int mdbx_txn_commit(MDBX_txn *txn) { /* Append our free list to parent's */ rc = mdbx_midl_append_list(&parent->mt_free_pages, txn->mt_free_pages); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto fail; mdbx_midl_free(txn->mt_free_pages); /* Failures after this must either undo the changes - * to the parent or set MDB_TXN_ERROR in the parent. */ + * to the parent or set MDBX_TXN_ERROR in the parent. */ parent->mt_next_pgno = txn->mt_next_pgno; parent->mt_flags = txn->mt_flags; @@ -3078,7 +3083,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { mdbx_cursors_eot(txn, 1); /* Update parent's DB table. */ - memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); parent->mt_numdbs = txn->mt_numdbs; parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI]; parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; @@ -3146,7 +3151,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { } } } else { /* Simplify the above for single-ancestor case */ - len = MDB_IDL_UM_MAX - txn->mt_dirtyroom; + len = MDBX_IDL_UM_MAX - txn->mt_dirtyroom; } /* Merge our dirty list with parent's */ y = src[0].mid; @@ -3166,8 +3171,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { /* TODO: Prevent failure here, so parent does not fail */ rc = mdbx_midl_append_list(&parent->mt_spill_pages, txn->mt_spill_pages); - if (unlikely(rc != MDB_SUCCESS)) - parent->mt_flags |= MDB_TXN_ERROR; + if (unlikely(rc != MDBX_SUCCESS)) + parent->mt_flags |= MDBX_TXN_ERROR; mdbx_midl_free(txn->mt_spill_pages); mdbx_midl_sort(parent->mt_spill_pages); } else { @@ -3182,7 +3187,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { parent->mt_loose_count += txn->mt_loose_count; parent->mt_child = NULL; - mdbx_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); + mdbx_midl_free(((MDBX_ntxn *)txn)->mnt_pgstate.mf_pghead); txn->mt_signature = 0; free(txn); return rc; @@ -3195,10 +3200,10 @@ int mdbx_txn_commit(MDBX_txn *txn) { } mdbx_cursors_eot(txn, 0); - end_mode |= MDB_END_EOTDONE; + end_mode |= MDBX_END_EOTDONE; if (!txn->mt_rw_dirtylist[0].mid && - !(txn->mt_flags & (MDB_TXN_DIRTY | MDB_TXN_SPILLS))) + !(txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS))) goto done; mdbx_debug( @@ -3207,28 +3212,28 @@ int mdbx_txn_commit(MDBX_txn *txn) { /* Update DB root pointers */ if (txn->mt_numdbs > CORE_DBS) { - MDB_cursor mc; - MDB_dbi i; + MDBX_cursor mc; + MDBX_dbi i; MDBX_val data; - data.iov_len = sizeof(MDB_db); + data.iov_len = sizeof(MDBX_db); mdbx_cursor_init(&mc, txn, MAIN_DBI, NULL); for (i = CORE_DBS; i < txn->mt_numdbs; i++) { if (txn->mt_dbflags[i] & DB_DIRTY) { if (unlikely(TXN_DBI_CHANGED(txn, i))) { - rc = MDB_BAD_DBI; + rc = MDBX_BAD_DBI; goto fail; } data.iov_base = &txn->mt_dbs[i]; rc = mdbx_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, F_SUBDATA); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto fail; } } } rc = mdbx_freelist_save(txn); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto fail; mdbx_midl_free(env->me_pghead); @@ -3239,8 +3244,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { mdbx_audit(txn); rc = mdbx_page_flush(txn, 0); - if (likely(rc == MDB_SUCCESS)) { - MDB_meta meta; + if (likely(rc == MDBX_SUCCESS)) { + MDBX_meta meta; meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; @@ -3250,9 +3255,9 @@ int mdbx_txn_commit(MDBX_txn *txn) { rc = mdbx_env_sync_locked(env, env->me_flags | txn->mt_flags, &meta); } - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto fail; - end_mode = MDB_END_COMMITTED | MDB_END_UPDATE | MDB_END_EOTDONE; + end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; done: return mdbx_txn_end(txn, end_mode); @@ -3264,19 +3269,19 @@ fail: /* Read the environment parameters of a DB environment * before mapping it into memory. */ -static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { - assert(offsetof(MDB_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); - memset(meta, 0, sizeof(MDB_meta)); - meta->mm_datasync_sign = MDB_DATASIGN_WEAK; +static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { + assert(offsetof(MDBX_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); + memset(meta, 0, sizeof(MDBX_meta)); + meta->mm_datasync_sign = MDBX_DATASIGN_WEAK; unsigned offset = 0; /* Read both meta pages so we can use the latest one. */ for (int loops_left = 2; --loops_left >= 0;) { - MDB_metabuf buf; + MDBX_metabuf buf; /* We don't know the page size on first time, so use a minimum value. */ int rc = mdbx_pread(env->me_fd, &buf, sizeof(buf), offset); - if (rc != MDB_SUCCESS) { + if (rc != MDBX_SUCCESS) { mdbx_debug("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(buf), rc, mdbx_strerror(rc)); return rc; @@ -3285,19 +3290,19 @@ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { MDBX_page *p = (MDBX_page *)&buf; if (!F_ISSET(p->mp_flags, P_META)) { mdbx_debug("page %" PRIaPGNO " not a meta-page", p->mp_pgno); - return MDB_INVALID; + return MDBX_INVALID; } - MDB_meta *m = PAGEDATA(p); - if (m->mm_magic != MDB_MAGIC) { + MDBX_meta *m = PAGEDATA(p); + if (m->mm_magic != MDBX_MAGIC) { mdbx_debug("meta[%u] has invalid magic", offset); - return MDB_INVALID; + return MDBX_INVALID; } - if (m->mm_version != MDB_DATA_VERSION) { + if (m->mm_version != MDBX_DATA_VERSION) { mdbx_debug("database is version %u, expected version %u", m->mm_version, - MDB_DATA_VERSION); - return MDB_VERSION_MISMATCH; + MDBX_DATA_VERSION); + return MDBX_VERSION_MISMATCH; } /* LY: check signature as a checksum */ @@ -3325,31 +3330,31 @@ static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) { if (META_IS_WEAK(meta)) { mdbx_debug("both meta-pages are weak, database is corrupted"); - return MDB_CORRUPTED; + return MDBX_CORRUPTED; } - return MDB_SUCCESS; + return MDBX_SUCCESS; } -/* Fill in most of the zeroed MDB_meta for an empty database environment */ -static void __cold mdbx_meta_model(const MDB_env *env, MDB_meta *model) { +/* Fill in most of the zeroed MDBX_meta for an empty database environment */ +static void __cold mdbx_meta_model(const MDBX_env *env, MDBX_meta *model) { memset(model, 0, sizeof(*model)); - model->mm_magic = MDB_MAGIC; - model->mm_version = MDB_DATA_VERSION; + model->mm_magic = MDBX_MAGIC; + model->mm_version = MDBX_DATA_VERSION; model->mm_mapsize = env->me_mapsize; model->mm_psize = env->me_psize; model->mm_last_pg = NUM_METAS - 1; model->mm_flags = (uint16_t)env->me_flags; - model->mm_flags |= MDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + model->mm_flags |= MDBX_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ model->mm_dbs[FREE_DBI].md_root = P_INVALID; model->mm_dbs[MAIN_DBI].md_root = P_INVALID; model->mm_datasync_sign = mdbx_meta_sign(model); } /* Write the environment parameters of a freshly created DB environment. */ -static int __cold mdbx_env_init_metas(const MDB_env *env, MDB_meta *model) { +static int __cold mdbx_env_init_metas(const MDBX_env *env, MDBX_meta *model) { mdbx_debug("writing new meta pages"); - assert(offsetof(MDB_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); + assert(offsetof(MDBX_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); unsigned page_size = env->me_psize; MDBX_page *first = calloc(NUM_METAS, page_size); @@ -3357,12 +3362,12 @@ static int __cold mdbx_env_init_metas(const MDB_env *env, MDB_meta *model) { return MDBX_ENOMEM; first->mp_pgno = 0; first->mp_flags = P_META; - MDB_meta *first_meta = (MDB_meta *)PAGEDATA(first); + MDBX_meta *first_meta = (MDBX_meta *)PAGEDATA(first); MDBX_page *second = (MDBX_page *)((char *)first + page_size); second->mp_pgno = 1; second->mp_flags = P_META; - MDB_meta *second_meta = (MDB_meta *)PAGEDATA(second); + MDBX_meta *second_meta = (MDBX_meta *)PAGEDATA(second); *first_meta = *model; model->mm_txnid += 1; @@ -3374,15 +3379,15 @@ static int __cold mdbx_env_init_metas(const MDB_env *env, MDB_meta *model) { return rc; } -static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, - MDB_meta *pending) { +static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, + MDBX_meta *pending) { int rc; - MDB_meta *head = mdbx_meta_head(env); + MDBX_meta *head = mdbx_meta_head(env); size_t prev_mapsize = head->mm_mapsize; size_t used_size = env->me_psize * (pending->mm_last_pg + 1); mdbx_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env)); - mdbx_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0); + mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); mdbx_assert(env, !META_IS_STEADY(head) || env->me_sync_pending != 0 || env->me_mapsize != prev_mapsize); @@ -3391,23 +3396,23 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, if (unlikely(pending->mm_mapsize != prev_mapsize)) { if (pending->mm_mapsize < prev_mapsize) { /* LY: currently this can't happen, but force full-sync. */ - flags &= MDB_WRITEMAP; + flags &= MDBX_WRITEMAP; } else { /* Persist any increases of mapsize config */ } } if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) - flags &= MDB_WRITEMAP; + flags &= MDBX_WRITEMAP; /* LY: step#1 - sync previously written/updated data-pages */ - if (env->me_sync_pending && (flags & MDB_NOSYNC) == 0) { - assert(((flags ^ env->me_flags) & MDB_WRITEMAP) == 0); - if (flags & MDB_WRITEMAP) { - rc = mdbx_msync(env->me_map, used_size, flags & MDB_MAPASYNC); - if (unlikely(rc != MDB_SUCCESS)) + if (env->me_sync_pending && (flags & MDBX_NOSYNC) == 0) { + assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + if (flags & MDBX_WRITEMAP) { + rc = mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC); + if (unlikely(rc != MDBX_SUCCESS)) goto fail; - if ((flags & MDB_MAPASYNC) == 0) + if ((flags & MDBX_MAPASYNC) == 0) env->me_sync_pending = 0; } else { bool fullsync = false; @@ -3425,7 +3430,7 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, fullsync = true; } rc = mdbx_filesync(env->me_fd, fullsync); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto fail; env->me_sync_pending = 0; } @@ -3437,17 +3442,17 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, } else { pending->mm_datasync_sign = (flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC - ? MDB_DATASIGN_NONE - : MDB_DATASIGN_WEAK; + ? MDBX_DATASIGN_NONE + : MDBX_DATASIGN_WEAK; } - volatile MDB_meta *target = + volatile MDBX_meta *target = (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head)) ? head : mdbx_env_meta_flipflop(env, head); off_t offset = (char *)target - env->me_map; - MDB_meta *stay = mdbx_env_meta_flipflop(env, (MDB_meta *)target); + MDBX_meta *stay = mdbx_env_meta_flipflop(env, (MDBX_meta *)target); mdbx_debug( "writing meta %d (%s, was %" PRIaTXN "/%s, stay %s %" PRIaTXN "/%s), root %" PRIaPGNO ", " @@ -3462,12 +3467,12 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" : "Legacy"); - if (env->me_flags & MDB_WRITEMAP) { + if (env->me_flags & MDBX_WRITEMAP) { /* LY: 'invalidate' the meta. */ mdbx_jitter4testing(true); - if (target->mm_datasync_sign != MDB_DATASIGN_WEAK || + if (target->mm_datasync_sign != MDBX_DATASIGN_WEAK || target->mm_txnid != pending->mm_txnid) { - target->mm_datasync_sign = MDB_DATASIGN_WEAK; + target->mm_datasync_sign = MDBX_DATASIGN_WEAK; mdbx_jitter4testing(true); target->mm_txnid = 0; mdbx_jitter4testing(true); @@ -3485,18 +3490,18 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, target->mm_datasync_sign = pending->mm_datasync_sign; mdbx_jitter4testing(true); } else { - pending->mm_magic = MDB_MAGIC; - pending->mm_version = MDB_DATA_VERSION; - rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDB_meta), offset); - if (unlikely(rc != MDB_SUCCESS)) { + pending->mm_magic = MDBX_MAGIC; + pending->mm_version = MDBX_DATA_VERSION; + rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), offset); + if (unlikely(rc != MDBX_SUCCESS)) { undo: mdbx_debug("write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ - mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDB_meta), offset); + mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta), offset); goto fail; } - mdbx_invalidate_cache(env->me_map + offset, sizeof(MDB_meta)); + mdbx_invalidate_cache(env->me_map + offset, sizeof(MDBX_meta)); } /* Memory ordering issues are irrelevant; since the entire writer @@ -3506,16 +3511,16 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, * how stale their view of these values is. */ /* LY: step#3 - sync meta-pages. */ - if ((flags & (MDB_NOSYNC | MDB_NOMETASYNC)) == 0) { - assert(((flags ^ env->me_flags) & MDB_WRITEMAP) == 0); - if (flags & MDB_WRITEMAP) { + if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) { + assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + if (flags & MDBX_WRITEMAP) { char *ptr = env->me_map + (offset & ~(env->me_os_psize - 1)); - rc = mdbx_msync(ptr, env->me_os_psize, flags & MDB_MAPASYNC); - if (unlikely(rc != MDB_SUCCESS)) + rc = mdbx_msync(ptr, env->me_os_psize, flags & MDBX_MAPASYNC); + if (unlikely(rc != MDBX_SUCCESS)) goto fail; } else { rc = mdbx_filesync(env->me_fd, false); - if (rc != MDB_SUCCESS) + if (rc != MDBX_SUCCESS) goto undo; } } @@ -3524,22 +3529,22 @@ static int mdbx_env_sync_locked(MDB_env *env, unsigned flags, if (unlikely(pending->mm_mapsize < prev_mapsize)) { mdbx_assert(env, pending->mm_mapsize == env->me_mapsize); rc = mdbx_ftruncate(env->me_fd, pending->mm_mapsize); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = mdbx_mremap_size((void **)&env->me_map, prev_mapsize, pending->mm_mapsize); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - return MDB_SUCCESS; + return MDBX_SUCCESS; fail: - env->me_flags |= MDB_FATAL_ERROR; + env->me_flags |= MDBX_FATAL_ERROR; return rc; } -int __cold mdbx_env_get_maxkeysize(MDB_env *env) { +int __cold mdbx_env_get_maxkeysize(MDBX_env *env) { if (!env || env->me_signature != MDBX_ME_SIGNATURE || !env->me_maxkey_limit) return MDBX_EINVAL; return env->me_maxkey_limit; @@ -3547,13 +3552,13 @@ int __cold mdbx_env_get_maxkeysize(MDB_env *env) { static __inline ssize_t mdbx_calc_nodemax(ssize_t pagesize) { assert(pagesize > 0); - return (((pagesize - PAGEHDRSZ) / MDB_MINKEYS) & -(ssize_t)2) - + return (((pagesize - PAGEHDRSZ) / MDBX_MINKEYS) & -(ssize_t)2) - sizeof(indx_t); } static __inline ssize_t mdbx_calc_maxkey(ssize_t nodemax) { assert(nodemax > 0); - return nodemax - (NODESIZE + sizeof(MDB_db)); + return nodemax - (NODESIZE + sizeof(MDBX_db)); } int mdbx_get_maxkeysize(size_t pagesize) { @@ -3568,7 +3573,7 @@ int mdbx_get_maxkeysize(size_t pagesize) { return (maxkey > 0 && maxkey < INT_MAX) ? (int)maxkey : -MDBX_EINVAL; } -static void __cold mdbx_env_setup_limits(MDB_env *env, size_t pagesize) { +static void __cold mdbx_env_setup_limits(MDBX_env *env, size_t pagesize) { env->me_maxfree_1pg = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; env->me_maxpg = env->me_mapsize / pagesize; @@ -3577,8 +3582,8 @@ static void __cold mdbx_env_setup_limits(MDB_env *env, size_t pagesize) { assert(env->me_maxkey_limit > 42 && env->me_maxkey_limit < pagesize); } -int __cold mdbx_env_create(MDB_env **penv) { - MDB_env *env = calloc(1, sizeof(MDB_env)); +int __cold mdbx_env_create(MDBX_env **penv) { + MDBX_env *env = calloc(1, sizeof(MDBX_env)); if (!env) return MDBX_ENOMEM; @@ -3589,28 +3594,28 @@ int __cold mdbx_env_create(MDB_env **penv) { env->me_pid = mdbx_getpid(); mdbx_env_setup_limits(env, env->me_os_psize = mdbx_syspagesize()); if (!is_power2(env->me_os_psize)) - return MDB_INCOMPATIBLE; + return MDBX_INCOMPATIBLE; VALGRIND_CREATE_MEMPOOL(env, 0, 0); env->me_signature = MDBX_ME_SIGNATURE; *penv = env; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -static int __cold mdbx_env_map(MDB_env *env, void *addr, size_t usedsize) { +static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { unsigned flags = env->me_flags; int rc; - if (flags & MDB_WRITEMAP) { + if (flags & MDBX_WRITEMAP) { rc = mdbx_ftruncate(env->me_fd, env->me_mapsize); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; } env->me_map = addr; - rc = mdbx_mmap((void **)&env->me_map, env->me_mapsize, flags & MDB_WRITEMAP, + rc = mdbx_mmap((void **)&env->me_map, env->me_mapsize, flags & MDBX_WRITEMAP, env->me_fd); - if (unlikely(rc != MDB_SUCCESS)) { + if (unlikely(rc != MDBX_SUCCESS)) { env->me_map = NULL; return rc; } @@ -3630,7 +3635,7 @@ static int __cold mdbx_env_map(MDB_env *env, void *addr, size_t usedsize) { #endif #ifdef MADV_REMOVE - if (flags & MDB_WRITEMAP) + if (flags & MDBX_WRITEMAP) (void)madvise(env->me_map + usedsize, env->me_mapsize - usedsize, MADV_REMOVE); #else @@ -3640,27 +3645,27 @@ static int __cold mdbx_env_map(MDB_env *env, void *addr, size_t usedsize) { #if defined(MADV_RANDOM) && defined(MADV_WILLNEED) /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ if (madvise(env->me_map, env->me_mapsize, - (flags & MDB_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED)) + (flags & MDBX_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED)) return errno; #endif /* Lock meta pages to avoid unexpected write, * before the data pages would be synchronized. */ - if (flags & MDB_WRITEMAP) { + if (flags & MDBX_WRITEMAP) { rc = mdbx_mlock(env->me_map, env->me_psize * 2); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; } #ifdef USE_VALGRIND env->me_valgrind_handle = - VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "lmdb"); + VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "mdbx"); #endif - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { +int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { if (unlikely(!env)) return MDBX_EINVAL; @@ -3674,7 +3679,7 @@ int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { * sure there are no active txns. */ if (env->me_map) { int rc; - MDB_meta *meta; + MDBX_meta *meta; if (env->me_txn) return MDBX_EINVAL; @@ -3694,22 +3699,22 @@ int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) { #endif rc = mdbx_ftruncate(env->me_fd, size); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; env->me_mapsize = size; /* FIXME: update meta */ rc = mdbx_env_map(env, NULL, usedsize); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; } env->me_mapsize = size; if (env->me_psize) env->me_maxpg = env->me_mapsize / env->me_psize; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int __cold mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { +int __cold mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { if (unlikely(!env)) return MDBX_EINVAL; @@ -3720,10 +3725,10 @@ int __cold mdbx_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { return MDBX_EINVAL; env->me_maxdbs = dbs + CORE_DBS; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int __cold mdbx_env_set_maxreaders(MDB_env *env, unsigned readers) { +int __cold mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { if (unlikely(!env || readers < 1)) return MDBX_EINVAL; @@ -3734,10 +3739,10 @@ int __cold mdbx_env_set_maxreaders(MDB_env *env, unsigned readers) { return MDBX_EINVAL; env->me_maxreaders = readers; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) { +int __cold mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers) { if (!env || !readers) return MDBX_EINVAL; @@ -3745,16 +3750,16 @@ int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) { return MDBX_EBADSIGN; *readers = env->me_maxreaders; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -/* Further setup required for opening an LMDB environment */ -static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { +/* Further setup required for opening an MDBX environment */ +static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { int rc = MDBX_RESULT_FALSE; int err = mdbx_read_header(env, meta); - if (unlikely(err != MDB_SUCCESS)) { + if (unlikely(err != MDBX_SUCCESS)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || - (env->me_flags & MDB_RDONLY) != 0) + (env->me_flags & MDBX_RDONLY) != 0) return err; mdbx_debug("create new database"); @@ -3786,26 +3791,26 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { /* mdbx_env_map() may grow the datafile. Write the metapages * first, so the file will be valid if initialization fails. */ err = mdbx_env_init_metas(env, meta); - if (unlikely(err != MDB_SUCCESS)) + if (unlikely(err != MDBX_SUCCESS)) return err; err = mdbx_ftruncate(env->me_fd, env->me_mapsize); - if (unlikely(err != MDB_SUCCESS)) + if (unlikely(err != MDBX_SUCCESS)) return err; } else { off_t size; err = mdbx_filesize(env->me_fd, &size); - if (unlikely(err != MDB_SUCCESS)) + if (unlikely(err != MDBX_SUCCESS)) return err; if (size != (off_t)env->me_mapsize) { mdbx_trace("filesize mismatch"); - if ((env->me_flags & MDB_RDONLY) || + if ((env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) return MDBX_WANNA_RECOVERY /* LY: could not mdbx_ftruncate */; err = mdbx_ftruncate(env->me_fd, env->me_mapsize); - if (unlikely(err != MDB_SUCCESS)) + if (unlikely(err != MDBX_SUCCESS)) return err; } } @@ -3814,23 +3819,23 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { if (err) return err; - MDB_meta *const head = mdbx_meta_head(env); + MDBX_meta *const head = mdbx_meta_head(env); if (head->mm_txnid != meta->mm_txnid) { mdbx_trace("head->mm_txnid (%" PRIaTXN ") != (%" PRIaTXN ") meta->mm_txnid", head->mm_txnid, meta->mm_txnid); if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { assert(META_IS_STEADY(meta) && !META_IS_STEADY(head)); - if (env->me_flags & MDB_RDONLY) { + if (env->me_flags & MDBX_RDONLY) { mdbx_trace("exclusive, but read-only, unable recovery/rollback"); return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */; } /* LY: rollback weak checkpoint */ - MDB_meta rollback = *head; + MDBX_meta rollback = *head; rollback.mm_txnid = 0; if (rollback.mm_txnid == meta->mm_txnid) rollback = *meta; - err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDB_meta), + err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); if (err) return err; @@ -3853,13 +3858,13 @@ static int __cold mdbx_setup_dxb(MDB_env *env, MDB_meta *meta, int lck_rc) { /****************************************************************************/ /* Open and/or initialize the lock region for the environment. */ -static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { +static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd == INVALID_HANDLE_VALUE); int err = mdbx_openfile(lck_pathname, O_RDWR | O_CREAT, mode, &env->me_lfd); - if (err != MDB_SUCCESS) { - if (err != MDBX_EROFS || (env->me_flags & MDB_RDONLY) == 0) + if (err != MDBX_SUCCESS) { + if (err != MDBX_EROFS || (env->me_flags & MDBX_RDONLY) == 0) return err; /* LY: without-lck mode (e.g. on read-only filesystem) */ env->me_lfd = INVALID_HANDLE_VALUE; @@ -3876,7 +3881,7 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { off_t size; err = mdbx_filesize(env->me_lfd, &size); - if (unlikely(err != MDB_SUCCESS)) + if (unlikely(err != MDBX_SUCCESS)) return err; if (rc == MDBX_RESULT_TRUE) { @@ -3885,14 +3890,14 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { env->me_os_psize); #ifndef NDEBUG err = mdbx_ftruncate(env->me_lfd, size = 0); - if (unlikely(err != MDB_SUCCESS)) + if (unlikely(err != MDBX_SUCCESS)) return err; #endif mdbx_jitter4testing(false); if (size != wanna) { err = mdbx_ftruncate(env->me_lfd, wanna); - if (unlikely(err != MDB_SUCCESS)) + if (unlikely(err != MDBX_SUCCESS)) return err; size = wanna; } @@ -3901,7 +3906,7 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { void *addr = NULL; err = mdbx_mmap(&addr, size, true, env->me_lfd); - if (unlikely(err != MDB_SUCCESS)) + if (unlikely(err != MDBX_SUCCESS)) return err; assert(addr != nullptr); env->me_lck = addr; @@ -3936,18 +3941,18 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { if (err) return err; - env->me_lck->mti_magic = MDB_MAGIC; - env->me_lck->mti_format = MDB_LOCK_FORMAT; + env->me_lck->mti_magic = MDBX_MAGIC; + env->me_lck->mti_format = MDBX_LOCK_FORMAT; } else { - if (env->me_lck->mti_magic != MDB_MAGIC) { + if (env->me_lck->mti_magic != MDBX_MAGIC) { mdbx_debug("lock region has invalid magic"); - return MDB_INVALID; + return MDBX_INVALID; } - if (env->me_lck->mti_format != MDB_LOCK_FORMAT) { + if (env->me_lck->mti_format != MDBX_LOCK_FORMAT) { mdbx_debug("lock region has format+version 0x%" PRIx64 ", expected 0x%" PRIx64, - env->me_lck->mti_format, MDB_LOCK_FORMAT); - return MDB_VERSION_MISMATCH; + env->me_lck->mti_format, MDBX_LOCK_FORMAT); + return MDBX_VERSION_MISMATCH; } } @@ -3958,17 +3963,17 @@ static int __cold mdbx_setup_lck(MDB_env *env, char *lck_pathname, int mode) { * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. */ #define CHANGEABLE \ - (MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC | MDB_NOMEMINIT | \ + (MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC | MDBX_NOMEMINIT | \ MDBX_COALESCE | MDBX_PAGEPERTURB) #define CHANGELESS \ - (MDB_NOSUBDIR | MDB_RDONLY | MDB_WRITEMAP | MDB_NOTLS | MDB_NORDAHEAD | \ + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM) #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE | CHANGELESS) #error "Persistent DB flags & env flags overlap, but both go in mm_flags" #endif -int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, +int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, mode_t mode, int *exclusive) { int oflags, rc, len; char *lck_pathname, *dxb_pathname; @@ -3984,7 +3989,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, return MDBX_EINVAL; len = strlen(path); - if (flags & MDB_NOSUBDIR) { + if (flags & MDBX_NOSUBDIR) { rc = len + sizeof(MDBX_LOCK_SUFFIX) + len + 1; } else { rc = len + sizeof(MDBX_LOCKNAME) + len + sizeof(MDBX_DATANAME); @@ -3993,7 +3998,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, if (!lck_pathname) return MDBX_ENOMEM; - if (flags & MDB_NOSUBDIR) { + if (flags & MDBX_NOSUBDIR) { dxb_pathname = lck_pathname + len + sizeof(MDBX_LOCK_SUFFIX); sprintf(lck_pathname, "%s" MDBX_LOCK_SUFFIX, path); strcpy(dxb_pathname, path); @@ -4003,39 +4008,39 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, sprintf(dxb_pathname, "%s" MDBX_DATANAME, path); } - rc = MDB_SUCCESS; + rc = MDBX_SUCCESS; flags |= env->me_flags; - if (flags & MDB_RDONLY) { + if (flags & MDBX_RDONLY) { /* LY: silently ignore irrelevant flags when * we're only getting read access */ - flags &= ~(MDB_WRITEMAP | MDB_MAPASYNC | MDB_NOSYNC | MDB_NOMETASYNC | - MDBX_COALESCE | MDBX_LIFORECLAIM | MDB_NOMEMINIT); + flags &= ~(MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NOSYNC | MDBX_NOMETASYNC | + MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_NOMEMINIT); } else { - if (!((env->me_free_pgs = mdbx_midl_alloc(MDB_IDL_UM_MAX)) && - (env->me_dirtylist = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) + if (!((env->me_free_pgs = mdbx_midl_alloc(MDBX_IDL_UM_MAX)) && + (env->me_dirtylist = calloc(MDBX_IDL_UM_SIZE, sizeof(MDBX_ID2))))) rc = MDBX_ENOMEM; } - env->me_flags = flags |= MDB_ENV_ACTIVE; + env->me_flags = flags |= MDBX_ENV_ACTIVE; if (rc) goto bailout; env->me_path = mdbx_strdup(path); - env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDBX_dbx)); env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned)); if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) { rc = MDBX_ENOMEM; goto bailout; } - env->me_dbxs[FREE_DBI].md_cmp = mdbx_cmp_int_ai; /* aligned MDB_INTEGERKEY */ + env->me_dbxs[FREE_DBI].md_cmp = mdbx_cmp_int_ai; /* aligned MDBX_INTEGERKEY */ - if (F_ISSET(flags, MDB_RDONLY)) + if (F_ISSET(flags, MDBX_RDONLY)) oflags = O_RDONLY; else oflags = O_RDWR | O_CREAT; rc = mdbx_openfile(dxb_pathname, oflags, mode, &env->me_fd); - if (rc != MDB_SUCCESS) + if (rc != MDBX_SUCCESS) goto bailout; const int lck_rc = mdbx_setup_lck(env, lck_pathname, mode); @@ -4044,7 +4049,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, goto bailout; } - MDB_meta meta; + MDBX_meta meta; const int dxb_rc = mdbx_setup_dxb(env, &meta, lck_rc); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; @@ -4053,7 +4058,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mdbx_debug("opened dbenv %p", (void *)env); const unsigned mode_flags = - MDB_WRITEMAP | MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC; + MDBX_WRITEMAP | MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC; if (lck_rc == MDBX_RESULT_TRUE) { env->me_lck->mti_envmode = env->me_flags & mode_flags; if (exclusive == NULL || *exclusive < 2) { @@ -4061,7 +4066,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, * in case exclusive==1, just leave value as is. */ rc = mdbx_lck_downgrade(env); mdbx_debug("lck-downgrade: rc %i ", rc); - if (rc != MDB_SUCCESS) + if (rc != MDBX_SUCCESS) goto bailout; } } else { @@ -4071,43 +4076,43 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, } if ((env->me_lck->mti_envmode ^ env->me_flags) & mode_flags) { /* LY: Current mode/flags incompatible with requested. */ - rc = MDB_INCOMPATIBLE; + rc = MDBX_INCOMPATIBLE; goto bailout; } } - if (env->me_lck && (env->me_flags & MDB_NOTLS) == 0) { + if (env->me_lck && (env->me_flags & MDBX_NOTLS) == 0) { rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_lck->mti_readers[0], &env->me_lck->mti_readers[env->me_maxreaders]); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; - env->me_flags |= MDB_ENV_TXKEY; + env->me_flags |= MDBX_ENV_TXKEY; } - if ((flags & MDB_RDONLY) == 0) { + if ((flags & MDBX_RDONLY) == 0) { MDBX_txn *txn; int tsize = sizeof(MDBX_txn), size = tsize + - env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + sizeof(unsigned) + 1); if ((env->me_pbuf = calloc(1, env->me_psize)) && (txn = calloc(1, size))) { - txn->mt_dbs = (MDB_db *)((char *)txn + tsize); - txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); + txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); + txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); txn->mt_dbflags = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; - txn->mt_flags = MDB_TXN_FINISHED; + txn->mt_flags = MDBX_TXN_FINISHED; env->me_txn0 = txn; } else { rc = MDBX_ENOMEM; } } -#if MDB_DEBUG - if (rc == MDB_SUCCESS) { - MDB_meta *meta = mdbx_meta_head(env); - MDB_db *db = &meta->mm_dbs[MAIN_DBI]; +#if MDBX_DEBUG + if (rc == MDBX_SUCCESS) { + MDBX_meta *meta = mdbx_meta_head(env); + MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; int toggle = ((char *)meta == PAGEDATA(env->me_map)) ? 0 : 1; mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, @@ -4129,16 +4134,16 @@ bailout: return rc; } -int __cold mdbx_env_open(MDB_env *env, const char *path, unsigned flags, +int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, mode_t mode) { return mdbx_env_open_ex(env, path, flags, mode, NULL); } /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ -static void __cold mdbx_env_close0(MDB_env *env) { - if (!(env->me_flags & MDB_ENV_ACTIVE)) +static void __cold mdbx_env_close0(MDBX_env *env) { + if (!(env->me_flags & MDBX_ENV_ACTIVE)) return; - env->me_flags &= ~MDB_ENV_ACTIVE; + env->me_flags &= ~MDBX_ENV_ACTIVE; /* Doing this here since me_dbxs may not exist during mdbx_env_close */ if (env->me_dbxs) { @@ -4157,9 +4162,9 @@ static void __cold mdbx_env_close0(MDB_env *env) { free(env->me_txn0); mdbx_midl_free(env->me_free_pgs); - if (env->me_flags & MDB_ENV_TXKEY) { + if (env->me_flags & MDBX_ENV_TXKEY) { mdbx_rthc_remove(env->me_txkey); - env->me_flags &= ~MDB_ENV_TXKEY; + env->me_flags &= ~MDBX_ENV_TXKEY; } if (env->me_map) { @@ -4187,9 +4192,9 @@ static void __cold mdbx_env_close0(MDB_env *env) { } } -int __cold mdbx_env_close_ex(MDB_env *env, int dont_sync) { +int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { MDBX_page *dp; - int rc = MDB_SUCCESS; + int rc = MDBX_SUCCESS; if (unlikely(!env)) return MDBX_EINVAL; @@ -4214,7 +4219,7 @@ int __cold mdbx_env_close_ex(MDB_env *env, int dont_sync) { return rc; } -void __cold mdbx_env_close(MDB_env *env) { mdbx_env_close_ex(env, 0); } +void __cold mdbx_env_close(MDBX_env *env) { mdbx_env_close_ex(env, 0); } /* LY: fast enough on most arches * @@ -4292,7 +4297,7 @@ static int __hot mdbx_cmp_int_a2(const MDBX_val *a, const MDBX_val *b) { /* Compare two items pointing at unsigneds of unknown alignment. * - * This is also set as MDB_INTEGERDUP|MDB_DUPFIXED's MDB_dbx.md_dcmp. */ + * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ static int __hot mdbx_cmp_int_ua(const MDBX_val *a, const MDBX_val *b) { mdbx_assert(NULL, a->iov_len == b->iov_len); #if UNALIGNED_OK @@ -4372,7 +4377,7 @@ static int __hot mdbx_cmp_memnr(const MDBX_val *a, const MDBX_val *b) { * in *exactp (1 or 0). * Updates the cursor index with the index of the found entry. * If no entry larger or equal to the key is found, returns NULL. */ -static MDBX_node *__hot mdbx_node_search(MDB_cursor *mc, MDBX_val *key, +static MDBX_node *__hot mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, int *exactp) { unsigned i = 0, nkeys; int low, high; @@ -4380,7 +4385,7 @@ static MDBX_node *__hot mdbx_node_search(MDB_cursor *mc, MDBX_val *key, MDBX_page *mp = mc->mc_pg[mc->mc_top]; MDBX_node *node = NULL; MDBX_val nodekey; - MDB_cmp_func *cmp; + MDBX_cmp_func *cmp; DKBUF; nkeys = NUMKEYS(mp); @@ -4453,8 +4458,8 @@ static MDBX_node *__hot mdbx_node_search(MDB_cursor *mc, MDBX_val *key, } #if 0 /* unused for now */ -static void mdbx_cursor_adjust(MDB_cursor *mc, func) { - MDB_cursor *m2; +static void mdbx_cursor_adjust(MDBX_cursor *mc, func) { + MDBX_cursor *m2; for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) { @@ -4465,7 +4470,7 @@ static void mdbx_cursor_adjust(MDB_cursor *mc, func) { #endif /* Pop a page off the top of the cursor's stack. */ -static void mdbx_cursor_pop(MDB_cursor *mc) { +static void mdbx_cursor_pop(MDBX_cursor *mc) { if (mc->mc_snum) { mdbx_debug("popped page %" PRIaPGNO " off db %d cursor %p", mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); @@ -4480,25 +4485,25 @@ static void mdbx_cursor_pop(MDB_cursor *mc) { } /* Push a page onto the top of the cursor's stack. - * Set MDB_TXN_ERROR on failure. */ -static int mdbx_cursor_push(MDB_cursor *mc, MDBX_page *mp) { + * Set MDBX_TXN_ERROR on failure. */ +static int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { mdbx_debug("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc), (void *)mc); if (unlikely(mc->mc_snum >= CURSOR_STACK)) { - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CURSOR_FULL; + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_CURSOR_FULL; } mc->mc_top = mc->mc_snum++; mc->mc_pg[mc->mc_top] = mp; mc->mc_ki[mc->mc_top] = 0; - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Find the address of the page corresponding to a given page number. - * Set MDB_TXN_ERROR on failure. + * Set MDBX_TXN_ERROR on failure. * * [in] mc the cursor accessing the page. * [in] pgno the page number for the page to retrieve. @@ -4508,18 +4513,18 @@ static int mdbx_cursor_push(MDB_cursor *mc, MDBX_page *mp) { * 0=mapped page. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDBX_page **ret, +static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, int *lvl) { MDBX_txn *txn = mc->mc_txn; - MDB_env *env = txn->mt_env; + MDBX_env *env = txn->mt_env; MDBX_page *p = NULL; int level; - if (!(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_WRITEMAP))) { + if (!(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_WRITEMAP))) { MDBX_txn *tx2 = txn; level = 1; do { - MDB_ID2L dl = tx2->mt_rw_dirtylist; + MDBX_ID2L dl = tx2->mt_rw_dirtylist; unsigned x; /* Spilled pages were dirtied in this txn and flushed * because the dirty list got full. Bring this page @@ -4544,8 +4549,8 @@ static int mdbx_page_get(MDB_cursor *mc, pgno_t pgno, MDBX_page **ret, if (unlikely(pgno >= txn->mt_next_pgno)) { mdbx_debug("page %" PRIaPGNO " not found", pgno); - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_NOTFOUND; + txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_NOTFOUND; } level = 0; @@ -4556,12 +4561,12 @@ done: *ret = p; if (lvl) *lvl = level; - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Finish mdbx_page_search() / mdbx_page_search_lowest(). * The cursor is at the root page, set up the rest of it. */ -static int mdbx_page_search_root(MDB_cursor *mc, MDBX_val *key, int flags) { +static int mdbx_page_search_root(MDBX_cursor *mc, MDBX_val *key, int flags) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; int rc; DKBUF; @@ -4578,9 +4583,9 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDBX_val *key, int flags) { mdbx_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); mdbx_debug("found index 0 to page %" PRIaPGNO "", NODEPGNO(NODEPTR(mp, 0))); - if (flags & (MDB_PS_FIRST | MDB_PS_LAST)) { + if (flags & (MDBX_PS_FIRST | MDBX_PS_LAST)) { i = 0; - if (flags & MDB_PS_LAST) { + if (flags & MDBX_PS_LAST) { i = NUMKEYS(mp) - 1; /* if already init'd, see if we're already in right place */ if (mc->mc_flags & C_INITIALIZED) { @@ -4617,7 +4622,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDBX_val *key, int flags) { return rc; ready: - if (flags & MDB_PS_MODIFY) { + if (flags & MDBX_PS_MODIFY) { if (unlikely((rc = mdbx_page_touch(mc)) != 0)) return rc; mp = mc->mc_pg[mc->mc_top]; @@ -4627,8 +4632,8 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDBX_val *key, int flags) { if (unlikely(!IS_LEAF(mp))) { mdbx_debug("internal error, index points to a page with 0x%02x flags!?", mp->mp_flags); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_CORRUPTED; + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_CORRUPTED; } mdbx_debug("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, @@ -4636,7 +4641,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDBX_val *key, int flags) { mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Search for the lowest key under the current branch page. @@ -4644,7 +4649,7 @@ static int mdbx_page_search_root(MDB_cursor *mc, MDBX_val *key, int flags) { * before calling mdbx_page_search_root(), because the callers * are all in situations where the current page is known to * be underfilled. */ -static int mdbx_page_search_lowest(MDB_cursor *mc) { +static int mdbx_page_search_lowest(MDBX_cursor *mc) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; MDBX_node *node = NODEPTR(mp, 0); int rc; @@ -4655,7 +4660,7 @@ static int mdbx_page_search_lowest(MDB_cursor *mc) { mc->mc_ki[mc->mc_top] = 0; if (unlikely(rc = mdbx_cursor_push(mc, mp))) return rc; - return mdbx_page_search_root(mc, NULL, MDB_PS_FIRST); + return mdbx_page_search_root(mc, NULL, MDBX_PS_FIRST); } /* Search for the page a given key should be in. @@ -4663,29 +4668,30 @@ static int mdbx_page_search_lowest(MDB_cursor *mc) { * * [in,out] mc the cursor for this operation. * [in] key the key to search for, or NULL for first/last page. - * [in] flags If MDB_PS_MODIFY is set, visited pages in the DB + * [in] flags If MDBX_PS_MODIFY is set, visited pages in the DB * are touched (updated with new page numbers). - * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. + * If MDBX_PS_FIRST or MDBX_PS_LAST is set, find first or last + * leaf. * This is used by mdbx_cursor_first() and mdbx_cursor_last(). - * If MDB_PS_ROOTONLY set, just fetch root node, no further + * If MDBX_PS_ROOTONLY set, just fetch root node, no further * lookups. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_search(MDB_cursor *mc, MDBX_val *key, int flags) { +static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { int rc; pgno_t root; /* Make sure the txn is still viable, then find the root from * the txn's db table and set it as the root of the cursor's stack. */ - if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) { + if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) { mdbx_debug("transaction has failed, must abort"); - return MDB_BAD_TXN; + return MDBX_BAD_TXN; } else { /* Make sure we're using an up-to-date root */ if (unlikely(*mc->mc_dbflag & DB_STALE)) { - MDB_cursor mc2; + MDBX_cursor mc2; if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) - return MDB_BAD_DBI; + return MDBX_BAD_DBI; mdbx_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); rc = mdbx_page_search(&mc2, &mc->mc_dbx->md_name, 0); if (rc) @@ -4695,21 +4701,21 @@ static int mdbx_page_search(MDB_cursor *mc, MDBX_val *key, int flags) { int exact = 0; MDBX_node *leaf = mdbx_node_search(&mc2, &mc->mc_dbx->md_name, &exact); if (!exact) - return MDB_NOTFOUND; + return MDBX_NOTFOUND; if (unlikely((leaf->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) - return MDB_INCOMPATIBLE; /* not a named DB */ + return MDBX_INCOMPATIBLE; /* not a named DB */ rc = mdbx_node_read(&mc2, leaf, &data); if (rc) return rc; uint16_t md_flags; - memcpy(&md_flags, ((char *)data.iov_base + offsetof(MDB_db, md_flags)), + memcpy(&md_flags, ((char *)data.iov_base + offsetof(MDBX_db, md_flags)), sizeof(uint16_t)); /* The txn may not know this DBI, or another process may * have dropped and recreated the DB with other flags. */ if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != md_flags)) - return MDB_INCOMPATIBLE; - memcpy(mc->mc_db, data.iov_base, sizeof(MDB_db)); + return MDBX_INCOMPATIBLE; + memcpy(mc->mc_db, data.iov_base, sizeof(MDBX_db)); } *mc->mc_dbflag &= ~DB_STALE; } @@ -4717,7 +4723,7 @@ static int mdbx_page_search(MDB_cursor *mc, MDBX_val *key, int flags) { if (unlikely(root == P_INVALID)) { /* Tree is empty. */ mdbx_debug("tree is empty"); - return MDB_NOTFOUND; + return MDBX_NOTFOUND; } } @@ -4732,23 +4738,23 @@ static int mdbx_page_search(MDB_cursor *mc, MDBX_val *key, int flags) { mdbx_debug("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, mc->mc_pg[0]->mp_flags); - if (flags & MDB_PS_MODIFY) { + if (flags & MDBX_PS_MODIFY) { if (unlikely(rc = mdbx_page_touch(mc))) return rc; } - if (flags & MDB_PS_ROOTONLY) - return MDB_SUCCESS; + if (flags & MDBX_PS_ROOTONLY) + return MDBX_SUCCESS; return mdbx_page_search_root(mc, key, flags); } -static int mdbx_ovpage_free(MDB_cursor *mc, MDBX_page *mp) { +static int mdbx_ovpage_free(MDBX_cursor *mc, MDBX_page *mp) { MDBX_txn *txn = mc->mc_txn; pgno_t pg = mp->mp_pgno; unsigned x = 0, ovpages = mp->mp_pages; - MDB_env *env = txn->mt_env; - MDB_IDL sl = txn->mt_spill_pages; + MDBX_env *env = txn->mt_env; + MDBX_IDL sl = txn->mt_spill_pages; pgno_t pn = pg << 1; int rc; @@ -4765,7 +4771,7 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDBX_page *mp) { (sl && (x = mdbx_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) { unsigned i, j; pgno_t *mop; - MDB_ID2 *dl, ix, iy; + MDBX_ID2 *dl, ix, iy; rc = mdbx_midl_need(&env->me_pghead, ovpages); if (unlikely(rc)) return rc; @@ -4788,13 +4794,13 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDBX_page *mp) { } else { mdbx_cassert(mc, x > 1); j = ++(dl[0].mid); - dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ - txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PROBLEM; + dl[j] = ix; /* Unsorted. OK when MDBX_TXN_ERROR. */ + txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; } } txn->mt_dirtyroom++; - if (!(env->me_flags & MDB_WRITEMAP)) + if (!(env->me_flags & MDBX_WRITEMAP)) mdbx_dpage_free(env, mp); release: /* Insert in me_pghead */ @@ -4821,7 +4827,7 @@ static int mdbx_ovpage_free(MDB_cursor *mc, MDBX_page *mp) { * [out] data Updated to point to the node's data. * * Returns 0 on success, non-zero on failure. */ -static __inline int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, +static __inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf, MDBX_val *data) { MDBX_page *omp; /* overflow page */ pgno_t pgno; @@ -4830,7 +4836,7 @@ static __inline int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { data->iov_len = NODEDSZ(leaf); data->iov_base = NODEDATA(leaf); - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Read overflow data. */ @@ -4842,12 +4848,12 @@ static __inline int mdbx_node_read(MDB_cursor *mc, MDBX_node *leaf, } data->iov_base = PAGEDATA(omp); - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data) { - MDB_cursor mc; - MDB_xcursor mx; +int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { + MDBX_cursor mc; + MDBX_xcursor mx; int exact = 0; DKBUF; @@ -4862,11 +4868,11 @@ int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data) { if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; mdbx_cursor_init(&mc, txn, dbi, &mx); - return mdbx_cursor_set(&mc, key, data, MDB_SET, &exact); + return mdbx_cursor_set(&mc, key, data, MDBX_SET, &exact); } /* Find a sibling for a page. @@ -4878,13 +4884,13 @@ int mdbx_get(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data) { * otherwise the left sibling. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { +static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { int rc; MDBX_node *indx; MDBX_page *mp; if (unlikely(mc->mc_snum < 2)) { - return MDB_NOTFOUND; /* root has no siblings */ + return MDBX_NOTFOUND; /* root has no siblings */ } mdbx_cursor_pop(mc); @@ -4896,7 +4902,7 @@ static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { : (mc->mc_ki[mc->mc_top] == 0)) { mdbx_debug("no more keys left, moving to %s sibling", move_right ? "right" : "left"); - if (unlikely((rc = mdbx_cursor_sibling(mc, move_right)) != MDB_SUCCESS)) { + if (unlikely((rc = mdbx_cursor_sibling(mc, move_right)) != MDBX_SUCCESS)) { /* undo cursor_pop before returning */ mc->mc_top++; mc->mc_snum++; @@ -4923,18 +4929,18 @@ static int mdbx_cursor_sibling(MDB_cursor *mc, int move_right) { if (!move_right) mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Move the cursor to the next data item. */ -static int mdbx_cursor_next(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, - MDB_cursor_op op) { +static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *leaf; int rc; - if ((mc->mc_flags & C_DEL) && op == MDB_NEXT_DUP) - return MDB_NOTFOUND; + if ((mc->mc_flags & C_DEL) && op == MDBX_NEXT_DUP) + return MDBX_NOTFOUND; if (!(mc->mc_flags & C_INITIALIZED)) return mdbx_cursor_first(mc, key, data); @@ -4942,25 +4948,26 @@ static int mdbx_cursor_next(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, mp = mc->mc_pg[mc->mc_top]; if (mc->mc_flags & C_EOF) { if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) - return MDB_NOTFOUND; + return MDBX_NOTFOUND; mc->mc_flags ^= C_EOF; } - if (mc->mc_db->md_flags & MDB_DUPSORT) { + if (mc->mc_db->md_flags & MDBX_DUPSORT) { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_NEXT || op == MDB_NEXT_DUP) { - rc = mdbx_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); - if (op != MDB_NEXT || rc != MDB_NOTFOUND) { - if (likely(rc == MDB_SUCCESS)) - MDB_GET_KEY(leaf, key); + if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) { + rc = + mdbx_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); + if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) { + if (likely(rc == MDBX_SUCCESS)) + MDBX_GET_KEY(leaf, key); return rc; } } } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (op == MDB_NEXT_DUP) - return MDB_NOTFOUND; + if (op == MDBX_NEXT_DUP) + return MDBX_NOTFOUND; } } @@ -4973,7 +4980,7 @@ static int mdbx_cursor_next(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { mdbx_debug("=====> move to next sibling page"); - if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDB_SUCCESS)) { + if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; } @@ -4991,7 +4998,7 @@ skip: if (IS_LEAF2(mp)) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); - return MDB_SUCCESS; + return MDBX_SUCCESS; } mdbx_cassert(mc, IS_LEAF(mp)); @@ -5001,29 +5008,29 @@ skip: mdbx_xcursor_init1(mc, leaf); } if (data) { - if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDBX_SUCCESS)) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; } } - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; + MDBX_GET_KEY(leaf, key); + return MDBX_SUCCESS; } /* Move the cursor to the previous data item. */ -static int mdbx_cursor_prev(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, - MDB_cursor_op op) { +static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *leaf; int rc; - if ((mc->mc_flags & C_DEL) && op == MDB_PREV_DUP) - return MDB_NOTFOUND; + if ((mc->mc_flags & C_DEL) && op == MDBX_PREV_DUP) + return MDBX_NOTFOUND; if (!(mc->mc_flags & C_INITIALIZED)) { rc = mdbx_cursor_last(mc, key, data); @@ -5033,15 +5040,16 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, } mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_db->md_flags & MDB_DUPSORT) && + if ((mc->mc_db->md_flags & MDBX_DUPSORT) && mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_PREV || op == MDB_PREV_DUP) { - rc = mdbx_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); - if (op != MDB_PREV || rc != MDB_NOTFOUND) { - if (likely(rc == MDB_SUCCESS)) { - MDB_GET_KEY(leaf, key); + if (op == MDBX_PREV || op == MDBX_PREV_DUP) { + rc = + mdbx_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); + if (op != MDBX_PREV || rc != MDBX_NOTFOUND) { + if (likely(rc == MDBX_SUCCESS)) { + MDBX_GET_KEY(leaf, key); mc->mc_flags &= ~C_EOF; } return rc; @@ -5049,8 +5057,8 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, } } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (op == MDB_PREV_DUP) - return MDB_NOTFOUND; + if (op == MDBX_PREV_DUP) + return MDBX_NOTFOUND; } } @@ -5061,7 +5069,7 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, if (mc->mc_ki[mc->mc_top] == 0) { mdbx_debug("=====> move to prev sibling page"); - if ((rc = mdbx_cursor_sibling(mc, 0)) != MDB_SUCCESS) { + if ((rc = mdbx_cursor_sibling(mc, 0)) != MDBX_SUCCESS) { return rc; } mp = mc->mc_pg[mc->mc_top]; @@ -5078,7 +5086,7 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, if (IS_LEAF2(mp)) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); - return MDB_SUCCESS; + return MDBX_SUCCESS; } mdbx_cassert(mc, IS_LEAF(mp)); @@ -5088,33 +5096,33 @@ static int mdbx_cursor_prev(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_xcursor_init1(mc, leaf); } if (data) { - if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDBX_SUCCESS)) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; } } - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; + MDBX_GET_KEY(leaf, key); + return MDBX_SUCCESS; } /* Set the cursor on a specific data item. */ -static int mdbx_cursor_set(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, - MDB_cursor_op op, int *exactp) { +static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op, int *exactp) { int rc; MDBX_page *mp; MDBX_node *leaf = NULL; DKBUF; - if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && + if ((mc->mc_db->md_flags & MDBX_INTEGERKEY) && unlikely(key->iov_len != sizeof(uint32_t) && key->iov_len != sizeof(uint64_t))) { - mdbx_cassert(mc, !"key-size is invalid for MDB_INTEGERKEY"); - return MDB_BAD_VALSIZE; + mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; } if (mc->mc_xcursor) @@ -5127,14 +5135,14 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, mp = mc->mc_pg[mc->mc_top]; if (!NUMKEYS(mp)) { mc->mc_ki[mc->mc_top] = 0; - return MDB_NOTFOUND; + return MDBX_NOTFOUND; } if (mp->mp_flags & P_LEAF2) { nodekey.iov_len = mc->mc_db->md_xsize; nodekey.iov_base = LEAF2KEY(mp, 0, nodekey.iov_len); } else { leaf = NODEPTR(mp, 0); - MDB_GET_KEY2(leaf, nodekey); + MDBX_GET_KEY2(leaf, nodekey); } rc = mc->mc_dbx->md_cmp(key, &nodekey); if (rc == 0) { @@ -5154,7 +5162,7 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, nodekey.iov_base = LEAF2KEY(mp, nkeys - 1, nodekey.iov_len); } else { leaf = NODEPTR(mp, nkeys - 1); - MDB_GET_KEY2(leaf, nodekey); + MDBX_GET_KEY2(leaf, nodekey); } rc = mc->mc_dbx->md_cmp(key, &nodekey); if (rc == 0) { @@ -5172,7 +5180,7 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, LEAF2KEY(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len); } else { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY2(leaf, nodekey); + MDBX_GET_KEY2(leaf, nodekey); } rc = mc->mc_dbx->md_cmp(key, &nodekey); if (rc == 0) { @@ -5195,24 +5203,24 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, if (i == mc->mc_top) { /* There are no other pages */ mc->mc_ki[mc->mc_top] = nkeys; - return MDB_NOTFOUND; + return MDBX_NOTFOUND; } } if (!mc->mc_top) { /* There are no other pages */ mc->mc_ki[mc->mc_top] = 0; - if (op == MDB_SET_RANGE && !exactp) { + if (op == MDBX_SET_RANGE && !exactp) { rc = 0; goto set1; } else - return MDB_NOTFOUND; + return MDBX_NOTFOUND; } } else { mc->mc_pg[0] = 0; } rc = mdbx_page_search(mc, key, 0); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; mp = mc->mc_pg[mc->mc_top]; @@ -5221,13 +5229,13 @@ static int mdbx_cursor_set(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, set2: leaf = mdbx_node_search(mc, key, exactp); if (exactp != NULL && !*exactp) { - /* MDB_SET specified and not an exact match. */ - return MDB_NOTFOUND; + /* MDBX_SET specified and not an exact match. */ + return MDBX_NOTFOUND; } if (leaf == NULL) { mdbx_debug("===> inexact leaf not found, goto sibling"); - if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDB_SUCCESS)) { + if (unlikely((rc = mdbx_cursor_sibling(mc, 1)) != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; /* no entries matched */ } @@ -5241,11 +5249,11 @@ set1: mc->mc_flags &= ~C_EOF; if (IS_LEAF2(mp)) { - if (op == MDB_SET_RANGE || op == MDB_SET_KEY) { + if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); } - return MDB_SUCCESS; + return MDBX_SUCCESS; } if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5253,50 +5261,50 @@ set1: } if (likely(data)) { if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) { + if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); } else { int ex2, *ex2p; - if (op == MDB_GET_BOTH) { + if (op == MDBX_GET_BOTH) { ex2p = &ex2; ex2 = 0; } else { ex2p = NULL; } rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, - MDB_SET_RANGE, ex2p); - if (unlikely(rc != MDB_SUCCESS)) + MDBX_SET_RANGE, ex2p); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } - } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { + } else if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { MDBX_val olddata; - if (unlikely((rc = mdbx_node_read(mc, leaf, &olddata)) != MDB_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, leaf, &olddata)) != MDBX_SUCCESS)) return rc; rc = mc->mc_dbx->md_dcmp(data, &olddata); if (rc) { - if (op == MDB_GET_BOTH || rc > 0) - return MDB_NOTFOUND; + if (op == MDBX_GET_BOTH || rc > 0) + return MDBX_NOTFOUND; rc = 0; } *data = olddata; } else { if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDBX_SUCCESS)) return rc; } } /* The key already matches in all other cases */ - if (op == MDB_SET_RANGE || op == MDB_SET_KEY) - MDB_GET_KEY(leaf, key); + if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) + MDBX_GET_KEY(leaf, key); mdbx_debug("==> cursor placed on key [%s], data [%s]", DKEY(key), DVAL(data)); return rc; } /* Move the cursor to the first item in the database. */ -static int mdbx_cursor_first(MDB_cursor *mc, MDBX_val *key, MDBX_val *data) { +static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; MDBX_node *leaf; @@ -5304,8 +5312,8 @@ static int mdbx_cursor_first(MDB_cursor *mc, MDBX_val *key, MDBX_val *data) { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDB_PS_FIRST); - if (unlikely(rc != MDB_SUCCESS)) + rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); @@ -5319,7 +5327,7 @@ static int mdbx_cursor_first(MDB_cursor *mc, MDBX_val *key, MDBX_val *data) { if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->iov_len); - return MDB_SUCCESS; + return MDBX_SUCCESS; } if (likely(data)) { @@ -5329,16 +5337,16 @@ static int mdbx_cursor_first(MDB_cursor *mc, MDBX_val *key, MDBX_val *data) { if (unlikely(rc)) return rc; } else { - if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDBX_SUCCESS)) return rc; } } - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; + MDBX_GET_KEY(leaf, key); + return MDBX_SUCCESS; } /* Move the cursor to the last item in the database. */ -static int mdbx_cursor_last(MDB_cursor *mc, MDBX_val *key, MDBX_val *data) { +static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; MDBX_node *leaf; @@ -5347,8 +5355,8 @@ static int mdbx_cursor_last(MDB_cursor *mc, MDBX_val *key, MDBX_val *data) { if (likely((mc->mc_flags & (C_EOF | C_DEL)) != C_EOF)) { if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDB_PS_LAST); - if (unlikely(rc != MDB_SUCCESS)) + rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); @@ -5362,7 +5370,7 @@ static int mdbx_cursor_last(MDB_cursor *mc, MDBX_val *key, MDBX_val *data) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->iov_len); - return MDB_SUCCESS; + return MDBX_SUCCESS; } if (likely(data)) { @@ -5372,20 +5380,20 @@ static int mdbx_cursor_last(MDB_cursor *mc, MDBX_val *key, MDBX_val *data) { if (unlikely(rc)) return rc; } else { - if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdbx_node_read(mc, leaf, data)) != MDBX_SUCCESS)) return rc; } } - MDB_GET_KEY(leaf, key); - return MDB_SUCCESS; + MDBX_GET_KEY(leaf, key); + return MDBX_SUCCESS; } -int mdbx_cursor_get(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, - MDB_cursor_op op) { +int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { int rc; int exact = 0; - int (*mfunc)(MDB_cursor * mc, MDBX_val * key, MDBX_val * data); + int (*mfunc)(MDBX_cursor * mc, MDBX_val * key, MDBX_val * data); if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -5393,28 +5401,28 @@ int mdbx_cursor_get(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; switch (op) { - case MDB_GET_CURRENT: { + case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; MDBX_page *mp = mc->mc_pg[mc->mc_top]; unsigned nkeys = NUMKEYS(mp); if (mc->mc_ki[mc->mc_top] >= nkeys) { mc->mc_ki[mc->mc_top] = nkeys; - return MDB_NOTFOUND; + return MDBX_NOTFOUND; } assert(nkeys > 0); - rc = MDB_SUCCESS; + rc = MDBX_SUCCESS; if (IS_LEAF2(mp)) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->iov_len); } else { MDBX_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - MDB_GET_KEY(leaf, key); + MDBX_GET_KEY(leaf, key); if (data) { if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { @@ -5424,7 +5432,7 @@ int mdbx_cursor_get(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, - MDB_GET_CURRENT); + MDBX_GET_CURRENT); } else { rc = mdbx_node_read(mc, leaf, data); } @@ -5434,93 +5442,93 @@ int mdbx_cursor_get(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, } break; } - case MDB_GET_BOTH: - case MDB_GET_BOTH_RANGE: + case MDBX_GET_BOTH: + case MDBX_GET_BOTH_RANGE: if (unlikely(data == NULL)) return MDBX_EINVAL; if (unlikely(mc->mc_xcursor == NULL)) - return MDB_INCOMPATIBLE; + return MDBX_INCOMPATIBLE; /* FALLTHRU */ - case MDB_SET: - case MDB_SET_KEY: - case MDB_SET_RANGE: + case MDBX_SET: + case MDBX_SET_KEY: + case MDBX_SET_RANGE: if (unlikely(key == NULL)) return MDBX_EINVAL; - rc = - mdbx_cursor_set(mc, key, data, op, op == MDB_SET_RANGE ? NULL : &exact); + rc = mdbx_cursor_set(mc, key, data, op, + op == MDBX_SET_RANGE ? NULL : &exact); break; - case MDB_GET_MULTIPLE: + case MDBX_GET_MULTIPLE: if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; - if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) - return MDB_INCOMPATIBLE; - rc = MDB_SUCCESS; + if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + rc = MDBX_SUCCESS; if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) break; goto fetchm; - case MDB_NEXT_MULTIPLE: + case MDBX_NEXT_MULTIPLE: if (unlikely(data == NULL)) return MDBX_EINVAL; - if (unlikely(!(mc->mc_db->md_flags & MDB_DUPFIXED))) - return MDB_INCOMPATIBLE; - rc = mdbx_cursor_next(mc, key, data, MDB_NEXT_DUP); - if (rc == MDB_SUCCESS) { + if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_DUP); + if (rc == MDBX_SUCCESS) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - MDB_cursor *mx; + MDBX_cursor *mx; fetchm: mx = &mc->mc_xcursor->mx_cursor; data->iov_len = NUMKEYS(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize; data->iov_base = PAGEDATA(mx->mc_pg[mx->mc_top]); mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top]) - 1; } else { - rc = MDB_NOTFOUND; + rc = MDBX_NOTFOUND; } } break; - case MDB_PREV_MULTIPLE: + case MDBX_PREV_MULTIPLE: if (data == NULL) return MDBX_EINVAL; - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) - return MDB_INCOMPATIBLE; - rc = MDB_SUCCESS; + if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) + return MDBX_INCOMPATIBLE; + rc = MDBX_SUCCESS; if (!(mc->mc_flags & C_INITIALIZED)) rc = mdbx_cursor_last(mc, key, data); - if (rc == MDB_SUCCESS) { - MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; + if (rc == MDBX_SUCCESS) { + MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor; if (mx->mc_flags & C_INITIALIZED) { rc = mdbx_cursor_sibling(mx, 0); - if (rc == MDB_SUCCESS) + if (rc == MDBX_SUCCESS) goto fetchm; } else { - rc = MDB_NOTFOUND; + rc = MDBX_NOTFOUND; } } break; - case MDB_NEXT: - case MDB_NEXT_DUP: - case MDB_NEXT_NODUP: + case MDBX_NEXT: + case MDBX_NEXT_DUP: + case MDBX_NEXT_NODUP: rc = mdbx_cursor_next(mc, key, data, op); break; - case MDB_PREV: - case MDB_PREV_DUP: - case MDB_PREV_NODUP: + case MDBX_PREV: + case MDBX_PREV_DUP: + case MDBX_PREV_NODUP: rc = mdbx_cursor_prev(mc, key, data, op); break; - case MDB_FIRST: + case MDBX_FIRST: rc = mdbx_cursor_first(mc, key, data); break; - case MDB_FIRST_DUP: + case MDBX_FIRST_DUP: mfunc = mdbx_cursor_first; mmove: if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; if (unlikely(mc->mc_xcursor == NULL)) - return MDB_INCOMPATIBLE; + return MDBX_INCOMPATIBLE; { MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { - MDB_GET_KEY(leaf, key); + MDBX_GET_KEY(leaf, key); rc = mdbx_node_read(mc, leaf, data); break; } @@ -5529,10 +5537,10 @@ int mdbx_cursor_get(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); break; - case MDB_LAST: + case MDBX_LAST: rc = mdbx_cursor_last(mc, key, data); break; - case MDB_LAST_DUP: + case MDBX_LAST_DUP: mfunc = mdbx_cursor_last; goto mmove; default: @@ -5547,17 +5555,17 @@ int mdbx_cursor_get(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, /* Touch all the pages in the cursor stack. Set mc_top. * Makes sure all the pages are writable, before attempting a write operation. * [in] mc The cursor to operate on. */ -static int mdbx_cursor_touch(MDB_cursor *mc) { - int rc = MDB_SUCCESS; +static int mdbx_cursor_touch(MDBX_cursor *mc) { + int rc = MDBX_SUCCESS; if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY | DB_DUPDATA))) { /* Touch DB record of named DB */ - MDB_cursor mc2; - MDB_xcursor mcx; + MDBX_cursor mc2; + MDBX_xcursor mcx; if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) - return MDB_BAD_DBI; + return MDBX_BAD_DBI; mdbx_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx); - rc = mdbx_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); + rc = mdbx_page_search(&mc2, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); if (unlikely(rc)) return rc; *mc->mc_dbflag |= DB_DIRTY; @@ -5573,15 +5581,15 @@ static int mdbx_cursor_touch(MDB_cursor *mc) { } /* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDB_NOSPILL 0x8000 +#define MDBX_NOSPILL 0x8000 -int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, +int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, unsigned flags) { - MDB_env *env; + MDBX_env *env; MDBX_page *fp, *sub_root = NULL; uint16_t fp_flags; MDBX_val xdata, *rdata, dkey, olddata; - MDB_db dummy; + MDBX_db dummy; int do_sub = 0, insert_key, insert_data; unsigned mcount = 0, dcount = 0, nospill; size_t nsize; @@ -5598,67 +5606,68 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, env = mc->mc_txn->mt_env; /* Check this first so counter will always be zero on any early failures. */ - if (flags & MDB_MULTIPLE) { + if (flags & MDBX_MULTIPLE) { dcount = data[1].iov_len; data[1].iov_len = 0; - if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED))) - return MDB_INCOMPATIBLE; + if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; } - if (flags & MDB_RESERVE) { - if (unlikely(mc->mc_db->md_flags & (MDB_DUPSORT | MDB_REVERSEDUP))) - return MDB_INCOMPATIBLE; + if (flags & MDBX_RESERVE) { + if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP))) + return MDBX_INCOMPATIBLE; data->iov_base = nullptr; } - nospill = flags & MDB_NOSPILL; - flags &= ~MDB_NOSPILL; + nospill = flags & MDBX_NOSPILL; + flags &= ~MDBX_NOSPILL; - if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; + if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS + : MDBX_BAD_TXN; if (unlikely(key->iov_len > env->me_maxkey_limit)) - return MDB_BAD_VALSIZE; + return MDBX_BAD_VALSIZE; - if (unlikely(data->iov_len > ((mc->mc_db->md_flags & MDB_DUPSORT) + if (unlikely(data->iov_len > ((mc->mc_db->md_flags & MDBX_DUPSORT) ? env->me_maxkey_limit : MDBX_MAXDATASIZE))) - return MDB_BAD_VALSIZE; + return MDBX_BAD_VALSIZE; - if ((mc->mc_db->md_flags & MDB_INTEGERKEY) && + if ((mc->mc_db->md_flags & MDBX_INTEGERKEY) && unlikely(key->iov_len != sizeof(uint32_t) && key->iov_len != sizeof(uint64_t))) { - mdbx_cassert(mc, !"key-size is invalid for MDB_INTEGERKEY"); - return MDB_BAD_VALSIZE; + mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; } - if ((mc->mc_db->md_flags & MDB_INTEGERDUP) && + if ((mc->mc_db->md_flags & MDBX_INTEGERDUP) && unlikely(data->iov_len != sizeof(uint32_t) && data->iov_len != sizeof(uint64_t))) { - mdbx_cassert(mc, !"data-size is invalid MDB_INTEGERDUP"); - return MDB_BAD_VALSIZE; + mdbx_cassert(mc, !"data-size is invalid MDBX_INTEGERDUP"); + return MDBX_BAD_VALSIZE; } mdbx_debug("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, DDBI(mc), DKEY(key), key ? key->iov_len : 0, - DVAL((flags & MDB_RESERVE) ? nullptr : data), data->iov_len); + DVAL((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); int dupdata_flag = 0; - if (flags & MDB_CURRENT) { - /* Опция MDB_CURRENT означает, что запрошено обновление текущей записи, + if (flags & MDBX_CURRENT) { + /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи, * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает * со значением в текущей позиции курсора. * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц - * с MDB_DUPSORT также требуется текущий размер данных. */ + * с MDBX_DUPSORT также требуется текущий размер данных. */ MDBX_val current_key, current_data; - rc = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDB_GET_CURRENT); - if (unlikely(rc != MDB_SUCCESS)) + rc = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); + if (unlikely(rc != MDBX_SUCCESS)) return rc; if (mc->mc_dbx->md_cmp(key, ¤t_key) != 0) return MDBX_EKEYMISMATCH; - if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdbx_cassert(mc, @@ -5670,9 +5679,9 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, if (mc->mc_xcursor->mx_db.md_entries > 1 || current_data.iov_len != data->iov_len) { rc = mdbx_cursor_del(mc, 0); - if (rc != MDB_SUCCESS) + if (rc != MDBX_SUCCESS) return rc; - flags -= MDB_CURRENT; + flags -= MDBX_CURRENT; } } } @@ -5683,17 +5692,17 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_snum = 0; mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; - rc = MDB_NO_ROOT; - } else if ((flags & MDB_CURRENT) == 0) { + rc = MDBX_NO_ROOT; + } else if ((flags & MDBX_CURRENT) == 0) { int exact = 0; MDBX_val d2; - if (flags & MDB_APPEND) { + if (flags & MDBX_APPEND) { MDBX_val k2; rc = mdbx_cursor_last(mc, &k2, &d2); if (rc == 0) { rc = mc->mc_dbx->md_cmp(key, &k2); if (rc > 0) { - rc = MDB_NOTFOUND; + rc = MDBX_NOTFOUND; mc->mc_ki[mc->mc_top]++; } else { /* new key is <= last key */ @@ -5701,14 +5710,14 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, } } } else { - rc = mdbx_cursor_set(mc, key, &d2, MDB_SET, &exact); + rc = mdbx_cursor_set(mc, key, &d2, MDBX_SET, &exact); } - if ((flags & MDB_NOOVERWRITE) && rc == 0) { + if ((flags & MDBX_NOOVERWRITE) && rc == 0) { mdbx_debug("duplicate key [%s]", DKEY(key)); *data = d2; - return MDB_KEYEXIST; + return MDBX_KEYEXIST; } - if (rc && unlikely(rc != MDB_NOTFOUND)) + if (rc && unlikely(rc != MDBX_NOTFOUND)) return rc; } @@ -5716,7 +5725,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, /* Cursor is positioned, check for room in the dirty list */ if (!nospill) { - if (flags & MDB_MULTIPLE) { + if (flags & MDBX_MULTIPLE) { rdata = &xdata; xdata.iov_len = data->iov_len * dcount; } else { @@ -5726,7 +5735,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, return rc2; } - if (rc == MDB_NO_ROOT) { + if (rc == MDBX_NO_ROOT) { MDBX_page *np; /* new database, write a root leaf page */ mdbx_debug("allocating new root leaf page"); @@ -5738,7 +5747,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_db->md_root = np->mp_pgno; mc->mc_db->md_depth++; *mc->mc_dbflag |= DB_DIRTY; - if ((mc->mc_db->md_flags & (MDB_DUPSORT | MDB_DUPFIXED)) == MDB_DUPFIXED) + if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) np->mp_flags |= P_LEAF2; mc->mc_flags |= C_INITIALIZED; } else { @@ -5752,13 +5761,13 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, if (insert_key) { /* The key does not exist */ mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); - if ((mc->mc_db->md_flags & MDB_DUPSORT) && + if ((mc->mc_db->md_flags & MDBX_DUPSORT) && LEAFSIZE(key, data) > env->me_nodemax) { /* Too big for a node, insert in sub-DB. Set up an empty * "old sub-page" for prep_subDB to expand to a full page. */ fp_flags = P_LEAF | P_DIRTY; fp = env->me_pbuf; - fp->mp_leaf2_ksize = (uint16_t)data->iov_len; /* used if MDB_DUPFIXED */ + fp->mp_leaf2_ksize = (uint16_t)data->iov_len; /* used if MDBX_DUPFIXED */ fp->mp_lower = fp->mp_upper = (PAGEHDRSZ - PAGEBASE); olddata.iov_len = PAGEHDRSZ; goto prep_subDB; @@ -5769,7 +5778,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, char *ptr; unsigned ksize = mc->mc_db->md_xsize; if (key->iov_len != ksize) - return MDB_BAD_VALSIZE; + return MDBX_BAD_VALSIZE; ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); memcpy(ptr, key->iov_base, ksize); fix_parent: @@ -5786,12 +5795,12 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, if (mc->mc_ki[mc->mc_top]) rc2 = mdbx_update_key(mc, key); else - rc2 = MDB_SUCCESS; + rc2 = MDBX_SUCCESS; mc->mc_top += dtop; if (rc2) return rc2; } - return MDB_SUCCESS; + return MDBX_SUCCESS; } more:; @@ -5800,7 +5809,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, olddata.iov_base = NODEDATA(leaf); /* DB has dups? */ - if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) { + if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { /* Prepare (sub-)page/sub-DB to accept the new item, if needed. * fp: old sub-page or a header faking it. * mp: new (sub-)page. offset: growth in page size. @@ -5814,14 +5823,14 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, /* does data match? */ if (!mc->mc_dbx->md_dcmp(data, &olddata)) { - if (unlikely(flags & (MDB_NODUPDATA | MDB_APPENDDUP))) - return MDB_KEYEXIST; + if (unlikely(flags & (MDBX_NODUPDATA | MDBX_APPENDDUP))) + return MDBX_KEYEXIST; /* overwrite it */ goto current; } /* Just overwrite the current item */ - if (flags & MDB_CURRENT) + if (flags & MDBX_CURRENT) goto current; /* Back up original data item */ @@ -5833,7 +5842,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP; fp->mp_lower = (PAGEHDRSZ - PAGEBASE); xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { + if (mc->mc_db->md_flags & MDBX_DUPFIXED) { fp->mp_flags |= P_LEAF2; fp->mp_leaf2_ksize = (uint16_t)data->iov_len; xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ @@ -5852,7 +5861,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, fp = olddata.iov_base; switch (flags) { default: - if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) { + if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) { offset = EVEN(NODESIZE + sizeof(indx_t) + data->iov_len); break; } @@ -5861,9 +5870,9 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, offset *= 4; /* space for 4 more */ break; } - /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */ - case MDB_CURRENT | MDB_NODUPDATA: - case MDB_CURRENT: + /* FALLTHRU: Big enough MDBX_DUPFIXED sub-page */ + case MDBX_CURRENT | MDBX_NODUPDATA: + case MDBX_CURRENT: fp->mp_flags |= P_DIRTY; fp->mp_pgno = mp->mp_pgno; mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; @@ -5880,19 +5889,19 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, prep_subDB: dummy.md_xsize = 0; dummy.md_flags = 0; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { + if (mc->mc_db->md_flags & MDBX_DUPFIXED) { fp_flags |= P_LEAF2; dummy.md_xsize = fp->mp_leaf2_ksize; - dummy.md_flags = MDB_DUPFIXED; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - dummy.md_flags |= MDB_INTEGERKEY; + dummy.md_flags = MDBX_DUPFIXED; + if (mc->mc_db->md_flags & MDBX_INTEGERDUP) + dummy.md_flags |= MDBX_INTEGERKEY; } dummy.md_depth = 1; dummy.md_branch_pages = 0; dummy.md_leaf_pages = 1; dummy.md_overflow_pages = 0; dummy.md_entries = NUMKEYS(fp); - xdata.iov_len = sizeof(MDB_db); + xdata.iov_len = sizeof(MDBX_db); xdata.iov_base = &dummy; if ((rc = mdbx_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) return rc; @@ -5925,9 +5934,9 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, goto new_sub; } current: - /* LMDB passes F_SUBDATA in 'flags' to write a DB record */ + /* MDBX passes F_SUBDATA in 'flags' to write a DB record */ if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) - return MDB_INCOMPATIBLE; + return MDBX_INCOMPATIBLE; /* overflow page overwrites need special handling */ if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { MDBX_page *omp; @@ -5942,7 +5951,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, /* Is the ov page large enough? */ if (ovpages >= dpages) { if (!(omp->mp_flags & P_DIRTY) && - (level || (env->me_flags & MDB_WRITEMAP))) { + (level || (env->me_flags & MDBX_WRITEMAP))) { rc = mdbx_page_unspill(mc->mc_txn, omp, &omp); if (unlikely(rc)) return rc; @@ -5956,7 +5965,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(level > 1)) { /* It is writable only in a parent txn */ MDBX_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); - MDB_ID2 id2; + MDBX_ID2 id2; if (unlikely(!np)) return MDBX_ENOMEM; id2.mid = pg; @@ -5966,10 +5975,10 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, mdbx_cassert(mc, rc2 == 0); /* Currently we make the page look as with put() in the - * parent txn, in case the user peeks at MDB_RESERVEd + * parent txn, in case the user peeks at MDBX_RESERVEd * or unused parts. Some users treat ovpages specially. */ size_t whole = (size_t)env->me_psize * ovpages; - /* Skip the part where LMDB will put *data. + /* Skip the part where MDBX will put *data. * Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. */ size_t off = (PAGEHDRSZ + data->iov_len) & -(ssize_t)sizeof(size_t); @@ -5979,21 +5988,21 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, omp = np; } SETDSZ(leaf, data->iov_len); - if (F_ISSET(flags, MDB_RESERVE)) + if (F_ISSET(flags, MDBX_RESERVE)) data->iov_base = PAGEDATA(omp); else memcpy(PAGEDATA(omp), data->iov_base, data->iov_len); - return MDB_SUCCESS; + return MDBX_SUCCESS; } } - if ((rc2 = mdbx_ovpage_free(mc, omp)) != MDB_SUCCESS) + if ((rc2 = mdbx_ovpage_free(mc, omp)) != MDBX_SUCCESS) return rc2; } else if (data->iov_len == olddata.iov_len) { assert(EVEN(key->iov_len) == EVEN(leaf->mn_ksize)); /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, * but instead we opt to shrink the node in that case. */ - if (F_ISSET(flags, MDB_RESERVE)) + if (F_ISSET(flags, MDBX_RESERVE)) data->iov_base = olddata.iov_base; else if (!(mc->mc_flags & C_SUB)) memcpy(olddata.iov_base, data->iov_base, data->iov_len); @@ -6010,7 +6019,7 @@ int mdbx_cursor_put(MDB_cursor *mc, MDBX_val *key, MDBX_val *data, (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); goto fix_parent; } - return MDB_SUCCESS; + return MDBX_SUCCESS; } mdbx_node_del(mc, 0); } @@ -6023,17 +6032,17 @@ new_sub: : mdbx_leaf_size(env, key, rdata); if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) { if ((flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) - nflags &= ~MDB_APPEND; /* sub-page may need room to grow */ + nflags &= ~MDBX_APPEND; /* sub-page may need room to grow */ if (!insert_key) - nflags |= MDB_SPLIT_REPLACE; + nflags |= MDBX_SPLIT_REPLACE; rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags); } else { /* There is room already in this leaf page. */ rc = mdbx_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags); if (likely(rc == 0)) { /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; + MDBX_cursor *m2, *m3; + MDBX_dbi dbi = mc->mc_dbi; unsigned i = mc->mc_top; MDBX_page *mp = mc->mc_pg[i]; @@ -6053,7 +6062,7 @@ new_sub: } } - if (likely(rc == MDB_SUCCESS)) { + if (likely(rc == MDBX_SUCCESS)) { /* Now store the actual data in the child DB. Note that we're * storing the user data in the keys field, so there are strict * size limits on dupdata. The actual data fields of the child @@ -6065,14 +6074,14 @@ new_sub: xdata.iov_len = 0; xdata.iov_base = ""; MDBX_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (flags & MDB_CURRENT) { - xflags = (flags & MDB_NODUPDATA) - ? MDB_CURRENT | MDB_NOOVERWRITE | MDB_NOSPILL - : MDB_CURRENT | MDB_NOSPILL; + if (flags & MDBX_CURRENT) { + xflags = (flags & MDBX_NODUPDATA) + ? MDBX_CURRENT | MDBX_NOOVERWRITE | MDBX_NOSPILL + : MDBX_CURRENT | MDBX_NOSPILL; } else { mdbx_xcursor_init1(mc, leaf); - xflags = (flags & MDB_NODUPDATA) ? MDB_NOOVERWRITE | MDB_NOSPILL - : MDB_NOSPILL; + xflags = (flags & MDBX_NODUPDATA) ? MDBX_NOOVERWRITE | MDBX_NOSPILL + : MDBX_NOSPILL; } if (sub_root) mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; @@ -6086,8 +6095,8 @@ new_sub: } if (!(leaf->mn_flags & F_SUBDATA) || sub_root) { /* Adjust other cursors pointing to mp */ - MDB_cursor *m2; - MDB_xcursor *mx = mc->mc_xcursor; + MDBX_cursor *m2; + MDBX_xcursor *mx = mc->mc_xcursor; unsigned i = mc->mc_top; MDBX_page *mp = mc->mc_pg[i]; int nkeys = NUMKEYS(mp); @@ -6107,12 +6116,12 @@ new_sub: } } ecount = mc->mc_xcursor->mx_db.md_entries; - if (flags & MDB_APPENDDUP) - xflags |= MDB_APPEND; + if (flags & MDBX_APPENDDUP) + xflags |= MDBX_APPEND; rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); if (flags & F_SUBDATA) { void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; } @@ -6127,7 +6136,7 @@ new_sub: * make sure the cursor is marked valid. */ mc->mc_flags |= C_INITIALIZED; } - if (flags & MDB_MULTIPLE) { + if (flags & MDBX_MULTIPLE) { if (!rc) { mcount++; /* let caller know how many succeeded, if any */ @@ -6141,15 +6150,15 @@ new_sub: } return rc; bad_sub: - if (unlikely(rc == MDB_KEYEXIST)) + if (unlikely(rc == MDBX_KEYEXIST)) /* should not happen, we deleted that item */ - rc = MDB_PROBLEM; + rc = MDBX_PROBLEM; } - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return rc; } -int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { +int mdbx_cursor_del(MDBX_cursor *mc, unsigned flags) { MDBX_node *leaf; MDBX_page *mp; int rc; @@ -6160,16 +6169,17 @@ int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(mc->mc_txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; + if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS + : MDBX_BAD_TXN; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; if (unlikely(mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))) - return MDB_NOTFOUND; + return MDBX_NOTFOUND; - if (unlikely(!(flags & MDB_NOSPILL) && + if (unlikely(!(flags & MDBX_NOSPILL) && (rc = mdbx_page_spill(mc, NULL, NULL)))) return rc; @@ -6183,7 +6193,7 @@ int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { - if (flags & MDB_NODUPDATA) { + if (flags & MDBX_NODUPDATA) { /* mdbx_cursor_del0() will subtract the final entry */ mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; @@ -6191,7 +6201,7 @@ int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); } - rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); + rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); if (unlikely(rc)) return rc; /* If sub-DB still has entries, we're done */ @@ -6199,9 +6209,9 @@ int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { if (leaf->mn_flags & F_SUBDATA) { /* update subDB info */ void *db = NODEDATA(leaf); - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); + memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } else { - MDB_cursor *m2; + MDBX_cursor *m2; /* shrink fake page */ mdbx_node_shrink(mp, mc->mc_ki[mc->mc_top]); leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); @@ -6238,9 +6248,9 @@ int mdbx_cursor_del(MDB_cursor *mc, unsigned flags) { goto fail; } } - /* LMDB passes F_SUBDATA in 'flags' to delete a DB record */ + /* MDBX passes F_SUBDATA in 'flags' to delete a DB record */ else if (unlikely((leaf->mn_flags ^ flags) & F_SUBDATA)) { - rc = MDB_INCOMPATIBLE; + rc = MDBX_INCOMPATIBLE; goto fail; } @@ -6259,12 +6269,12 @@ del_key: return mdbx_cursor_del0(mc); fail: - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return rc; } /* Allocate and initialize new pages for a database. - * Set MDB_TXN_ERROR on failure. + * Set MDBX_TXN_ERROR on failure. * * [in] mc a cursor on the database being added to. * [in] flags flags defining what type of page is being allocated. @@ -6273,7 +6283,7 @@ fail: * [out] mp Address of a page, or NULL on failure. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, +static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num, MDBX_page **mp) { MDBX_page *np; int rc; @@ -6296,7 +6306,7 @@ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, } *mp = np; - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Calculate the size of a leaf node. @@ -6312,7 +6322,7 @@ static int mdbx_page_new(MDB_cursor *mc, uint32_t flags, int num, * [in] data The data for the node. * * Returns The number of bytes needed to store the node. */ -static __inline size_t mdbx_leaf_size(MDB_env *env, MDBX_val *key, +static __inline size_t mdbx_leaf_size(MDBX_env *env, MDBX_val *key, MDBX_val *data) { size_t sz; @@ -6337,7 +6347,7 @@ static __inline size_t mdbx_leaf_size(MDB_env *env, MDBX_val *key, * [in] key The key for the node. * * Returns The number of bytes needed to store the node. */ -static __inline size_t mdbx_branch_size(MDB_env *env, MDBX_val *key) { +static __inline size_t mdbx_branch_size(MDBX_env *env, MDBX_val *key) { size_t sz; sz = INDXSIZE(key); @@ -6353,7 +6363,7 @@ static __inline size_t mdbx_branch_size(MDB_env *env, MDBX_val *key) { } /* Add a node to the page pointed to by the cursor. - * Set MDB_TXN_ERROR on failure. + * Set MDBX_TXN_ERROR on failure. * * [in] mc The cursor for this operation. * [in] indx The index on the page where the new node should be added. @@ -6365,10 +6375,10 @@ static __inline size_t mdbx_branch_size(MDB_env *env, MDBX_val *key) { * Returns 0 on success, non-zero on failure. Possible errors are: * * MDBX_ENOMEM - failed to allocate overflow pages for the node. - * MDB_PAGE_FULL - there is insufficient room in the page. This error + * MDBX_PAGE_FULL - there is insufficient room in the page. This error * should never happen since all callers already calculate * the page's free space before calling this function. */ -static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDBX_val *key, +static int mdbx_node_add(MDBX_cursor *mc, indx_t indx, MDBX_val *key, MDBX_val *data, pgno_t pgno, unsigned flags) { unsigned i; size_t node_size = NODESIZE; @@ -6402,7 +6412,7 @@ static int mdbx_node_add(MDB_cursor *mc, indx_t indx, MDBX_val *key, /* Just using these for counting */ mp->mp_lower += sizeof(indx_t); mp->mp_upper -= ksize - sizeof(indx_t); - return MDB_SUCCESS; + return MDBX_SUCCESS; } room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); @@ -6466,21 +6476,21 @@ update: if (unlikely(ofp == NULL)) { if (unlikely(F_ISSET(flags, F_BIGDATA))) memcpy(ndata, data->iov_base, sizeof(pgno_t)); - else if (F_ISSET(flags, MDB_RESERVE)) + else if (F_ISSET(flags, MDBX_RESERVE)) data->iov_base = ndata; else if (likely(ndata != data->iov_base)) memcpy(ndata, data->iov_base, data->iov_len); } else { memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); ndata = PAGEDATA(ofp); - if (F_ISSET(flags, MDB_RESERVE)) + if (F_ISSET(flags, MDBX_RESERVE)) data->iov_base = ndata; else if (likely(ndata != data->iov_base)) memcpy(ndata, data->iov_base, data->iov_len); } } - return MDB_SUCCESS; + return MDBX_SUCCESS; full: mdbx_debug("not enough room in page %" PRIaPGNO ", got %u ptrs", mp->mp_pgno, @@ -6488,15 +6498,15 @@ full: mdbx_debug("upper-lower = %u - %u = %" PRIiPTR "", mp->mp_upper, mp->mp_lower, room); mdbx_debug("node size = %" PRIuPTR "", node_size); - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - return MDB_PAGE_FULL; + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_FULL; } /* Delete the specified node from a page. * [in] mc Cursor pointing to the node to delete. * [in] ksize The size of a node. Only used if the page is - * part of a MDB_DUPFIXED database. */ -static void mdbx_node_del(MDB_cursor *mc, int ksize) { + * part of a MDBX_DUPFIXED database. */ +static void mdbx_node_del(MDBX_cursor *mc, int ksize) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; indx_t indx = mc->mc_ki[mc->mc_top]; unsigned sz; @@ -6598,8 +6608,8 @@ static void mdbx_node_shrink(MDBX_page *mp, indx_t indx) { * depend only on the parent DB. * * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */ -static void mdbx_xcursor_init0(MDB_cursor *mc) { - MDB_xcursor *mx = mc->mc_xcursor; +static void mdbx_xcursor_init0(MDBX_cursor *mc) { + MDBX_xcursor *mx = mc->mc_xcursor; mx->mx_cursor.mc_xcursor = NULL; mx->mx_cursor.mc_txn = mc->mc_txn; @@ -6619,13 +6629,13 @@ static void mdbx_xcursor_init0(MDB_cursor *mc) { /* Final setup of a sorted-dups cursor. * Sets up the fields that depend on the data from the main cursor. * [in] mc The main cursor whose sorted-dups cursor is to be initialized. - * [in] node The data containing the MDB_db record for the sorted-dup database. + * [in] node The data containing the MDBX_db record for the sorted-dup database. */ -static void mdbx_xcursor_init1(MDB_cursor *mc, MDBX_node *node) { - MDB_xcursor *mx = mc->mc_xcursor; +static void mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node) { + MDBX_xcursor *mx = mc->mc_xcursor; if (node->mn_flags & F_SUBDATA) { - memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); + memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDBX_db)); mx->mx_cursor.mc_pg[0] = 0; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; @@ -6645,11 +6655,11 @@ static void mdbx_xcursor_init1(MDB_cursor *mc, MDBX_node *node) { mx->mx_cursor.mc_flags = C_INITIALIZED | C_SUB; mx->mx_cursor.mc_pg[0] = fp; mx->mx_cursor.mc_ki[0] = 0; - if (mc->mc_db->md_flags & MDB_DUPFIXED) { - mx->mx_db.md_flags = MDB_DUPFIXED; + if (mc->mc_db->md_flags & MDBX_DUPFIXED) { + mx->mx_db.md_flags = MDBX_DUPFIXED; mx->mx_db.md_xsize = fp->mp_leaf2_ksize; - if (mc->mc_db->md_flags & MDB_INTEGERDUP) - mx->mx_db.md_flags |= MDB_INTEGERKEY; + if (mc->mc_db->md_flags & MDBX_INTEGERDUP) + mx->mx_db.md_flags |= MDBX_INTEGERKEY; } } mdbx_debug("Sub-db -%u root page %" PRIaPGNO "", mx->mx_cursor.mc_dbi, @@ -6669,9 +6679,9 @@ static void mdbx_xcursor_init1(MDB_cursor *mc, MDBX_node *node) { * [in] mc The main cursor whose sorted-dups cursor is to be fixed up. * [in] src_mx The xcursor of an up-to-date cursor. * [in] new_dupdata True if converting from a non-F_DUPDATA item. */ -static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, +static void mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, int new_dupdata) { - MDB_xcursor *mx = mc->mc_xcursor; + MDBX_xcursor *mx = mc->mc_xcursor; if (new_dupdata) { mx->mx_cursor.mc_snum = 1; @@ -6690,8 +6700,8 @@ static void mdbx_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, } /* Initialize a cursor for a given transaction and database. */ -static void mdbx_cursor_init(MDB_cursor *mc, MDBX_txn *txn, MDB_dbi dbi, - MDB_xcursor *mx) { +static void mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi, + MDBX_xcursor *mx) { mc->mc_signature = MDBX_MC_SIGNATURE; mc->mc_next = NULL; mc->mc_backup = NULL; @@ -6706,20 +6716,20 @@ static void mdbx_cursor_init(MDB_cursor *mc, MDBX_txn *txn, MDB_dbi dbi, mc->mc_flags = 0; mc->mc_ki[0] = 0; mc->mc_xcursor = NULL; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { + if (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) { mdbx_tassert(txn, mx != NULL); mx->mx_cursor.mc_signature = MDBX_MC_SIGNATURE; mc->mc_xcursor = mx; mdbx_xcursor_init0(mc); } if (unlikely(*mc->mc_dbflag & DB_STALE)) { - mdbx_page_search(mc, NULL, MDB_PS_ROOTONLY); + mdbx_page_search(mc, NULL, MDBX_PS_ROOTONLY); } } -int mdbx_cursor_open(MDBX_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { - MDB_cursor *mc; - size_t size = sizeof(MDB_cursor); +int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) { + MDBX_cursor *mc; + size_t size = sizeof(MDBX_cursor); if (unlikely(!ret || !txn)) return MDBX_EINVAL; @@ -6730,17 +6740,17 @@ int mdbx_cursor_open(MDBX_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; - if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) return MDBX_EINVAL; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) - size += sizeof(MDB_xcursor); + if (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) + size += sizeof(MDBX_xcursor); if (likely((mc = malloc(size)) != NULL)) { - mdbx_cursor_init(mc, txn, dbi, (MDB_xcursor *)(mc + 1)); + mdbx_cursor_init(mc, txn, dbi, (MDBX_xcursor *)(mc + 1)); if (txn->mt_cursors) { mc->mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = mc; @@ -6752,10 +6762,10 @@ int mdbx_cursor_open(MDBX_txn *txn, MDB_dbi dbi, MDB_cursor **ret) { *ret = mc; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int mdbx_cursor_renew(MDBX_txn *txn, MDB_cursor *mc) { +int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) { if (unlikely(!mc || !txn)) return MDBX_EINVAL; @@ -6773,7 +6783,7 @@ int mdbx_cursor_renew(MDBX_txn *txn, MDB_cursor *mc) { return MDBX_EINVAL; if (unlikely((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)) { - MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; if (*prev == mc) @@ -6781,36 +6791,36 @@ int mdbx_cursor_renew(MDBX_txn *txn, MDB_cursor *mc) { mc->mc_signature = MDBX_MC_READY4CLOSE; } - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; mdbx_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Return the count of duplicate data items for the current key */ -int mdbx_cursor_count(MDB_cursor *mc, uint64_t *countp) { +int mdbx_cursor_count(MDBX_cursor *mc, uint64_t *countp) { if (unlikely(mc == NULL || countp == NULL)) return MDBX_EINVAL; if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(mc->mc_txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; if (!mc->mc_snum) { *countp = 0; - return MDB_NOTFOUND; + return MDBX_NOTFOUND; } MDBX_page *mp = mc->mc_pg[mc->mc_top]; if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)) { *countp = 0; - return MDB_NOTFOUND; + return MDBX_NOTFOUND; } *countp = 1; @@ -6822,10 +6832,10 @@ int mdbx_cursor_count(MDB_cursor *mc, uint64_t *countp) { *countp = mc->mc_xcursor->mx_db.md_entries; } } - return MDB_SUCCESS; + return MDBX_SUCCESS; } -void mdbx_cursor_close(MDB_cursor *mc) { +void mdbx_cursor_close(MDBX_cursor *mc) { if (mc) { mdbx_ensure(NULL, mc->mc_signature == MDBX_MC_SIGNATURE || mc->mc_signature == MDBX_MC_READY4CLOSE); @@ -6834,7 +6844,7 @@ void mdbx_cursor_close(MDB_cursor *mc) { * A read-only txn (!C_UNTRACK) may have been freed already, * so do not peek inside it. Only write txns track cursors. */ if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { - MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; if (*prev == mc) @@ -6850,24 +6860,24 @@ void mdbx_cursor_close(MDB_cursor *mc) { } } -MDBX_txn *mdbx_cursor_txn(MDB_cursor *mc) { +MDBX_txn *mdbx_cursor_txn(MDBX_cursor *mc) { if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) return NULL; return mc->mc_txn; } -MDB_dbi mdbx_cursor_dbi(MDB_cursor *mc) { +MDBX_dbi mdbx_cursor_dbi(MDBX_cursor *mc) { if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) return INT_MIN; return mc->mc_dbi; } /* Replace the key for a branch node with a new key. - * Set MDB_TXN_ERROR on failure. + * Set MDBX_TXN_ERROR on failure. * [in] mc Cursor pointing to the node to operate on. * [in] key The new key to use. * Returns 0 on success, non-zero on failure. */ -static int mdbx_update_key(MDB_cursor *mc, MDBX_val *key) { +static int mdbx_update_key(MDBX_cursor *mc, MDBX_val *key) { MDBX_page *mp; MDBX_node *node; char *base; @@ -6880,7 +6890,7 @@ static int mdbx_update_key(MDB_cursor *mc, MDBX_val *key) { mp = mc->mc_pg[mc->mc_top]; node = NODEPTR(mp, indx); ptr = mp->mp_ptrs[indx]; - if (MDB_DEBUG) { + if (MDBX_DEBUG) { MDBX_val k2; char kbuf2[DKBUF_MAXKEYSIZE * 2 + 1]; k2.iov_base = NODEKEY(node); @@ -6903,7 +6913,7 @@ static int mdbx_update_key(MDB_cursor *mc, MDBX_val *key) { mdbx_debug("Not enough room, delta = %d, splitting...", delta); pgno = NODEPGNO(node); mdbx_node_del(mc, 0); - return mdbx_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE); + return mdbx_page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); } numkeys = NUMKEYS(mp); @@ -6927,18 +6937,19 @@ static int mdbx_update_key(MDB_cursor *mc, MDBX_val *key) { if (key->iov_len) memcpy(NODEKEY(node), key->iov_base, key->iov_len); - return MDB_SUCCESS; + return MDBX_SUCCESS; } -static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); +static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); /* Perform act while tracking temporary cursor mn */ #define WITH_CURSOR_TRACKING(mn, act) \ do { \ - MDB_cursor mc_dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ + MDBX_cursor mc_dummy, *tracked, \ + **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ if ((mn).mc_flags & C_SUB) { \ mc_dummy.mc_flags = C_INITIALIZED; \ - mc_dummy.mc_xcursor = (MDB_xcursor *)&(mn); \ + mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn); \ tracked = &mc_dummy; \ } else { \ tracked = &(mn); \ @@ -6950,11 +6961,11 @@ static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); } while (0) /* Move a node from csrc to cdst. */ -static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { +static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { MDBX_node *srcnode; MDBX_val key, data; pgno_t srcpg; - MDB_cursor mn; + MDBX_cursor mn; int rc; unsigned flags; @@ -7038,7 +7049,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { /* Add the node to the destination page. */ rc = mdbx_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; /* Delete the node from the source page. */ @@ -7046,8 +7057,8 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { { /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; + MDBX_cursor *m2, *m3; + MDBX_dbi dbi = csrc->mc_dbi; MDBX_page *mpd, *mps; mps = csrc->mc_pg[csrc->mc_top]; @@ -7119,7 +7130,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { mn.mc_top--; /* We want mdbx_rebalance to find mn when doing fixups */ WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; } if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { @@ -7129,7 +7140,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { csrc->mc_ki[csrc->mc_top] = 0; rc = mdbx_update_key(csrc, &nullkey); csrc->mc_ki[csrc->mc_top] = ix; - mdbx_cassert(csrc, rc == MDB_SUCCESS); + mdbx_cassert(csrc, rc == MDBX_SUCCESS); } } @@ -7149,7 +7160,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { mn.mc_top--; /* We want mdbx_rebalance to find mn when doing fixups */ WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; } if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { @@ -7159,11 +7170,11 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { cdst->mc_ki[cdst->mc_top] = 0; rc = mdbx_update_key(cdst, &nullkey); cdst->mc_ki[cdst->mc_top] = ix; - mdbx_cassert(cdst, rc == MDB_SUCCESS); + mdbx_cassert(cdst, rc == MDBX_SUCCESS); } } - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Merge one page into another. @@ -7175,7 +7186,7 @@ static int mdbx_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { * [in] cdst Cursor pointing to the destination page. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { +static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_page *psrc, *pdst; MDBX_node *srcnode; MDBX_val key, data; @@ -7206,7 +7217,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { key.iov_base = PAGEDATA(psrc); for (i = 0; i < NUMKEYS(psrc); i++, j++) { rc = mdbx_node_add(cdst, j, &key, NULL, 0, 0); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; key.iov_base = (char *)key.iov_base + key.iov_len; } @@ -7214,7 +7225,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { for (i = 0; i < NUMKEYS(psrc); i++, j++) { srcnode = NODEPTR(psrc, i); if (i == 0 && IS_BRANCH(psrc)) { - MDB_cursor mn; + MDBX_cursor mn; MDBX_node *s2; mdbx_cursor_copy(csrc, &mn); mn.mc_xcursor = NULL; @@ -7239,7 +7250,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { data.iov_base = NODEDATA(srcnode); rc = mdbx_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; } } @@ -7273,8 +7284,8 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { csrc->mc_db->md_branch_pages--; { /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = csrc->mc_dbi; + MDBX_cursor *m2, *m3; + MDBX_dbi dbi = csrc->mc_dbi; unsigned top = csrc->mc_top; for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { @@ -7315,7 +7326,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) { /* Copy the contents of a cursor. * [in] csrc The cursor to copy from. * [out] cdst The cursor to copy to. */ -static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) { +static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { unsigned i; cdst->mc_txn = csrc->mc_txn; @@ -7335,11 +7346,11 @@ static void mdbx_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) { /* Rebalance the tree after a delete operation. * [in] mc Cursor pointing to the page where rebalancing should begin. * Returns 0 on success, non-zero on failure. */ -static int mdbx_rebalance(MDB_cursor *mc) { +static int mdbx_rebalance(MDBX_cursor *mc) { MDBX_node *node; int rc, fromleft; unsigned ptop, minkeys, thresh; - MDB_cursor mn; + MDBX_cursor mn; indx_t oldki; if (IS_BRANCH(mc->mc_pg[mc->mc_top])) { @@ -7358,7 +7369,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { mdbx_debug("no need to rebalance page %" PRIaPGNO ", above fill threshold", mc->mc_pg[mc->mc_top]->mp_pgno); - return MDB_SUCCESS; + return MDBX_SUCCESS; } if (mc->mc_snum < 2) { @@ -7366,7 +7377,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { unsigned nkeys = NUMKEYS(mp); if (IS_SUBP(mp)) { mdbx_debug("Can't rebalance a subpage, ignoring"); - return MDB_SUCCESS; + return MDBX_SUCCESS; } if (nkeys == 0) { mdbx_debug("tree is completely empty"); @@ -7381,8 +7392,8 @@ static int mdbx_rebalance(MDB_cursor *mc) { mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; { - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; + MDBX_cursor *m2, *m3; + MDBX_dbi dbi = mc->mc_dbi; for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { if (mc->mc_flags & C_SUB) @@ -7417,8 +7428,8 @@ static int mdbx_rebalance(MDB_cursor *mc) { } { /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; + MDBX_cursor *m2, *m3; + MDBX_dbi dbi = mc->mc_dbi; for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { if (mc->mc_flags & C_SUB) @@ -7441,7 +7452,7 @@ static int mdbx_rebalance(MDB_cursor *mc) { } } else mdbx_debug("root page doesn't need rebalancing"); - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* The parent (branch page) must have at least 2 pointers, @@ -7513,13 +7524,13 @@ static int mdbx_rebalance(MDB_cursor *mc) { } /* Complete a delete operation started by mdbx_cursor_del(). */ -static int mdbx_cursor_del0(MDB_cursor *mc) { +static int mdbx_cursor_del0(MDBX_cursor *mc) { int rc; MDBX_page *mp; indx_t ki; unsigned nkeys; - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; + MDBX_cursor *m2, *m3; + MDBX_dbi dbi = mc->mc_dbi; ki = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; @@ -7536,7 +7547,7 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { if (m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] == ki) { m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDB_DUPSORT) { + if (mc->mc_db->md_flags & MDBX_DUPSORT) { /* Sub-cursor referred into dataset which is gone */ m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); } @@ -7551,7 +7562,7 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { } rc = mdbx_rebalance(mc); - if (likely(rc == MDB_SUCCESS)) { + if (likely(rc == MDBX_SUCCESS)) { /* DB is totally empty now, just bail out. * Other cursors adjustments were already done * by mdbx_rebalance and aren't needed here. */ @@ -7575,13 +7586,13 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { if (m3->mc_ki[mc->mc_top] >= nkeys) { rc = mdbx_cursor_sibling(m3, 1); - if (rc == MDB_NOTFOUND) { + if (rc == MDBX_NOTFOUND) { m3->mc_flags |= C_EOF; - rc = MDB_SUCCESS; + rc = MDBX_SUCCESS; continue; } } - if (mc->mc_db->md_flags & MDB_DUPSORT) { + if (mc->mc_db->md_flags & MDBX_DUPSORT) { MDBX_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); /* If this node has dupdata, it may need to be reinited @@ -7603,11 +7614,11 @@ static int mdbx_cursor_del0(MDB_cursor *mc) { } if (unlikely(rc)) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return rc; } -int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data) { +int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { if (unlikely(!key || !txn)) return MDBX_EINVAL; @@ -7617,17 +7628,17 @@ int mdbx_del(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data) { if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; + if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; return mdbx_del0(txn, dbi, key, data, 0); } -static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, +static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, unsigned flags) { - MDB_cursor mc; - MDB_xcursor mx; - MDB_cursor_op op; + MDBX_cursor mc; + MDBX_xcursor mx; + MDBX_cursor_op op; MDBX_val rdata; int rc, exact = 0; DKBUF; @@ -7638,12 +7649,12 @@ static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, mdbx_cursor_init(&mc, txn, dbi, &mx); if (data) { - op = MDB_GET_BOTH; + op = MDBX_GET_BOTH; rdata = *data; data = &rdata; } else { - op = MDB_SET; - flags |= MDB_NODUPDATA; + op = MDBX_SET; + flags |= MDBX_NODUPDATA; } rc = mdbx_cursor_set(&mc, key, data, op, &exact); if (likely(rc == 0)) { @@ -7663,7 +7674,7 @@ static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, } /* Split a page and insert a new node. - * Set MDB_TXN_ERROR on failure. + * Set MDBX_TXN_ERROR on failure. * [in,out] mc Cursor pointing to the page and desired insertion index. * The cursor will be updated to point to the actual page and index where * the node got inserted after the split. @@ -7672,20 +7683,20 @@ static int mdbx_del0(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, * [in] newpgno The page number, if the new node is a branch node. * [in] nflags The NODE_ADD_FLAGS for the new node. * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, +static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, pgno_t newpgno, unsigned nflags) { unsigned flags; - int rc = MDB_SUCCESS, new_root = 0, did_split = 0; + int rc = MDBX_SUCCESS, new_root = 0, did_split = 0; indx_t newindx; pgno_t pgno = 0; int i, j, split_indx, nkeys, pmax; - MDB_env *env = mc->mc_txn->mt_env; + MDBX_env *env = mc->mc_txn->mt_env; MDBX_node *node; MDBX_val sepkey, rkey, xdata, *rdata = &xdata; MDBX_page *copy = NULL; MDBX_page *mp, *rp, *pp; int ptop; - MDB_cursor mn; + MDBX_cursor mn; DKBUF; mp = mc->mc_pg[mc->mc_top]; @@ -7723,7 +7734,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, /* Add left (implicit) pointer. */ if (unlikely((rc = mdbx_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != - MDB_SUCCESS)) { + MDBX_SUCCESS)) { /* undo the pre-push */ mc->mc_pg[0] = mc->mc_pg[1]; mc->mc_ki[0] = mc->mc_ki[1]; @@ -7744,7 +7755,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, mn.mc_pg[mn.mc_top] = rp; mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1; - if (nflags & MDB_APPEND) { + if (nflags & MDBX_APPEND) { mn.mc_ki[mn.mc_top] = 0; sepkey = *newkey; split_indx = newindx; @@ -7889,7 +7900,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, /* We want other splits to find mn when doing fixups */ WITH_CURSOR_TRACKING( mn, rc = mdbx_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0)); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto done; /* root split? */ @@ -7918,12 +7929,12 @@ static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, rc = mdbx_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); mn.mc_top++; } - if (unlikely(rc != MDB_SUCCESS)) { - if (rc == MDB_NOTFOUND) /* improper mdbx_cursor_sibling() result */ - rc = MDB_PROBLEM; + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ + rc = MDBX_PROBLEM; goto done; } - if (nflags & MDB_APPEND) { + if (nflags & MDBX_APPEND) { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[mc->mc_top] = 0; rc = mdbx_node_add(mc, 0, newkey, newdata, newpgno, nflags); @@ -8001,7 +8012,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, } } } - if (nflags & MDB_RESERVE) { + if (nflags & MDBX_RESERVE) { node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!(node->mn_flags & F_BIGDATA)) newdata->iov_base = NODEDATA(node); @@ -8023,8 +8034,8 @@ static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, { /* Adjust other cursors pointing to mp */ - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi; + MDBX_cursor *m2, *m3; + MDBX_dbi dbi = mc->mc_dbi; nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { @@ -8056,7 +8067,7 @@ static int mdbx_page_split(MDB_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, m3->mc_top++; } if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) + if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDBX_SPLIT_REPLACE)) m3->mc_ki[mc->mc_top]++; if (m3->mc_ki[mc->mc_top] >= nkeys) { m3->mc_pg[mc->mc_top] = rp; @@ -8081,14 +8092,14 @@ done: if (copy) /* tmp page */ mdbx_page_free(env, copy); if (unlikely(rc)) - mc->mc_txn->mt_flags |= MDB_TXN_ERROR; + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return rc; } -int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, +int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, unsigned flags) { - MDB_cursor mc; - MDB_xcursor mx; + MDBX_cursor mc; + MDBX_xcursor mx; if (unlikely(!key || !data || !txn)) return MDBX_EINVAL; @@ -8100,23 +8111,23 @@ int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(flags & - ~(MDB_NOOVERWRITE | MDB_NODUPDATA | MDB_RESERVE | MDB_APPEND | - MDB_APPENDDUP | MDB_CURRENT))) + ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_RESERVE | + MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; + if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; mdbx_cursor_init(&mc, txn, dbi, &mx); mc.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &mc; - int rc = MDB_SUCCESS; + int rc = MDBX_SUCCESS; /* LY: support for update (explicit overwrite) */ - if (flags & MDB_CURRENT) { - rc = mdbx_cursor_get(&mc, key, NULL, MDB_SET); - if (likely(rc == MDB_SUCCESS) && - (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)) { + if (flags & MDBX_CURRENT) { + rc = mdbx_cursor_get(&mc, key, NULL, MDBX_SET); + if (likely(rc == MDBX_SUCCESS) && + (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT)) { /* LY: allows update (explicit overwrite) only for unique keys */ MDBX_node *leaf = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -8127,21 +8138,21 @@ int mdbx_put(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, } } - if (likely(rc == MDB_SUCCESS)) + if (likely(rc == MDBX_SUCCESS)) rc = mdbx_cursor_put(&mc, key, data, flags); txn->mt_cursors[dbi] = mc.mc_next; return rc; } -#ifndef MDB_WBUF -#define MDB_WBUF (1024 * 1024) +#ifndef MDBX_WBUF +#define MDBX_WBUF (1024 * 1024) #endif -#define MDB_EOF 0x10 /* mdbx_env_copyfd1() is done reading */ +#define MDBX_EOF 0x10 /* mdbx_env_copyfd1() is done reading */ /* State needed for a double-buffering compacting copy. */ typedef struct mdbx_copy { - MDB_env *mc_env; + MDBX_env *mc_env; MDBX_txn *mc_txn; mdbx_condmutex_t mc_condmutex; char *mc_wbuf[2]; @@ -8151,9 +8162,9 @@ typedef struct mdbx_copy { pgno_t mc_next_pgno; mdbx_filehandle_t mc_fd; int mc_toggle; /* Buffer number in provider */ - int mc_new; /* (0-2 buffers to write) | (MDB_EOF at end) */ + int mc_new; /* (0-2 buffers to write) | (MDBX_EOF at end) */ /* Error code. Never cleared if set. Both threads can set nonzero - * to fail the copy. Not mutex-protected, LMDB expects atomic int. */ + * to fail the copy. Not mutex-protected, MDBX expects atomic int. */ volatile int mc_error; } mdbx_copy; @@ -8167,14 +8178,14 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { while (!my->mc_error) { while (!my->mc_new) mdbx_condmutex_wait(&my->mc_condmutex); - if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ + if (my->mc_new == 0 + MDBX_EOF) /* 0 buffers, just EOF */ break; wsize = my->mc_wlen[toggle]; ptr = my->mc_wbuf[toggle]; again: if (wsize > 0 && !my->mc_error) { int rc = mdbx_write(my->mc_fd, ptr, wsize); - if (rc != MDB_SUCCESS) + if (rc != MDBX_SUCCESS) my->mc_error = rc; } @@ -8195,10 +8206,10 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { return (THREAD_RESULT)0; } -/* Give buffer and/or MDB_EOF to writer thread, await unused buffer. +/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. * * [in] my control structure. - * [in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). */ + * [in] adjust (1 to hand off 1 buffer) | (MDBX_EOF when ending). */ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { mdbx_condmutex_lock(&my->mc_condmutex); my->mc_new += adjust; @@ -8218,7 +8229,7 @@ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { * [in,out] pg database root. * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { - MDB_cursor mc; + MDBX_cursor mc; MDBX_node *ni; MDBX_page *mo, *mp, *leaf; char *buf, *ptr; @@ -8227,7 +8238,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { /* Empty DB, nothing to do */ if (*pg == P_INVALID) - return MDB_SUCCESS; + return MDBX_SUCCESS; memset(&mc, 0, sizeof(mc)); mc.mc_snum = 1; @@ -8236,7 +8247,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { rc = mdbx_page_get(&mc, *pg, &mc.mc_pg[0], NULL); if (rc) return rc; - rc = mdbx_page_search_root(&mc, NULL, MDB_PS_FIRST); + rc = mdbx_page_search_root(&mc, NULL, MDBX_PS_FIRST); if (rc) return rc; @@ -8281,7 +8292,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { rc = mdbx_page_get(&mc, pgno, &omp, NULL); if (rc) goto done; - if (my->mc_wlen[toggle] >= MDB_WBUF) { + if (my->mc_wlen[toggle] >= MDBX_WBUF) { rc = mdbx_env_cthr_toggle(my, 1); if (rc) goto done; @@ -8301,7 +8312,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { toggle = my->mc_toggle; } } else if (ni->mn_flags & F_SUBDATA) { - MDB_db db; + MDBX_db db; /* Need writable leaf */ if (mp != leaf) { @@ -8344,7 +8355,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { continue; } } - if (my->mc_wlen[toggle] >= MDB_WBUF) { + if (my->mc_wlen[toggle] >= MDBX_WBUF) { rc = mdbx_env_cthr_toggle(my, 1); if (rc) goto done; @@ -8371,8 +8382,8 @@ done: } /* Copy environment with compaction. */ -static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { - MDB_meta *mm; +static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { + MDBX_meta *mm; MDBX_page *mp; mdbx_copy my; MDBX_txn *txn = NULL; @@ -8383,13 +8394,13 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { memset(&my, 0, sizeof(my)); if ((rc = mdbx_condmutex_init(&my.mc_condmutex)) != 0) return rc; - rc = mdbx_memalign_alloc(env->me_os_psize, MDB_WBUF * 2, + rc = mdbx_memalign_alloc(env->me_os_psize, MDBX_WBUF * 2, (void **)&my.mc_wbuf[0]); - if (rc != MDB_SUCCESS) + if (rc != MDBX_SUCCESS) goto done; - memset(my.mc_wbuf[0], 0, MDB_WBUF * 2); - my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; + memset(my.mc_wbuf[0], 0, MDBX_WBUF * 2); + my.mc_wbuf[1] = my.mc_wbuf[0] + MDBX_WBUF; my.mc_next_pgno = NUM_METAS; my.mc_env = env; my.mc_fd = fd; @@ -8397,7 +8408,7 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { if (rc) goto done; - rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); if (rc) goto finish; @@ -8405,14 +8416,14 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { memset(mp, 0, NUM_METAS * env->me_psize); mp->mp_pgno = 0; mp->mp_flags = P_META; - mm = (MDB_meta *)PAGEDATA(mp); + mm = (MDBX_meta *)PAGEDATA(mp); mdbx_meta_model(env, mm); mp = (MDBX_page *)(my.mc_wbuf[0] + env->me_psize); mp->mp_pgno = 1; mp->mp_flags = P_META; - *(MDB_meta *)PAGEDATA(mp) = *mm; - mm = (MDB_meta *)PAGEDATA(mp); + *(MDBX_meta *)PAGEDATA(mp) = *mm; + mm = (MDBX_meta *)PAGEDATA(mp); /* Set metapage 1 with current main DB */ root = new_root = txn->mt_dbs[MAIN_DBI].md_root; @@ -8420,12 +8431,12 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { /* Count free pages + freeDB pages. Subtract from last_pg * to find the new last_pg, which also becomes the new root. */ pgno_t freecount = 0; - MDB_cursor mc; + MDBX_cursor mc; MDBX_val key, data; mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); - while ((rc = mdbx_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) + while ((rc = mdbx_cursor_get(&mc, &key, &data, MDBX_NEXT)) == 0) freecount += *(pgno_t *)data.iov_base; - if (rc != MDB_NOTFOUND) + if (rc != MDBX_NOTFOUND) goto finish; freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + txn->mt_dbs[FREE_DBI].md_leaf_pages + @@ -8447,14 +8458,14 @@ static int __cold mdbx_env_compact(MDB_env *env, mdbx_filehandle_t fd) { my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; rc = mdbx_env_cwalk(&my, &root, 0); - if (rc == MDB_SUCCESS && root != new_root) { - rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ + if (rc == MDBX_SUCCESS && root != new_root) { + rc = MDBX_INCOMPATIBLE; /* page leak or corrupt DB */ } finish: if (rc) my.mc_error = rc; - mdbx_env_cthr_toggle(&my, 1 | MDB_EOF); + mdbx_env_cthr_toggle(&my, 1 | MDBX_EOF); rc = mdbx_thread_join(thr); mdbx_txn_abort(txn); @@ -8465,18 +8476,18 @@ done: } /* Copy environment as-is. */ -static int __cold mdbx_env_copy_asis(MDB_env *env, mdbx_filehandle_t fd) { +static int __cold mdbx_env_copy_asis(MDBX_env *env, mdbx_filehandle_t fd) { MDBX_txn *txn = NULL; int rc; /* Do the lock/unlock of the reader mutex before starting the * write txn. Otherwise other read txns could block writers. */ - rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); if (unlikely(rc)) return rc; /* We must start the actual read txn after blocking writers */ - rc = mdbx_txn_end(txn, MDB_END_RESET_TMP); + rc = mdbx_txn_end(txn, MDBX_END_RESET_TMP); if (unlikely(rc)) goto bailout; /* FIXME: or just return? */ @@ -8485,7 +8496,7 @@ static int __cold mdbx_env_copy_asis(MDB_env *env, mdbx_filehandle_t fd) { if (unlikely(rc)) goto bailout; - rc = mdbx_txn_renew0(txn, MDB_RDONLY); + rc = mdbx_txn_renew0(txn, MDBX_RDONLY); if (rc) { mdbx_txn_unlock(env); goto bailout; @@ -8494,7 +8505,7 @@ static int __cold mdbx_env_copy_asis(MDB_env *env, mdbx_filehandle_t fd) { rc = mdbx_write(fd, env->me_map, env->me_psize * NUM_METAS); mdbx_txn_unlock(env); - if (rc == MDB_SUCCESS) + if (rc == MDBX_SUCCESS) rc = mdbx_ftruncate(fd, txn->mt_next_pgno * env->me_psize); bailout: @@ -8502,20 +8513,20 @@ bailout: return rc; } -int __cold mdbx_env_copy2fd(MDB_env *env, mdbx_filehandle_t fd, +int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, unsigned flags) { - if (flags & MDB_CP_COMPACT) + if (flags & MDBX_CP_COMPACT) return mdbx_env_compact(env, fd); else return mdbx_env_copy_asis(env, fd); } -int __cold mdbx_env_copy(MDB_env *env, const char *path, unsigned flags) { +int __cold mdbx_env_copy(MDBX_env *env, const char *path, unsigned flags) { int rc, len; char *lck_pathname; mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE; - if (env->me_flags & MDB_NOSUBDIR) { + if (env->me_flags & MDBX_NOSUBDIR) { lck_pathname = (char *)path; } else { len = strlen(path); @@ -8530,7 +8541,7 @@ int __cold mdbx_env_copy(MDB_env *env, const char *path, unsigned flags) { * We don't want the OS to cache the writes, since the source data is * already in the OS cache. */ rc = mdbx_openfile(lck_pathname, O_WRONLY | O_CREAT | O_EXCL, 0666, &newfd); - if (rc == MDB_SUCCESS) { + if (rc == MDBX_SUCCESS) { if (env->me_psize >= env->me_os_psize) { #ifdef F_NOCACHE /* __APPLE__ */ (void)fcntl(newfd, F_NOCACHE, 1); @@ -8543,19 +8554,19 @@ int __cold mdbx_env_copy(MDB_env *env, const char *path, unsigned flags) { rc = mdbx_env_copy2fd(env, newfd, flags); } - if (!(env->me_flags & MDB_NOSUBDIR)) + if (!(env->me_flags & MDBX_NOSUBDIR)) free(lck_pathname); if (newfd != INVALID_HANDLE_VALUE) { int err = mdbx_closefile(newfd); - if (rc == MDB_SUCCESS && err != rc) + if (rc == MDBX_SUCCESS && err != rc) rc = err; } return rc; } -int __cold mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff) { +int __cold mdbx_env_set_flags(MDBX_env *env, unsigned flags, int onoff) { if (unlikely(flags & ~CHANGEABLE)) return MDBX_EINVAL; @@ -8569,73 +8580,73 @@ int __cold mdbx_env_set_flags(MDB_env *env, unsigned flags, int onoff) { env->me_flags &= ~flags; mdbx_txn_unlock(env); - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int __cold mdbx_env_get_flags(MDB_env *env, unsigned *arg) { +int __cold mdbx_env_get_flags(MDBX_env *env, unsigned *arg) { if (unlikely(!env || !arg)) return MDBX_EINVAL; *arg = env->me_flags & (CHANGEABLE | CHANGELESS); - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int __cold mdbx_env_set_userctx(MDB_env *env, void *ctx) { +int __cold mdbx_env_set_userctx(MDBX_env *env, void *ctx) { if (unlikely(!env)) return MDBX_EINVAL; env->me_userctx = ctx; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -void *__cold mdbx_env_get_userctx(MDB_env *env) { +void *__cold mdbx_env_get_userctx(MDBX_env *env) { return env ? env->me_userctx : NULL; } -int __cold mdbx_env_set_assert(MDB_env *env, MDB_assert_func *func) { +int __cold mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { if (unlikely(!env)) return MDBX_EINVAL; -#if MDB_DEBUG +#if MDBX_DEBUG env->me_assert_func = func; - return MDB_SUCCESS; + return MDBX_SUCCESS; #else (void)func; return MDBX_ENOSYS; #endif } -int __cold mdbx_env_get_path(MDB_env *env, const char **arg) { +int __cold mdbx_env_get_path(MDBX_env *env, const char **arg) { if (unlikely(!env || !arg)) return MDBX_EINVAL; *arg = env->me_path; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int __cold mdbx_env_get_fd(MDB_env *env, mdbx_filehandle_t *arg) { +int __cold mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *arg) { if (unlikely(!env || !arg)) return MDBX_EINVAL; *arg = env->me_fd; - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Common code for mdbx_dbi_stat() and mdbx_env_stat(). * [in] env the environment to operate in. - * [in] db the MDB_db record containing the stats to return. - * [out] arg the address of an MDB_stat structure to receive the stats. + * [in] db the MDBX_db record containing the stats to return. + * [out] arg the address of an MDBX_stat structure to receive the stats. * Returns 0, this function always succeeds. */ -static int __cold mdbx_stat0(MDB_env *env, MDB_db *db, MDBX_stat *arg) { +static int __cold mdbx_stat0(MDBX_env *env, MDBX_db *db, MDBX_stat *arg) { arg->ms_psize = env->me_psize; arg->ms_depth = db->md_depth; arg->ms_branch_pages = db->md_branch_pages; arg->ms_leaf_pages = db->md_leaf_pages; arg->ms_overflow_pages = db->md_overflow_pages; arg->ms_entries = db->md_entries; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int __cold mdbx_env_stat(MDB_env *env, MDBX_stat *arg, size_t bytes) { - MDB_meta *meta; +int __cold mdbx_env_stat(MDBX_env *env, MDBX_stat *arg, size_t bytes) { + MDBX_meta *meta; if (unlikely(env == NULL || arg == NULL)) return MDBX_EINVAL; @@ -8646,8 +8657,8 @@ int __cold mdbx_env_stat(MDB_env *env, MDBX_stat *arg, size_t bytes) { return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); } -int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) { - MDB_meta *meta; +int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { + MDBX_meta *meta; if (unlikely(env == NULL || arg == NULL)) return MDBX_EINVAL; @@ -8655,7 +8666,7 @@ int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) { if (bytes != sizeof(MDBX_envinfo)) return MDBX_EINVAL; - MDB_meta *m1, *m2; + MDBX_meta *m1, *m2; MDBX_reader *r; unsigned i; @@ -8689,32 +8700,32 @@ int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) { } } - return MDB_SUCCESS; + return MDBX_SUCCESS; } -static MDB_cmp_func *mdbx_default_keycmp(unsigned flags) { - return (flags & MDB_REVERSEKEY) ? mdbx_cmp_memnr : (flags & MDB_INTEGERKEY) - ? mdbx_cmp_int_a2 - : mdbx_cmp_memn; +static MDBX_cmp_func *mdbx_default_keycmp(unsigned flags) { + return (flags & MDBX_REVERSEKEY) ? mdbx_cmp_memnr : (flags & MDBX_INTEGERKEY) + ? mdbx_cmp_int_a2 + : mdbx_cmp_memn; } -static MDB_cmp_func *mdbx_default_datacmp(unsigned flags) { - return !(flags & MDB_DUPSORT) +static MDBX_cmp_func *mdbx_default_datacmp(unsigned flags) { + return !(flags & MDBX_DUPSORT) ? 0 - : ((flags & MDB_INTEGERDUP) + : ((flags & MDBX_INTEGERDUP) ? mdbx_cmp_int_ua - : ((flags & MDB_REVERSEDUP) ? mdbx_cmp_memnr - : mdbx_cmp_memn)); + : ((flags & MDBX_REVERSEDUP) ? mdbx_cmp_memnr + : mdbx_cmp_memn)); } -static int mdbx_dbi_bind(MDBX_txn *txn, const MDB_dbi dbi, unsigned user_flags, - MDB_cmp_func *keycmp, MDB_cmp_func *datacmp) { +static int mdbx_dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { /* LY: so, accepting only three cases for the table's flags: * 1) user_flags and both comparators are zero * = assume that a by-default mode/flags is requested for reading; * 2) user_flags exactly the same * = assume that the target mode/flags are requested properly; - * 3) user_flags differs, but table is empty and MDB_CREATE is provided + * 3) user_flags differs, but table is empty and MDBX_CREATE is provided * = assume that a properly create request with custom flags; */ if ((user_flags ^ txn->mt_dbs[dbi].md_flags) & PERSISTENT_FLAGS) { @@ -8724,45 +8735,45 @@ static int mdbx_dbi_bind(MDBX_txn *txn, const MDB_dbi dbi, unsigned user_flags, /* no comparators were provided and flags are zero, * seems that is case #1 above */ user_flags = txn->mt_dbs[dbi].md_flags; - } else if ((user_flags & MDB_CREATE) && txn->mt_dbs[dbi].md_entries == 0) { - if (txn->mt_flags & MDB_TXN_RDONLY) + } else if ((user_flags & MDBX_CREATE) && txn->mt_dbs[dbi].md_entries == 0) { + if (txn->mt_flags & MDBX_TXN_RDONLY) return /* FIXME: return extended info */ MDBX_EACCESS; /* make sure flags changes get committed */ txn->mt_dbs[dbi].md_flags = user_flags & PERSISTENT_FLAGS; - txn->mt_flags |= MDB_TXN_DIRTY; + txn->mt_flags |= MDBX_TXN_DIRTY; } else { - return /* FIXME: return extended info */ MDB_INCOMPATIBLE; + return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; } } - if (!txn->mt_dbxs[dbi].md_cmp || MDB_DEBUG) { + if (!txn->mt_dbxs[dbi].md_cmp || MDBX_DEBUG) { if (!keycmp) keycmp = mdbx_default_keycmp(user_flags); assert(!txn->mt_dbxs[dbi].md_cmp || txn->mt_dbxs[dbi].md_cmp == keycmp); txn->mt_dbxs[dbi].md_cmp = keycmp; } - if (!txn->mt_dbxs[dbi].md_dcmp || MDB_DEBUG) { + if (!txn->mt_dbxs[dbi].md_dcmp || MDBX_DEBUG) { if (!datacmp) datacmp = mdbx_default_datacmp(user_flags); assert(!txn->mt_dbxs[dbi].md_dcmp || txn->mt_dbxs[dbi].md_dcmp == datacmp); txn->mt_dbxs[dbi].md_dcmp = datacmp; } - return MDB_SUCCESS; + return MDBX_SUCCESS; } int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, - MDB_dbi *dbi, MDB_cmp_func *keycmp, - MDB_cmp_func *datacmp) { + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, + MDBX_cmp_func *datacmp) { if (unlikely(!txn || !dbi || (user_flags & ~VALID_FLAGS) != 0)) return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; /* main table? */ if (!table_name) { @@ -8779,7 +8790,7 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, /* Is the DB already open? */ size_t len = strlen(table_name); - MDB_dbi scan, slot = txn->mt_numdbs; + MDBX_dbi scan, slot = txn->mt_numdbs; for (scan = txn->mt_numdbs; --scan >= CORE_DBS;) { if (!txn->mt_dbxs[scan].md_name.iov_len) { /* Remember this free slot */ @@ -8795,28 +8806,29 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, /* Fail, if no free slot and max hit */ if (unlikely(slot >= txn->mt_env->me_maxdbs)) - return MDB_DBS_FULL; + return MDBX_DBS_FULL; /* Cannot mix named table with some main-table flags */ - if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT | MDB_INTEGERKEY))) - return (user_flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & + (MDBX_DUPSORT | MDBX_INTEGERKEY))) + return (user_flags & MDBX_CREATE) ? MDBX_INCOMPATIBLE : MDBX_NOTFOUND; /* Find the DB info */ int exact = 0; MDBX_val key, data; key.iov_len = len; key.iov_base = (void *)table_name; - MDB_cursor mc; + MDBX_cursor mc; mdbx_cursor_init(&mc, txn, MAIN_DBI, NULL); - int rc = mdbx_cursor_set(&mc, &key, &data, MDB_SET, &exact); - if (unlikely(rc != MDB_SUCCESS)) { - if (rc != MDB_NOTFOUND || !(user_flags & MDB_CREATE)) + int rc = mdbx_cursor_set(&mc, &key, &data, MDBX_SET, &exact); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) return rc; } else { /* make sure this is actually a table */ MDBX_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if (unlikely((node->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) - return MDB_INCOMPATIBLE; + return MDBX_INCOMPATIBLE; } /* Done here so we cannot fail after creating a new DB */ @@ -8828,18 +8840,19 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, unsigned dbflag = DB_NEW | DB_VALID | DB_USRVALID; if (unlikely(rc)) { - /* MDB_NOTFOUND and MDB_CREATE: Create new DB */ - assert(rc == MDB_NOTFOUND); - MDB_db db_dummy; + /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ + assert(rc == MDBX_NOTFOUND); + MDBX_db db_dummy; memset(&db_dummy, 0, sizeof(db_dummy)); db_dummy.md_root = P_INVALID; db_dummy.md_flags = user_flags & PERSISTENT_FLAGS; data.iov_len = sizeof(db_dummy); data.iov_base = &db_dummy; - WITH_CURSOR_TRACKING(mc, rc = mdbx_cursor_put(&mc, &key, &data, - F_SUBDATA | MDB_NOOVERWRITE)); + WITH_CURSOR_TRACKING( + mc, + rc = mdbx_cursor_put(&mc, &key, &data, F_SUBDATA | MDBX_NOOVERWRITE)); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; dbflag |= DB_DIRTY; @@ -8853,9 +8866,9 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, txn->mt_dbflags[slot] = dbflag; txn->mt_dbiseqs[slot] = (txn->mt_env->me_dbiseqs[slot] += 1); - txn->mt_dbs[slot] = *(MDB_db *)data.iov_base; + txn->mt_dbs[slot] = *(MDBX_db *)data.iov_base; rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp); - if (unlikely(rc != MDB_SUCCESS)) { + if (unlikely(rc != MDBX_SUCCESS)) { assert((dbflag & DB_DIRTY) == 0); /* cleanup slot */ txn->mt_dbxs[slot].md_name.iov_base = NULL; @@ -8874,11 +8887,11 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, } int mdbx_dbi_open(MDBX_txn *txn, const char *table_name, unsigned table_flags, - MDB_dbi *dbi) { + MDBX_dbi *dbi) { return mdbx_dbi_open_ex(txn, table_name, table_flags, dbi, nullptr, nullptr); } -int __cold mdbx_dbi_stat(MDBX_txn *txn, MDB_dbi dbi, MDBX_stat *arg, +int __cold mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *arg, size_t bytes) { if (unlikely(!arg || !txn)) return MDBX_EINVAL; @@ -8892,19 +8905,19 @@ int __cold mdbx_dbi_stat(MDBX_txn *txn, MDB_dbi dbi, MDBX_stat *arg, if (unlikely(bytes != sizeof(MDBX_stat))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; if (unlikely(txn->mt_dbflags[dbi] & DB_STALE)) { - MDB_cursor mc; - MDB_xcursor mx; + MDBX_cursor mc; + MDBX_xcursor mx; /* Stale, must read the DB's root. cursor_init does it for us. */ mdbx_cursor_init(&mc, txn, dbi, &mx); } return mdbx_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); } -int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi) { +int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { char *ptr; if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) return MDBX_EINVAL; @@ -8913,17 +8926,17 @@ int mdbx_dbi_close(MDB_env *env, MDB_dbi dbi) { ptr = env->me_dbxs[dbi].md_name.iov_base; /* If there was no name, this was already closed */ if (unlikely(!ptr)) - return MDB_BAD_DBI; + return MDBX_BAD_DBI; env->me_dbxs[dbi].md_name.iov_base = NULL; env->me_dbxs[dbi].md_name.iov_len = 0; env->me_dbflags[dbi] = 0; env->me_dbiseqs[dbi]++; free(ptr); - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int mdbx_dbi_flags(MDBX_txn *txn, MDB_dbi dbi, unsigned *flags) { +int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { if (unlikely(!txn || !flags)) return MDBX_EINVAL; @@ -8934,21 +8947,21 @@ int mdbx_dbi_flags(MDBX_txn *txn, MDB_dbi dbi, unsigned *flags) { return MDBX_EINVAL; *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Add all the DB's pages to the free list. * [in] mc Cursor on the DB to free. * [in] subs non-Zero to check for sub-DBs in this DB. * Returns 0 on success, non-zero on failure. */ -static int mdbx_drop0(MDB_cursor *mc, int subs) { +static int mdbx_drop0(MDBX_cursor *mc, int subs) { int rc; - rc = mdbx_page_search(mc, NULL, MDB_PS_FIRST); - if (likely(rc == MDB_SUCCESS)) { + rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + if (likely(rc == MDBX_SUCCESS)) { MDBX_txn *txn = mc->mc_txn; MDBX_node *ni; - MDB_cursor mx; + MDBX_cursor mx; unsigned i; /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. @@ -9004,7 +9017,7 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { mc->mc_ki[mc->mc_top] = i; rc = mdbx_cursor_sibling(mc, 1); if (rc) { - if (unlikely(rc != MDB_NOTFOUND)) + if (unlikely(rc != MDBX_NOTFOUND)) goto done; /* no more siblings, go back to beginning * of previous level. */ @@ -9021,16 +9034,16 @@ static int mdbx_drop0(MDB_cursor *mc, int subs) { rc = mdbx_midl_append(&txn->mt_free_pages, mc->mc_db->md_root); done: if (unlikely(rc)) - txn->mt_flags |= MDB_TXN_ERROR; - } else if (rc == MDB_NOTFOUND) { - rc = MDB_SUCCESS; + txn->mt_flags |= MDBX_TXN_ERROR; + } else if (rc == MDBX_NOTFOUND) { + rc = MDBX_SUCCESS; } mc->mc_flags &= ~C_INITIALIZED; return rc; } -int mdbx_drop(MDBX_txn *txn, MDB_dbi dbi, int del) { - MDB_cursor *mc, *m2; +int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { + MDBX_cursor *mc, *m2; int rc; if (unlikely(1 < (unsigned)del || !txn)) @@ -9043,9 +9056,9 @@ int mdbx_drop(MDBX_txn *txn, MDB_dbi dbi, int del) { return MDBX_EINVAL; if (unlikely(TXN_DBI_CHANGED(txn, dbi))) - return MDB_BAD_DBI; + return MDBX_BAD_DBI; - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) return MDBX_EACCESS; rc = mdbx_cursor_open(txn, dbi, &mc); @@ -9054,7 +9067,7 @@ int mdbx_drop(MDBX_txn *txn, MDB_dbi dbi, int del) { /* FIXME: locking to avoid races ? */ - rc = mdbx_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + rc = mdbx_drop0(mc, mc->mc_db->md_flags & MDBX_DUPSORT); /* Invalidate the dropped DB's cursors */ for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) m2->mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -9068,7 +9081,7 @@ int mdbx_drop(MDBX_txn *txn, MDB_dbi dbi, int del) { txn->mt_dbflags[dbi] = DB_STALE; mdbx_dbi_close(txn->mt_env, dbi); } else { - txn->mt_flags |= MDB_TXN_ERROR; + txn->mt_flags |= MDBX_TXN_ERROR; } } else { /* reset the DB record, mark it dirty */ @@ -9081,14 +9094,14 @@ int mdbx_drop(MDBX_txn *txn, MDB_dbi dbi, int del) { txn->mt_dbs[dbi].md_root = P_INVALID; txn->mt_dbs[dbi].md_seq = 0; - txn->mt_flags |= MDB_TXN_DIRTY; + txn->mt_flags |= MDBX_TXN_DIRTY; } leave: mdbx_cursor_close(mc); return rc; } -int mdbx_set_compare(MDBX_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { +int mdbx_set_compare(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { if (unlikely(!txn)) return MDBX_EINVAL; @@ -9099,10 +9112,10 @@ int mdbx_set_compare(MDBX_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { return MDBX_EINVAL; txn->mt_dbxs[dbi].md_cmp = cmp; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int mdbx_set_dupsort(MDBX_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { +int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { if (unlikely(!txn)) return MDBX_EINVAL; @@ -9113,10 +9126,10 @@ int mdbx_set_dupsort(MDBX_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp) { return MDBX_EINVAL; txn->mt_dbxs[dbi].md_dcmp = cmp; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int __cold mdbx_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) { +int __cold mdbx_reader_list(MDBX_env *env, MDBX_msg_func *func, void *ctx) { char buf[64]; int rc = 0, first = 1; @@ -9190,7 +9203,7 @@ static int __cold mdbx_pid_insert(mdbx_pid_t *ids, mdbx_pid_t pid) { return 0; } -int __cold mdbx_reader_check(MDB_env *env, int *dead) { +int __cold mdbx_reader_check(MDBX_env *env, int *dead) { if (unlikely(!env || env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EINVAL; if (dead) @@ -9198,12 +9211,12 @@ int __cold mdbx_reader_check(MDB_env *env, int *dead) { return mdbx_reader_check0(env, 0, dead); } -int __cold mdbx_reader_check0(MDB_env *env, int rdt_locked, int *dead) { +int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { assert(rdt_locked >= 0); if (unlikely(env->me_pid != mdbx_getpid())) { - env->me_flags |= MDB_FATAL_ERROR; - return MDB_PANIC; + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_PANIC; } unsigned snap_nreaders = env->me_lck->mti_numreaders; @@ -9274,7 +9287,7 @@ int __cold mdbx_reader_check0(MDB_env *env, int rdt_locked, int *dead) { return rc; } -static unsigned __hot mdbx_midl_search(MDB_IDL ids, pgno_t id) { +static unsigned __hot mdbx_midl_search(MDBX_IDL ids, pgno_t id) { /* binary search of id in ids * if found, returns position of id * if not found, returns first position greater than id */ @@ -9304,8 +9317,8 @@ static unsigned __hot mdbx_midl_search(MDB_IDL ids, pgno_t id) { return cursor; } -static MDB_IDL mdbx_midl_alloc(int num) { - MDB_IDL ids = malloc((num + 2) * sizeof(pgno_t)); +static MDBX_IDL mdbx_midl_alloc(int num) { + MDBX_IDL ids = malloc((num + 2) * sizeof(pgno_t)); if (likely(ids)) { *ids++ = num; *ids = 0; @@ -9313,25 +9326,25 @@ static MDB_IDL mdbx_midl_alloc(int num) { return ids; } -static void mdbx_midl_free(MDB_IDL ids) { +static void mdbx_midl_free(MDBX_IDL ids) { if (ids) free(ids - 1); } -static void mdbx_midl_shrink(MDB_IDL *idp) { - MDB_IDL ids = *idp - 1; - if (unlikely(*ids > MDB_IDL_UM_MAX)) { - /* shrink to MDB_IDL_UM_MAX */ - ids = realloc(ids, (MDB_IDL_UM_MAX + 2) * sizeof(pgno_t)); +static void mdbx_midl_shrink(MDBX_IDL *idp) { + MDBX_IDL ids = *idp - 1; + if (unlikely(*ids > MDBX_IDL_UM_MAX)) { + /* shrink to MDBX_IDL_UM_MAX */ + ids = realloc(ids, (MDBX_IDL_UM_MAX + 2) * sizeof(pgno_t)); if (likely(ids)) { - *ids++ = MDB_IDL_UM_MAX; + *ids++ = MDBX_IDL_UM_MAX; *idp = ids; } } } -static int mdbx_midl_grow(MDB_IDL *idp, int num) { - MDB_IDL idn = *idp - 1; +static int mdbx_midl_grow(MDBX_IDL *idp, int num) { + MDBX_IDL idn = *idp - 1; /* grow it */ idn = realloc(idn, (*idn + num + 2) * sizeof(pgno_t)); if (unlikely(!idn)) @@ -9341,8 +9354,8 @@ static int mdbx_midl_grow(MDB_IDL *idp, int num) { return 0; } -static int mdbx_midl_need(MDB_IDL *idp, unsigned num) { - MDB_IDL ids = *idp; +static int mdbx_midl_need(MDBX_IDL *idp, unsigned num) { + MDBX_IDL ids = *idp; num += ids[0]; if (num > ids[-1]) { num = (num + num / 4 + (256 + 2)) & -256; @@ -9355,11 +9368,11 @@ static int mdbx_midl_need(MDB_IDL *idp, unsigned num) { return 0; } -static int mdbx_midl_append(MDB_IDL *idp, pgno_t id) { - MDB_IDL ids = *idp; +static int mdbx_midl_append(MDBX_IDL *idp, pgno_t id) { + MDBX_IDL ids = *idp; /* Too big? */ if (ids[0] >= ids[-1]) { - if (mdbx_midl_grow(idp, MDB_IDL_UM_MAX)) + if (mdbx_midl_grow(idp, MDBX_IDL_UM_MAX)) return MDBX_ENOMEM; ids = *idp; } @@ -9368,8 +9381,8 @@ static int mdbx_midl_append(MDB_IDL *idp, pgno_t id) { return 0; } -static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app) { - MDB_IDL ids = *idp; +static int mdbx_midl_append_list(MDBX_IDL *idp, MDBX_IDL app) { + MDBX_IDL ids = *idp; /* Too big? */ if (ids[0] + app[0] >= ids[-1]) { if (mdbx_midl_grow(idp, app[0])) @@ -9381,11 +9394,11 @@ static int mdbx_midl_append_list(MDB_IDL *idp, MDB_IDL app) { return 0; } -static int mdbx_midl_append_range(MDB_IDL *idp, pgno_t id, unsigned n) { +static int mdbx_midl_append_range(MDBX_IDL *idp, pgno_t id, unsigned n) { pgno_t *ids = *idp, len = ids[0]; /* Too big? */ if (len + n > ids[-1]) { - if (mdbx_midl_grow(idp, n | MDB_IDL_UM_MAX)) + if (mdbx_midl_grow(idp, n | MDBX_IDL_UM_MAX)) return MDBX_ENOMEM; ids = *idp; } @@ -9396,7 +9409,7 @@ static int mdbx_midl_append_range(MDB_IDL *idp, pgno_t id, unsigned n) { return 0; } -static void __hot mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge) { +static void __hot mdbx_midl_xmerge(MDBX_IDL idl, MDBX_IDL merge) { pgno_t old_id, merge_id, i = merge[0], j = idl[0], k = i + j, total = k; idl[0] = ~(pgno_t)0; /* delimiter for idl scan below */ old_id = idl[j]; @@ -9418,7 +9431,7 @@ static void __hot mdbx_midl_xmerge(MDB_IDL idl, MDB_IDL merge) { (b) = tmp_pgno; \ } while (0) -static void __hot mdbx_midl_sort(MDB_IDL ids) { +static void __hot mdbx_midl_sort(MDBX_IDL ids) { /* Max possible depth of int-indexed tree * 2 items/level */ int istack[sizeof(int) * CHAR_BIT * 2]; int i, j, k, l, ir, jstack; @@ -9484,7 +9497,7 @@ static void __hot mdbx_midl_sort(MDB_IDL ids) { } } -static unsigned __hot mdbx_mid2l_search(MDB_ID2L ids, pgno_t id) { +static unsigned __hot mdbx_mid2l_search(MDBX_ID2L ids, pgno_t id) { /* binary search of id in ids * if found, returns position of id * if not found, returns first position greater than id */ @@ -9514,7 +9527,7 @@ static unsigned __hot mdbx_mid2l_search(MDB_ID2L ids, pgno_t id) { return cursor; } -static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id) { +static int mdbx_mid2l_insert(MDBX_ID2L ids, MDBX_ID2 *id) { unsigned x = mdbx_mid2l_search(ids, id->mid); if (unlikely(x < 1)) return /* internal error */ -2; @@ -9522,7 +9535,7 @@ static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id) { if (x <= ids[0].mid && ids[x].mid == id->mid) return /* duplicate */ -1; - if (unlikely(ids[0].mid >= MDB_IDL_UM_MAX)) + if (unlikely(ids[0].mid >= MDBX_IDL_UM_MAX)) return /* too big */ -2; /* insert id */ @@ -9533,9 +9546,9 @@ static int mdbx_mid2l_insert(MDB_ID2L ids, MDB_ID2 *id) { return 0; } -static int mdbx_mid2l_append(MDB_ID2L ids, MDB_ID2 *id) { +static int mdbx_mid2l_append(MDBX_ID2L ids, MDBX_ID2 *id) { /* Too big? */ - if (unlikely(ids[0].mid >= MDB_IDL_UM_MAX)) + if (unlikely(ids[0].mid >= MDBX_IDL_UM_MAX)) return -2; ids[0].mid++; @@ -9550,14 +9563,14 @@ int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn) { if (logger != (MDBX_debug_func *)MDBX_DBG_DNT) mdbx_debug_logger = logger; if (edge_txn != (long)MDBX_DBG_DNT) { -#if MDB_DEBUG +#if MDBX_DEBUG mdbx_debug_edge = edge_txn; #endif } return ret; } -static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { +static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { mdbx_debug("DB size maxed out"); int retry; @@ -9612,7 +9625,7 @@ static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) { return mdbx_find_oldest(env, NULL); } -int __cold mdbx_env_set_syncbytes(MDB_env *env, size_t bytes) { +int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes) { if (unlikely(!env)) return MDBX_EINVAL; @@ -9620,10 +9633,10 @@ int __cold mdbx_env_set_syncbytes(MDB_env *env, size_t bytes) { return MDBX_EBADSIGN; env->me_sync_threshold = bytes; - return env->me_map ? mdbx_env_sync(env, 0) : MDB_SUCCESS; + return env->me_map ? mdbx_env_sync(env, 0) : MDBX_SUCCESS; } -int __cold mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oomfunc) { +int __cold mdbx_env_set_oomfunc(MDBX_env *env, MDBX_oom_func *oomfunc) { if (unlikely(!env)) return MDBX_EINVAL; @@ -9631,10 +9644,10 @@ int __cold mdbx_env_set_oomfunc(MDB_env *env, MDBX_oom_func *oomfunc) { return MDBX_EBADSIGN; env->me_oom_func = oomfunc; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -MDBX_oom_func *__cold mdbx_env_get_oomfunc(MDB_env *env) { +MDBX_oom_func *__cold mdbx_env_get_oomfunc(MDBX_env *env) { return likely(env && env->me_signature == MDBX_ME_SIGNATURE) ? env->me_oom_func : NULL; @@ -9655,8 +9668,8 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) if (unlikely(!txn->mt_ro_reader)) return -1; - MDB_env *env = txn->mt_env; - MDB_meta *meta = mdbx_meta_head(env); + MDBX_env *env = txn->mt_env; + MDBX_meta *meta = mdbx_meta_head(env); if (percent) { size_t maxpg = env->me_maxpg; size_t last = meta->mm_last_pg + 1; @@ -9683,9 +9696,9 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, const char *type; if (pg == P_INVALID) - return MDB_SUCCESS; /* empty db */ + return MDBX_SUCCESS; /* empty db */ - MDB_cursor mc; + MDBX_cursor mc; memset(&mc, 0, sizeof(mc)); mc.mc_snum = 1; mc.mc_txn = ctx->mw_txn; @@ -9694,7 +9707,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, if (rc) return rc; if (pg != mp->mp_pgno) - return MDB_CORRUPTED; + return MDBX_CORRUPTED; nkeys = NUMKEYS(mp); header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower; @@ -9708,7 +9721,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, case P_BRANCH: type = "branch"; if (nkeys < 1) - return MDB_CORRUPTED; + return MDBX_CORRUPTED; break; case P_LEAF: type = "leaf"; @@ -9725,7 +9738,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, case P_META: case P_OVERFLOW: default: - return MDB_CORRUPTED; + return MDBX_CORRUPTED; } for (align_bytes = i = 0; i < nkeys; @@ -9760,12 +9773,12 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, if (rc) return rc; if (*opg != omp->mp_pgno) - return MDB_CORRUPTED; + return MDBX_CORRUPTED; /* LY: Don't use mask here, e.g bitwise * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). * Pages should not me marked dirty/loose or otherwise. */ if (P_OVERFLOW != omp->mp_flags) - return MDB_CORRUPTED; + return MDBX_CORRUPTED; over_header = PAGEHDRSZ; over_payload = NODEDSZ(node); @@ -9782,7 +9795,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, payload_size += NODEDSZ(node); if (node->mn_flags & F_SUBDATA) { - MDB_db *db = NODEDATA(node); + MDBX_db *db = NODEDATA(node); char *name = NULL; if (!(node->mn_flags & F_DUPDATA)) { @@ -9805,7 +9818,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, void *user) { if (unlikely(!txn)) - return MDB_BAD_TXN; + return MDBX_BAD_TXN; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; @@ -9814,9 +9827,9 @@ int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, ctx.mw_user = user; ctx.mw_visitor = visitor; - int rc = visitor(0, 2, user, "lmdb", "meta", 2, sizeof(MDB_meta) * 2, + int rc = visitor(0, 2, user, "mdbx", "meta", 2, sizeof(MDBX_meta) * 2, PAGEHDRSZ * 2, - (txn->mt_env->me_psize - sizeof(MDB_meta) - PAGEHDRSZ) * 2); + (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * 2); if (!rc) rc = mdbx_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0); if (!rc) @@ -9833,29 +9846,29 @@ int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) return MDBX_EACCESS; if (likely(canary)) { if (txn->mt_canary.x == canary->x && txn->mt_canary.y == canary->y && txn->mt_canary.z == canary->z) - return MDB_SUCCESS; + return MDBX_SUCCESS; txn->mt_canary.x = canary->x; txn->mt_canary.y = canary->y; txn->mt_canary.z = canary->z; } txn->mt_canary.v = txn->mt_txnid; - if ((txn->mt_flags & MDB_TXN_DIRTY) == 0) { - MDB_env *env = txn->mt_env; - txn->mt_flags |= MDB_TXN_DIRTY; + if ((txn->mt_flags & MDBX_TXN_DIRTY) == 0) { + MDBX_env *env = txn->mt_env; + txn->mt_flags |= MDBX_TXN_DIRTY; env->me_sync_pending += env->me_psize; } - return MDB_SUCCESS; + return MDBX_SUCCESS; } int mdbx_canary_get(MDBX_txn *txn, mdbx_canary *canary) { @@ -9865,10 +9878,10 @@ int mdbx_canary_get(MDBX_txn *txn, mdbx_canary *canary) { return MDBX_EBADSIGN; *canary = txn->mt_canary; - return MDB_SUCCESS; + return MDBX_SUCCESS; } -int mdbx_cursor_on_first(MDB_cursor *mc) { +int mdbx_cursor_on_first(MDBX_cursor *mc) { if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -9886,7 +9899,7 @@ int mdbx_cursor_on_first(MDB_cursor *mc) { return MDBX_RESULT_TRUE; } -int mdbx_cursor_on_last(MDB_cursor *mc) { +int mdbx_cursor_on_last(MDBX_cursor *mc) { if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -9905,7 +9918,7 @@ int mdbx_cursor_on_last(MDB_cursor *mc) { return MDBX_RESULT_TRUE; } -int mdbx_cursor_eof(MDB_cursor *mc) { +int mdbx_cursor_eof(MDBX_cursor *mc) { if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -9944,7 +9957,7 @@ static int mdbx_is_samedata(const MDBX_val *a, const MDBX_val *b) { * Для не-уникальных ключей также возможен второй сценарий использования, * когда посредством old_data из записей с одинаковым ключом для * удаления/обновления выбирается конкретная. Для выбора этого сценария - * во flags следует одновременно указать MDB_CURRENT и MDB_NOOVERWRITE. + * во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE. * Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет * идентифицировать запрос такого сценария. * @@ -9953,7 +9966,7 @@ static int mdbx_is_samedata(const MDBX_val *a, const MDBX_val *b) { * - внешняя аллокация курсоров, в том числе на стеке (без malloc). * - получения статуса страницы по адресу (знать о P_DIRTY). */ -int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *new_data, +int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *new_data, MDBX_val *old_data, unsigned flags) { if (unlikely(!key || !old_data || !txn || old_data == new_data)) return MDBX_EINVAL; @@ -9964,40 +9977,40 @@ int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *new_data, if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) return MDBX_EINVAL; - if (unlikely(new_data == NULL && !(flags & MDB_CURRENT))) + if (unlikely(new_data == NULL && !(flags & MDBX_CURRENT))) return MDBX_EINVAL; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; if (unlikely(flags & - ~(MDB_NOOVERWRITE | MDB_NODUPDATA | MDB_RESERVE | MDB_APPEND | - MDB_APPENDDUP | MDB_CURRENT))) + ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_RESERVE | + MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & (MDB_TXN_RDONLY | MDB_TXN_BLOCKED))) - return (txn->mt_flags & MDB_TXN_RDONLY) ? MDBX_EACCESS : MDB_BAD_TXN; + if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; - MDB_cursor mc; - MDB_xcursor mx; + MDBX_cursor mc; + MDBX_xcursor mx; mdbx_cursor_init(&mc, txn, dbi, &mx); mc.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &mc; int rc; MDBX_val present_key = *key; - if (F_ISSET(flags, MDB_CURRENT | MDB_NOOVERWRITE)) { + if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) { /* в old_data значение для выбора конкретного дубликата */ - if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDB_DUPSORT))) { + if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT))) { rc = MDBX_EINVAL; goto bailout; } /* убираем лишний бит, он был признаком запрошенного режима */ - flags -= MDB_NOOVERWRITE; + flags -= MDBX_NOOVERWRITE; - rc = mdbx_cursor_get(&mc, &present_key, old_data, MDB_GET_BOTH); - if (rc != MDB_SUCCESS) + rc = mdbx_cursor_get(&mc, &present_key, old_data, MDBX_GET_BOTH); + if (rc != MDBX_SUCCESS) goto bailout; if (new_data) { @@ -10011,20 +10024,20 @@ int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *new_data, if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) return MDBX_EINVAL; MDBX_val present_data; - rc = mdbx_cursor_get(&mc, &present_key, &present_data, MDB_SET_KEY); - if (unlikely(rc != MDB_SUCCESS)) { + rc = mdbx_cursor_get(&mc, &present_key, &present_data, MDBX_SET_KEY); + if (unlikely(rc != MDBX_SUCCESS)) { old_data->iov_base = NULL; old_data->iov_len = rc; - if (rc != MDB_NOTFOUND || (flags & MDB_CURRENT)) + if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT)) goto bailout; - } else if (flags & MDB_NOOVERWRITE) { - rc = MDB_KEYEXIST; + } else if (flags & MDBX_NOOVERWRITE) { + rc = MDBX_KEYEXIST; *old_data = present_data; goto bailout; } else { MDBX_page *page = mc.mc_pg[mc.mc_top]; - if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { - if (flags & MDB_CURRENT) { + if (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) { + if (flags & MDBX_CURRENT) { /* для не-уникальных ключей позволяем update/delete только если ключ * один */ MDBX_node *leaf = NODEPTR(page, mc.mc_ki[mc.mc_top]); @@ -10041,14 +10054,14 @@ int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *new_data, *old_data = *new_data; goto bailout; } - /* В оригинальной LMDB фладок MDB_CURRENT здесь приведет - * к замене данных без учета MDB_DUPSORT сортировки, + /* В оригинальной LMDB фладок MDBX_CURRENT здесь приведет + * к замене данных без учета MDBX_DUPSORT сортировки, * но здесь это в любом случае допустимо, так как мы * проверили что для ключа есть только одно значение. */ - } else if ((flags & MDB_NODUPDATA) && + } else if ((flags & MDBX_NODUPDATA) && mdbx_is_samedata(&present_data, new_data)) { - /* если данные совпадают и установлен MDB_NODUPDATA */ - rc = MDB_KEYEXIST; + /* если данные совпадают и установлен MDBX_NODUPDATA */ + rc = MDBX_KEYEXIST; goto bailout; } } else { @@ -10057,7 +10070,7 @@ int mdbx_replace(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *new_data, *old_data = *new_data; goto bailout; } - flags |= MDB_CURRENT; + flags |= MDBX_CURRENT; } if (page->mp_flags & P_DIRTY) { @@ -10085,7 +10098,7 @@ bailout: return rc; } -int mdbx_get_ex(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, +int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, int *values_count) { DKBUF; mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); @@ -10099,17 +10112,17 @@ int mdbx_get_ex(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; - MDB_cursor mc; - MDB_xcursor mx; + MDBX_cursor mc; + MDBX_xcursor mx; mdbx_cursor_init(&mc, txn, dbi, &mx); int exact = 0; - int rc = mdbx_cursor_set(&mc, key, data, MDB_SET_KEY, &exact); - if (unlikely(rc != MDB_SUCCESS)) { - if (rc == MDB_NOTFOUND && values_count) + int rc = mdbx_cursor_set(&mc, key, data, MDBX_SET_KEY, &exact); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND && values_count) *values_count = 0; return rc; } @@ -10125,7 +10138,7 @@ int mdbx_get_ex(MDBX_txn *txn, MDB_dbi dbi, MDBX_val *key, MDBX_val *data, } } } - return MDB_SUCCESS; + return MDBX_SUCCESS; } /* Функция сообщает находится ли указанный адрес в "грязной" странице у @@ -10157,10 +10170,10 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(txn->mt_flags & MDB_TXN_RDONLY)) + if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) return MDBX_RESULT_FALSE; - const MDB_env *env = txn->mt_env; + const MDBX_env *env = txn->mt_env; const uintptr_t mask = ~(uintptr_t)(env->me_psize - 1); const MDBX_page *page = (const MDBX_page *)((uintptr_t)ptr & mask); @@ -10202,7 +10215,7 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { return MDBX_RESULT_TRUE; } -int mdbx_dbi_sequence(MDBX_txn *txn, MDB_dbi dbi, uint64_t *result, +int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, uint64_t increment) { if (unlikely(!txn)) return MDBX_EINVAL; @@ -10214,17 +10227,17 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDB_dbi dbi, uint64_t *result, return MDBX_EINVAL; if (unlikely(TXN_DBI_CHANGED(txn, dbi))) - return MDB_BAD_DBI; + return MDBX_BAD_DBI; - MDB_db *dbs = &txn->mt_dbs[dbi]; + MDBX_db *dbs = &txn->mt_dbs[dbi]; if (likely(result)) *result = dbs->md_seq; if (likely(increment > 0)) { - if (unlikely(txn->mt_flags & MDB_TXN_BLOCKED)) - return MDB_BAD_TXN; + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; - if (unlikely(F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))) + if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) return MDBX_EACCESS; uint64_t new = dbs->md_seq + increment; @@ -10233,9 +10246,9 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDB_dbi dbi, uint64_t *result, assert(new > dbs->md_seq); dbs->md_seq = new; - txn->mt_flags |= MDB_TXN_DIRTY; + txn->mt_flags |= MDBX_TXN_DIRTY; txn->mt_dbflags[dbi] |= DB_DIRTY; } - return MDB_SUCCESS; + return MDBX_SUCCESS; } diff --git a/src/midl.h b/src/midl.h index 35e37ad5..8c983c24 100644 --- a/src/midl.h +++ b/src/midl.h @@ -14,21 +14,21 @@ /* IDL sizes - likely should be even bigger * limiting factors: sizeof(pgno_t), thread stack size */ -#define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ -#define MDB_IDL_DB_SIZE (1 << MDB_IDL_LOGN) -#define MDB_IDL_UM_SIZE (1 << (MDB_IDL_LOGN + 1)) +#define MDBX_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ +#define MDBX_IDL_DB_SIZE (1 << MDBX_IDL_LOGN) +#define MDBX_IDL_UM_SIZE (1 << (MDBX_IDL_LOGN + 1)) -#define MDB_IDL_DB_MAX (MDB_IDL_DB_SIZE - 1) -#define MDB_IDL_UM_MAX (MDB_IDL_UM_SIZE - 1) +#define MDBX_IDL_DB_MAX (MDBX_IDL_DB_SIZE - 1) +#define MDBX_IDL_UM_MAX (MDBX_IDL_UM_SIZE - 1) -#define MDB_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(pgno_t)) -#define MDB_IDL_IS_ZERO(ids) ((ids)[0] == 0) -#define MDB_IDL_CPY(dst, src) (memcpy(dst, src, MDB_IDL_SIZEOF(src))) -#define MDB_IDL_FIRST(ids) ((ids)[1]) -#define MDB_IDL_LAST(ids) ((ids)[(ids)[0]]) +#define MDBX_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(pgno_t)) +#define MDBX_IDL_IS_ZERO(ids) ((ids)[0] == 0) +#define MDBX_IDL_CPY(dst, src) (memcpy(dst, src, MDBX_IDL_SIZEOF(src))) +#define MDBX_IDL_FIRST(ids) ((ids)[1]) +#define MDBX_IDL_LAST(ids) ((ids)[(ids)[0]]) /* Current max length of an #mdbx_midl_alloc()ed IDL */ -#define MDB_IDL_ALLOCLEN(ids) ((ids)[-1]) +#define MDBX_IDL_ALLOCLEN(ids) ((ids)[-1]) /* Append ID to IDL. The IDL must be big enough. */ #define mdbx_midl_xappend(idl, id) \ diff --git a/src/osal.c b/src/osal.c index b0e1d6b1..a8550fef 100644 --- a/src/osal.c +++ b/src/osal.c @@ -20,7 +20,7 @@ static int waitstatus2errcode(DWORD result) { switch (result) { case WAIT_OBJECT_0: - return MDB_SUCCESS; + return MDBX_SUCCESS; case WAIT_FAILED: return mdbx_get_errno_checked(); case WAIT_ABANDONED: @@ -48,16 +48,16 @@ __extern_C __declspec(dllimport) void __cdecl _assert(char const *message, #endif /* _MSC_VER */ #ifndef mdbx_assert_fail -void __cold mdbx_assert_fail(MDB_env *env, const char *msg, const char *func, +void __cold mdbx_assert_fail(MDBX_env *env, const char *msg, const char *func, int line) { -#if MDB_DEBUG +#if MDBX_DEBUG if (env && env->me_assert_func) { env->me_assert_func(env, msg, func, line); return; } #else (void)env; -#endif /* MDB_DEBUG */ +#endif /* MDBX_DEBUG */ if (mdbx_debug_logger) mdbx_debug_log(MDBX_DBG_ASSERT, func, line, "assert: %s\n", msg); @@ -139,10 +139,10 @@ int mdbx_asprintf(char **strp, const char *fmt, ...) { int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result) { #if _MSC_VER *result = _aligned_malloc(bytes, alignment); - return *result ? MDB_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */; + return *result ? MDBX_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */; #elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L *result = memalign(alignment, bytes); - return *result ? MDB_SUCCESS : errno; + return *result ? MDBX_SUCCESS : errno; #elif _POSIX_VERSION >= 200112L *result = NULL; return posix_memalign(result, alignment, bytes); @@ -166,7 +166,7 @@ void mdbx_memalign_free(void *ptr) { int mdbx_condmutex_init(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) - int rc = MDB_SUCCESS; + int rc = MDBX_SUCCESS; condmutex->event = NULL; condmutex->mutex = CreateMutex(NULL, FALSE, NULL); if (!condmutex->mutex) @@ -203,13 +203,15 @@ int mdbx_condmutex_destroy(mdbx_condmutex_t *condmutex) { int rc = MDBX_EINVAL; #if defined(_WIN32) || defined(_WIN64) if (condmutex->event) { - rc = CloseHandle(condmutex->event) ? MDB_SUCCESS : mdbx_get_errno_checked(); - if (rc == MDB_SUCCESS) + rc = + CloseHandle(condmutex->event) ? MDBX_SUCCESS : mdbx_get_errno_checked(); + if (rc == MDBX_SUCCESS) condmutex->event = NULL; } if (condmutex->mutex) { - rc = CloseHandle(condmutex->mutex) ? MDB_SUCCESS : mdbx_get_errno_checked(); - if (rc == MDB_SUCCESS) + rc = + CloseHandle(condmutex->mutex) ? MDBX_SUCCESS : mdbx_get_errno_checked(); + if (rc == MDBX_SUCCESS) condmutex->mutex = NULL; } #else @@ -238,7 +240,7 @@ int mdbx_condmutex_lock(mdbx_condmutex_t *condmutex) { int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) - return ReleaseMutex(condmutex->mutex) ? MDB_SUCCESS + return ReleaseMutex(condmutex->mutex) ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else return pthread_mutex_unlock(&condmutex->mutex); @@ -247,7 +249,7 @@ int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex) { int mdbx_condmutex_signal(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) - return SetEvent(condmutex->event) ? MDB_SUCCESS : mdbx_get_errno_checked(); + return SetEvent(condmutex->event) ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else return pthread_cond_signal(&condmutex->cond); #endif @@ -270,7 +272,7 @@ int mdbx_condmutex_wait(mdbx_condmutex_t *condmutex) { int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(fastmutex); - return MDB_SUCCESS; + return MDBX_SUCCESS; #else return pthread_mutex_init(fastmutex, NULL); #endif @@ -279,7 +281,7 @@ int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) DeleteCriticalSection(fastmutex); - return MDB_SUCCESS; + return MDBX_SUCCESS; #else return pthread_mutex_destroy(fastmutex); #endif @@ -288,7 +290,7 @@ int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) EnterCriticalSection(fastmutex); - return MDB_SUCCESS; + return MDBX_SUCCESS; #else return pthread_mutex_lock(fastmutex); #endif @@ -297,7 +299,7 @@ int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(fastmutex); - return MDB_SUCCESS; + return MDBX_SUCCESS; #else return pthread_mutex_unlock(fastmutex); #endif @@ -320,7 +322,7 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, case O_RDONLY: DesiredAccess = GENERIC_READ; break; - case O_WRONLY: /* assume for mdb_env_copy() and friends output */ + case O_WRONLY: /* assume for MDBX_env_copy() and friends output */ DesiredAccess = GENERIC_WRITE; ShareMode = 0; FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; @@ -378,14 +380,14 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, (void)fcntl(*fd, F_SETFD, flags | FD_CLOEXEC); #endif #endif - return MDB_SUCCESS; + return MDBX_SUCCESS; } int mdbx_closefile(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) - return CloseHandle(fd) ? MDB_SUCCESS : mdbx_get_errno_checked(); + return CloseHandle(fd) ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else - return (close(fd) == 0) ? MDB_SUCCESS : errno; + return (close(fd) == 0) ? MDBX_SUCCESS : errno; #endif } @@ -402,16 +404,16 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { DWORD read = 0; if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { int rc = mdbx_get_errno_checked(); - return (rc == MDB_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; + return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; } #else ssize_t read = pread(fd, buf, bytes, offset); if (read < 0) { int rc = errno; - return (rc == MDB_SUCCESS) ? /* paranoia */ MDBX_EIO : rc; + return (rc == MDBX_SUCCESS) ? /* paranoia */ MDBX_EIO : rc; } #endif - return (bytes == (size_t)read) ? MDB_SUCCESS : MDBX_ENODATA; + return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA; } int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, @@ -427,7 +429,7 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, DWORD written; if (likely(WriteFile(fd, buf, (DWORD)bytes, &written, &ov))) - return (bytes == written) ? MDB_SUCCESS : MDBX_EIO /* ERROR_WRITE_FAULT */; + return (bytes == written) ? MDBX_SUCCESS : MDBX_EIO /* ERROR_WRITE_FAULT */; return mdbx_get_errno_checked(); #else int rc; @@ -435,7 +437,7 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, do { written = pwrite(fd, buf, bytes, offset); if (likely(bytes == (size_t)written)) - return MDB_SUCCESS; + return MDBX_SUCCESS; rc = errno; } while (rc == EINTR); return (written < 0) ? rc : MDBX_EIO /* Use which error code (ENOSPC)? */; @@ -448,12 +450,12 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, size_t written = 0; for (int i = 0; i < iovcnt; ++i) { int rc = mdbx_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; written += iov[i].iov_len; offset += iov[i].iov_len; } - return (expected_written == written) ? MDB_SUCCESS + return (expected_written == written) ? MDBX_SUCCESS : MDBX_EIO /* ERROR_WRITE_FAULT */; #else int rc; @@ -461,7 +463,7 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, do { written = pwritev(fd, iov, iovcnt, offset); if (likely(expected_written == (size_t)written)) - return MDB_SUCCESS; + return MDBX_SUCCESS; rc = errno; } while (rc == EINTR); return (written < 0) ? rc : MDBX_EIO /* Use which error code? */; @@ -508,7 +510,7 @@ int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { #ifdef SIGPIPE pthread_sigmask(SIG_SETMASK, &old, NULL); #endif - return MDB_SUCCESS; + return MDBX_SUCCESS; } ptr += written; bytes -= written; @@ -518,19 +520,19 @@ int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync) { #if defined(_WIN32) || defined(_WIN64) (void)fullsync; - return FlushFileBuffers(fd) ? MDB_SUCCESS : mdbx_get_errno_checked(); + return FlushFileBuffers(fd) ? MDBX_SUCCESS : mdbx_get_errno_checked(); #elif __GLIBC_PREREQ(2, 16) || _BSD_SOURCE || _XOPEN_SOURCE || \ (__GLIBC_PREREQ(2, 8) && _POSIX_C_SOURCE >= 200112L) for (;;) { #if _POSIX_C_SOURCE >= 199309L || _XOPEN_SOURCE >= 500 || \ defined(_POSIX_SYNCHRONIZED_IO) if (!fullsync && fdatasync(fd) == 0) - return MDB_SUCCESS; + return MDBX_SUCCESS; #else (void)fullsync; #endif if (fsync(fd) == 0) - return MDB_SUCCESS; + return MDBX_SUCCESS; int rc = errno; if (rc != EINTR) return rc; @@ -554,7 +556,7 @@ int mdbx_filesize(mdbx_filehandle_t fd, off_t *length) { *length = st.st_size; #endif - return MDB_SUCCESS; + return MDBX_SUCCESS; } int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length) { @@ -562,10 +564,10 @@ int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length) { LARGE_INTEGER li; li.QuadPart = length; return (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) && SetEndOfFile(fd)) - ? MDB_SUCCESS + ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else - return ftruncate(fd, length) == 0 ? MDB_SUCCESS : errno; + return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno; #endif } @@ -574,7 +576,7 @@ int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length) { int mdbx_thread_key_create(mdbx_thread_key_t *key) { #if defined(_WIN32) || defined(_WIN64) *key = TlsAlloc(); - return (*key != TLS_OUT_OF_INDEXES) ? MDB_SUCCESS : mdbx_get_errno_checked(); + return (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else return pthread_key_create(key, mdbx_rthc_dtor); #endif @@ -617,7 +619,7 @@ int mdbx_thread_create(mdbx_thread_t *thread, void *arg) { #if defined(_WIN32) || defined(_WIN64) *thread = CreateThread(NULL, 0, start_routine, arg, 0, NULL); - return *thread ? MDB_SUCCESS : mdbx_get_errno_checked(); + return *thread ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else return pthread_create(thread, NULL, start_routine, arg); #endif @@ -638,10 +640,11 @@ int mdbx_thread_join(mdbx_thread_t thread) { int mdbx_msync(void *addr, size_t length, int async) { #if defined(_WIN32) || defined(_WIN64) if (async) - return MDB_SUCCESS; - return FlushViewOfFile(addr, length) ? MDB_SUCCESS : mdbx_get_errno_checked(); + return MDBX_SUCCESS; + return FlushViewOfFile(addr, length) ? MDBX_SUCCESS + : mdbx_get_errno_checked(); #else - return (msync(addr, length, async ? MS_ASYNC : MS_SYNC) == 0) ? MDB_SUCCESS + return (msync(addr, length, async ? MS_ASYNC : MS_SYNC) == 0) ? MDBX_SUCCESS : errno; #endif } @@ -654,7 +657,7 @@ int mdbx_mremap_size(void **address, size_t old_size, size_t new_size) { return ERROR_CALL_NOT_IMPLEMENTED; #else *address = mremap(*address, old_size, new_size, 0, address); - return (*address != MAP_FAILED) ? MDB_SUCCESS : errno; + return (*address != MAP_FAILED) ? MDBX_SUCCESS : errno; #endif } @@ -666,31 +669,31 @@ int mdbx_mmap(void **address, size_t length, int rw, mdbx_filehandle_t fd) { return mdbx_get_errno_checked(); *address = MapViewOfFileEx(h, rw ? FILE_MAP_WRITE : FILE_MAP_READ, 0, 0, length, *address); - int rc = (*address != MAP_FAILED) ? MDB_SUCCESS : mdbx_get_errno_checked(); + int rc = (*address != MAP_FAILED) ? MDBX_SUCCESS : mdbx_get_errno_checked(); CloseHandle(h); return rc; #else *address = mmap(NULL, length, rw ? PROT_READ | PROT_WRITE : PROT_READ, MAP_SHARED, fd, 0); - return (*address != MAP_FAILED) ? MDB_SUCCESS : errno; + return (*address != MAP_FAILED) ? MDBX_SUCCESS : errno; #endif } int mdbx_munmap(void *address, size_t length) { #if defined(_WIN32) || defined(_WIN64) (void)length; - return UnmapViewOfFile(address) ? MDB_SUCCESS : mdbx_get_errno_checked(); + return UnmapViewOfFile(address) ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else - return (munmap(address, length) == 0) ? MDB_SUCCESS : errno; + return (munmap(address, length) == 0) ? MDBX_SUCCESS : errno; #endif } int mdbx_mlock(const void *address, size_t length) { #if defined(_WIN32) || defined(_WIN64) - return VirtualLock((void *)address, length) ? MDB_SUCCESS + return VirtualLock((void *)address, length) ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else - return (mlock(address, length) == 0) ? MDB_SUCCESS : errno; + return (mlock(address, length) == 0) ? MDBX_SUCCESS : errno; #endif } diff --git a/src/osal.h b/src/osal.h index 8698fea9..16a12302 100644 --- a/src/osal.h +++ b/src/osal.h @@ -315,7 +315,7 @@ static __inline void mdbx_invalidate_cache(void *addr, size_t nbytes) { /*----------------------------------------------------------------------------*/ #ifndef mdbx_assert_fail -void mdbx_assert_fail(MDB_env *env, const char *msg, const char *func, +void mdbx_assert_fail(MDBX_env *env, const char *msg, const char *func, int line); #endif /* mdbx_assert_fail */ @@ -453,20 +453,20 @@ void mdbx_osal_jitter(bool tiny); #define MDBX_OSAL_LOCK_SIGN MDBX_TETRAD('P', 'T', 'M', 'X') #endif -int mdbx_lck_init(MDB_env *env); +int mdbx_lck_init(MDBX_env *env); -int mdbx_lck_seize(MDB_env *env); -int mdbx_lck_downgrade(MDB_env *env); -void mdbx_lck_destroy(MDB_env *env); +int mdbx_lck_seize(MDBX_env *env); +int mdbx_lck_downgrade(MDBX_env *env); +void mdbx_lck_destroy(MDBX_env *env); -int mdbx_rdt_lock(MDB_env *env); -void mdbx_rdt_unlock(MDB_env *env); +int mdbx_rdt_lock(MDBX_env *env); +void mdbx_rdt_unlock(MDBX_env *env); -int mdbx_txn_lock(MDB_env *env); -void mdbx_txn_unlock(MDB_env *env); +int mdbx_txn_lock(MDBX_env *env); +void mdbx_txn_unlock(MDBX_env *env); -int mdbx_rpid_set(MDB_env *env); -int mdbx_rpid_clear(MDB_env *env); +int mdbx_rpid_set(MDBX_env *env); +int mdbx_rpid_clear(MDBX_env *env); /* Checks reader by pid. * @@ -474,7 +474,7 @@ int mdbx_rpid_clear(MDB_env *env); * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) * MDBX_RESULT_FALSE, if pid is dead (lock acquired) * or otherwise the errcode. */ -int mdbx_rpid_check(MDB_env *env, mdbx_pid_t pid); +int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid); /*----------------------------------------------------------------------------*/ diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 3b614f8a..930f8b58 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -34,12 +34,12 @@ typedef struct flagbit { char *name; } flagbit; -flagbit dbflags[] = {{MDB_DUPSORT, "dupsort"}, - {MDB_INTEGERKEY, "integerkey"}, - {MDB_REVERSEKEY, "reversekey"}, - {MDB_DUPFIXED, "dupfixed"}, - {MDB_REVERSEDUP, "reversedup"}, - {MDB_INTEGERDUP, "integerdup"}, +flagbit dbflags[] = {{MDBX_DUPSORT, "dupsort"}, + {MDBX_INTEGERKEY, "integerkey"}, + {MDBX_REVERSEKEY, "reversekey"}, + {MDBX_DUPFIXED, "dupfixed"}, + {MDBX_REVERSEDUP, "reversedup"}, + {MDBX_INTEGERDUP, "integerdup"}, {0, NULL}}; static volatile sig_atomic_t gotsignal; @@ -75,7 +75,7 @@ static __attribute__((constructor)) void init_walk(void) { uint64_t total_unused_bytes; int exclusive = 2; -MDB_env *env; +MDBX_env *env; MDBX_txn *txn, *locktxn; MDBX_envinfo envinfo; MDBX_stat envstat; @@ -298,19 +298,19 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, } } - return gotsignal ? EINTR : MDB_SUCCESS; + return gotsignal ? EINTR : MDBX_SUCCESS; } typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data); -static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent); +static int process_db(MDBX_dbi dbi, char *name, visitor *handler, int silent); static int handle_userdb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { (void)record_number; (void)key; (void)data; - return MDB_SUCCESS; + return MDBX_SUCCESS; } static int handle_freedb(const uint64_t record_number, const MDBX_val *key, @@ -332,7 +332,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, data->iov_len); else { number = *iptr++; - if (number >= MDB_IDL_UM_MAX) + if (number >= MDBX_IDL_UM_MAX) problem_add("entry", record_number, "wrong idl length", "%" PRIiPTR "", number); else if ((number + 1) * sizeof(pgno_t) != data->iov_len) @@ -378,7 +378,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, } } - return MDB_SUCCESS; + return MDBX_SUCCESS; } static int handle_maindb(const uint64_t record_number, const MDBX_val *key, @@ -400,14 +400,14 @@ static int handle_maindb(const uint64_t record_number, const MDBX_val *key, rc = process_db(-1, name, handle_userdb, 0); free(name); - if (rc != MDB_INCOMPATIBLE) + if (rc != MDBX_INCOMPATIBLE) return rc; return handle_userdb(record_number, key, data); } -static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { - MDB_cursor *mc; +static int process_db(MDBX_dbi dbi, char *name, visitor *handler, int silent) { + MDBX_cursor *mc; MDBX_stat ms; MDBX_val key, data; MDBX_val prev_key, prev_data; @@ -424,7 +424,7 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { if (rc) { if (!name || rc != - MDB_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { + MDBX_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { error(" - mdbx_open '%s' failed, error %d %s\n", name ? name : "main", rc, mdbx_strerror(rc)); } @@ -439,7 +439,7 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { fflush(NULL); } skipped_subdb++; - return MDB_SUCCESS; + return MDBX_SUCCESS; } if (!silent && verbose) { @@ -488,8 +488,8 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { saved_list = problems_push(); prev_key.iov_base = NULL; prev_data.iov_len = 0; - rc = mdbx_cursor_get(mc, &key, &data, MDB_FIRST); - while (rc == MDB_SUCCESS) { + rc = mdbx_cursor_get(mc, &key, &data, MDBX_FIRST); + while (rc == MDBX_SUCCESS) { if (gotsignal) { print(" - interrupted by signal\n"); fflush(NULL); @@ -500,20 +500,20 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { if (key.iov_len > maxkeysize) { problem_add("entry", record_count, "key length exceeds max-key-size", "%" PRIuPTR " > %u", key.iov_len, maxkeysize); - } else if ((flags & MDB_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && + } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && key.iov_len != sizeof(uint32_t)) { problem_add("entry", record_count, "wrong key length", "%" PRIuPTR " != 4or8", key.iov_len); } - if ((flags & MDB_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && + if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && data.iov_len != sizeof(uint32_t)) { problem_add("entry", record_count, "wrong data length", "%" PRIuPTR " != 4or8", data.iov_len); } if (prev_key.iov_base) { - if ((flags & MDB_DUPFIXED) && prev_data.iov_len != data.iov_len) { + if ((flags & MDBX_DUPFIXED) && prev_data.iov_len != data.iov_len) { problem_add("entry", record_count, "different data length", "%" PRIuPTR " != %" PRIuPTR "", prev_data.iov_len, data.iov_len); @@ -524,9 +524,9 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { problem_add("entry", record_count, "broken ordering of entries", NULL); } else if (cmp == 0) { ++dups; - if (!(flags & MDB_DUPSORT)) + if (!(flags & MDBX_DUPSORT)) problem_add("entry", record_count, "duplicated entries", NULL); - else if (flags & MDB_INTEGERDUP) { + else if (flags & MDBX_INTEGERDUP) { cmp = mdbx_dcmp(txn, dbi, &prev_data, &data); if (cmp > 0) problem_add("entry", record_count, @@ -534,9 +534,9 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { } } } else if (verbose) { - if (flags & MDB_INTEGERKEY) + if (flags & MDBX_INTEGERKEY) print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); - if (flags & (MDB_INTEGERDUP | MDB_DUPFIXED)) + if (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); } @@ -552,9 +552,9 @@ static int process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) { prev_key = key; prev_data = data; - rc = mdbx_cursor_get(mc, &key, &data, MDB_NEXT); + rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT); } - if (rc != MDB_NOTFOUND) + if (rc != MDBX_NOTFOUND) error(" - mdbx_cursor_get failed, error %d %s\n", rc, mdbx_strerror(rc)); else rc = 0; @@ -612,7 +612,7 @@ int main(int argc, char *argv[]) { int i, rc; char *prog = argv[0]; char *envname; - int envflags = MDB_RDONLY; + int envflags = MDBX_RDONLY; int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; int dont_traversal = 0; struct timespec timestamp_start, timestamp_finish; @@ -644,10 +644,10 @@ int main(int argc, char *argv[]) { quiet = 1; break; case 'n': - envflags |= MDB_NOSUBDIR; + envflags |= MDBX_NOSUBDIR; break; case 'w': - envflags &= ~MDB_RDONLY; + envflags &= ~MDBX_RDONLY; break; case 'c': exclusive = 0; @@ -679,7 +679,7 @@ int main(int argc, char *argv[]) { envname = argv[optind]; print("Running mdbx_chk for '%s' in %s mode...\n", envname, - (envflags & MDB_RDONLY) ? "read-only" : "write-lock"); + (envflags & MDBX_RDONLY) ? "read-only" : "write-lock"); fflush(NULL); rc = mdbx_env_create(&env); @@ -697,14 +697,14 @@ int main(int argc, char *argv[]) { rc = mdbx_env_open_ex(env, envname, envflags, 0664, &exclusive); if (rc) { error("mdbx_env_open failed, error %d %s\n", rc, mdbx_strerror(rc)); - if (rc == MDBX_WANNA_RECOVERY && (envflags & MDB_RDONLY)) + if (rc == MDBX_WANNA_RECOVERY && (envflags & MDBX_RDONLY)) print("Please run %s in the read-write mode (with '-w' option).\n", prog); goto bailout; } if (verbose) print(" - %s mode\n", exclusive ? "monopolistic" : "cooperative"); - if (!(envflags & MDB_RDONLY)) { + if (!(envflags & MDBX_RDONLY)) { rc = mdbx_txn_begin(env, NULL, 0, &locktxn); if (rc) { error("mdbx_txn_begin(lock-write) failed, error %d %s\n", rc, @@ -721,7 +721,7 @@ int main(int argc, char *argv[]) { } maxkeysize = rc; - rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); if (rc) { error("mdbx_txn_begin(read-only) failed, error %d %s\n", rc, mdbx_strerror(rc)); diff --git a/src/tools/mdbx_copy.1 b/src/tools/mdbx_copy.1 index 6c3fd6f8..825fb269 100644 --- a/src/tools/mdbx_copy.1 +++ b/src/tools/mdbx_copy.1 @@ -2,9 +2,9 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDB_COPY 1 "2014/06/20" "LMDB 0.9.14" +.TH MDBX_COPY 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdbx_copy \- LMDB environment copy tool +mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS .B mdbx_copy [\c @@ -19,7 +19,7 @@ mdbx_copy \- LMDB environment copy tool .SH DESCRIPTION The .B mdbx_copy -utility copies an LMDB environment. The environment can +utility copies an MDBX environment. The environment can be copied regardless of whether it is currently in use. No lockfile is created, since it gets recreated at need. diff --git a/src/tools/mdbx_copy.c b/src/tools/mdbx_copy.c index 528d5edb..9295c7ff 100644 --- a/src/tools/mdbx_copy.c +++ b/src/tools/mdbx_copy.c @@ -23,16 +23,16 @@ static void sighandle(int sig) { (void)sig; } int main(int argc, char *argv[]) { int rc; - MDB_env *env = NULL; + MDBX_env *env = NULL; const char *progname = argv[0], *act; - unsigned flags = MDB_RDONLY; + unsigned flags = MDBX_RDONLY; unsigned cpflags = 0; for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { if (argv[1][1] == 'n' && argv[1][2] == '\0') - flags |= MDB_NOSUBDIR; + flags |= MDBX_NOSUBDIR; else if (argv[1][1] == 'c' && argv[1][2] == '\0') - cpflags |= MDB_CP_COMPACT; + cpflags |= MDBX_CP_COMPACT; else if (argv[1][1] == 'V' && argv[1][2] == '\0') { printf("%s (%s, build %s)\n", mdbx_version.git.describe, mdbx_version.git.datetime, mdbx_build.datetime); @@ -57,10 +57,10 @@ int main(int argc, char *argv[]) { act = "opening environment"; rc = mdbx_env_create(&env); - if (rc == MDB_SUCCESS) { + if (rc == MDBX_SUCCESS) { rc = mdbx_env_open(env, argv[1], flags, 0640); } - if (rc == MDB_SUCCESS) { + if (rc == MDBX_SUCCESS) { act = "copying"; if (argc == 2) rc = mdbx_env_copy2fd(env, STDOUT_FILENO, cpflags); diff --git a/src/tools/mdbx_dump.1 b/src/tools/mdbx_dump.1 index 12cb239e..f8dd62a1 100644 --- a/src/tools/mdbx_dump.1 +++ b/src/tools/mdbx_dump.1 @@ -2,9 +2,9 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDB_DUMP 1 "2014/06/20" "LMDB 0.9.14" +.TH MDBX_DUMP 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdbx_dump \- LMDB environment export tool +mdbx_dump \- MDBX environment export tool .SH SYNOPSIS .B mdbx_dump [\c @@ -42,7 +42,7 @@ List the databases stored in the environment. Just the names will be listed, no data will be output. .TP .BR \-n -Dump an LMDB database which does not use subdirectories. +Dump an MDBX database which does not use subdirectories. .TP .BR \-p If characters in either the key or data items are printing characters (as diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index 1f4fa78c..6cff2560 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -31,12 +31,12 @@ typedef struct flagbit { char *name; } flagbit; -flagbit dbflags[] = {{MDB_REVERSEKEY, "reversekey"}, - {MDB_DUPSORT, "dupsort"}, - {MDB_INTEGERKEY, "integerkey"}, - {MDB_DUPFIXED, "dupfixed"}, - {MDB_INTEGERDUP, "integerdup"}, - {MDB_REVERSEDUP, "reversedup"}, +flagbit dbflags[] = {{MDBX_REVERSEKEY, "reversekey"}, + {MDBX_DUPSORT, "dupsort"}, + {MDBX_INTEGERKEY, "integerkey"}, + {MDBX_DUPFIXED, "dupfixed"}, + {MDBX_INTEGERDUP, "integerdup"}, + {MDBX_REVERSEDUP, "reversedup"}, {0, NULL}}; static volatile sig_atomic_t gotsig; @@ -84,8 +84,8 @@ static void byte(MDBX_val *v) { } /* Dump in BDB-compatible format */ -static int dumpit(MDBX_txn *txn, MDB_dbi dbi, char *name) { - MDB_cursor *mc; +static int dumpit(MDBX_txn *txn, MDBX_dbi dbi, char *name) { + MDBX_cursor *mc; MDBX_stat ms; MDBX_val key, data; MDBX_envinfo info; @@ -125,7 +125,7 @@ static int dumpit(MDBX_txn *txn, MDB_dbi dbi, char *name) { if (rc) return rc; - while ((rc = mdbx_cursor_get(mc, &key, &data, MDB_NEXT)) == MDB_SUCCESS) { + while ((rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT)) == MDBX_SUCCESS) { if (gotsig) { rc = EINTR; break; @@ -139,8 +139,8 @@ static int dumpit(MDBX_txn *txn, MDB_dbi dbi, char *name) { } } printf("DATA=END\n"); - if (rc == MDB_NOTFOUND) - rc = MDB_SUCCESS; + if (rc == MDBX_NOTFOUND) + rc = MDBX_SUCCESS; return rc; } @@ -154,9 +154,9 @@ static void usage(char *prog) { int main(int argc, char *argv[]) { int i, rc; - MDB_env *env; + MDBX_env *env; MDBX_txn *txn; - MDB_dbi dbi; + MDBX_dbi dbi; char *prog = argv[0]; char *envname; char *subname = NULL; @@ -196,7 +196,7 @@ int main(int argc, char *argv[]) { } break; case 'n': - envflags |= MDB_NOSUBDIR; + envflags |= MDBX_NOSUBDIR; break; case 'p': mode |= PRINT; @@ -235,14 +235,14 @@ int main(int argc, char *argv[]) { mdbx_env_set_maxdbs(env, 2); } - rc = mdbx_env_open(env, envname, envflags | MDB_RDONLY, 0664); + rc = mdbx_env_open(env, envname, envflags | MDBX_RDONLY, 0664); if (rc) { fprintf(stderr, "mdbx_env_open failed, error %d %s\n", rc, mdbx_strerror(rc)); goto env_close; } - rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); if (rc) { fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, mdbx_strerror(rc)); @@ -256,7 +256,7 @@ int main(int argc, char *argv[]) { } if (alldbs) { - MDB_cursor *cursor; + MDBX_cursor *cursor; MDBX_val key; int count = 0; @@ -266,9 +266,9 @@ int main(int argc, char *argv[]) { mdbx_strerror(rc)); goto txn_abort; } - while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { + while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDBX_NEXT_NODUP)) == 0) { char *str; - MDB_dbi db2; + MDBX_dbi db2; if (memchr(key.iov_base, '\0', key.iov_len)) continue; count++; @@ -276,7 +276,7 @@ int main(int argc, char *argv[]) { memcpy(str, key.iov_base, key.iov_len); str[key.iov_len] = '\0'; rc = mdbx_dbi_open(txn, str, 0, &db2); - if (rc == MDB_SUCCESS) { + if (rc == MDBX_SUCCESS) { if (list) { printf("%s\n", str); list++; @@ -295,15 +295,15 @@ int main(int argc, char *argv[]) { if (!count) { fprintf(stderr, "%s: %s does not contain multiple databases\n", prog, envname); - rc = MDB_NOTFOUND; - } else if (rc == MDB_INCOMPATIBLE) { + rc = MDBX_NOTFOUND; + } else if (rc == MDBX_INCOMPATIBLE) { /* LY: the record it not a named sub-db. */ - rc = MDB_SUCCESS; + rc = MDBX_SUCCESS; } } else { rc = dumpit(txn, dbi, subname); } - if (rc && rc != MDB_NOTFOUND) + if (rc && rc != MDBX_NOTFOUND) fprintf(stderr, "%s: %s: %s\n", prog, envname, mdbx_strerror(rc)); mdbx_dbi_close(env, dbi); diff --git a/src/tools/mdbx_load.1 b/src/tools/mdbx_load.1 index e326a523..ac04101c 100644 --- a/src/tools/mdbx_load.1 +++ b/src/tools/mdbx_load.1 @@ -2,9 +2,9 @@ .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDB_LOAD 1 "2014/06/20" "LMDB 0.9.14" +.TH MDBX_LOAD 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdbx_load \- LMDB environment import tool +mdbx_load \- MDBX environment import tool .SH SYNOPSIS .B mdbx_load [\c @@ -24,7 +24,7 @@ mdbx_load \- LMDB environment import tool The .B mdbx_load utility reads from the standard input and loads it into the -LMDB environment +MDBX environment .BR envpath . The input to @@ -43,7 +43,7 @@ Write the library version number to the standard output, and exit. Read from the specified file instead of from the standard input. .TP .BR \-n -Load an LMDB database which does not use subdirectories. +Load an MDBX database which does not use subdirectories. .TP .BR \-s \ subdb Load a specific subdatabase. If no database is specified, data is loaded into the main database. diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index 531c9632..924cdaeb 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -51,12 +51,12 @@ typedef struct flagbit { #define S(s) s, STRLENOF(s) -flagbit dbflags[] = {{MDB_REVERSEKEY, S("reversekey")}, - {MDB_DUPSORT, S("dupsort")}, - {MDB_INTEGERKEY, S("integerkey")}, - {MDB_DUPFIXED, S("dupfixed")}, - {MDB_INTEGERDUP, S("integerdup")}, - {MDB_REVERSEDUP, S("reversedup")}, +flagbit dbflags[] = {{MDBX_REVERSEKEY, S("reversekey")}, + {MDBX_DUPSORT, S("dupsort")}, + {MDBX_INTEGERKEY, S("integerkey")}, + {MDBX_DUPFIXED, S("dupfixed")}, + {MDBX_INTEGERDUP, S("integerdup")}, + {MDBX_REVERSEDUP, S("reversedup")}, {0, NULL, 0}}; static void readhdr(void) { @@ -294,10 +294,10 @@ static void usage(void) { int main(int argc, char *argv[]) { int i, rc; - MDB_env *env; + MDBX_env *env; MDBX_txn *txn; - MDB_cursor *mc; - MDB_dbi dbi; + MDBX_cursor *mc; + MDBX_dbi dbi; char *envname; int envflags = 0, putflags = 0; @@ -328,13 +328,13 @@ int main(int argc, char *argv[]) { } break; case 'n': - envflags |= MDB_NOSUBDIR; + envflags |= MDBX_NOSUBDIR; break; case 's': subname = strdup(optarg); break; case 'N': - putflags = MDB_NOOVERWRITE | MDB_NODUPDATA; + putflags = MDBX_NOOVERWRITE | MDBX_NODUPDATA; break; case 'T': mode |= NOHDR | PRINT; @@ -369,9 +369,9 @@ int main(int argc, char *argv[]) { if (envinfo.me_mapsize) mdbx_env_set_mapsize(env, envinfo.me_mapsize); -#ifdef MDB_FIXEDMAP +#ifdef MDBX_FIXEDMAP if (info.me_mapaddr) - envflags |= MDB_FIXEDMAP; + envflags |= MDBX_FIXEDMAP; #endif rc = mdbx_env_open(env, envname, envflags, 0664); @@ -395,7 +395,7 @@ int main(int argc, char *argv[]) { goto env_close; } - rc = mdbx_dbi_open(txn, subname, dbi_flags | MDB_CREATE, &dbi); + rc = mdbx_dbi_open(txn, subname, dbi_flags | MDBX_CREATE, &dbi); if (rc) { fprintf(stderr, "mdbx_open failed, error %d %s\n", rc, mdbx_strerror(rc)); goto txn_abort; @@ -421,7 +421,7 @@ int main(int argc, char *argv[]) { } rc = mdbx_cursor_put(mc, &key, &data, putflags); - if (rc == MDB_KEYEXIST && putflags) + if (rc == MDBX_KEYEXIST && putflags) continue; if (rc) { fprintf(stderr, "mdbx_cursor_put failed, error %d %s\n", rc, diff --git a/src/tools/mdbx_stat.1 b/src/tools/mdbx_stat.1 index e62f288f..2056decb 100644 --- a/src/tools/mdbx_stat.1 +++ b/src/tools/mdbx_stat.1 @@ -2,9 +2,9 @@ .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDB_STAT 1 "2014/06/20" "LMDB 0.9.14" +.TH MDBX_STAT 1 "2014/06/20" "LMDB 0.9.14" .SH NAME -mdbx_stat \- LMDB environment status tool +mdbx_stat \- MDBX environment status tool .SH SYNOPSIS .B mdbx_stat [\c @@ -24,7 +24,7 @@ mdbx_stat \- LMDB environment status tool .SH DESCRIPTION The .B mdbx_stat -utility displays the status of an LMDB environment. +utility displays the status of an MDBX environment. .SH OPTIONS .TP .BR \-V @@ -39,7 +39,7 @@ If \fB\-ff\fP is given, summarize each freelist entry. If \fB\-fff\fP is given, display the full list of page IDs in the freelist. .TP .BR \-n -Display the status of an LMDB database which does not use subdirectories. +Display the status of an MDBX database which does not use subdirectories. .TP .BR \-r Display information about the environment reader table. diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index 1c163d5b..e3c53a9d 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -38,9 +38,9 @@ static void usage(char *prog) { int main(int argc, char *argv[]) { int i, rc; - MDB_env *env; + MDBX_env *env; MDBX_txn *txn; - MDB_dbi dbi; + MDBX_dbi dbi; MDBX_stat mst; MDBX_envinfo mei; char *prog = argv[0]; @@ -80,7 +80,7 @@ int main(int argc, char *argv[]) { freinfo++; break; case 'n': - envflags |= MDB_NOSUBDIR; + envflags |= MDBX_NOSUBDIR; break; case 'r': rdrinfo++; @@ -110,7 +110,7 @@ int main(int argc, char *argv[]) { mdbx_env_set_maxdbs(env, 4); } - rc = mdbx_env_open(env, envname, envflags | MDB_RDONLY, 0664); + rc = mdbx_env_open(env, envname, envflags | MDBX_RDONLY, 0664); if (rc) { fprintf(stderr, "mdbx_env_open failed, error %d %s\n", rc, mdbx_strerror(rc)); @@ -139,18 +139,18 @@ int main(int argc, char *argv[]) { if (rdrinfo) { printf("Reader Table Status\n"); - rc = mdbx_reader_list(env, (MDB_msg_func *)fputs, stdout); + rc = mdbx_reader_list(env, (MDBX_msg_func *)fputs, stdout); if (rdrinfo > 1) { int dead; mdbx_reader_check(env, &dead); printf(" %d stale readers cleared.\n", dead); - rc = mdbx_reader_list(env, (MDB_msg_func *)fputs, stdout); + rc = mdbx_reader_list(env, (MDBX_msg_func *)fputs, stdout); } if (!(subname || alldbs || freinfo)) goto env_close; } - rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); if (rc) { fprintf(stderr, "mdbx_txn_begin failed, error %d %s\n", rc, mdbx_strerror(rc)); @@ -158,7 +158,7 @@ int main(int argc, char *argv[]) { } if (freinfo) { - MDB_cursor *cursor; + MDBX_cursor *cursor; MDBX_val key, data; size_t pages = 0, *iptr; size_t reclaimable = 0; @@ -178,7 +178,7 @@ int main(int argc, char *argv[]) { goto txn_abort; } prstat(&mst); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == 0) { iptr = data.iov_base; pages += *iptr; if (envinfo && mei.me_tail_txnid > *(size_t *)key.iov_base) @@ -262,7 +262,7 @@ int main(int argc, char *argv[]) { prstat(&mst); if (alldbs) { - MDB_cursor *cursor; + MDBX_cursor *cursor; MDBX_val key; rc = mdbx_cursor_open(txn, dbi, &cursor); @@ -271,16 +271,16 @@ int main(int argc, char *argv[]) { mdbx_strerror(rc)); goto txn_abort; } - while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { + while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDBX_NEXT_NODUP)) == 0) { char *str; - MDB_dbi db2; + MDBX_dbi db2; if (memchr(key.iov_base, '\0', key.iov_len)) continue; str = malloc(key.iov_len + 1); memcpy(str, key.iov_base, key.iov_len); str[key.iov_len] = '\0'; rc = mdbx_dbi_open(txn, str, 0, &db2); - if (rc == MDB_SUCCESS) + if (rc == MDBX_SUCCESS) printf("Status of %s\n", str); free(str); if (rc) @@ -297,8 +297,8 @@ int main(int argc, char *argv[]) { mdbx_cursor_close(cursor); } - if (rc == MDB_NOTFOUND) - rc = MDB_SUCCESS; + if (rc == MDBX_NOTFOUND) + rc = MDBX_SUCCESS; mdbx_dbi_close(env, dbi); txn_abort: diff --git a/test/config.cc b/test/config.cc index d2e6dd12..c1d2a76d 100644 --- a/test/config.cc +++ b/test/config.cc @@ -226,21 +226,21 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, //----------------------------------------------------------------------------- const struct option_verb mode_bits[] = { - {"rdonly", MDB_RDONLY}, {"mapasync", MDB_MAPASYNC}, - {"utterly", MDBX_UTTERLY_NOSYNC}, {"nosubdir", MDB_NOSUBDIR}, - {"nosync", MDB_NOSYNC}, {"nometasync", MDB_NOMETASYNC}, - {"writemap", MDB_WRITEMAP}, {"notls", MDB_NOTLS}, - {"nordahead", MDB_NORDAHEAD}, {"nomeminit", MDB_NOMEMINIT}, + {"rdonly", MDBX_RDONLY}, {"mapasync", MDBX_MAPASYNC}, + {"utterly", MDBX_UTTERLY_NOSYNC}, {"nosubdir", MDBX_NOSUBDIR}, + {"nosync", MDBX_NOSYNC}, {"nometasync", MDBX_NOMETASYNC}, + {"writemap", MDBX_WRITEMAP}, {"notls", MDBX_NOTLS}, + {"nordahead", MDBX_NORDAHEAD}, {"nomeminit", MDBX_NOMEMINIT}, {"coasesce", MDBX_COALESCE}, {"lifo", MDBX_LIFORECLAIM}, {"parturb", MDBX_PAGEPERTURB}, {nullptr, 0}}; const struct option_verb table_bits[] = { - {"key.reverse", MDB_REVERSEKEY}, - {"key.integer", MDB_INTEGERKEY}, - {"data.integer", MDB_INTEGERDUP | MDB_DUPFIXED | MDB_DUPSORT}, - {"data.fixed", MDB_DUPFIXED | MDB_DUPSORT}, - {"data.reverse", MDB_REVERSEDUP | MDB_DUPSORT}, - {"data.dups", MDB_DUPSORT}, + {"key.reverse", MDBX_REVERSEKEY}, + {"key.integer", MDBX_INTEGERKEY}, + {"data.integer", MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT}, + {"data.fixed", MDBX_DUPFIXED | MDBX_DUPSORT}, + {"data.reverse", MDBX_REVERSEDUP | MDBX_DUPSORT}, + {"data.dups", MDBX_DUPSORT}, {nullptr, 0}}; static void dump_verbs(const char *caption, size_t bits, diff --git a/test/hill.cc b/test/hill.cc index daa6e04e..360eb29d 100644 --- a/test/hill.cc +++ b/test/hill.cc @@ -29,7 +29,7 @@ bool testcase_hill::run() { db_open(); txn_begin(false); - MDB_dbi dbi = db_table_open(true); + MDBX_dbi dbi = db_table_open(true); txn_end(false); /* LY: тест "холмиком": @@ -46,7 +46,7 @@ bool testcase_hill::run() { * итерирование ключей интервалами различной ширины, с тем чтобы * проверить различные варианты как разделения, так и слияния страниц * внутри движка. - * - при не-уникальных ключах (MDB_DUPSORT с подвариантами), для каждого + * - при не-уникальных ключах (MDBX_DUPSORT с подвариантами), для каждого * повтора внутри движка формируется вложенное btree-дерево, * соответственно требуется соблюдение аналогичных принципов * итерирования для значений. @@ -61,10 +61,11 @@ bool testcase_hill::run() { keygen::buffer b_key = keygen::alloc(config.params.keylen_max); keygen::buffer b_data = keygen::alloc(config.params.datalen_max); - const unsigned insert_flags = (config.params.table_flags & MDB_DUPSORT) - ? MDB_NODUPDATA - : MDB_NODUPDATA | MDB_NOOVERWRITE; - const unsigned update_flags = MDB_CURRENT | MDB_NODUPDATA | MDB_NOOVERWRITE; + const unsigned insert_flags = (config.params.table_flags & MDBX_DUPSORT) + ? MDBX_NODUPDATA + : MDBX_NODUPDATA | MDBX_NOOVERWRITE; + const unsigned update_flags = + MDBX_CURRENT | MDBX_NODUPDATA | MDBX_NOOVERWRITE; uint64_t serial_count = 0; unsigned txn_nops = 0; @@ -86,7 +87,7 @@ bool testcase_hill::run() { generate_pair(a_serial, a_key, a_data_1, age_shift); int rc = mdbx_put(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, insert_flags); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_put(insert-a.1)", rc); if (++txn_nops >= config.params.batch_write) { @@ -99,7 +100,7 @@ bool testcase_hill::run() { generate_pair(b_serial, b_key, b_data, 0); rc = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value, insert_flags); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_put(insert-b)", rc); if (++txn_nops >= config.params.batch_write) { @@ -113,7 +114,7 @@ bool testcase_hill::run() { generate_pair(a_serial, a_key, a_data_0, 0); rc = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_0->value, &a_data_1->value, update_flags); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_put(update-a: 1->0)", rc); if (++txn_nops >= config.params.batch_write) { @@ -124,7 +125,7 @@ bool testcase_hill::run() { // удаляем вторую запись log_trace("uphill: delete-b %" PRIu64, b_serial); rc = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_del(b)", rc); if (++txn_nops >= config.params.batch_write) { @@ -159,7 +160,7 @@ bool testcase_hill::run() { log_trace("!!!"); int rc = mdbx_replace(txn_guard.get(), dbi, &a_key->value, &a_data_1->value, &a_data_0->value, update_flags); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_put(update-a: 0->1)", rc); if (++txn_nops >= config.params.batch_write) { @@ -172,7 +173,7 @@ bool testcase_hill::run() { generate_pair(b_serial, b_key, b_data, 0); rc = mdbx_put(txn_guard.get(), dbi, &b_key->value, &b_data->value, insert_flags); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_put(insert-b)", rc); if (++txn_nops >= config.params.batch_write) { @@ -184,7 +185,7 @@ bool testcase_hill::run() { log_trace("downhill: delete-a (age %" PRIu64 ") %" PRIu64, age_shift, a_serial); rc = mdbx_del(txn_guard.get(), dbi, &a_key->value, &a_data_1->value); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_del(a)", rc); if (++txn_nops >= config.params.batch_write) { @@ -195,7 +196,7 @@ bool testcase_hill::run() { // удаляем вторую запись log_trace("downhill: delete-b %" PRIu64, b_serial); rc = mdbx_del(txn_guard.get(), dbi, &b_key->value, &b_data->value); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_del(b)", rc); if (++txn_nops >= config.params.batch_write) { diff --git a/test/keygen.cc b/test/keygen.cc index 921b324e..6cfaed8c 100644 --- a/test/keygen.cc +++ b/test/keygen.cc @@ -73,8 +73,8 @@ void __hot maker::pair(serial_t serial, const buffer &key, buffer &value, assert(mapping.mesh <= mapping.width); assert(mapping.rotate <= mapping.width); assert(mapping.offset <= mask(mapping.width)); - assert(!(key_essentials.flags & (MDB_INTEGERDUP | MDB_REVERSEDUP))); - assert(!(value_essentials.flags & (MDB_INTEGERKEY | MDB_REVERSEKEY))); + assert(!(key_essentials.flags & (MDBX_INTEGERDUP | MDBX_REVERSEDUP))); + assert(!(value_essentials.flags & (MDBX_INTEGERKEY | MDBX_REVERSEKEY))); log_trace("keygen-pair: serial %" PRIu64 ", data-age %" PRIu64, serial, value_age); @@ -120,12 +120,13 @@ void __hot maker::pair(serial_t serial, const buffer &key, buffer &value, void maker::setup(const config::actor_params_pod &actor, unsigned thread_number) { - key_essentials.flags = actor.table_flags & (MDB_INTEGERKEY | MDB_REVERSEKEY); + key_essentials.flags = + actor.table_flags & (MDBX_INTEGERKEY | MDBX_REVERSEKEY); key_essentials.minlen = actor.keylen_min; key_essentials.maxlen = actor.keylen_max; value_essentials.flags = - actor.table_flags & (MDB_INTEGERDUP | MDB_REVERSEDUP); + actor.table_flags & (MDBX_INTEGERDUP | MDBX_REVERSEDUP); value_essentials.minlen = actor.datalen_min; value_essentials.maxlen = actor.datalen_max; @@ -196,14 +197,14 @@ void __hot maker::mk(const serial_t serial, const essentials ¶ms, out.value.iov_base = out.bytes; out.value.iov_len = params.minlen; - if (params.flags & (MDB_INTEGERKEY | MDB_INTEGERDUP)) { + if (params.flags & (MDBX_INTEGERKEY | MDBX_INTEGERDUP)) { assert(params.maxlen == params.minlen); assert(params.minlen == 4 || params.minlen == 8); if (is_byteorder_le() || params.minlen == 8) out.u64 = serial; else out.u32 = (uint32_t)serial; - } else if (params.flags & (MDB_REVERSEKEY | MDB_REVERSEDUP)) { + } else if (params.flags & (MDBX_REVERSEKEY | MDBX_REVERSEDUP)) { if (out.value.iov_len > 8) { memset(out.bytes, '\0', out.value.iov_len - 8); unaligned::store(out.bytes + out.value.iov_len - 8, htobe64(serial)); diff --git a/test/log.cc b/test/log.cc index 2254e3f0..3c0a4b11 100644 --- a/test/log.cc +++ b/test/log.cc @@ -101,9 +101,9 @@ bool output(const logging::loglevel priority, const char *format, va_list ap) { #ifdef _MSC_VER int rc = _localtime32_s(&tm, (const __time32_t *)&now.utc); #else - int rc = localtime_r(&time, &tm) ? MDB_SUCCESS : errno; + int rc = localtime_r(&time, &tm) ? MDBX_SUCCESS : errno; #endif - if (rc != MDB_SUCCESS) + if (rc != MDBX_SUCCESS) failure_perror("localtime_r()", rc); last = stdout; diff --git a/test/main.cc b/test/main.cc index 8ef6f2f2..009b3e01 100644 --- a/test/main.cc +++ b/test/main.cc @@ -37,9 +37,9 @@ void actor_params::set_defaults(void) { #else "test_tmpdb.mdbx"; #endif - mode_flags = MDB_NOSUBDIR | MDB_WRITEMAP | MDB_MAPASYNC | MDB_NORDAHEAD | - MDB_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM; - table_flags = MDB_DUPSORT; + mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NORDAHEAD | + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM; + table_flags = MDBX_DUPSORT; size = 1024 * 1024; keygen.seed = 1; diff --git a/test/osal-windows.cc b/test/osal-windows.cc index c42513f5..2c540f44 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -20,7 +20,7 @@ static HANDLE hBarrierSemaphore, hBarrierEvent; static int waitstatus2errcode(DWORD result) { switch (result) { case WAIT_OBJECT_0: - return MDB_SUCCESS; + return MDBX_SUCCESS; case WAIT_FAILED: return GetLastError(); case WAIT_ABANDONED: diff --git a/test/test.cc b/test/test.cc index ad5141f6..d19d684c 100644 --- a/test/test.cc +++ b/test/test.cc @@ -90,7 +90,7 @@ static void mdbx_debug_logger(int type, const char *function, int line, abort(); } -int testcase::oom_callback(MDB_env *env, int pid, mdbx_tid_t tid, uint64_t txn, +int testcase::oom_callback(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, unsigned gap, int retry) { testcase *self = (testcase *)mdbx_env_get_userctx(env); @@ -121,32 +121,32 @@ void testcase::db_prepare() { int rc = mdbx_setup_debug(mdbx_dbg_opts, mdbx_debug_logger, MDBX_DBG_DNT); log_info("set mdbx debug-opts: 0x%02x", rc); - MDB_env *env = nullptr; + MDBX_env *env = nullptr; rc = mdbx_env_create(&env); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_create()", rc); assert(env != nullptr); db_guard.reset(env); rc = mdbx_env_set_userctx(env, this); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_set_userctx()", rc); rc = mdbx_env_set_maxreaders(env, config.params.max_readers); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_set_maxreaders()", rc); rc = mdbx_env_set_maxdbs(env, config.params.max_tables); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_set_maxdbs()", rc); rc = mdbx_env_set_oomfunc(env, testcase::oom_callback); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_set_oomfunc()", rc); rc = mdbx_env_set_mapsize(env, (size_t)config.params.size); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_set_mapsize()", rc); log_trace("<< db_prepare"); @@ -159,7 +159,7 @@ void testcase::db_open() { db_prepare(); int rc = mdbx_env_open(db_guard.get(), config.params.pathname_db.c_str(), (unsigned)config.params.mode_flags, 0640); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_open()", rc); log_trace("<< db_open"); @@ -179,8 +179,8 @@ void testcase::txn_begin(bool readonly) { MDBX_txn *txn = nullptr; int rc = - mdbx_txn_begin(db_guard.get(), nullptr, readonly ? MDB_RDONLY : 0, &txn); - if (unlikely(rc != MDB_SUCCESS)) + mdbx_txn_begin(db_guard.get(), nullptr, readonly ? MDBX_RDONLY : 0, &txn); + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_txn_begin()", rc); txn_guard.reset(txn); @@ -194,11 +194,11 @@ void testcase::txn_end(bool abort) { MDBX_txn *txn = txn_guard.release(); if (abort) { int rc = mdbx_txn_abort(txn); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_txn_abort()", rc); } else { int rc = mdbx_txn_commit(txn); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_txn_commit()", rc); } @@ -303,7 +303,7 @@ void testcase::fetch_canary() { log_trace(">> fetch_canary"); int rc = mdbx_canary_get(txn_guard.get(), &canary_now); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_canary_get()", rc); if (canary_now.v < last.canary.v) @@ -329,13 +329,13 @@ void testcase::update_canary(uint64_t increment) { canary_now.y += increment; int rc = mdbx_canary_put(txn_guard.get(), &canary_now); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_canary_put()", rc); log_trace("<< update_canary: sequence = %" PRIu64, canary_now.y); } -MDB_dbi testcase::db_table_open(bool create) { +MDBX_dbi testcase::db_table_open(bool create) { log_trace(">> testcase::db_table_create"); char tablename_buf[16]; @@ -349,23 +349,23 @@ MDB_dbi testcase::db_table_open(bool create) { } log_verbose("use %s table", tablename ? tablename : "MAINDB"); - MDB_dbi handle = 0; + MDBX_dbi handle = 0; int rc = mdbx_dbi_open(txn_guard.get(), tablename, - (create ? MDB_CREATE : 0) | config.params.table_flags, + (create ? MDBX_CREATE : 0) | config.params.table_flags, &handle); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_dbi_open()", rc); log_trace("<< testcase::db_table_create, handle %u", handle); return handle; } -void testcase::db_table_drop(MDB_dbi handle) { +void testcase::db_table_drop(MDBX_dbi handle) { log_trace(">> testcase::db_table_drop, handle %u", handle); if (config.params.drop_table) { int rc = mdbx_drop(txn_guard.get(), handle, true); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_drop()", rc); log_trace("<< testcase::db_table_drop"); } else { @@ -373,11 +373,11 @@ void testcase::db_table_drop(MDB_dbi handle) { } } -void testcase::db_table_close(MDB_dbi handle) { +void testcase::db_table_close(MDBX_dbi handle) { log_trace(">> testcase::db_table_close, handle %u", handle); assert(!txn_guard); int rc = mdbx_dbi_close(db_guard.get(), handle); - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_dbi_close()", rc); log_trace("<< testcase::db_table_close"); } diff --git a/test/test.h b/test/test.h index 441bc72c..7b43357b 100644 --- a/test/test.h +++ b/test/test.h @@ -54,8 +54,8 @@ extern bool failfast; //----------------------------------------------------------------------------- -struct db_deleter : public std::unary_function { - void operator()(MDB_env *env) const { mdbx_env_close(env); } +struct db_deleter : public std::unary_function { + void operator()(MDBX_env *env) const { mdbx_env_close(env); } }; struct txn_deleter : public std::unary_function { @@ -66,13 +66,13 @@ struct txn_deleter : public std::unary_function { } }; -struct cursor_deleter : public std::unary_function { - void operator()(MDB_cursor *cursor) const { mdbx_cursor_close(cursor); } +struct cursor_deleter : public std::unary_function { + void operator()(MDBX_cursor *cursor) const { mdbx_cursor_close(cursor); } }; -typedef std::unique_ptr scoped_db_guard; +typedef std::unique_ptr scoped_db_guard; typedef std::unique_ptr scoped_txn_guard; -typedef std::unique_ptr scoped_cursor_guard; +typedef std::unique_ptr scoped_cursor_guard; //----------------------------------------------------------------------------- @@ -96,7 +96,7 @@ protected: mdbx_canary canary; } last; - static int oom_callback(MDB_env *env, int pid, mdbx_tid_t tid, uint64_t txn, + static int oom_callback(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, unsigned gap, int retry); void db_prepare(); @@ -108,9 +108,9 @@ protected: void fetch_canary(); void update_canary(uint64_t increment); - MDB_dbi db_table_open(bool create); - void db_table_drop(MDB_dbi handle); - void db_table_close(MDB_dbi handle); + MDBX_dbi db_table_open(bool create); + void db_table_drop(MDBX_dbi handle); + void db_table_close(MDBX_dbi handle); bool wait4start(); void report(size_t nops_done); @@ -128,7 +128,7 @@ protected: } bool mode_readonly() const { - return (config.params.mode_flags & MDB_RDONLY) ? true : false; + return (config.params.mode_flags & MDBX_RDONLY) ? true : false; } public: diff --git a/test/test.vcxproj b/test/test.vcxproj index 047e6ae3..7afeb1c7 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -95,7 +95,7 @@ Use Level3 Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + WIN32;_DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions) true test.h @@ -110,7 +110,7 @@ Use Level3 Disabled - _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + _DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions) true test.h diff --git a/tutorial/sample-mdb.txt b/tutorial/sample-mdb.txt index bc7dc9eb..54b56f61 100644 --- a/tutorial/sample-mdb.txt +++ b/tutorial/sample-mdb.txt @@ -19,16 +19,16 @@ */ #include -#include "lmdb.h" +#include "mdbx.h" int main(int argc,char * argv[]) { int rc; - MDB_env *env; - MDB_dbi dbi; + MDBX_env *env; + MDBX_dbi dbi; MDBX_val key, data; MDBX_txn *txn; - MDB_cursor *cursor; + MDBX_cursor *cursor; char sval[32]; /* Note: Most error checking omitted for simplicity */ @@ -50,9 +50,9 @@ int main(int argc,char * argv[]) fprintf(stderr, "mdbx_txn_commit: (%d) %s\n", rc, mdbx_strerror(rc)); goto leave; } - rc = mdbx_txn_begin(env, NULL, MDB_RDONLY, &txn); + rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); rc = mdbx_cursor_open(txn, dbi, &cursor); - while ((rc = mdbx_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { + while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == 0) { printf("key: %p %.*s, data: %p %.*s\n", key.iov_base, (int) key.iov_len, (char *) key.iov_base, data.iov_base, (int) data.iov_len, (char *) data.iov_base); From a6c8c1ad082e0c526ee0f089b95d3d7cf5754934 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 02:16:25 +0300 Subject: [PATCH 156/303] test: add simple progress indicator. --- test/base.h | 2 ++ test/config.cc | 2 ++ test/main.cc | 5 +++++ test/osal-unix.cc | 2 ++ test/osal-windows.cc | 2 ++ test/osal.h | 1 + test/test.cc | 32 +++++++++++++++++++++++++++++++- test/test.h | 2 ++ 8 files changed, 47 insertions(+), 1 deletion(-) diff --git a/test/base.h b/test/base.h index ad804813..39e2c357 100644 --- a/test/base.h +++ b/test/base.h @@ -35,9 +35,11 @@ #include #if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) +#include #else #include #include +#include #endif #ifdef _BSD_SOURCE diff --git a/test/config.cc b/test/config.cc index c1d2a76d..02a4f955 100644 --- a/test/config.cc +++ b/test/config.cc @@ -339,6 +339,8 @@ void dump(const char *title) { global::config::cleanup_after ? "Yes" : "No"); log_info("failfast: %s\n", global::config::failfast ? "Yes" : "No"); + log_info("progress indicator: %s\n", + global::config::progress_indicator ? "Yes" : "No"); } } /* namespace config */ diff --git a/test/main.cc b/test/main.cc index 009b3e01..8cc21e51 100644 --- a/test/main.cc +++ b/test/main.cc @@ -76,6 +76,7 @@ void actor_params::set_defaults(void) { global::config::cleanup_before = true; global::config::cleanup_after = true; global::config::failfast = true; + global::config::progress_indicator = osal_istty(STDERR_FILENO); } namespace global { @@ -95,6 +96,7 @@ bool dump_config; bool cleanup_before; bool cleanup_after; bool failfast; +bool progress_indicator; } /* namespace config */ } /* namespace global */ @@ -272,6 +274,9 @@ int main(int argc, char *const argv[]) { if (config::parse_option(argc, argv, narg, "failfast", global::config::failfast)) continue; + if (config::parse_option(argc, argv, narg, "progress", + global::config::progress_indicator)) + continue; if (*argv[narg] != '-') testcase_setup(argv[narg], params, last_space_id); diff --git a/test/osal-unix.cc b/test/osal-unix.cc index c44ade47..88a10f11 100644 --- a/test/osal-unix.cc +++ b/test/osal-unix.cc @@ -270,3 +270,5 @@ void osal_udelay(unsigned us) { now = chrono::now_motonic(); } while (until.fixedpoint > now.fixedpoint); } + +bool osal_istty(int fd) { return isatty(fd) == 1; } diff --git a/test/osal-windows.cc b/test/osal-windows.cc index 2c540f44..57f7f547 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.cc @@ -303,3 +303,5 @@ void osal_udelay(unsigned us) { now = chrono::now_motonic(); } while (now.fixedpoint < until.fixedpoint); } + +bool osal_istty(int fd) { return _isatty(fd) != 0; } diff --git a/test/osal.h b/test/osal.h index 7eac2ad8..29d52219 100644 --- a/test/osal.h +++ b/test/osal.h @@ -30,3 +30,4 @@ mdbx_pid_t osal_getpid(void); int osal_delay(unsigned seconds); void osal_udelay(unsigned us); void osal_yield(void); +bool osal_istty(int fd); diff --git a/test/test.cc b/test/test.cc index d19d684c..cded3e2a 100644 --- a/test/test.cc +++ b/test/test.cc @@ -101,7 +101,7 @@ int testcase::oom_callback(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, pid, (size_t)tid, txn, gap); if (self->should_continue()) { - osal_yield(); + /* osal_yield(); */ osal_udelay(retry * 100); return 1 /* always retry */; } @@ -239,11 +239,38 @@ bool testcase::wait4start() { return true; } +void testcase::kick_progress(bool active) const { + static chrono::time last; + chrono::time now = chrono::now_motonic(); + + if (active) { + static int last_point = -1; + int point = (now.fixedpoint >> 29) & 3; + if (point != last_point) { + last = now; + fprintf(stderr, "%c\b", "-\\|/"[last_point = point]); + fflush(stderr); + } + } else if (now.fixedpoint - last.fixedpoint > + chrono::from_seconds(2).fixedpoint) { + last = now; + fprintf(stderr, "%c\b", "@*"[now.utc & 1]); + fflush(stderr); + } +} + void testcase::report(size_t nops_done) { + assert(nops_done > 0); + if (!nops_done) + return; + nops_completed += nops_done; log_verbose("== complete +%" PRIuPTR " iteration, total %" PRIuPTR " done", nops_done, nops_completed); + if (global::config::progress_indicator) + kick_progress(true); + if (config.signal_nops && !signalled && config.signal_nops <= nops_completed) { log_trace(">> signal(n-ops %" PRIuPTR ")", nops_completed); @@ -295,6 +322,9 @@ bool testcase::should_continue() const { if (config.params.test_nops && nops_completed >= config.params.test_nops) result = false; + if (result && global::config::progress_indicator) + kick_progress(false); + return result; } diff --git a/test/test.h b/test/test.h index 7b43357b..34083805 100644 --- a/test/test.h +++ b/test/test.h @@ -48,6 +48,7 @@ extern bool dump_config; extern bool cleanup_before; extern bool cleanup_after; extern bool failfast; +extern bool progress_indicator; } /* namespace config */ } /* namespace global */ @@ -107,6 +108,7 @@ protected: void txn_restart(bool abort, bool readonly); void fetch_canary(); void update_canary(uint64_t increment); + void kick_progress(bool active) const; MDBX_dbi db_table_open(bool create); void db_table_drop(MDBX_dbi handle); From 38deb14ee4f2a22b0e72f4f548872bd42335e9b8 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 02:43:43 +0300 Subject: [PATCH 157/303] test: fix STDERR_FILENO for MSVC. --- test/osal.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/osal.h b/test/osal.h index 29d52219..8f162635 100644 --- a/test/osal.h +++ b/test/osal.h @@ -31,3 +31,15 @@ int osal_delay(unsigned seconds); void osal_udelay(unsigned us); void osal_yield(void); bool osal_istty(int fd); + +#ifdef _MSC_VER +#ifndef STDIN_FILENO +#define STDIN_FILENO _fileno(stdin) +#endif +#ifndef STDOUT_FILENO +#define STDOUT_FILENO _fileno(stdout) +#endif +#ifndef STDERR_FILENO +#define STDERR_FILENO _fileno(stderr) +#endif +#endif /* _MSC_VER */ From 8af6291d50715b63f575535932d81ca07eccab8d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 02:46:51 +0300 Subject: [PATCH 158/303] test: set default db-size to 4M. --- test/main.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/main.cc b/test/main.cc index 8cc21e51..2c4f9b09 100644 --- a/test/main.cc +++ b/test/main.cc @@ -40,7 +40,7 @@ void actor_params::set_defaults(void) { mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_MAPASYNC | MDBX_NORDAHEAD | MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM; table_flags = MDBX_DUPSORT; - size = 1024 * 1024; + size = 1024 * 1024 * 4; keygen.seed = 1; keygen.keycase = kc_random; From dd7855c30d052f5f43cb54731b69fb77f09b1e58 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 13:37:06 +0300 Subject: [PATCH 159/303] mdbx: cleanup tabs (minor). --- src/bits.h | 109 ++++++++-------- src/defs.h | 363 ++++++++++++++++++++++++++--------------------------- 2 files changed, 235 insertions(+), 237 deletions(-) diff --git a/src/bits.h b/src/bits.h index cc2e6ab6..897e1eb6 100644 --- a/src/bits.h +++ b/src/bits.h @@ -16,18 +16,17 @@ /* clang-format off */ #ifndef _FILE_OFFSET_BITS -# define _FILE_OFFSET_BITS 64 +# define _FILE_OFFSET_BITS 64 #endif #if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS) -# define _CRT_SECURE_NO_WARNINGS +# define _CRT_SECURE_NO_WARNINGS #endif #ifdef _MSC_VER #pragma warning(disable : 4464) /* C4464: relative include path contains '..' */ #pragma warning(disable : 4710) /* C4710: 'xyz': function not inlined */ #pragma warning(disable : 4711) /* C4711: function 'xyz' selected for automatic inline expansion */ -//#pragma warning(disable : 4061) /* C4061: enumerator 'abc' in switch of enum 'xyz' is not explicitly handled by a case label */ #pragma warning(disable : 4201) /* C4201: nonstandard extension used : nameless struct / union */ #pragma warning(disable : 4706) /* C4706: assignment within conditional expression */ #pragma warning(disable : 4127) /* C4127: conditional expression is constant */ @@ -37,102 +36,102 @@ #include "./defs.h" #if defined(USE_VALGRIND) -# include -# ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE -/* LY: available since Valgrind 3.10 */ -# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# endif +# include +# ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE + /* LY: available since Valgrind 3.10 */ +# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# endif #else -# define VALGRIND_CREATE_MEMPOOL(h,r,z) -# define VALGRIND_DESTROY_MEMPOOL(h) -# define VALGRIND_MEMPOOL_TRIM(h,a,s) -# define VALGRIND_MEMPOOL_ALLOC(h,a,s) -# define VALGRIND_MEMPOOL_FREE(h,a) -# define VALGRIND_MEMPOOL_CHANGE(h,a,b,s) -# define VALGRIND_MAKE_MEM_NOACCESS(a,s) -# define VALGRIND_MAKE_MEM_DEFINED(a,s) -# define VALGRIND_MAKE_MEM_UNDEFINED(a,s) -# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) -# define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) +# define VALGRIND_CREATE_MEMPOOL(h,r,z) +# define VALGRIND_DESTROY_MEMPOOL(h) +# define VALGRIND_MEMPOOL_TRIM(h,a,s) +# define VALGRIND_MEMPOOL_ALLOC(h,a,s) +# define VALGRIND_MEMPOOL_FREE(h,a) +# define VALGRIND_MEMPOOL_CHANGE(h,a,b,s) +# define VALGRIND_MAKE_MEM_NOACCESS(a,s) +# define VALGRIND_MAKE_MEM_DEFINED(a,s) +# define VALGRIND_MAKE_MEM_UNDEFINED(a,s) +# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) +# define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) #endif /* USE_VALGRIND */ #ifdef __SANITIZE_ADDRESS__ -# include +# include #else -# define ASAN_POISON_MEMORY_REGION(addr, size) \ - ((void)(addr), (void)(size)) -# define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ - ((void)(addr), (void)(size)) +# define ASAN_POISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +# define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) #endif /* __SANITIZE_ADDRESS__ */ #include "./osal.h" #ifndef MDBX_DEBUG -# define MDBX_DEBUG 0 +# define MDBX_DEBUG 0 #endif #if MDBX_DEBUG -# undef NDEBUG +# undef NDEBUG #endif #if defined(__GNUC__) && !__GNUC_PREREQ(4,2) - /* Actualy libmdbx was not tested with compilers older than GCC from RHEL6. - * But you could remove this #error and try to continue at your own risk. - * In such case please don't rise up an issues related ONLY to old compilers. - */ -# warning "libmdbx required at least GCC 4.2 compatible C/C++ compiler." + /* Actualy libmdbx was not tested with compilers older than GCC from RHEL6. + * But you could remove this #error and try to continue at your own risk. + * In such case please don't rise up an issues related ONLY to old compilers. + */ +# warning "libmdbx required at least GCC 4.2 compatible C/C++ compiler." #endif #if defined(__GLIBC__) && !__GLIBC_PREREQ(2,12) - /* Actualy libmdbx was not tested with something older than glibc 2.12 (from RHEL6). - * But you could remove this #error and try to continue at your own risk. - * In such case please don't rise up an issues related ONLY to old systems. - */ -# warning "libmdbx required at least GLIBC 2.12." + /* Actualy libmdbx was not tested with something older than glibc 2.12 (from RHEL6). + * But you could remove this #error and try to continue at your own risk. + * In such case please don't rise up an issues related ONLY to old systems. + */ +# warning "libmdbx required at least GLIBC 2.12." #endif #if defined(__i386) || defined(__x86_64) || defined(_M_IX86) -# define UNALIGNED_OK 1 /* TODO */ +# define UNALIGNED_OK 1 /* TODO */ #endif #ifndef UNALIGNED_OK -# define UNALIGNED_OK 0 +# define UNALIGNED_OK 0 #endif /* UNALIGNED_OK */ #if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF -# error "Sanity checking failed: Two's complement, reasonably sized integer types" +# error "Sanity checking failed: Two's complement, reasonably sized integer types" #endif /*----------------------------------------------------------------------------*/ #ifndef ARRAY_LENGTH -# ifdef __cplusplus - template - char (&__ArraySizeHelper(T (&array)[N]))[N]; -# define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array))) -# else -# define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) -# endif +# ifdef __cplusplus + template + char (&__ArraySizeHelper(T (&array)[N]))[N]; +# define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array))) +# else +# define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) +# endif #endif /* ARRAY_LENGTH */ #ifndef ARRAY_END -# define ARRAY_END(array) (&array[ARRAY_LENGTH(array)]) +# define ARRAY_END(array) (&array[ARRAY_LENGTH(array)]) #endif /* ARRAY_END */ #ifndef STRINGIFY -# define STRINGIFY_HELPER(x) #x -# define STRINGIFY(x) STRINGIFY_HELPER(x) +# define STRINGIFY_HELPER(x) #x +# define STRINGIFY(x) STRINGIFY_HELPER(x) #endif /* STRINGIFY */ #ifndef offsetof -# define offsetof(type, member) __builtin_offsetof(type, member) +# define offsetof(type, member) __builtin_offsetof(type, member) #endif /* offsetof */ #ifndef container_of -# define container_of(ptr, type, member) \ - ((type *)((char *)(ptr) - offsetof(type, member))) +# define container_of(ptr, type, member) \ + ((type *)((char *)(ptr) - offsetof(type, member))) #endif /* container_of */ /* *INDENT-ON* */ diff --git a/src/defs.h b/src/defs.h index 24c67cc2..6cae9714 100644 --- a/src/defs.h +++ b/src/defs.h @@ -17,293 +17,292 @@ /* clang-format off */ #ifndef __GNUC_PREREQ -# if defined(__GNUC__) && defined(__GNUC_MINOR__) -# define __GNUC_PREREQ(maj, min) \ - ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) -# else -# define __GNUC_PREREQ(maj, min) (0) -# endif +# if defined(__GNUC__) && defined(__GNUC_MINOR__) +# define __GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GNUC_PREREQ(maj, min) (0) +# endif #endif /* __GNUC_PREREQ */ #ifndef __CLANG_PREREQ -# ifdef __clang__ -# define __CLANG_PREREQ(maj,min) \ - ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min)) -# else -# define __CLANG_PREREQ(maj,min) (0) -# endif +# ifdef __clang__ +# define __CLANG_PREREQ(maj,min) \ + ((__clang_major__ << 16) + __clang_minor__ >= ((maj) << 16) + (min)) +# else +# define __CLANG_PREREQ(maj,min) (0) +# endif #endif /* __CLANG_PREREQ */ #ifndef __GLIBC_PREREQ -# if defined(__GLIBC__) && defined(__GLIBC_MINOR__) -# define __GLIBC_PREREQ(maj, min) \ - ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min)) -# else -# define __GLIBC_PREREQ(maj, min) (0) -# endif +# if defined(__GLIBC__) && defined(__GLIBC_MINOR__) +# define __GLIBC_PREREQ(maj, min) \ + ((__GLIBC__ << 16) + __GLIBC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GLIBC_PREREQ(maj, min) (0) +# endif #endif /* __GLIBC_PREREQ */ #ifndef __has_attribute -# define __has_attribute(x) (0) +# define __has_attribute(x) (0) #endif #ifndef __has_feature -# define __has_feature(x) (0) +# define __has_feature(x) (0) #endif #ifndef __has_extension -# define __has_extension(x) (0) +# define __has_extension(x) (0) #endif #ifndef __has_builtin -# define __has_builtin(x) (0) +# define __has_builtin(x) (0) #endif #if __has_feature(thread_sanitizer) -# define __SANITIZE_THREAD__ 1 +# define __SANITIZE_THREAD__ 1 #endif #if __has_feature(address_sanitizer) -# define __SANITIZE_ADDRESS__ 1 +# define __SANITIZE_ADDRESS__ 1 #endif /*----------------------------------------------------------------------------*/ #ifndef __extern_C -# ifdef __cplusplus -# define __extern_C extern "C" -# else -# define __extern_C -# endif +# ifdef __cplusplus +# define __extern_C extern "C" +# else +# define __extern_C +# endif #endif /* __extern_C */ #ifndef __cplusplus -# ifndef bool -# define bool _Bool -# endif -# ifndef true -# define true (1) -# endif -# ifndef false -# define false (0) -# endif +# ifndef bool +# define bool _Bool +# endif +# ifndef true +# define true (1) +# endif +# ifndef false +# define false (0) +# endif #endif #if !defined(nullptr) && !defined(__cplusplus) || (__cplusplus < 201103L && !defined(_MSC_VER)) -# define nullptr NULL +# define nullptr NULL #endif /*----------------------------------------------------------------------------*/ #if !defined(__thread) && (defined(_MSC_VER) || defined(__DMC__)) -# define __thread __declspec(thread) +# define __thread __declspec(thread) #endif /* __thread */ #ifndef __alwaysinline -# if defined(__GNUC__) || __has_attribute(always_inline) -# define __alwaysinline __inline __attribute__((always_inline)) -# elif defined(_MSC_VER) -# define __alwaysinline __forceinline -# else -# define __alwaysinline -# endif +# if defined(__GNUC__) || __has_attribute(always_inline) +# define __alwaysinline __inline __attribute__((always_inline)) +# elif defined(_MSC_VER) +# define __alwaysinline __forceinline +# else +# define __alwaysinline +# endif #endif /* __alwaysinline */ #ifndef __noinline -# if defined(__GNUC__) || __has_attribute(noinline) -# define __noinline __attribute__((noinline)) -# elif defined(_MSC_VER) -# define __noinline __declspec(noinline) -# elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) -# define __noinline inline -# elif !defined(__INTEL_COMPILER) -# define __noinline /* FIXME ? */ -# endif +# if defined(__GNUC__) || __has_attribute(noinline) +# define __noinline __attribute__((noinline)) +# elif defined(_MSC_VER) +# define __noinline __declspec(noinline) +# elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) +# define __noinline inline +# elif !defined(__INTEL_COMPILER) +# define __noinline /* FIXME ? */ +# endif #endif /* __noinline */ #ifndef __must_check_result -# if defined(__GNUC__) || __has_attribute(warn_unused_result) -# define __must_check_result __attribute__((warn_unused_result)) -# else -# define __must_check_result -# endif +# if defined(__GNUC__) || __has_attribute(warn_unused_result) +# define __must_check_result __attribute__((warn_unused_result)) +# else +# define __must_check_result +# endif #endif /* __must_check_result */ #ifndef __deprecated -# if defined(__GNUC__) || __has_attribute(deprecated) -# define __deprecated __attribute__((deprecated)) -# elif defined(_MSC_VER) -# define __deprecated __declspec(deprecated) -# else -# define __deprecated -# endif +# if defined(__GNUC__) || __has_attribute(deprecated) +# define __deprecated __attribute__((deprecated)) +# elif defined(_MSC_VER) +# define __deprecated __declspec(deprecated) +# else +# define __deprecated +# endif #endif /* __deprecated */ #ifndef __packed -# if defined(__GNUC__) || __has_attribute(packed) -# define __packed __attribute__((packed)) -# else -# define __packed -# endif +# if defined(__GNUC__) || __has_attribute(packed) +# define __packed __attribute__((packed)) +# else +# define __packed +# endif #endif /* __packed */ #ifndef __aligned -# if defined(__GNUC__) || __has_attribute(aligned) -# define __aligned(N) __attribute__((aligned(N))) -# elif defined(_MSC_VER) -# define __aligned(N) __declspec(align(N)) -# else -# define __aligned(N) -# endif +# if defined(__GNUC__) || __has_attribute(aligned) +# define __aligned(N) __attribute__((aligned(N))) +# elif defined(_MSC_VER) +# define __aligned(N) __declspec(align(N)) +# else +# define __aligned(N) +# endif #endif /* __aligned */ #ifndef __noreturn -# if defined(__GNUC__) || __has_attribute(noreturn) -# define __noreturn __attribute__((noreturn)) -# elif defined(_MSC_VER) -# define __noreturn __declspec(noreturn) -# else -# define __noreturn -# endif +# if defined(__GNUC__) || __has_attribute(noreturn) +# define __noreturn __attribute__((noreturn)) +# elif defined(_MSC_VER) +# define __noreturn __declspec(noreturn) +# else +# define __noreturn +# endif #endif /* __noreturn */ #ifndef __nothrow -# if defined(__GNUC__) || __has_attribute(nothrow) -# define __nothrow __attribute__((nothrow)) -# elif defined(_MSC_VER) && defined(__cplusplus) -# define __nothrow __declspec(nothrow) -# else -# define __nothrow -# endif +# if defined(__GNUC__) || __has_attribute(nothrow) +# define __nothrow __attribute__((nothrow)) +# elif defined(_MSC_VER) && defined(__cplusplus) +# define __nothrow __declspec(nothrow) +# else +# define __nothrow +# endif #endif /* __nothrow */ #ifndef __pure_function - /* Many functions have no effects except the return value and their - * return value depends only on the parameters and/or global variables. - * Such a function can be subject to common subexpression elimination - * and loop optimization just as an arithmetic operator would be. - * These functions should be declared with the attribute pure. */ -# if defined(__GNUC__) || __has_attribute(pure) -# define __pure_function __attribute__((pure)) -# else -# define __pure_function -# endif + /* Many functions have no effects except the return value and their + * return value depends only on the parameters and/or global variables. + * Such a function can be subject to common subexpression elimination + * and loop optimization just as an arithmetic operator would be. + * These functions should be declared with the attribute pure. */ +# if defined(__GNUC__) || __has_attribute(pure) +# define __pure_function __attribute__((pure)) +# else +# define __pure_function +# endif #endif /* __pure_function */ #ifndef __const_function - /* Many functions do not examine any values except their arguments, - * and have no effects except the return value. Basically this is just - * slightly more strict class than the PURE attribute, since function - * is not allowed to read global memory. - * - * Note that a function that has pointer arguments and examines the - * data pointed to must not be declared const. Likewise, a function - * that calls a non-const function usually must not be const. - * It does not make sense for a const function to return void. */ -# if defined(__GNUC__) || __has_attribute(const) -# define __const_function __attribute__((const)) -# else -# define __const_function -# endif + /* Many functions do not examine any values except their arguments, + * and have no effects except the return value. Basically this is just + * slightly more strict class than the PURE attribute, since function + * is not allowed to read global memory. + * + * Note that a function that has pointer arguments and examines the + * data pointed to must not be declared const. Likewise, a function + * that calls a non-const function usually must not be const. + * It does not make sense for a const function to return void. */ +# if defined(__GNUC__) || __has_attribute(const) +# define __const_function __attribute__((const)) +# else +# define __const_function +# endif #endif /* __const_function */ #ifndef __dll_hidden -# if defined(__GNUC__) || __has_attribute(visibility) -# define __hidden __attribute__((visibility("hidden"))) -# else -# define __hidden -# endif +# if defined(__GNUC__) || __has_attribute(visibility) +# define __hidden __attribute__((visibility("hidden"))) +# else +# define __hidden +# endif #endif /* __dll_hidden */ #ifndef __optimize -# if defined(__OPTIMIZE__) -# if defined(__clang__) && !__has_attribute(optimize) -# define __optimize(ops) -# elif defined(__GNUC__) || __has_attribute(optimize) -# define __optimize(ops) __attribute__((optimize(ops))) -# else -# define __optimize(ops) -# endif -# else -# define __optimize(ops) -# endif +# if defined(__OPTIMIZE__) +# if defined(__clang__) && !__has_attribute(optimize) +# define __optimize(ops) +# elif defined(__GNUC__) || __has_attribute(optimize) +# define __optimize(ops) __attribute__((optimize(ops))) +# else +# define __optimize(ops) +# endif +# else +# define __optimize(ops) +# endif #endif /* __optimize */ #ifndef __hot -# if defined(__OPTIMIZE__) -# if defined(__clang__) && !__has_attribute(hot) - /* just put frequently used functions in separate section */ -# define __hot __attribute__((section("text.hot"))) __optimize("O3") -# elif defined(__GNUC__) || __has_attribute(hot) -# define __hot __attribute__((hot)) __optimize("O3") -# else -# define __hot __optimize("O3") -# endif -# else -# define __hot -# endif +# if defined(__OPTIMIZE__) +# if defined(__clang__) && !__has_attribute(hot) + /* just put frequently used functions in separate section */ +# define __hot __attribute__((section("text.hot"))) __optimize("O3") +# elif defined(__GNUC__) || __has_attribute(hot) +# define __hot __attribute__((hot)) __optimize("O3") +# else +# define __hot __optimize("O3") +# endif +# else +# define __hot +# endif #endif /* __hot */ #ifndef __cold -# if defined(__OPTIMIZE__) -# if defined(__clang__) && !__has_attribute(cold) - /* just put infrequently used functions in separate section */ -# define __cold __attribute__((section("text.unlikely"))) __optimize("Os") -# elif defined(__GNUC__) || __has_attribute(cold) -# define __cold __attribute__((cold)) __optimize("Os") -# else -# define __cold __optimize("Os") -# endif -# else -# define __cold -# endif +# if defined(__OPTIMIZE__) +# if defined(__clang__) && !__has_attribute(cold) + /* just put infrequently used functions in separate section */ +# define __cold __attribute__((section("text.unlikely"))) __optimize("Os") +# elif defined(__GNUC__) || __has_attribute(cold) +# define __cold __attribute__((cold)) __optimize("Os") +# else +# define __cold __optimize("Os") +# endif +# else +# define __cold +# endif #endif /* __cold */ #ifndef __flatten -# if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(flatten)) -# define __flatten __attribute__((flatten)) -# else -# define __flatten -# endif +# if defined(__OPTIMIZE__) && (defined(__GNUC__) || __has_attribute(flatten)) +# define __flatten __attribute__((flatten)) +# else +# define __flatten +# endif #endif /* __flatten */ #ifndef likely -# if defined(__GNUC__) || defined(__clang__) -# define likely(cond) __builtin_expect(!!(cond), 1) -# else -# define likely(x) (x) -# endif +# if defined(__GNUC__) || defined(__clang__) +# define likely(cond) __builtin_expect(!!(cond), 1) +# else +# define likely(x) (x) +# endif #endif /* likely */ #ifndef unlikely -# if defined(__GNUC__) || defined(__clang__) -# define unlikely(cond) __builtin_expect(!!(cond), 0) -# else -# define unlikely(x) (x) -# endif +# if defined(__GNUC__) || defined(__clang__) +# define unlikely(cond) __builtin_expect(!!(cond), 0) +# else +# define unlikely(x) (x) +# endif #endif /* unlikely */ #if !defined(__noop) && !defined(_MSC_VER) - static __inline int __do_noop(void* crutch, ...) { - (void) crutch; return 0; - } -# define __noop(...) __do_noop(0, __VA_ARGS__) + static __inline int __do_noop(void* crutch, ...) { + (void) crutch; return 0; + } +# define __noop(...) __do_noop(0, __VA_ARGS__) #endif /* __noop */ /*----------------------------------------------------------------------------*/ /* Wrapper around __func__, which is a C99 feature */ #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L -# define mdbx_func_ __func__ +# define mdbx_func_ __func__ #elif (defined(__GNUC__) && __GNUC__ >= 2) || defined(__clang__) || defined(_MSC_VER) -# define mdbx_func_ __FUNCTION__ +# define mdbx_func_ __FUNCTION__ #else -# define mdbx_func_ "" +# define mdbx_func_ "" #endif /* *INDENT-ON* */ /* clang-format on */ #define MDBX_TETRAD(a, b, c, d) \ - ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | \ - (uint32_t)(d)) + ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d)) From 924e81ed92e651bd5b4bd452920704c2be6884a4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 13:59:50 +0300 Subject: [PATCH 160/303] mdbx: refine includes, drop midl.h and mdbx_osal.h --- Makefile | 35 ++- dll.vcxproj | 3 +- libmdbx.files | 2 - mdbx.h | 108 ++++++++- mdbx_osal.h | 131 ----------- src/bits.h | 525 +++++++++++++++++++++++++++++++++---------- src/defs.h | 72 +++++- src/mdbx.c | 338 +--------------------------- src/midl.h | 38 ---- src/osal.h | 114 ++++++---- src/tools/mdbx_chk.c | 1 - test/test.vcxproj | 1 + 12 files changed, 672 insertions(+), 696 deletions(-) delete mode 100644 mdbx_osal.h delete mode 100644 src/midl.h diff --git a/Makefile b/Makefile index e1fd41f4..a8bae342 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,7 @@ XCFLAGS ?= -DNDEBUG=1 -DMDBX_DEBUG=0 -DLIBMDBX_EXPORTS=1 CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden CFLAGS += -D_GNU_SOURCE=1 -std=gnu99 -pthread $(XCFLAGS) CXXFLAGS = -std=c++11 $(filter-out -std=gnu99,$(CFLAGS)) -TESTDB ?= /tmp/mdbx-check.db +TESTDB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-check.db # LY: '--no-as-needed,-lrt' for ability to built with modern glibc, but then run with the old LDFLAGS ?= -Wl,--gc-sections,-z,relro,-O,--no-as-needed,-lrt @@ -44,6 +44,13 @@ TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 SHELL := /bin/bash +CORE_SRC := $(filter-out src/lck-windows.c, $(wildcard src/*.c)) +CORE_INC := $(wildcard src/*.h) +CORE_OBJ := $(patsubst %.c,%.o,$(CORE_SRC)) +TEST_SRC := $(filter-out test/osal-windows.cc, $(wildcard test/*.cc)) +TEST_INC := $(wildcard test/*.h) +TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC)) + .PHONY: mdbx all install clean check coverage all: $(LIBRARIES) $(TOOLS) test/test @@ -63,27 +70,35 @@ install: $(LIBRARIES) $(TOOLS) $(HEADERS) && cp -t $(SANDBOX)$(mandir)/man1 $(MANPAGES) clean: - rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err + rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err src/*.o test/*.o check: test/test rm -f $(TESTDB) && (set -o pipefail; test/test --pathname=$(TESTDB) --dont-cleanup-after basic | tee test.log | tail -n 42) && ./mdbx_chk -vn $(TESTDB) -src/%.o: src/%.c mdbx.h mdbx_osal.h $(addprefix src/, defs.h bits.h osal.h midl.h) Makefile - $(CC) $(CFLAGS) -c $(filter %.c, $^) -o $@ +define core-rule +$(patsubst %.c,%.o,$(1)): $(1) $(CORE_INC) mdbx.h Makefile + $(CC) $(CFLAGS) -c $(1) -o $$@ -libmdbx.a: $(addprefix src/, mdbx.o osal.o lck-posix.o version.o) +endef +$(foreach file,$(CORE_SRC),$(eval $(call core-rule,$(file)))) + +define test-rule +$(patsubst %.cc,%.o,$(1)): $(1) $(TEST_INC) mdbx.h Makefile + $(CXX) $(CXXFLAGS) -c $(1) -o $$@ + +endef +$(foreach file,$(TEST_SRC),$(eval $(call test-rule,$(file)))) + +libmdbx.a: $(CORE_OBJ) $(AR) rs $@ $? -libmdbx.so: libmdbx.a +libmdbx.so: $(CORE_OBJ) $(CC) $(CFLAGS) -save-temps $^ -pthread -shared $(LDFLAGS) -o $@ mdbx_%: src/tools/mdbx_%.c libmdbx.a $(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@ -test/%.o: test/%.cc $(wildcard test/*.h) Makefile - $(CXX) $(CXXFLAGS) -Isrc -c $(filter %.cc, $^) -o $@ - -test/test: $(patsubst %.cc,%.o,$(filter-out test/osal-windows.cc, $(wildcard test/*.cc))) libmdbx.a +test/test: $(TEST_OBJ) libmdbx.a $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ ifneq ($(wildcard $(IOARENA)),) diff --git a/dll.vcxproj b/dll.vcxproj index 8c179768..44c71646 100644 --- a/dll.vcxproj +++ b/dll.vcxproj @@ -146,14 +146,13 @@ + - - diff --git a/libmdbx.files b/libmdbx.files index 63bee3c9..3f51a9b5 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -3,13 +3,11 @@ LICENSE Makefile README.md mdbx.h -mdbx_osal.h src/bits.h src/defs.h src/lck-posix.c src/lck-windows.c src/mdbx.c -src/midl.h src/osal.c src/osal.h src/tools/mdbx_chk.c diff --git a/mdbx.h b/mdbx.h index eb2f4cfd..44802f30 100644 --- a/mdbx.h +++ b/mdbx.h @@ -49,14 +49,108 @@ #ifndef LIBMDBX_H #define LIBMDBX_H +/*--------------------------------------------------------------------------*/ + +#ifdef _MSC_VER +#pragma warning(push, 1) +#pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ + * semantics are not enabled. Specify /EHsc */ +#pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ + * mode specified; termination on exception is \ + * not guaranteed. Specify /EHsc */ +#endif /* _MSC_VER (warnings) */ + +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) + +#include +#include +typedef unsigned mode_t; +typedef HANDLE mdbx_filehandle_t; +typedef DWORD mdbx_pid_t; +typedef DWORD mdbx_tid_t; +#define MDBX_ENODATA ERROR_HANDLE_EOF +#define MDBX_EINVAL ERROR_INVALID_PARAMETER +#define MDBX_EACCESS ERROR_ACCESS_DENIED +#define MDBX_ENOMEM ERROR_OUTOFMEMORY +#define MDBX_EROFS ERROR_FILE_READ_ONLY +#define MDBX_ENOSYS ERROR_NOT_SUPPORTED +#define MDBX_EIO ERROR_WRITE_FAULT + +#else + +#include /* for error codes */ +#include /* for pthread_t */ +#include /* for pid_t */ +#include /* for truct iovec */ +#define HAVE_STRUCT_IOVEC 1 +typedef int mdbx_filehandle_t; +typedef pid_t mdbx_pid_t; +typedef pthread_t mdbx_tid_t; +#define MDBX_ENODATA ENODATA +#define MDBX_EINVAL EINVAL +#define MDBX_EACCESS EACCES +#define MDBX_ENOMEM ENOMEM +#define MDBX_EROFS EROFS +#define MDBX_ENOSYS ENOSYS +#define MDBX_EIO EIO +#endif + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/*--------------------------------------------------------------------------*/ + +#ifndef __has_attribute +#define __has_attribute(x) (0) +#endif + +#ifndef __dll_export +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(__GNUC__) || __has_attribute(dllexport) +#define __dll_export __attribute__((dllexport)) +#elif defined(_MSC_VER) +#define __dll_export __declspec(dllexport) +#else +#define __dll_export +#endif +#elif defined(__GNUC__) || __has_attribute(visibility) +#define __dll_export __attribute__((visibility("default"))) +#else +#define __dll_export +#endif +#endif /* __dll_export */ + +#ifndef __dll_import +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(__GNUC__) || __has_attribute(dllimport) +#define __dll_import __attribute__((dllimport)) +#elif defined(_MSC_VER) +#define __dll_import __declspec(dllimport) +#else +#define __dll_import +#endif +#else +#define __dll_import +#endif +#endif /* __dll_import */ + +/*--------------------------------------------------------------------------*/ + #define MDBX_VERSION_MAJOR 0 #define MDBX_VERSION_MINOR 0 -#ifdef _MSC_VER -#pragma warning(push) -#endif - -#include "mdbx_osal.h" +#if defined(LIBMDBX_EXPORTS) +#define LIBMDBX_API __dll_export +#elif defined(LIBMDBX_IMPORTS) +#define LIBMDBX_API __dll_import +#else +#define LIBMDBX_API +#endif /* LIBMDBX_API */ #ifdef __cplusplus extern "C" { @@ -1511,8 +1605,4 @@ LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, } #endif -#ifdef _MSC_VER -#pragma warning(pop) -#endif - #endif /* LIBMDBX_H */ diff --git a/mdbx_osal.h b/mdbx_osal.h deleted file mode 100644 index 10237c5f..00000000 --- a/mdbx_osal.h +++ /dev/null @@ -1,131 +0,0 @@ -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ - -/* - * Copyright 2015-2017 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#pragma once - -#ifndef __has_attribute -#define __has_attribute(x) (0) -#endif - -#ifndef __dll_export -#if defined(_WIN32) || defined(__CYGWIN__) -#if defined(__GNUC__) || __has_attribute(dllexport) -#define __dll_export __attribute__((dllexport)) -#elif defined(_MSC_VER) -#define __dll_export __declspec(dllexport) -#else -#define __dll_export -#endif -#elif defined(__GNUC__) || __has_attribute(visibility) -#define __dll_export __attribute__((visibility("default"))) -#else -#define __dll_export -#endif -#endif /* __dll_export */ - -#ifndef __dll_import -#if defined(_WIN32) || defined(__CYGWIN__) -#if defined(__GNUC__) || __has_attribute(dllimport) -#define __dll_import __attribute__((dllimport)) -#elif defined(_MSC_VER) -#define __dll_import __declspec(dllimport) -#else -#define __dll_import -#endif -#else -#define __dll_import -#endif -#endif /* __dll_import */ - -/*--------------------------------------------------------------------------*/ - -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4514) /* 'xyz': unreferenced inline function \ - has been removed */ -#pragma warning(disable : 4710) /* 'xyz': function not inlined */ -#pragma warning(disable : 4711) /* function 'xyz' selected for \ - automatic inline expansion */ -#pragma warning(disable : 4061) /* enumerator 'abc' in switch of enum \ - 'xyz' is not explicitly handled by a case \ - label */ -#pragma warning(disable : 4201) /* nonstandard extension used : \ - nameless struct / union */ -#pragma warning(disable : 4127) /* conditional expression is constant \ - */ - -#pragma warning(push, 1) -#pragma warning(disable : 4530) /* C++ exception handler used, but \ - unwind semantics are not enabled. Specify \ - /EHsc */ -#pragma warning(disable : 4577) /* 'noexcept' used with no exception \ - handling mode specified; termination on \ - exception is not guaranteed. Specify /EHsc \ - */ -#endif /* _MSC_VER (warnings) */ - -#include -#include -#include - -#if defined(_WIN32) || defined(_WIN64) - -#include -#include -typedef unsigned mode_t; -typedef HANDLE mdbx_filehandle_t; -typedef DWORD mdbx_pid_t; -typedef DWORD mdbx_tid_t; -#define MDBX_ENODATA ERROR_HANDLE_EOF -#define MDBX_EINVAL ERROR_INVALID_PARAMETER -#define MDBX_EACCESS ERROR_ACCESS_DENIED -#define MDBX_ENOMEM ERROR_OUTOFMEMORY -#define MDBX_EROFS ERROR_FILE_READ_ONLY -#define MDBX_ENOSYS ERROR_NOT_SUPPORTED -#define MDBX_EIO ERROR_WRITE_FAULT - -#else - -#include /* for error codes */ -#include /* for pthread_t */ -#include /* for pid_t */ -#include /* for truct iovec */ -#define HAVE_STRUCT_IOVEC 1 -typedef int mdbx_filehandle_t; -typedef pid_t mdbx_pid_t; -typedef pthread_t mdbx_tid_t; -#define MDBX_ENODATA ENODATA -#define MDBX_EINVAL EINVAL -#define MDBX_EACCESS EACCES -#define MDBX_ENOMEM ENOMEM -#define MDBX_EROFS EROFS -#define MDBX_ENOSYS ENOSYS -#define MDBX_EIO EIO -#endif - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - -/*--------------------------------------------------------------------------*/ - -#if defined(LIBMDBX_EXPORTS) -#define LIBMDBX_API __dll_export -#elif defined(LIBMDBX_IMPORTS) -#define LIBMDBX_API __dll_import -#else -#define LIBMDBX_API -#endif /* LIBMDBX_API */ diff --git a/src/bits.h b/src/bits.h index 897e1eb6..1ef6a799 100644 --- a/src/bits.h +++ b/src/bits.h @@ -15,60 +15,6 @@ /* *INDENT-OFF* */ /* clang-format off */ -#ifndef _FILE_OFFSET_BITS -# define _FILE_OFFSET_BITS 64 -#endif - -#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS) -# define _CRT_SECURE_NO_WARNINGS -#endif - -#ifdef _MSC_VER -#pragma warning(disable : 4464) /* C4464: relative include path contains '..' */ -#pragma warning(disable : 4710) /* C4710: 'xyz': function not inlined */ -#pragma warning(disable : 4711) /* C4711: function 'xyz' selected for automatic inline expansion */ -#pragma warning(disable : 4201) /* C4201: nonstandard extension used : nameless struct / union */ -#pragma warning(disable : 4706) /* C4706: assignment within conditional expression */ -#pragma warning(disable : 4127) /* C4127: conditional expression is constant */ -#endif /* _MSC_VER (warnings) */ - -#include "../mdbx.h" -#include "./defs.h" - -#if defined(USE_VALGRIND) -# include -# ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE - /* LY: available since Valgrind 3.10 */ -# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# endif -#else -# define VALGRIND_CREATE_MEMPOOL(h,r,z) -# define VALGRIND_DESTROY_MEMPOOL(h) -# define VALGRIND_MEMPOOL_TRIM(h,a,s) -# define VALGRIND_MEMPOOL_ALLOC(h,a,s) -# define VALGRIND_MEMPOOL_FREE(h,a) -# define VALGRIND_MEMPOOL_CHANGE(h,a,b,s) -# define VALGRIND_MAKE_MEM_NOACCESS(a,s) -# define VALGRIND_MAKE_MEM_DEFINED(a,s) -# define VALGRIND_MAKE_MEM_UNDEFINED(a,s) -# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) -# define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) -#endif /* USE_VALGRIND */ - -#ifdef __SANITIZE_ADDRESS__ -# include -#else -# define ASAN_POISON_MEMORY_REGION(addr, size) \ - ((void)(addr), (void)(size)) -# define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ - ((void)(addr), (void)(size)) -#endif /* __SANITIZE_ADDRESS__ */ - -#include "./osal.h" - #ifndef MDBX_DEBUG # define MDBX_DEBUG 0 #endif @@ -77,6 +23,14 @@ # undef NDEBUG #endif +/* Features under development */ +#ifndef MDBX_DEVEL +# define MDBX_DEVEL 0 +#endif + +#include "../mdbx.h" +#include "./defs.h" + #if defined(__GNUC__) && !__GNUC_PREREQ(4,2) /* Actualy libmdbx was not tested with compilers older than GCC from RHEL6. * But you could remove this #error and try to continue at your own risk. @@ -93,53 +47,66 @@ # warning "libmdbx required at least GLIBC 2.12." #endif -#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) -# define UNALIGNED_OK 1 /* TODO */ -#endif -#ifndef UNALIGNED_OK -# define UNALIGNED_OK 0 -#endif /* UNALIGNED_OK */ +#ifdef __SANITIZE_THREAD__ +# warning "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues." +#endif /* __SANITIZE_THREAD__ */ -#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF -# error "Sanity checking failed: Two's complement, reasonably sized integer types" -#endif +#ifdef _MSC_VER +#pragma warning(disable : 4464) /* C4464: relative include path contains '..' */ +#pragma warning(disable : 4710) /* C4710: 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* C4711: function 'xyz' selected for automatic inline expansion */ +#pragma warning(disable : 4201) /* C4201: nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4706) /* C4706: assignment within conditional expression */ +#pragma warning(disable : 4127) /* C4127: conditional expression is constant */ +#endif /* _MSC_VER (warnings) */ -/*----------------------------------------------------------------------------*/ - -#ifndef ARRAY_LENGTH -# ifdef __cplusplus - template - char (&__ArraySizeHelper(T (&array)[N]))[N]; -# define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array))) -# else -# define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) -# endif -#endif /* ARRAY_LENGTH */ - -#ifndef ARRAY_END -# define ARRAY_END(array) (&array[ARRAY_LENGTH(array)]) -#endif /* ARRAY_END */ - -#ifndef STRINGIFY -# define STRINGIFY_HELPER(x) #x -# define STRINGIFY(x) STRINGIFY_HELPER(x) -#endif /* STRINGIFY */ - -#ifndef offsetof -# define offsetof(type, member) __builtin_offsetof(type, member) -#endif /* offsetof */ - -#ifndef container_of -# define container_of(ptr, type, member) \ - ((type *)((char *)(ptr) - offsetof(type, member))) -#endif /* container_of */ +#include "./osal.h" /* *INDENT-ON* */ /* clang-format on */ -#define FIXME "FIXME: " __FILE__ ", " STRINGIFY(__LINE__) - /*----------------------------------------------------------------------------*/ +/* Basic constants and types */ + +/* The maximum size of a database page. + * + * It is 32k or 64k, since value-PAGEBASE must fit in + * MDBX_page.mp_upper. + * + * MDBX will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. + * + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. */ +#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) + +/* The minimum number of keys required in a database page. + * Setting this to a larger value will place a smaller bound on the + * maximum size of a data item. Data items larger than this size will + * be pushed into overflow pages instead of being stored directly in + * the B-tree node. This value used to default to 4. With a page size + * of 4096 bytes that meant that any item larger than 1024 bytes would + * go into an overflow page. That also meant that on average 2-3KB of + * each overflow page was wasted space. The value cannot be lower than + * 2 because then there would no longer be a tree structure. With this + * value, items larger than 2KB will go into overflow pages, and on + * average only 1KB will be wasted. */ +#define MDBX_MINKEYS 2 + +/* A stamp that identifies a file as an MDBX file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. */ +#define MDBX_MAGIC 0xBEEFC0DE + +/* The version number for a database's datafile format. */ +#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 999 : 1) +/* The version number for a database's lockfile format. */ +#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 999 : 1) /* handle for the DB used to track free pages. */ #define FREE_DBI 0 @@ -162,32 +129,53 @@ typedef uint32_t pgno_t; typedef uint64_t txnid_t; #define PRIaTXN PRIi64 -/* An IDL is an ID List, a sorted array of IDs. The first - * element of the array is a counter for how many actual - * IDs are in the list. In the original back-bdb code, IDLs are - * sorted in ascending order. For libmdb IDLs are sorted in - * descending order. */ -typedef pgno_t *MDBX_IDL; - -/* An ID2 is an ID/pointer pair. */ -typedef struct MDBX_ID2 { - pgno_t mid; /* The ID */ - void *mptr; /* The pointer */ -} MDBX_ID2; - -/* An ID2L is an ID2 List, a sorted array of ID2s. - * The first element's mid member is a count of how many actual - * elements are in the array. The mptr member of the first element is - * unused. The array is sorted in ascending order by mid. */ -typedef MDBX_ID2 *MDBX_ID2L; - /* Used for offsets within a single page. * Since memory pages are typically 4 or 8KB in size, 12-13 bits, * this is plenty. */ typedef uint16_t indx_t; +/*----------------------------------------------------------------------------*/ +/* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) +/* Reader Lock Table + * + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent + * read transactions started by the same thread need no further locking to + * proceed. + * + * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. + * No reader table is used if the database is on a read-only filesystem. + * + * Since the database uses multi-version concurrency control, readers don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the oldest + * outstanding reader transaction. Any freed pages older than this will be + * reclaimed by the writer. The writer doesn't use any locks when scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for correct + * operation - all we need is to know the upper bound on the oldest reader, + * we don't care at all about the newest reader. So the only consequence of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages from + * many old transactions together. */ + /* The actual reader record, with cacheline padding. */ typedef struct MDBX_reader { /* Current Transaction ID when this transaction began, or (txnid_t)-1. @@ -343,6 +331,54 @@ typedef struct MDBX_lockinfo { } MDBX_lockinfo; #pragma pack(pop) +/*----------------------------------------------------------------------------*/ +/* Two kind lists of pages (aka IDL) */ + +/* An IDL is an ID List, a sorted array of IDs. The first + * element of the array is a counter for how many actual + * IDs are in the list. In the libmdbx IDLs are sorted in + * descending order. */ +typedef pgno_t *MDBX_IDL; + +/* An ID2 is an ID/pointer pair. */ +typedef struct MDBX_ID2 { + pgno_t mid; /* The ID */ + void *mptr; /* The pointer */ +} MDBX_ID2; + +/* An ID2L is an ID2 List, a sorted array of ID2s. + * The first element's mid member is a count of how many actual + * elements are in the array. The mptr member of the first element is + * unused. The array is sorted in ascending order by mid. */ +typedef MDBX_ID2 *MDBX_ID2L; + +/* IDL sizes - likely should be even bigger + * limiting factors: sizeof(pgno_t), thread stack size */ +#define MDBX_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ +#define MDBX_IDL_DB_SIZE (1 << MDBX_IDL_LOGN) +#define MDBX_IDL_UM_SIZE (1 << (MDBX_IDL_LOGN + 1)) + +#define MDBX_IDL_DB_MAX (MDBX_IDL_DB_SIZE - 1) +#define MDBX_IDL_UM_MAX (MDBX_IDL_UM_SIZE - 1) + +#define MDBX_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(pgno_t)) +#define MDBX_IDL_IS_ZERO(ids) ((ids)[0] == 0) +#define MDBX_IDL_CPY(dst, src) (memcpy(dst, src, MDBX_IDL_SIZEOF(src))) +#define MDBX_IDL_FIRST(ids) ((ids)[1]) +#define MDBX_IDL_LAST(ids) ((ids)[(ids)[0]]) + +/* Current max length of an mdbx_midl_alloc()ed IDL */ +#define MDBX_IDL_ALLOCLEN(ids) ((ids)[-1]) + +/* Append ID to IDL. The IDL must be big enough. */ +#define mdbx_midl_xappend(idl, id) \ + do { \ + pgno_t *xidl = (idl), xlen = ++(xidl[0]); \ + xidl[xlen] = (id); \ + } while (0) + +/*----------------------------------------------------------------------------*/ +/* Internal structures */ /* Auxiliary DB info. * The information here is mostly static/read-only. There is @@ -598,6 +634,7 @@ typedef struct MDBX_ntxn { } MDBX_ntxn; /*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ extern int mdbx_runtime_flags; extern MDBX_debug_func *mdbx_debug_logger; @@ -638,8 +675,6 @@ void mdbx_panic(const char *fmt, ...) #define mdbx_print(fmt, ...) \ mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) -/*----------------------------------------------------------------------------*/ - #define mdbx_trace(fmt, ...) \ do { \ if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ @@ -689,8 +724,6 @@ void mdbx_panic(const char *fmt, ...) fmt "\n", ##__VA_ARGS__); \ } while (0) -/*----------------------------------------------------------------------------*/ - #define mdbx_debug(fmt, ...) \ do { \ if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ @@ -738,8 +771,6 @@ void mdbx_panic(const char *fmt, ...) /* assert(3) variant in transaction context */ #define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) -/*----------------------------------------------------------------------------*/ - static __inline void mdbx_jitter4testing(bool tiny) { #ifndef NDEBUG mdbx_osal_jitter(tiny); @@ -748,6 +779,9 @@ static __inline void mdbx_jitter4testing(bool tiny) { #endif } +/*----------------------------------------------------------------------------*/ +/* Internal prototypes and inlines */ + int mdbx_reader_check0(MDBX_env *env, int rlocked, int *dead); #define METAPAGE_1(env) (&((MDBX_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) @@ -782,3 +816,256 @@ static __inline size_t roundup2(size_t value, size_t granularity) { #define MDBX_IS_ERROR(rc) \ ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) + +/* Internal error codes, not exposed outside libmdbx */ +#define MDBX_NO_ROOT (MDBX_LAST_ERRCODE + 10) + +/* Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) + +/* Key size which fits in a DKBUF. */ +#define DKBUF_MAXKEYSIZE 511 /* FIXME */ + +#if MDBX_DEBUG +#define DKBUF char _kbuf[DKBUF_MAXKEYSIZE * 4 + 2] +#define DKEY(x) mdbx_dkey(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1) +#define DVAL(x) \ + mdbx_dkey(x, _kbuf + DKBUF_MAXKEYSIZE * 2 + 1, DKBUF_MAXKEYSIZE * 2 + 1) +#else +#define DKBUF ((void)(0)) +#define DKEY(x) ("-") +#define DVAL(x) ("-") +#endif + +/* An invalid page number. + * Mainly used to denote an empty tree. */ +#define P_INVALID (~(pgno_t)0) + +/* Test if the flags f are set in a flag word w. */ +#define F_ISSET(w, f) (((w) & (f)) == (f)) + +/* Round n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + +/* Default size of memory map. + * This is certainly too small for any actual applications. Apps should + * always set the size explicitly using mdbx_env_set_mapsize(). */ +#define DEFAULT_MAPSIZE 1048576 + +/* Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. The 61 is a prime number, + * and such readers plus a couple mutexes fit into single 4KB page. + * Applications should set the table size using mdbx_env_set_maxreaders(). */ +#define DEFAULT_READERS 61 + +/* Address of first usable data byte in a page, after the header */ +#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) + +/* ITS#7713, change PAGEBASE to handle 65536 byte pages */ +#define PAGEBASE ((MDBX_DEVEL) ? PAGEHDRSZ : 0) + +/* Number of nodes on a page */ +#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) + +/* The amount of space remaining in the page */ +#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) + +/* The percentage of space used in the page, in tenths of a percent. */ +#define PAGEFILL(env, p) \ + (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + ((env)->me_psize - PAGEHDRSZ)) +/* The minimum page fill factor, in tenths of a percent. + * Pages emptier than this are candidates for merging. */ +#define FILL_THRESHOLD 250 + +/* Test if a page is a leaf page */ +#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) +/* Test if a page is a LEAF2 page */ +#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) +/* Test if a page is a branch page */ +#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) +/* Test if a page is an overflow page */ +#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) +/* Test if a page is a sub page */ +#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) + +/* The number of overflow pages needed to store the given size. */ +#define OVPAGES(size, psize) ((PAGEHDRSZ - 1 + (size)) / (psize) + 1) + +/* Link in MDBX_txn.mt_loose_pages list. + * Kept outside the page header, which is needed when reusing the page. */ +#define NEXT_LOOSE_PAGE(p) (*(MDBX_page **)((p) + 2)) + +/* Header for a single key/data pair within a page. + * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. + * We guarantee 2-byte alignment for 'MDBX_node's. + * + * mn_lo and mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, mn_flags is also used + * for pgno. (Branch nodes have no flags). Lo and hi are in host byte + * order in case some accesses can be optimized to 32-bit word access. + * + * Leaf node flags describe node contents. F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just F_SUBDATA). */ +typedef struct MDBX_node { + union { + struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + union { + struct { + uint16_t mn_lo, mn_hi; /* part of data size or pgno */ + }; + uint32_t mn_dsize; + }; + uint16_t mn_flags; /* see mdbx_node */ + uint16_t mn_ksize; /* key size */ +#else + uint16_t mn_ksize; /* key size */ + uint16_t mn_flags; /* see mdbx_node */ + union { + struct { + uint16_t mn_hi, mn_lo; /* part of data size or pgno */ + }; + uint32_t mn_dsize; + }; +#endif + }; + pgno_t mn_ksize_and_pgno; + }; + +/* mdbx_node Flags */ +#define F_BIGDATA 0x01 /* data put on overflow page */ +#define F_SUBDATA 0x02 /* data is a sub-database */ +#define F_DUPDATA 0x04 /* data has duplicates */ + +/* valid flags for mdbx_node_add() */ +#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) + uint8_t mn_data[1]; /* key and data are appended here */ +} MDBX_node; + +/* Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDBX_node, mn_data) + +/* Bit position of top word in page number, for shifting mn_flags */ +#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) + +/* Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. */ +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->iov_len)) + +/* Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. */ +#define LEAFSIZE(k, d) (NODESIZE + (k)->iov_len + (d)->iov_len) + +/* Address of node i in page p */ +static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { + assert(NUMKEYS(p) > (unsigned)(i)); + return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); +} + +/* Address of the key for the node */ +#define NODEKEY(node) (void *)((node)->mn_data) + +/* Address of the data for a node */ +#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) + +/* Get the page number pointed to by a branch node */ +static __inline pgno_t NODEPGNO(const MDBX_node *node) { + pgno_t pgno; + if (UNALIGNED_OK) { + pgno = node->mn_ksize_and_pgno; + if (sizeof(pgno_t) > 4) + pgno &= UINT64_C(0xffffFFFFffff); + } else { + pgno = node->mn_lo | ((pgno_t)node->mn_lo << 16); + if (sizeof(pgno_t) > 4) + pgno |= ((uint64_t)node->mn_flags) << 32; + } + return pgno; +} + +/* Set the page number in a branch node */ +static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { + assert(pgno <= (pgno_t)UINT64_C(0xffffFFFFffff)); + + if (UNALIGNED_OK) { + if (sizeof(pgno_t) > 4) + pgno |= ((uint64_t)node->mn_ksize) << 48; + node->mn_ksize_and_pgno = pgno; + } else { + node->mn_lo = (uint16_t)pgno; + node->mn_hi = (uint16_t)(pgno >> 16); + if (sizeof(pgno_t) > 4) + node->mn_flags = (uint16_t)((uint64_t)pgno >> 32); + } +} + +/* Get the size of the data in a leaf node */ +static __inline size_t NODEDSZ(const MDBX_node *node) { + size_t size; + if (UNALIGNED_OK) { + size = node->mn_dsize; + } else { + size = node->mn_lo | ((size_t)node->mn_hi << 16); + } + return size; +} + +/* Set the size of the data for a leaf node */ +static __inline void SETDSZ(MDBX_node *node, unsigned size) { + if (UNALIGNED_OK) { + node->mn_dsize = size; + } else { + node->mn_lo = (uint16_t)size; + node->mn_hi = (uint16_t)(size >> 16); + } +} + +/* The size of a key in a node */ +#define NODEKSZ(node) ((node)->mn_ksize) + +/* The address of a key in a LEAF2 page. + * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. + * There are no node headers, keys are stored contiguously. */ +#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks))) + +/* Set the node's key into keyptr, if requested. */ +#define MDBX_GET_KEY(node, keyptr) \ + do { \ + if ((keyptr) != NULL) { \ + (keyptr)->iov_len = NODEKSZ(node); \ + (keyptr)->iov_base = NODEKEY(node); \ + } \ + } while (0) + +/* Set the node's key into key. */ +#define MDBX_GET_KEY2(node, key) \ + do { \ + key.iov_len = NODEKSZ(node); \ + key.iov_base = NODEKEY(node); \ + } while (0) + +#define MDBX_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDBX_VALID)) +/* mdbx_dbi_open() flags */ +#define VALID_FLAGS \ + (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \ + MDBX_INTEGERDUP | MDBX_REVERSEDUP | MDBX_CREATE) + +/* max number of pages to commit in one writev() call */ +#define MDBX_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ +#undef MDBX_COMMIT_PAGES +#define MDBX_COMMIT_PAGES IOV_MAX +#endif + +/* Check txn and dbi arguments to a function */ +#define TXN_DBI_EXIST(txn, dbi, validity) \ + ((dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) + +/* Check for misused dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) diff --git a/src/defs.h b/src/defs.h index 6cae9714..e4a4c49f 100644 --- a/src/defs.h +++ b/src/defs.h @@ -290,8 +290,6 @@ # define __noop(...) __do_noop(0, __VA_ARGS__) #endif /* __noop */ -/*----------------------------------------------------------------------------*/ - /* Wrapper around __func__, which is a C99 feature */ #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L # define mdbx_func_ __func__ @@ -301,8 +299,74 @@ # define mdbx_func_ "" #endif -/* *INDENT-ON* */ -/* clang-format on */ +/*----------------------------------------------------------------------------*/ + +#if defined(USE_VALGRIND) +# include +# ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE + /* LY: available since Valgrind 3.10 */ +# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# endif +#else +# define VALGRIND_CREATE_MEMPOOL(h,r,z) +# define VALGRIND_DESTROY_MEMPOOL(h) +# define VALGRIND_MEMPOOL_TRIM(h,a,s) +# define VALGRIND_MEMPOOL_ALLOC(h,a,s) +# define VALGRIND_MEMPOOL_FREE(h,a) +# define VALGRIND_MEMPOOL_CHANGE(h,a,b,s) +# define VALGRIND_MAKE_MEM_NOACCESS(a,s) +# define VALGRIND_MAKE_MEM_DEFINED(a,s) +# define VALGRIND_MAKE_MEM_UNDEFINED(a,s) +# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) +# define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) +#endif /* USE_VALGRIND */ + +#ifdef __SANITIZE_ADDRESS__ +# include +#else +# define ASAN_POISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +# define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +#endif /* __SANITIZE_ADDRESS__ */ + +/*----------------------------------------------------------------------------*/ + +#ifndef ARRAY_LENGTH +# ifdef __cplusplus + template + char (&__ArraySizeHelper(T (&array)[N]))[N]; +# define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array))) +# else +# define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) +# endif +#endif /* ARRAY_LENGTH */ + +#ifndef ARRAY_END +# define ARRAY_END(array) (&array[ARRAY_LENGTH(array)]) +#endif /* ARRAY_END */ + +#ifndef STRINGIFY +# define STRINGIFY_HELPER(x) #x +# define STRINGIFY(x) STRINGIFY_HELPER(x) +#endif /* STRINGIFY */ + +#ifndef offsetof +# define offsetof(type, member) __builtin_offsetof(type, member) +#endif /* offsetof */ + +#ifndef container_of +# define container_of(ptr, type, member) \ + ((type *)((char *)(ptr) - offsetof(type, member))) +#endif /* container_of */ #define MDBX_TETRAD(a, b, c, d) \ ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d)) + +#define FIXME "FIXME: " __FILE__ ", " STRINGIFY(__LINE__) + +/* *INDENT-ON* */ +/* clang-format on */ diff --git a/src/mdbx.c b/src/mdbx.c index 6e71ea2a..d829bb60 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -36,7 +36,6 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "./bits.h" -#include "./midl.h" /*----------------------------------------------------------------------------*/ /* rthc (tls keys and destructors) */ @@ -255,342 +254,6 @@ int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); txnid_t mdbx_debug_edge; #endif -/* Features under development */ -#ifndef MDBX_DEVEL -#define MDBX_DEVEL 0 -#endif - -/* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_ERRCODE + 10) - -/* Debuging output value of a cursor DBI: Negative in a sub-cursor. */ -#define DDBI(mc) \ - (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) - -/* The maximum size of a database page. - * - * It is 32k or 64k, since value-PAGEBASE must fit in - * MDBX_page.mp_upper. - * - * MDBX will use database pages < OS pages if needed. - * That causes more I/O in write transactions: The OS must - * know (read) the whole page before writing a partial page. - * - * Note that we don't currently support Huge pages. On Linux, - * regular data files cannot use Huge pages, and in general - * Huge pages aren't actually pageable. We rely on the OS - * demand-pager to read our data and page it out when memory - * pressure from other processes is high. So until OSs have - * actual paging support for Huge pages, they're not viable. */ -#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) - -/* The minimum number of keys required in a database page. - * Setting this to a larger value will place a smaller bound on the - * maximum size of a data item. Data items larger than this size will - * be pushed into overflow pages instead of being stored directly in - * the B-tree node. This value used to default to 4. With a page size - * of 4096 bytes that meant that any item larger than 1024 bytes would - * go into an overflow page. That also meant that on average 2-3KB of - * each overflow page was wasted space. The value cannot be lower than - * 2 because then there would no longer be a tree structure. With this - * value, items larger than 2KB will go into overflow pages, and on - * average only 1KB will be wasted. */ -#define MDBX_MINKEYS 2 - -/* A stamp that identifies a file as an MDBX file. - * There's nothing special about this value other than that it is easily - * recognizable, and it will reflect any byte order mismatches. */ -#define MDBX_MAGIC 0xBEEFC0DE - -/* The version number for a database's datafile format. */ -#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 999 : 1) -/* The version number for a database's lockfile format. */ -#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 999 : 1) - -/* Key size which fits in a DKBUF. */ -#define DKBUF_MAXKEYSIZE 511 /* FIXME */ - -#if MDBX_DEBUG -#define DKBUF char _kbuf[DKBUF_MAXKEYSIZE * 4 + 2] -#define DKEY(x) mdbx_dkey(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1) -#define DVAL(x) \ - mdbx_dkey(x, _kbuf + DKBUF_MAXKEYSIZE * 2 + 1, DKBUF_MAXKEYSIZE * 2 + 1) -#else -#define DKBUF ((void)(0)) -#define DKEY(x) ("-") -#define DVAL(x) ("-") -#endif - -/* An invalid page number. - * Mainly used to denote an empty tree. */ -#define P_INVALID (~(pgno_t)0) - -/* Test if the flags f are set in a flag word w. */ -#define F_ISSET(w, f) (((w) & (f)) == (f)) - -/* Round n up to an even number. */ -#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ - -/* Default size of memory map. - * This is certainly too small for any actual applications. Apps should - * always set the size explicitly using mdbx_env_set_mapsize(). */ -#define DEFAULT_MAPSIZE 1048576 - -/* Reader Lock Table - * - * Readers don't acquire any locks for their data access. Instead, they - * simply record their transaction ID in the reader table. The reader - * mutex is needed just to find an empty slot in the reader table. The - * slot's address is saved in thread-specific data so that subsequent - * read transactions started by the same thread need no further locking to - * proceed. - * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. - * - * Since the database uses multi-version concurrency control, readers don't - * actually need any locking. This table is used to keep track of which - * readers are using data from which old transactions, so that we'll know - * when a particular old transaction is no longer in use. Old transactions - * that have discarded any data pages can then have those pages reclaimed - * for use by a later write transaction. - * - * The lock table is constructed such that reader slots are aligned with the - * processor's cache line size. Any slot is only ever used by one thread. - * This alignment guarantees that there will be no contention or cache - * thrashing as threads update their own slot info, and also eliminates - * any need for locking when accessing a slot. - * - * A writer thread will scan every slot in the table to determine the oldest - * outstanding reader transaction. Any freed pages older than this will be - * reclaimed by the writer. The writer doesn't use any locks when scanning - * this table. This means that there's no guarantee that the writer will - * see the most up-to-date reader info, but that's not required for correct - * operation - all we need is to know the upper bound on the oldest reader, - * we don't care at all about the newest reader. So the only consequence of - * reading stale information here is that old pages might hang around a - * while longer before being reclaimed. That's actually good anyway, because - * the longer we delay reclaiming old pages, the more likely it is that a - * string of contiguous pages can be found after coalescing old pages from - * many old transactions together. */ - -/* Number of slots in the reader table. - * This value was chosen somewhat arbitrarily. The 61 is a prime number, - * and such readers plus a couple mutexes fit into single 4KB page. - * Applications should set the table size using mdbx_env_set_maxreaders(). */ -#define DEFAULT_READERS 61 - -/* Address of first usable data byte in a page, after the header */ -#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) - -/* ITS#7713, change PAGEBASE to handle 65536 byte pages */ -#define PAGEBASE ((MDBX_DEVEL) ? PAGEHDRSZ : 0) - -/* Number of nodes on a page */ -#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) - -/* The amount of space remaining in the page */ -#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) - -/* The percentage of space used in the page, in tenths of a percent. */ -#define PAGEFILL(env, p) \ - (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ - ((env)->me_psize - PAGEHDRSZ)) -/* The minimum page fill factor, in tenths of a percent. - * Pages emptier than this are candidates for merging. */ -#define FILL_THRESHOLD 250 - -/* Test if a page is a leaf page */ -#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) -/* Test if a page is a LEAF2 page */ -#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) -/* Test if a page is a branch page */ -#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) -/* Test if a page is an overflow page */ -#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) -/* Test if a page is a sub page */ -#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) - -/* The number of overflow pages needed to store the given size. */ -#define OVPAGES(size, psize) ((PAGEHDRSZ - 1 + (size)) / (psize) + 1) - -/* Link in MDBX_txn.mt_loose_pages list. - * Kept outside the page header, which is needed when reusing the page. */ -#define NEXT_LOOSE_PAGE(p) (*(MDBX_page **)((p) + 2)) - -/* Header for a single key/data pair within a page. - * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. - * We guarantee 2-byte alignment for 'MDBX_node's. - * - * mn_lo and mn_hi are used for data size on leaf nodes, and for child - * pgno on branch nodes. On 64 bit platforms, mn_flags is also used - * for pgno. (Branch nodes have no flags). Lo and hi are in host byte - * order in case some accesses can be optimized to 32-bit word access. - * - * Leaf node flags describe node contents. F_BIGDATA says the node's - * data part is the page number of an overflow page with actual data. - * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in - * a sub-page/sub-database, and named databases (just F_SUBDATA). */ -typedef struct MDBX_node { - union { - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - union { - struct { - uint16_t mn_lo, mn_hi; /* part of data size or pgno */ - }; - uint32_t mn_dsize; - }; - uint16_t mn_flags; /* see mdbx_node */ - uint16_t mn_ksize; /* key size */ -#else - uint16_t mn_ksize; /* key size */ - uint16_t mn_flags; /* see mdbx_node */ - union { - struct { - uint16_t mn_hi, mn_lo; /* part of data size or pgno */ - }; - uint32_t mn_dsize; - }; -#endif - }; - pgno_t mn_ksize_and_pgno; - }; - -/* mdbx_node Flags */ -#define F_BIGDATA 0x01 /* data put on overflow page */ -#define F_SUBDATA 0x02 /* data is a sub-database */ -#define F_DUPDATA 0x04 /* data has duplicates */ - -/* valid flags for mdbx_node_add() */ -#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) - uint8_t mn_data[1]; /* key and data are appended here */ -} MDBX_node; - -/* Size of the node header, excluding dynamic data at the end */ -#define NODESIZE offsetof(MDBX_node, mn_data) - -/* Bit position of top word in page number, for shifting mn_flags */ -#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) - -/* Size of a node in a branch page with a given key. - * This is just the node header plus the key, there is no data. */ -#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->iov_len)) - -/* Size of a node in a leaf page with a given key and data. - * This is node header plus key plus data size. */ -#define LEAFSIZE(k, d) (NODESIZE + (k)->iov_len + (d)->iov_len) - -/* Address of node i in page p */ -static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { - assert(NUMKEYS(p) > (unsigned)(i)); - return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); -} - -/* Address of the key for the node */ -#define NODEKEY(node) (void *)((node)->mn_data) - -/* Address of the data for a node */ -#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) - -/* Get the page number pointed to by a branch node */ -static __inline pgno_t NODEPGNO(const MDBX_node *node) { - pgno_t pgno; - if (UNALIGNED_OK) { - pgno = node->mn_ksize_and_pgno; - if (sizeof(pgno_t) > 4) - pgno &= UINT64_C(0xffffFFFFffff); - } else { - pgno = node->mn_lo | ((pgno_t)node->mn_lo << 16); - if (sizeof(pgno_t) > 4) - pgno |= ((uint64_t)node->mn_flags) << 32; - } - return pgno; -} - -/* Set the page number in a branch node */ -static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { - assert(pgno <= (pgno_t)UINT64_C(0xffffFFFFffff)); - - if (UNALIGNED_OK) { - if (sizeof(pgno_t) > 4) - pgno |= ((uint64_t)node->mn_ksize) << 48; - node->mn_ksize_and_pgno = pgno; - } else { - node->mn_lo = (uint16_t)pgno; - node->mn_hi = (uint16_t)(pgno >> 16); - if (sizeof(pgno_t) > 4) - node->mn_flags = (uint16_t)((uint64_t)pgno >> 32); - } -} - -/* Get the size of the data in a leaf node */ -static __inline size_t NODEDSZ(const MDBX_node *node) { - size_t size; - if (UNALIGNED_OK) { - size = node->mn_dsize; - } else { - size = node->mn_lo | ((size_t)node->mn_hi << 16); - } - return size; -} - -/* Set the size of the data for a leaf node */ -static __inline void SETDSZ(MDBX_node *node, unsigned size) { - if (UNALIGNED_OK) { - node->mn_dsize = size; - } else { - node->mn_lo = (uint16_t)size; - node->mn_hi = (uint16_t)(size >> 16); - } -} - -/* The size of a key in a node */ -#define NODEKSZ(node) ((node)->mn_ksize) - -/* The address of a key in a LEAF2 page. - * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. - * There are no node headers, keys are stored contiguously. */ -#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks))) - -/* Set the node's key into keyptr, if requested. */ -#define MDBX_GET_KEY(node, keyptr) \ - do { \ - if ((keyptr) != NULL) { \ - (keyptr)->iov_len = NODEKSZ(node); \ - (keyptr)->iov_base = NODEKEY(node); \ - } \ - } while (0) - -/* Set the node's key into key. */ -#define MDBX_GET_KEY2(node, key) \ - do { \ - key.iov_len = NODEKSZ(node); \ - key.iov_base = NODEKEY(node); \ - } while (0) - -#define MDBX_VALID 0x8000 /* DB handle is valid, for me_dbflags */ -#define PERSISTENT_FLAGS (0xffff & ~(MDBX_VALID)) -/* mdbx_dbi_open() flags */ -#define VALID_FLAGS \ - (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \ - MDBX_INTEGERDUP | MDBX_REVERSEDUP | MDBX_CREATE) - -/* max number of pages to commit in one writev() call */ -#define MDBX_COMMIT_PAGES 64 -#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ -#undef MDBX_COMMIT_PAGES -#define MDBX_COMMIT_PAGES IOV_MAX -#endif - -/* Check txn and dbi arguments to a function */ -#define TXN_DBI_EXIST(txn, dbi, validity) \ - ((dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) - -/* Check for misused dbi handles */ -#define TXN_DBI_CHANGED(txn, dbi) \ - ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) - static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, int flags); static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num, MDBX_page **mp); @@ -622,6 +285,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode); static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **mp, int *lvl); static int mdbx_page_search_root(MDBX_cursor *mc, MDBX_val *key, int modify); + #define MDBX_PS_MODIFY 1 #define MDBX_PS_ROOTONLY 2 #define MDBX_PS_FIRST 4 diff --git a/src/midl.h b/src/midl.h deleted file mode 100644 index 8c983c24..00000000 --- a/src/midl.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -/* IDL sizes - likely should be even bigger - * limiting factors: sizeof(pgno_t), thread stack size */ -#define MDBX_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ -#define MDBX_IDL_DB_SIZE (1 << MDBX_IDL_LOGN) -#define MDBX_IDL_UM_SIZE (1 << (MDBX_IDL_LOGN + 1)) - -#define MDBX_IDL_DB_MAX (MDBX_IDL_DB_SIZE - 1) -#define MDBX_IDL_UM_MAX (MDBX_IDL_UM_SIZE - 1) - -#define MDBX_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(pgno_t)) -#define MDBX_IDL_IS_ZERO(ids) ((ids)[0] == 0) -#define MDBX_IDL_CPY(dst, src) (memcpy(dst, src, MDBX_IDL_SIZEOF(src))) -#define MDBX_IDL_FIRST(ids) ((ids)[1]) -#define MDBX_IDL_LAST(ids) ((ids)[(ids)[0]]) - -/* Current max length of an #mdbx_midl_alloc()ed IDL */ -#define MDBX_IDL_ALLOCLEN(ids) ((ids)[-1]) - -/* Append ID to IDL. The IDL must be big enough. */ -#define mdbx_midl_xappend(idl, id) \ - do { \ - pgno_t *xidl = (idl), xlen = ++(xidl[0]); \ - xidl[xlen] = (id); \ - } while (0) diff --git a/src/osal.h b/src/osal.h index 16a12302..846e5673 100644 --- a/src/osal.h +++ b/src/osal.h @@ -16,16 +16,27 @@ #pragma once +/*----------------------------------------------------------------------------*/ +/* Microsoft compiler generates a lot of warning for self includes... */ + #ifdef _MSC_VER #pragma warning(push, 1) -#pragma warning(disable : 4530) /* C++ exception handler used, but \ - unwind semantics are not enabled. Specify \ - /EHsc */ -#pragma warning(disable : 4577) /* 'noexcept' used with no exception \ - handling mode specified; termination on \ - exception is not guaranteed. Specify /EHsc \ - */ -#endif /* _MSC_VER (warnings) */ +#pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ + * semantics are not enabled. Specify /EHsc */ +#pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ + * mode specified; termination on exception is \ + * not guaranteed. Specify /EHsc */ +#if !defined(_CRT_SECURE_NO_WARNINGS) +#define _CRT_SECURE_NO_WARNINGS +#endif +#endif /* _MSC_VER (warnings) */ + +/*----------------------------------------------------------------------------*/ +/* C99 includes */ + +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif #include #include @@ -52,6 +63,9 @@ #define _XOPEN_SOURCE 0 #endif +/*----------------------------------------------------------------------------*/ +/* Systems includes */ + #if defined(_WIN32) || defined(_WIN64) #include #include @@ -103,7 +117,20 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #include #endif +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) +#define UNALIGNED_OK 1 /* TODO */ +#endif +#ifndef UNALIGNED_OK +#define UNALIGNED_OK 0 +#endif /* UNALIGNED_OK */ + +#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +#error \ + "Sanity checking failed: Two's complement, reasonably sized integer types" +#endif + /*----------------------------------------------------------------------------*/ +/* Compiler's includes for builtins/intrinsics */ #ifdef _MSC_VER @@ -162,10 +189,6 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #include /* defines BYTE_ORDER on HPUX and Solaris */ #endif -#ifdef _MSC_VER -#pragma warning(pop) -#endif - #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) #define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN #define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN @@ -197,36 +220,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #endif /*----------------------------------------------------------------------------*/ -/* Cache coherence */ - -#if defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64) || \ - defined(_M_IX86) || defined(__i386) || defined(__amd64) || \ - defined(i386) || defined(__x86_64) || defined(_AMD64_) || defined(_M_X64) -#define MDBX_CACHE_IS_COHERENT 1 -#elif defined(__hppa) || defined(__hppa__) -#define MDBX_CACHE_IS_COHERENT 1 -#endif - -#ifndef MDBX_CACHE_IS_COHERENT -#define MDBX_CACHE_IS_COHERENT 0 -#endif - -#ifndef MDBX_CACHELINE_SIZE -#if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) -#define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE -#elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64) -#define MDBX_CACHELINE_SIZE 128 -#else -#define MDBX_CACHELINE_SIZE 64 -#endif -#endif /* MDBX_CACHELINE_SIZE */ - -#ifndef __cache_aligned -#define __cache_aligned __aligned(MDBX_CACHELINE_SIZE) -#endif - -/*----------------------------------------------------------------------------*/ -/* Memory/Compiler barriers */ +/* Memory/Compiler barriers, cache coherence */ static __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) @@ -286,6 +280,35 @@ static __inline void mdbx_memory_barrier(void) { #endif } +/*----------------------------------------------------------------------------*/ +/* Cache coherence and invalidation */ + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64) || \ + defined(_M_IX86) || defined(__i386) || defined(__amd64) || \ + defined(i386) || defined(__x86_64) || defined(_AMD64_) || defined(_M_X64) +#define MDBX_CACHE_IS_COHERENT 1 +#elif defined(__hppa) || defined(__hppa__) +#define MDBX_CACHE_IS_COHERENT 1 +#endif + +#ifndef MDBX_CACHE_IS_COHERENT +#define MDBX_CACHE_IS_COHERENT 0 +#endif + +#ifndef MDBX_CACHELINE_SIZE +#if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) +#define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE +#elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64) +#define MDBX_CACHELINE_SIZE 128 +#else +#define MDBX_CACHELINE_SIZE 64 +#endif +#endif /* MDBX_CACHELINE_SIZE */ + +#ifndef __cache_aligned +#define __cache_aligned __aligned(MDBX_CACHELINE_SIZE) +#endif + #if MDBX_CACHE_IS_COHERENT #define mdbx_coherent_barrier() mdbx_compiler_barrier() #else @@ -313,6 +336,7 @@ static __inline void mdbx_invalidate_cache(void *addr, size_t nbytes) { } /*----------------------------------------------------------------------------*/ +/* libc compatibility stuff */ #ifndef mdbx_assert_fail void mdbx_assert_fail(MDBX_env *env, const char *msg, const char *func, @@ -338,6 +362,7 @@ int mdbx_asprintf(char **strp, const char *fmt, ...); #endif /* _MSC_VER */ /*----------------------------------------------------------------------------*/ +/* OS abstraction layer stuff */ /* max bytes to write in one call */ #define MAX_WRITE UINT32_C(0x3fff0000) @@ -444,6 +469,7 @@ static __inline mdbx_pid_t mdbx_getpid(void) { void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ +/* lck stuff */ #if defined(_WIN32) || defined(_WIN64) #undef MDBX_OSAL_LOCK @@ -477,6 +503,7 @@ int mdbx_rpid_clear(MDBX_env *env); int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid); /*----------------------------------------------------------------------------*/ +/* Atomics */ #if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) && \ (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ @@ -485,7 +512,6 @@ int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid); #elif defined(__GNUC__) || defined(__clang__) /* LY: nothing required */ #elif defined(_MSC_VER) -#pragma warning(push) #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ #pragma warning(disable : 4133) /* 'function': incompatible types - from \ 'size_t' to 'LONGLONG' */ @@ -558,6 +584,8 @@ static __inline bool mdbx_atomic_compare_and_swap(volatile size_t *p, size_t c, #endif } +/*----------------------------------------------------------------------------*/ + #ifdef _MSC_VER #pragma warning(pop) #endif diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 930f8b58..71e8e103 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -27,7 +27,6 @@ #include "../../mdbx.h" #include "../bits.h" -#include "../midl.h" typedef struct flagbit { int bit; diff --git a/test/test.vcxproj b/test/test.vcxproj index 7afeb1c7..6676ffc0 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -164,6 +164,7 @@ + From 96b9af0b4dd9672d5625a5c963736c5baa0c2f5b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 15:50:03 +0300 Subject: [PATCH 161/303] mdbx: minor refine idl functions. --- src/bits.h | 21 +- src/mdbx.c | 573 +++++++++++++++++++++++++---------------------------- 2 files changed, 286 insertions(+), 308 deletions(-) diff --git a/src/bits.h b/src/bits.h index 1ef6a799..0a999c02 100644 --- a/src/bits.h +++ b/src/bits.h @@ -370,13 +370,6 @@ typedef MDBX_ID2 *MDBX_ID2L; /* Current max length of an mdbx_midl_alloc()ed IDL */ #define MDBX_IDL_ALLOCLEN(ids) ((ids)[-1]) -/* Append ID to IDL. The IDL must be big enough. */ -#define mdbx_midl_xappend(idl, id) \ - do { \ - pgno_t *xidl = (idl), xlen = ++(xidl[0]); \ - xidl[xlen] = (id); \ - } while (0) - /*----------------------------------------------------------------------------*/ /* Internal structures */ @@ -1069,3 +1062,17 @@ static __inline void SETDSZ(MDBX_node *node, unsigned size) { /* Check for misused dbi handles */ #define TXN_DBI_CHANGED(txn, dbi) \ ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) + +/* LY: fast enough on most systems + * + * / + * | -1, a < b + * cmp2int(a,b) = < 0, a == b + * | 1, a > b + * \ + */ +#if 1 +#define mdbx_cmp2int(a, b) (((b) > (a)) ? -1 : (a) > (b)) +#else +#define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) +#endif diff --git a/src/mdbx.c b/src/mdbx.c index d829bb60..5574cca0 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -156,78 +156,332 @@ __cold void mdbx_rthc_remove(mdbx_thread_key_t key) { /*----------------------------------------------------------------------------*/ +/* Allocate an IDL. + * Allocates memory for an IDL of the given size. + * Returns IDL on success, NULL on failure. */ +static MDBX_IDL mdbx_midl_alloc(unsigned size) { + MDBX_IDL ids = malloc((size + 2) * sizeof(pgno_t)); + if (likely(ids)) { + *ids++ = size; + *ids = 0; + } + return ids; +} + +/* Free an IDL. + * [in] ids The IDL to free. */ +static void mdbx_midl_free(MDBX_IDL ids) { + if (ids) + free(ids - 1); +} + +/* Append ID to IDL. The IDL must be big enough. */ +static __inline void mdbx_midl_xappend(MDBX_IDL idl, pgno_t id) { + assert(idl[0] + (size_t)1 < MDBX_IDL_ALLOCLEN(idl)); + idl[idl[0] += 1] = id; +} + /* Search for an ID in an IDL. * [in] ids The IDL to search. * [in] id The ID to search for. * Returns The index of the first ID greater than or equal to id. */ -static unsigned mdbx_midl_search(MDBX_IDL ids, pgno_t id); +static unsigned __hot mdbx_midl_search(MDBX_IDL ids, pgno_t id) { + /* binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = ids[0]; -/* Allocate an IDL. - * Allocates memory for an IDL of the given size. - * Returns IDL on success, NULL on failure. */ -static MDBX_IDL mdbx_midl_alloc(int num); + while (n > 0) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = mdbx_cmp2int(ids[cursor], id); -/* Free an IDL. - * [in] ids The IDL to free. */ -static void mdbx_midl_free(MDBX_IDL ids); + if (val < 0) { + n = pivot; + } else if (val > 0) { + base = cursor; + n -= pivot + 1; + } else { + return cursor; + } + } + + if (val > 0) + ++cursor; + + return cursor; +} /* Shrink an IDL. * Return the IDL to the default size if it has grown larger. * [in,out] idp Address of the IDL to shrink. */ -static void mdbx_midl_shrink(MDBX_IDL *idp); +static void mdbx_midl_shrink(MDBX_IDL *idp) { + MDBX_IDL ids = *idp - 1; + if (unlikely(*ids > MDBX_IDL_UM_MAX)) { + /* shrink to MDBX_IDL_UM_MAX */ + ids = realloc(ids, (MDBX_IDL_UM_MAX + 2) * sizeof(pgno_t)); + if (likely(ids)) { + *ids++ = MDBX_IDL_UM_MAX; + *idp = ids; + } + } +} + +/* Grow an IDL. + * Return the IDL to the size growed by given number. + * [in,out] idp Address of the IDL to grow. */ +static int mdbx_midl_grow(MDBX_IDL *idp, unsigned num) { + MDBX_IDL idn = *idp - 1; + /* grow it */ + idn = realloc(idn, (*idn + num + 2) * sizeof(pgno_t)); + if (unlikely(!idn)) + return MDBX_ENOMEM; + *idn++ += num; + *idp = idn; + return 0; +} /* Make room for num additional elements in an IDL. * [in,out] idp Address of the IDL. * [in] num Number of elements to make room for. * Returns 0 on success, MDBX_ENOMEM on failure. */ -static int mdbx_midl_need(MDBX_IDL *idp, unsigned num); +static int mdbx_midl_need(MDBX_IDL *idp, unsigned num) { + MDBX_IDL ids = *idp; + num += ids[0]; + if (num > ids[-1]) { + num = (num + num / 4 + (256 + 2)) & -256; + ids = realloc(ids - 1, num * sizeof(pgno_t)); + if (unlikely(!ids)) + return MDBX_ENOMEM; + *ids++ = num - 2; + *idp = ids; + } + return 0; +} /* Append an ID onto an IDL. * [in,out] idp Address of the IDL to append to. * [in] id The ID to append. * Returns 0 on success, MDBX_ENOMEM if the IDL is too large. */ -static int mdbx_midl_append(MDBX_IDL *idp, pgno_t id); +static int mdbx_midl_append(MDBX_IDL *idp, pgno_t id) { + MDBX_IDL ids = *idp; + /* Too big? */ + if (ids[0] >= ids[-1]) { + if (mdbx_midl_grow(idp, MDBX_IDL_UM_MAX)) + return MDBX_ENOMEM; + ids = *idp; + } + ids[0]++; + ids[ids[0]] = id; + return 0; +} /* Append an IDL onto an IDL. * [in,out] idp Address of the IDL to append to. * [in] app The IDL to append. * Returns 0 on success, MDBX_ENOMEM if the IDL is too large. */ -static int mdbx_midl_append_list(MDBX_IDL *idp, MDBX_IDL app); +static int mdbx_midl_append_list(MDBX_IDL *idp, MDBX_IDL app) { + MDBX_IDL ids = *idp; + /* Too big? */ + if (ids[0] + app[0] >= ids[-1]) { + if (mdbx_midl_grow(idp, app[0])) + return MDBX_ENOMEM; + ids = *idp; + } + memcpy(&ids[ids[0] + 1], &app[1], app[0] * sizeof(pgno_t)); + ids[0] += app[0]; + return 0; +} /* Append an ID range onto an IDL. * [in,out] idp Address of the IDL to append to. * [in] id The lowest ID to append. * [in] n Number of IDs to append. * Returns 0 on success, MDBX_ENOMEM if the IDL is too large. */ -static int mdbx_midl_append_range(MDBX_IDL *idp, pgno_t id, unsigned n); +static int mdbx_midl_append_range(MDBX_IDL *idp, pgno_t id, unsigned n) { + pgno_t *ids = *idp, len = ids[0]; + /* Too big? */ + if (len + n > ids[-1]) { + if (mdbx_midl_grow(idp, n | MDBX_IDL_UM_MAX)) + return MDBX_ENOMEM; + ids = *idp; + } + ids[0] = len + n; + ids += len; + while (n) + ids[n--] = id++; + return 0; +} /* Merge an IDL onto an IDL. The destination IDL must be big enough. * [in] idl The IDL to merge into. * [in] merge The IDL to merge. */ -static void mdbx_midl_xmerge(MDBX_IDL idl, MDBX_IDL merge); +static void __hot mdbx_midl_xmerge(MDBX_IDL idl, MDBX_IDL merge) { + pgno_t old_id, merge_id, i = merge[0], j = idl[0], k = i + j, total = k; + idl[0] = ~(pgno_t)0; /* delimiter for idl scan below */ + old_id = idl[j]; + while (i) { + merge_id = merge[i--]; + for (; old_id < merge_id; old_id = idl[--j]) + idl[k--] = old_id; + idl[k--] = merge_id; + } + idl[0] = total; +} /* Sort an IDL. * [in,out] ids The IDL to sort. */ -static void mdbx_midl_sort(MDBX_IDL ids); +static void __hot mdbx_midl_sort(MDBX_IDL ids) { + /* Max possible depth of int-indexed tree * 2 items/level */ + int istack[sizeof(int) * CHAR_BIT * 2]; + int i, j, k, l, ir, jstack; + pgno_t a; + +/* Quicksort + Insertion sort for small arrays */ +#define MIDL_SMALL 8 +#define MIDL_SWAP(a, b) \ + do { \ + pgno_t tmp_pgno = (a); \ + (a) = (b); \ + (b) = tmp_pgno; \ + } while (0) + + ir = (int)ids[0]; + l = 1; + jstack = 0; + for (;;) { + if (ir - l < MIDL_SMALL) { /* Insertion sort */ + for (j = l + 1; j <= ir; j++) { + a = ids[j]; + for (i = j - 1; i >= 1; i--) { + if (ids[i] >= a) + break; + ids[i + 1] = ids[i]; + } + ids[i + 1] = a; + } + if (jstack == 0) + break; + ir = istack[jstack--]; + l = istack[jstack--]; + } else { + k = (l + ir) >> 1; /* Choose median of left, center, right */ + MIDL_SWAP(ids[k], ids[l + 1]); + if (ids[l] < ids[ir]) + MIDL_SWAP(ids[l], ids[ir]); + + if (ids[l + 1] < ids[ir]) + MIDL_SWAP(ids[l + 1], ids[ir]); + + if (ids[l] < ids[l + 1]) + MIDL_SWAP(ids[l], ids[l + 1]); + + i = l + 1; + j = ir; + a = ids[l + 1]; + for (;;) { + do + i++; + while (ids[i] > a); + do + j--; + while (ids[j] < a); + if (j < i) + break; + MIDL_SWAP(ids[i], ids[j]); + } + ids[l + 1] = ids[j]; + ids[j] = a; + jstack += 2; + if (ir - i + 1 >= j - l) { + istack[jstack] = ir; + istack[jstack - 1] = i; + ir = j - 1; + } else { + istack[jstack] = j - 1; + istack[jstack - 1] = l; + l = i; + } + } + } +#undef MIDL_SMALL +#undef MIDL_SWAP +} /* Search for an ID in an ID2L. * [in] ids The ID2L to search. * [in] id The ID to search for. * Returns The index of the first ID2 whose mid member is greater than * or equal to id. */ -static unsigned mdbx_mid2l_search(MDBX_ID2L ids, pgno_t id); +static unsigned __hot mdbx_mid2l_search(MDBX_ID2L ids, pgno_t id) { + /* binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = (unsigned)ids[0].mid; + + while (n > 0) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = mdbx_cmp2int(id, ids[cursor].mid); + + if (val < 0) { + n = pivot; + } else if (val > 0) { + base = cursor; + n -= pivot + 1; + } else { + return cursor; + } + } + + if (val > 0) + ++cursor; + + return cursor; +} /* Insert an ID2 into a ID2L. * [in,out] ids The ID2L to insert into. * [in] id The ID2 to insert. * Returns 0 on success, -1 if the ID was already present in the ID2L. */ -static int mdbx_mid2l_insert(MDBX_ID2L ids, MDBX_ID2 *id); +static int mdbx_mid2l_insert(MDBX_ID2L ids, MDBX_ID2 *id) { + unsigned x = mdbx_mid2l_search(ids, id->mid); + if (unlikely(x < 1)) + return /* internal error */ -2; + + if (x <= ids[0].mid && ids[x].mid == id->mid) + return /* duplicate */ -1; + + if (unlikely(ids[0].mid >= MDBX_IDL_UM_MAX)) + return /* too big */ -2; + + /* insert id */ + ids[0].mid++; + for (unsigned i = (unsigned)ids[0].mid; i > x; i--) + ids[i] = ids[i - 1]; + ids[x] = *id; + return 0; +} /* Append an ID2 into a ID2L. * [in,out] ids The ID2L to append into. * [in] id The ID2 to append. * Returns 0 on success, -2 if the ID2L is too big. */ -static int mdbx_mid2l_append(MDBX_ID2L ids, MDBX_ID2 *id); +static int mdbx_mid2l_append(MDBX_ID2L ids, MDBX_ID2 *id) { + /* Too big? */ + if (unlikely(ids[0].mid >= MDBX_IDL_UM_MAX)) + return -2; + + ids[0].mid++; + ids[ids[0].mid] = *id; + return 0; +} /*----------------------------------------------------------------------------*/ @@ -3885,20 +4139,6 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { void __cold mdbx_env_close(MDBX_env *env) { mdbx_env_close_ex(env, 0); } -/* LY: fast enough on most arches - * - * / - * | -1, a < b - * cmp2int(a,b) = < 0, a == b - * | 1, a > b - * \ - */ -#if 1 -#define mdbx_cmp2int(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -#define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) -#endif - /* Compare two items pointing at aligned unsigned int's. */ static int __hot mdbx_cmp_int_ai(const MDBX_val *a, const MDBX_val *b) { mdbx_assert(NULL, a->iov_len == b->iov_len); @@ -8951,275 +9191,6 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { return rc; } -static unsigned __hot mdbx_midl_search(MDBX_IDL ids, pgno_t id) { - /* binary search of id in ids - * if found, returns position of id - * if not found, returns first position greater than id */ - unsigned base = 0; - unsigned cursor = 1; - int val = 0; - unsigned n = ids[0]; - - while (n > 0) { - unsigned pivot = n >> 1; - cursor = base + pivot + 1; - val = mdbx_cmp2int(ids[cursor], id); - - if (val < 0) { - n = pivot; - } else if (val > 0) { - base = cursor; - n -= pivot + 1; - } else { - return cursor; - } - } - - if (val > 0) - ++cursor; - - return cursor; -} - -static MDBX_IDL mdbx_midl_alloc(int num) { - MDBX_IDL ids = malloc((num + 2) * sizeof(pgno_t)); - if (likely(ids)) { - *ids++ = num; - *ids = 0; - } - return ids; -} - -static void mdbx_midl_free(MDBX_IDL ids) { - if (ids) - free(ids - 1); -} - -static void mdbx_midl_shrink(MDBX_IDL *idp) { - MDBX_IDL ids = *idp - 1; - if (unlikely(*ids > MDBX_IDL_UM_MAX)) { - /* shrink to MDBX_IDL_UM_MAX */ - ids = realloc(ids, (MDBX_IDL_UM_MAX + 2) * sizeof(pgno_t)); - if (likely(ids)) { - *ids++ = MDBX_IDL_UM_MAX; - *idp = ids; - } - } -} - -static int mdbx_midl_grow(MDBX_IDL *idp, int num) { - MDBX_IDL idn = *idp - 1; - /* grow it */ - idn = realloc(idn, (*idn + num + 2) * sizeof(pgno_t)); - if (unlikely(!idn)) - return MDBX_ENOMEM; - *idn++ += num; - *idp = idn; - return 0; -} - -static int mdbx_midl_need(MDBX_IDL *idp, unsigned num) { - MDBX_IDL ids = *idp; - num += ids[0]; - if (num > ids[-1]) { - num = (num + num / 4 + (256 + 2)) & -256; - ids = realloc(ids - 1, num * sizeof(pgno_t)); - if (unlikely(!ids)) - return MDBX_ENOMEM; - *ids++ = num - 2; - *idp = ids; - } - return 0; -} - -static int mdbx_midl_append(MDBX_IDL *idp, pgno_t id) { - MDBX_IDL ids = *idp; - /* Too big? */ - if (ids[0] >= ids[-1]) { - if (mdbx_midl_grow(idp, MDBX_IDL_UM_MAX)) - return MDBX_ENOMEM; - ids = *idp; - } - ids[0]++; - ids[ids[0]] = id; - return 0; -} - -static int mdbx_midl_append_list(MDBX_IDL *idp, MDBX_IDL app) { - MDBX_IDL ids = *idp; - /* Too big? */ - if (ids[0] + app[0] >= ids[-1]) { - if (mdbx_midl_grow(idp, app[0])) - return MDBX_ENOMEM; - ids = *idp; - } - memcpy(&ids[ids[0] + 1], &app[1], app[0] * sizeof(pgno_t)); - ids[0] += app[0]; - return 0; -} - -static int mdbx_midl_append_range(MDBX_IDL *idp, pgno_t id, unsigned n) { - pgno_t *ids = *idp, len = ids[0]; - /* Too big? */ - if (len + n > ids[-1]) { - if (mdbx_midl_grow(idp, n | MDBX_IDL_UM_MAX)) - return MDBX_ENOMEM; - ids = *idp; - } - ids[0] = len + n; - ids += len; - while (n) - ids[n--] = id++; - return 0; -} - -static void __hot mdbx_midl_xmerge(MDBX_IDL idl, MDBX_IDL merge) { - pgno_t old_id, merge_id, i = merge[0], j = idl[0], k = i + j, total = k; - idl[0] = ~(pgno_t)0; /* delimiter for idl scan below */ - old_id = idl[j]; - while (i) { - merge_id = merge[i--]; - for (; old_id < merge_id; old_id = idl[--j]) - idl[k--] = old_id; - idl[k--] = merge_id; - } - idl[0] = total; -} - -/* Quicksort + Insertion sort for small arrays */ -#define SMALL 8 -#define MIDL_SWAP(a, b) \ - do { \ - pgno_t tmp_pgno = (a); \ - (a) = (b); \ - (b) = tmp_pgno; \ - } while (0) - -static void __hot mdbx_midl_sort(MDBX_IDL ids) { - /* Max possible depth of int-indexed tree * 2 items/level */ - int istack[sizeof(int) * CHAR_BIT * 2]; - int i, j, k, l, ir, jstack; - pgno_t a; - - ir = (int)ids[0]; - l = 1; - jstack = 0; - for (;;) { - if (ir - l < SMALL) { /* Insertion sort */ - for (j = l + 1; j <= ir; j++) { - a = ids[j]; - for (i = j - 1; i >= 1; i--) { - if (ids[i] >= a) - break; - ids[i + 1] = ids[i]; - } - ids[i + 1] = a; - } - if (jstack == 0) - break; - ir = istack[jstack--]; - l = istack[jstack--]; - } else { - k = (l + ir) >> 1; /* Choose median of left, center, right */ - MIDL_SWAP(ids[k], ids[l + 1]); - if (ids[l] < ids[ir]) - MIDL_SWAP(ids[l], ids[ir]); - - if (ids[l + 1] < ids[ir]) - MIDL_SWAP(ids[l + 1], ids[ir]); - - if (ids[l] < ids[l + 1]) - MIDL_SWAP(ids[l], ids[l + 1]); - - i = l + 1; - j = ir; - a = ids[l + 1]; - for (;;) { - do - i++; - while (ids[i] > a); - do - j--; - while (ids[j] < a); - if (j < i) - break; - MIDL_SWAP(ids[i], ids[j]); - } - ids[l + 1] = ids[j]; - ids[j] = a; - jstack += 2; - if (ir - i + 1 >= j - l) { - istack[jstack] = ir; - istack[jstack - 1] = i; - ir = j - 1; - } else { - istack[jstack] = j - 1; - istack[jstack - 1] = l; - l = i; - } - } - } -} - -static unsigned __hot mdbx_mid2l_search(MDBX_ID2L ids, pgno_t id) { - /* binary search of id in ids - * if found, returns position of id - * if not found, returns first position greater than id */ - unsigned base = 0; - unsigned cursor = 1; - int val = 0; - unsigned n = (unsigned)ids[0].mid; - - while (n > 0) { - unsigned pivot = n >> 1; - cursor = base + pivot + 1; - val = mdbx_cmp2int(id, ids[cursor].mid); - - if (val < 0) { - n = pivot; - } else if (val > 0) { - base = cursor; - n -= pivot + 1; - } else { - return cursor; - } - } - - if (val > 0) - ++cursor; - - return cursor; -} - -static int mdbx_mid2l_insert(MDBX_ID2L ids, MDBX_ID2 *id) { - unsigned x = mdbx_mid2l_search(ids, id->mid); - if (unlikely(x < 1)) - return /* internal error */ -2; - - if (x <= ids[0].mid && ids[x].mid == id->mid) - return /* duplicate */ -1; - - if (unlikely(ids[0].mid >= MDBX_IDL_UM_MAX)) - return /* too big */ -2; - - /* insert id */ - ids[0].mid++; - for (unsigned i = (unsigned)ids[0].mid; i > x; i--) - ids[i] = ids[i - 1]; - ids[x] = *id; - return 0; -} - -static int mdbx_mid2l_append(MDBX_ID2L ids, MDBX_ID2 *id) { - /* Too big? */ - if (unlikely(ids[0].mid >= MDBX_IDL_UM_MAX)) - return -2; - - ids[0].mid++; - ids[ids[0].mid] = *id; - return 0; -} - int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn) { unsigned ret = mdbx_runtime_flags; if (flags != (int)MDBX_DBG_DNT) From 7ef7e700121d92d452a3ea88280844a324aee4e3 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 18:42:13 +0300 Subject: [PATCH 162/303] mdbx: add STATIC_ASSERT. --- src/defs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/defs.h b/src/defs.h index e4a4c49f..290a854d 100644 --- a/src/defs.h +++ b/src/defs.h @@ -368,5 +368,15 @@ #define FIXME "FIXME: " __FILE__ ", " STRINGIFY(__LINE__) +#ifndef STATIC_ASSERT +# if __STDC_VERSION__ >= 201112L +# define STATIC_ASSERT(expr, msg) _Static_assert(expr, msg) +# elif defined(static_assert) +# define STATIC_ASSERT(expr, msg) static_assert(expr, msg) +# else +# define STATIC_ASSERT(expr, msg) switch (0) {case 0:case (expr):;} +# endif +#endif /* STATIC_ASSERT */ + /* *INDENT-ON* */ /* clang-format on */ From af7b468e634ef6ce1d65c40338a0a38fcf0609fa Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 18:50:24 +0300 Subject: [PATCH 163/303] mdbx: use uint64_t or size_t insted of off_t (buggy on Windows). --- src/bits.h | 11 +++++++++++ src/lck-windows.c | 5 +++-- src/mdbx.c | 16 ++++++++-------- src/osal.c | 23 ++++++++++++++++------- src/osal.h | 14 +++++--------- test/config.cc | 2 +- test/config.h | 6 +++--- 7 files changed, 47 insertions(+), 30 deletions(-) diff --git a/src/bits.h b/src/bits.h index 0a999c02..6d9e1a37 100644 --- a/src/bits.h +++ b/src/bits.h @@ -28,6 +28,17 @@ # define MDBX_DEVEL 0 #endif +/*----------------------------------------------------------------------------*/ + +/* Should be defined before any includes */ +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif + +#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS) +#define _CRT_SECURE_NO_WARNINGS +#endif + #include "../mdbx.h" #include "./defs.h" diff --git a/src/lck-windows.c b/src/lck-windows.c index 5b6551d6..898f7ecd 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -98,7 +98,7 @@ void mdbx_rthc_unlock(void) { LeaveCriticalSection(&rthc_critical_section); } #define LCK_WAITFOR 0 #define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY -static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, off_t offset, +static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset, size_t bytes) { OVERLAPPED ov; ov.hEvent = 0; @@ -107,7 +107,8 @@ static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, off_t offset, return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov); } -static __inline BOOL funlock(mdbx_filehandle_t fd, off_t offset, size_t bytes) { +static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset, + size_t bytes) { return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, HIGH_DWORD(bytes)); } diff --git a/src/mdbx.c b/src/mdbx.c index 5574cca0..ef93eeaf 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3368,14 +3368,14 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head)) ? head : mdbx_env_meta_flipflop(env, head); - off_t offset = (char *)target - env->me_map; + size_t offset = (char *)target - env->me_map; MDBX_meta *stay = mdbx_env_meta_flipflop(env, (MDBX_meta *)target); mdbx_debug( "writing meta %d (%s, was %" PRIaTXN "/%s, stay %s %" PRIaTXN "/%s), root %" PRIaPGNO ", " "txn_id %" PRIaTXN ", %s", - offset >= (off_t)env->me_psize, target == head ? "head" : "tail", + offset >= env->me_psize, target == head ? "head" : "tail", target->mm_txnid, META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" : "Legacy", @@ -3716,12 +3716,12 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { if (unlikely(err != MDBX_SUCCESS)) return err; } else { - off_t size; + uint64_t size; err = mdbx_filesize(env->me_fd, &size); if (unlikely(err != MDBX_SUCCESS)) return err; - if (size != (off_t)env->me_mapsize) { + if (size != env->me_mapsize) { mdbx_trace("filesize mismatch"); if ((env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) @@ -3797,15 +3797,15 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { mdbx_debug("lck-setup: %s ", (rc == MDBX_RESULT_TRUE) ? "exclusive" : "shared"); - off_t size; + uint64_t size; err = mdbx_filesize(env->me_lfd, &size); if (unlikely(err != MDBX_SUCCESS)) return err; if (rc == MDBX_RESULT_TRUE) { - off_t wanna = roundup2((env->me_maxreaders - 1) * sizeof(MDBX_reader) + - sizeof(MDBX_lockinfo), - env->me_os_psize); + uint64_t wanna = roundup2((env->me_maxreaders - 1) * sizeof(MDBX_reader) + + sizeof(MDBX_lockinfo), + env->me_os_psize); #ifndef NDEBUG err = mdbx_ftruncate(env->me_lfd, size = 0); if (unlikely(err != MDBX_SUCCESS)) diff --git a/src/osal.c b/src/osal.c index a8550fef..d2ac9ebb 100644 --- a/src/osal.c +++ b/src/osal.c @@ -391,7 +391,7 @@ int mdbx_closefile(mdbx_filehandle_t fd) { #endif } -int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { +int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) { if (bytes > MAX_WRITE) return MDBX_EINVAL; #if defined(_WIN32) || defined(_WIN64) @@ -407,6 +407,8 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; } #else + STATIC_ASSERT(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); ssize_t read = pread(fd, buf, bytes, offset); if (read < 0) { int rc = errno; @@ -417,7 +419,7 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) { } int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, - off_t offset) { + uint64_t offset) { #if defined(_WIN32) || defined(_WIN64) if (bytes > MAX_WRITE) return ERROR_INVALID_PARAMETER; @@ -435,6 +437,8 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, int rc; ssize_t written; do { + STATIC_ASSERT(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); written = pwrite(fd, buf, bytes, offset); if (likely(bytes == (size_t)written)) return MDBX_SUCCESS; @@ -445,7 +449,7 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, } int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, - off_t offset, size_t expected_written) { + uint64_t offset, size_t expected_written) { #if defined(_WIN32) || defined(_WIN64) size_t written = 0; for (int i = 0; i < iovcnt; ++i) { @@ -461,6 +465,8 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, int rc; ssize_t written; do { + STATIC_ASSERT(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); written = pwritev(fd, iov, iovcnt, offset); if (likely(expected_written == (size_t)written)) return MDBX_SUCCESS; @@ -494,8 +500,7 @@ int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { #ifdef SIGPIPE if (rc == EPIPE) { /* Collect the pending SIGPIPE, otherwise at least OS X - * gives it to the process on thread-exit (ITS#8504). - */ + * gives it to the process on thread-exit (ITS#8504). */ int tmp; sigwait(&set, &tmp); written = 0; @@ -542,7 +547,7 @@ int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync) { #endif } -int mdbx_filesize(mdbx_filehandle_t fd, off_t *length) { +int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { #if defined(_WIN32) || defined(_WIN64) BY_HANDLE_FILE_INFORMATION info; if (!GetFileInformationByHandle(fd, &info)) @@ -551,6 +556,8 @@ int mdbx_filesize(mdbx_filehandle_t fd, off_t *length) { #else struct stat st; + STATIC_ASSERT(sizeof(off_t) <= sizeof(uint64_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); if (fstat(fd, &st)) return errno; @@ -559,7 +566,7 @@ int mdbx_filesize(mdbx_filehandle_t fd, off_t *length) { return MDBX_SUCCESS; } -int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length) { +int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER li; li.QuadPart = length; @@ -567,6 +574,8 @@ int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length) { ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else + STATIC_ASSERT(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno; #endif } diff --git a/src/osal.h b/src/osal.h index 846e5673..fd050719 100644 --- a/src/osal.h +++ b/src/osal.h @@ -34,10 +34,6 @@ /*----------------------------------------------------------------------------*/ /* C99 includes */ -#ifndef _FILE_OFFSET_BITS -#define _FILE_OFFSET_BITS 64 -#endif - #include #include #include @@ -428,10 +424,10 @@ int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, - off_t offset, size_t expected_written); -int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, off_t offset); + uint64_t offset, size_t expected_written); +int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, - off_t offset); + uint64_t offset); int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t count); int mdbx_msync(void *addr, size_t length, int async); @@ -447,8 +443,8 @@ void *mdbx_thread_rthc_get(mdbx_thread_key_t key); void mdbx_thread_rthc_set(mdbx_thread_key_t key, const void *value); int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync); -int mdbx_ftruncate(mdbx_filehandle_t fd, off_t length); -int mdbx_filesize(mdbx_filehandle_t fd, off_t *length); +int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); +int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); int mdbx_openfile(const char *pathname, int flags, mode_t mode, mdbx_filehandle_t *fd); int mdbx_closefile(mdbx_filehandle_t fd); diff --git a/test/config.cc b/test/config.cc index 02a4f955..3c6ac224 100644 --- a/test/config.cc +++ b/test/config.cc @@ -70,7 +70,7 @@ bool parse_option(int argc, char *const argv[], int &narg, const char *option, } bool parse_option(int argc, char *const argv[], int &narg, const char *option, - size_t &mask, const option_verb *verbs) { + unsigned &mask, const option_verb *verbs) { const char *list; if (!parse_option(argc, argv, narg, option, &list)) return false; diff --git a/test/config.h b/test/config.h index 91ea4a24..483fe9b5 100644 --- a/test/config.h +++ b/test/config.h @@ -64,7 +64,7 @@ struct option_verb { }; bool parse_option(int argc, char *const argv[], int &narg, const char *option, - size_t &mask, const option_verb *verbs); + unsigned &mask, const option_verb *verbs); bool parse_option(int argc, char *const argv[], int &narg, const char *option, uint64_t &value, const scale_mode scale, @@ -194,8 +194,8 @@ struct keygen_params_pod { struct actor_params_pod { unsigned loglevel; - size_t mode_flags; - size_t table_flags; + unsigned mode_flags; + unsigned table_flags; uint64_t size; unsigned test_duration; From aa9aa79e11e7dc29dcd797ca8a792a8422601c11 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 19:22:48 +0300 Subject: [PATCH 164/303] test: fix oom_callback(). --- test/test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test.cc b/test/test.cc index cded3e2a..6c296ad0 100644 --- a/test/test.cc +++ b/test/test.cc @@ -100,8 +100,8 @@ int testcase::oom_callback(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, ", txn #%" PRIu64 ", gap %d", pid, (size_t)tid, txn, gap); - if (self->should_continue()) { - /* osal_yield(); */ + if (retry > 0 && self->should_continue()) { + osal_yield(); osal_udelay(retry * 100); return 1 /* always retry */; } From c335b16c810092bc9d73037373af149c4c316a40 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 25 May 2017 08:57:58 +0300 Subject: [PATCH 165/303] mdbx: rework mapsize's stuff inside mdbx_setup_dxb(). --- src/mdbx.c | 90 +++++++++++++++++++++++++++++------------------------- 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index ef93eeaf..8c625206 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3683,61 +3683,55 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { mdbx_debug("create new database"); rc = /* new database */ MDBX_RESULT_TRUE; - env->me_psize = env->me_os_psize; + if (!env->me_psize) + env->me_psize = env->me_os_psize; if (env->me_psize > MAX_PAGESIZE) env->me_psize = MAX_PAGESIZE; + env->me_mapsize = roundup2( + env->me_mapsize ? env->me_mapsize : DEFAULT_MAPSIZE, env->me_os_psize); mdbx_meta_model(env, meta); - meta->mm_mapsize = DEFAULT_MAPSIZE; - } else { - env->me_psize = meta->mm_psize; - } - - /* Was a mapsize configured? */ - if (!env->me_mapsize) - env->me_mapsize = meta->mm_mapsize; - else { - /* Make sure mapsize >= committed data size. Even when using - * mm_mapsize, which could be broken in old files (ITS#7789). */ - size_t usedsize = (meta->mm_last_pg + 1) * meta->mm_psize; - if (env->me_mapsize < usedsize) - env->me_mapsize = usedsize; - - meta->mm_mapsize = env->me_mapsize; - } - - if (rc == MDBX_RESULT_TRUE) { - /* mdbx_env_map() may grow the datafile. Write the metapages - * first, so the file will be valid if initialization fails. */ err = mdbx_env_init_metas(env, meta); if (unlikely(err != MDBX_SUCCESS)) return err; + } else { + env->me_psize = meta->mm_psize; + + /* Make sure mapsize >= committed data size. Even when using + * mm_mapsize, which could be broken in old files (ITS#7789). */ + const size_t usedsize = + roundup2((meta->mm_last_pg + 1) * meta->mm_psize, env->me_os_psize); + if (meta->mm_mapsize < usedsize) + meta->mm_mapsize = usedsize; + + /* Was a mapsize configured? */ + if (!env->me_mapsize || (env->me_flags & MDBX_RDONLY) || + lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) + env->me_mapsize = meta->mm_mapsize; + else if (env->me_mapsize < usedsize) + env->me_mapsize = usedsize; + } + + uint64_t size; + err = mdbx_filesize(env->me_fd, &size); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + if (size != env->me_mapsize) { + mdbx_trace("filesize mismatch"); + if ((env->me_flags & MDBX_RDONLY) || + lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) + return MDBX_WANNA_RECOVERY /* LY: could not mdbx_ftruncate */; err = mdbx_ftruncate(env->me_fd, env->me_mapsize); if (unlikely(err != MDBX_SUCCESS)) return err; - } else { - uint64_t size; - err = mdbx_filesize(env->me_fd, &size); - if (unlikely(err != MDBX_SUCCESS)) - return err; - - if (size != env->me_mapsize) { - mdbx_trace("filesize mismatch"); - if ((env->me_flags & MDBX_RDONLY) || - lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) - return MDBX_WANNA_RECOVERY /* LY: could not mdbx_ftruncate */; - - err = mdbx_ftruncate(env->me_fd, env->me_mapsize); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } } err = mdbx_env_map(env, NULL, env->me_mapsize); if (err) return err; - MDBX_meta *const head = mdbx_meta_head(env); + const MDBX_meta *head = mdbx_meta_head(env); if (head->mm_txnid != meta->mm_txnid) { mdbx_trace("head->mm_txnid (%" PRIaTXN ") != (%" PRIaTXN ") meta->mm_txnid", head->mm_txnid, meta->mm_txnid); @@ -3751,8 +3745,6 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { /* LY: rollback weak checkpoint */ MDBX_meta rollback = *head; rollback.mm_txnid = 0; - if (rollback.mm_txnid == meta->mm_txnid) - rollback = *meta; err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); if (err) @@ -3769,6 +3761,22 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { } } + head = mdbx_meta_head(env); + if (head->mm_mapsize != env->me_mapsize) { + mdbx_trace("head->mm_mapsize (%" PRIu64 ") != (%" PRIu64 + ") env->mm_mapsize", + head->mm_mapsize, env->me_mapsize); + if ((env->me_flags & MDBX_RDONLY) || + lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) + return MDBX_MAP_RESIZED; + + *meta = *head; + meta->mm_mapsize = env->me_mapsize; + err = mdbx_env_sync_locked(env, env->me_flags & MDBX_WRITEMAP, meta); + if (err) + return err; + } + mdbx_env_setup_limits(env, env->me_psize); return rc; } From 15e2a454250962b6e155c07d0a5952f2825ab74f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 25 May 2017 09:20:04 +0300 Subject: [PATCH 166/303] mdbx: check size of lck-file. --- src/mdbx.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 8c625206..a821f2b3 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3828,7 +3828,19 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { size = wanna; } } - env->me_maxreaders = (size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader) + 1; + + if (size & (env->me_os_psize - 1) || size < env->me_os_psize) { + mdbx_notice("lck-file has invalid size %" PRIu64 " bytes", size); + return MDBX_PROBLEM; + } + + const uint64_t maxreaders = + (size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader) + 1; + if (maxreaders > UINT16_MAX) { + mdbx_notice("lck-size too big (up to %" PRIu64 " readers)", maxreaders); + return MDBX_PROBLEM; + } + env->me_maxreaders = (unsigned)maxreaders; void *addr = NULL; err = mdbx_mmap(&addr, size, true, env->me_lfd); From 5519d568f047026d5cf6f69eaa59e486eb2ea8cf Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 25 May 2017 09:26:03 +0300 Subject: [PATCH 167/303] mdbx: fix mdbx_filesize() for Windows. --- src/osal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/osal.c b/src/osal.c index d2ac9ebb..458e50e3 100644 --- a/src/osal.c +++ b/src/osal.c @@ -552,7 +552,7 @@ int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { BY_HANDLE_FILE_INFORMATION info; if (!GetFileInformationByHandle(fd, &info)) return mdbx_get_errno_checked(); - *length = info.nFileSizeLow | (uint64_t)info.nFileIndexHigh << 32; + *length = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32; #else struct stat st; From 89b5b53193952df837629b841bf651ce931a5d8d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 25 May 2017 09:51:26 +0300 Subject: [PATCH 168/303] test: refine oom-callback. --- test/test.cc | 10 ++++++---- test/test.h | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/test/test.cc b/test/test.cc index 6c296ad0..44d59209 100644 --- a/test/test.cc +++ b/test/test.cc @@ -100,9 +100,10 @@ int testcase::oom_callback(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, ", txn #%" PRIu64 ", gap %d", pid, (size_t)tid, txn, gap); - if (retry > 0 && self->should_continue()) { + if (self->should_continue(true)) { osal_yield(); - osal_udelay(retry * 100); + if (retry > 0) + osal_udelay(retry * 100); return 1 /* always retry */; } @@ -308,7 +309,7 @@ bool testcase::teardown() { return true; } -bool testcase::should_continue() const { +bool testcase::should_continue(bool check_timeout_only) const { bool result = true; if (config.params.test_duration) { @@ -319,7 +320,8 @@ bool testcase::should_continue() const { result = false; } - if (config.params.test_nops && nops_completed >= config.params.test_nops) + if (!check_timeout_only && config.params.test_nops && + nops_completed >= config.params.test_nops) result = false; if (result && global::config::progress_indicator) diff --git a/test/test.h b/test/test.h index 34083805..939fc8b3 100644 --- a/test/test.h +++ b/test/test.h @@ -117,7 +117,7 @@ protected: bool wait4start(); void report(size_t nops_done); void signal(); - bool should_continue() const; + bool should_continue(bool check_timeout_only = false) const; void generate_pair(const keygen::serial_t serial, keygen::buffer &key, keygen::buffer &value, keygen::serial_t data_age = 0) { From b94e761d73c2892a3df892074c93ff0ab313b8aa Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 25 May 2017 14:56:07 +0300 Subject: [PATCH 169/303] mdbx: fix 'magic' bug. --- src/bits.h | 2 +- src/osal.h | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/bits.h b/src/bits.h index 6d9e1a37..ca88f686 100644 --- a/src/bits.h +++ b/src/bits.h @@ -984,7 +984,7 @@ static __inline pgno_t NODEPGNO(const MDBX_node *node) { if (sizeof(pgno_t) > 4) pgno &= UINT64_C(0xffffFFFFffff); } else { - pgno = node->mn_lo | ((pgno_t)node->mn_lo << 16); + pgno = node->mn_lo | ((pgno_t)node->mn_hi << 16); if (sizeof(pgno_t) > 4) pgno |= ((uint64_t)node->mn_flags) << 32; } diff --git a/src/osal.h b/src/osal.h index fd050719..daea002c 100644 --- a/src/osal.h +++ b/src/osal.h @@ -113,11 +113,14 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #include #endif -#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) -#define UNALIGNED_OK 1 /* TODO */ -#endif -#ifndef UNALIGNED_OK +#if !defined(UNALIGNED_OK) +#if defined(__i386) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) || defined(i386) || defined(_X86_) || defined(__i386__) || \ + defined(_X86_64_) +#define UNALIGNED_OK 1 +#else #define UNALIGNED_OK 0 +#endif #endif /* UNALIGNED_OK */ #if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF From 3a166e19703780dff88bd12a95e934bae849a44c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 25 May 2017 03:41:42 +0300 Subject: [PATCH 170/303] test: temporary crutch. --- src/bits.h | 8 ++++++++ src/mdbx.c | 2 ++ 2 files changed, 10 insertions(+) diff --git a/src/bits.h b/src/bits.h index ca88f686..3a566cab 100644 --- a/src/bits.h +++ b/src/bits.h @@ -252,8 +252,16 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u volatile uint64_t mm_datasync_sign; + +#define MDBX_TEMPORARY_CRUTCH FIXME +#ifndef MDBX_TEMPORARY_CRUTCH #define SIGN_IS_WEAK(sign) ((sign) == MDBX_DATASIGN_WEAK) #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) +#else +#define SIGN_IS_WEAK(sign) (false && (sign) == MDBX_DATASIGN_WEAK) +#define SIGN_IS_STEADY(sign) (true || (sign) > MDBX_DATASIGN_WEAK) +#endif /* FIXME: MDBX_TEMPORARY_CRUTCH */ + #define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) #define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) volatile mdbx_canary mm_canary; diff --git a/src/mdbx.c b/src/mdbx.c index a821f2b3..82d6a1ee 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3223,11 +3223,13 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { return MDBX_VERSION_MISMATCH; } +#ifndef MDBX_TEMPORARY_CRUTCH /* LY: check signature as a checksum */ if (META_IS_STEADY(m) && m->mm_datasync_sign != mdbx_meta_sign(m)) { mdbx_debug("steady-meta[%u] has invalid checksum", offset); continue; } +#endif /* FIXME: MDBX_TEMPORARY_CRUTCH */ if (mdbx_meta_lt(meta, m)) { *meta = *m; From 96de36baef1eb76b4c255ed341623c0fe6027531 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 25 May 2017 16:21:29 +0300 Subject: [PATCH 171/303] mdbx: refine mdbx_cursor_count() API. --- mdbx.h | 2 +- src/mdbx.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/mdbx.h b/mdbx.h index 44802f30..625cca46 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1415,7 +1415,7 @@ LIBMDBX_API int mdbx_cursor_del(MDBX_cursor *cursor, unsigned flags); * possible errors are: * - MDBX_EINVAL - cursor is not initialized, or an invalid parameter * was specified. */ -LIBMDBX_API int mdbx_cursor_count(MDBX_cursor *cursor, uint64_t *countp); +LIBMDBX_API int mdbx_cursor_count(MDBX_cursor *cursor, size_t *countp); /* Compare two data items according to a particular database. * diff --git a/src/mdbx.c b/src/mdbx.c index 82d6a1ee..a2beaea2 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -6725,7 +6725,7 @@ int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) { } /* Return the count of duplicate data items for the current key */ -int mdbx_cursor_count(MDBX_cursor *mc, uint64_t *countp) { +int mdbx_cursor_count(MDBX_cursor *mc, size_t *countp) { if (unlikely(mc == NULL || countp == NULL)) return MDBX_EINVAL; @@ -6755,7 +6755,9 @@ int mdbx_cursor_count(MDBX_cursor *mc, uint64_t *countp) { if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); - *countp = mc->mc_xcursor->mx_db.md_entries; + *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > INT_MAX) + ? INT_MAX + : mc->mc_xcursor->mx_db.md_entries; } } return MDBX_SUCCESS; From b9dbe7c577fdcb1e8c5477596a99122137a69697 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 25 May 2017 18:37:47 +0300 Subject: [PATCH 172/303] mdbx: fix segfault and double-free (in case twice abortion of the same write-txn). --- src/mdbx.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index a2beaea2..54c31be5 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2345,27 +2345,29 @@ uint64_t mdbx_txn_id(MDBX_txn *txn) { /* Export or close DBI handles opened in this txn. */ static void mdbx_dbis_update(MDBX_txn *txn, int keep) { MDBX_dbi n = txn->mt_numdbs; - MDBX_env *env = txn->mt_env; - uint8_t *tdbflags = txn->mt_dbflags; + if (n) { + MDBX_env *env = txn->mt_env; + uint8_t *tdbflags = txn->mt_dbflags; - for (unsigned i = n; --i >= CORE_DBS;) { - if (tdbflags[i] & DB_NEW) { - if (keep) { - env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDBX_VALID; - } else { - char *ptr = env->me_dbxs[i].md_name.iov_base; - if (ptr) { - env->me_dbxs[i].md_name.iov_base = NULL; - env->me_dbxs[i].md_name.iov_len = 0; - env->me_dbflags[i] = 0; - env->me_dbiseqs[i]++; - free(ptr); + for (unsigned i = n; --i >= CORE_DBS;) { + if (tdbflags[i] & DB_NEW) { + if (keep) { + env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDBX_VALID; + } else { + char *ptr = env->me_dbxs[i].md_name.iov_base; + if (ptr) { + env->me_dbxs[i].md_name.iov_base = NULL; + env->me_dbxs[i].md_name.iov_len = 0; + env->me_dbflags[i] = 0; + env->me_dbiseqs[i]++; + free(ptr); + } } } } + if (keep && env->me_numdbs < n) + env->me_numdbs = n; } - if (keep && env->me_numdbs < n) - env->me_numdbs = n; } /* End a transaction, except successful commit of a nested transaction. @@ -2447,7 +2449,8 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { if (mode & MDBX_END_FREE) { txn->mt_signature = 0; - free(txn); + if (txn != env->me_txn0) + free(txn); } return MDBX_SUCCESS; From 7315b99b9ddba9f952abd6363343b23e26edec4f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 19:02:24 +0300 Subject: [PATCH 173/303] mdbx: fix MSVC/Windows warnings. --- src/mdbx.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 54c31be5..5ca45782 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -651,12 +651,13 @@ static const char *__mdbx_strerr(int errnum) { const char *__cold mdbx_strerror_r(int errnum, char *buf, size_t buflen) { const char *msg = __mdbx_strerr(errnum); if (!msg) { - if (!buflen) + if (!buflen || buflen > INT_MAX) return NULL; #ifdef _MSC_VER size_t size = FormatMessageA( FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, - errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, buflen, NULL); + errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, + NULL); return size ? buf : NULL; #elif defined(_GNU_SOURCE) /* GNU-specific */ @@ -1085,7 +1086,7 @@ static int mdbx_page_loose(MDBX_cursor *mc, MDBX_page *mp) { * [in] all No shortcuts. Needed except after a full mdbx_page_flush(). * * Returns 0 on success, non-zero on failure. */ -static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, int all) { +static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, bool all) { const unsigned Mask = P_SUBP | P_DIRTY | P_LOOSE | P_KEEP; MDBX_txn *txn = mc->mc_txn; MDBX_cursor *m3, *m0 = mc; @@ -1142,7 +1143,7 @@ mark_done: return rc; } -static int mdbx_page_flush(MDBX_txn *txn, int keep); +static int mdbx_page_flush(MDBX_txn *txn, size_t keep); /* Spill pages from the dirty list back to disk. * This is intended to prevent running into MDBX_TXN_FULL situations, @@ -1180,16 +1181,13 @@ static int mdbx_page_flush(MDBX_txn *txn, int keep); * Returns 0 on success, non-zero on failure. */ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { MDBX_txn *txn = m0->mc_txn; - MDBX_page *dp; MDBX_ID2L dl = txn->mt_rw_dirtylist; - unsigned i, j, need; - int rc; if (m0->mc_flags & C_SUB) return MDBX_SUCCESS; /* Estimate how much space this op will take */ - i = m0->mc_db->md_depth; + size_t i = m0->mc_db->md_depth; /* Named DBs also dirty the main DB */ if (m0->mc_dbi >= CORE_DBS) i += txn->mt_dbs[MAIN_DBI].md_depth; @@ -1197,7 +1195,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { if (key) i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; i += i; /* double it for good measure */ - need = i; + size_t need = i; if (txn->mt_dirtyroom > i) return MDBX_SUCCESS; @@ -1209,8 +1207,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { } else { /* purge deleted slots */ MDBX_IDL sl = txn->mt_spill_pages; - unsigned num = sl[0]; - j = 0; + unsigned num = sl[0], j = 0; for (i = 1; i <= num; i++) { if (!(sl[i] & 1)) sl[++j] = sl[i]; @@ -1219,7 +1216,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { } /* Preserve pages which may soon be dirtied again */ - rc = mdbx_pages_xkeep(m0, P_DIRTY, 1); + int rc = mdbx_pages_xkeep(m0, P_DIRTY, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -1236,7 +1233,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { /* flush from the tail forward, this saves a lot of shifting later on. */ for (i = dl[0].mid; i && need; i--) { pgno_t pn = dl[i].mid << 1; - dp = dl[i].mptr; + MDBX_page *dp = dl[i].mptr; if (dp->mp_flags & (P_LOOSE | P_KEEP)) continue; /* Can't spill twice, @@ -1245,7 +1242,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { MDBX_txn *tx2; for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { if (tx2->mt_spill_pages) { - j = mdbx_midl_search(tx2->mt_spill_pages, pn); + unsigned j = mdbx_midl_search(tx2->mt_spill_pages, pn); if (j <= tx2->mt_spill_pages[0] && tx2->mt_spill_pages[j] == pn) { dp->mp_flags |= P_KEEP; break; @@ -1268,7 +1265,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { goto bailout; /* Reset any dirty pages we kept that page_flush didn't see */ - rc = mdbx_pages_xkeep(m0, P_DIRTY | P_KEEP, i); + rc = mdbx_pages_xkeep(m0, P_DIRTY | P_KEEP, i != 0); bailout: txn->mt_flags |= rc ? MDBX_TXN_ERROR : MDBX_TXN_SPILLS; @@ -2839,7 +2836,7 @@ bailout: * [in] txn the transaction that's being committed * [in] keep number of initial pages in dirtylist to keep dirty. * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_flush(MDBX_txn *txn, int keep) { +static int mdbx_page_flush(MDBX_txn *txn, size_t keep) { MDBX_env *env = txn->mt_env; MDBX_ID2L dl = txn->mt_rw_dirtylist; unsigned psize = env->me_psize, j; From 141306644d5863bf3e71263223750b8f2f755018 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 25 May 2017 17:51:15 +0300 Subject: [PATCH 174/303] mdbx: fix mdbx_env_copy_asis(). --- src/mdbx.c | 73 +++++++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 5ca45782..4c93522e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -8173,10 +8173,10 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { mc.mc_txn = my->mc_txn; rc = mdbx_page_get(&mc, *pg, &mc.mc_pg[0], NULL); - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_page_search_root(&mc, NULL, MDBX_PS_FIRST); - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) return rc; /* Make cursor pages writable */ @@ -8218,11 +8218,11 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { memcpy(&pgno, NODEDATA(ni), sizeof(pgno)); memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); rc = mdbx_page_get(&mc, pgno, &omp, NULL); - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) goto done; if (my->mc_wlen[toggle] >= MDBX_WBUF) { rc = mdbx_env_cthr_toggle(my, 1); - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) goto done; toggle = my->mc_toggle; } @@ -8235,7 +8235,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; rc = mdbx_env_cthr_toggle(my, 1); - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) goto done; toggle = my->mc_toggle; } @@ -8268,7 +8268,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); pgno = NODEPGNO(ni); rc = mdbx_page_get(&mc, pgno, &mp, NULL); - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) goto done; mc.mc_top++; mc.mc_snum++; @@ -8285,7 +8285,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { } if (my->mc_wlen[toggle] >= MDBX_WBUF) { rc = mdbx_env_cthr_toggle(my, 1); - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) goto done; toggle = my->mc_toggle; } @@ -8311,20 +8311,17 @@ done: /* Copy environment with compaction. */ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { - MDBX_meta *mm; - MDBX_page *mp; - mdbx_copy my; MDBX_txn *txn = NULL; mdbx_thread_t thr; - pgno_t root, new_root; - int rc; - + mdbx_copy my; memset(&my, 0, sizeof(my)); - if ((rc = mdbx_condmutex_init(&my.mc_condmutex)) != 0) + + int rc = mdbx_condmutex_init(&my.mc_condmutex); + if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = mdbx_memalign_alloc(env->me_os_psize, MDBX_WBUF * 2, (void **)&my.mc_wbuf[0]); - if (rc != MDBX_SUCCESS) + if (unlikely(rc != MDBX_SUCCESS)) goto done; memset(my.mc_wbuf[0], 0, MDBX_WBUF * 2); @@ -8333,18 +8330,18 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { my.mc_env = env; my.mc_fd = fd; rc = mdbx_thread_create(&thr, mdbx_env_copythr, &my); - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) goto done; rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); - if (rc) + if (unlikely(rc != MDBX_SUCCESS)) goto finish; - mp = (MDBX_page *)my.mc_wbuf[0]; + MDBX_page* mp = (MDBX_page *)my.mc_wbuf[0]; memset(mp, 0, NUM_METAS * env->me_psize); mp->mp_pgno = 0; mp->mp_flags = P_META; - mm = (MDBX_meta *)PAGEDATA(mp); + MDBX_meta* mm = (MDBX_meta *)PAGEDATA(mp); mdbx_meta_model(env, mm); mp = (MDBX_page *)(my.mc_wbuf[0] + env->me_psize); @@ -8354,18 +8351,20 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { mm = (MDBX_meta *)PAGEDATA(mp); /* Set metapage 1 with current main DB */ - root = new_root = txn->mt_dbs[MAIN_DBI].md_root; - if (root != P_INVALID) { + pgno_t new_root, root = txn->mt_dbs[MAIN_DBI].md_root; + if ((new_root = root) != P_INVALID) { /* Count free pages + freeDB pages. Subtract from last_pg * to find the new last_pg, which also becomes the new root. */ pgno_t freecount = 0; MDBX_cursor mc; MDBX_val key, data; + mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdbx_cursor_get(&mc, &key, &data, MDBX_NEXT)) == 0) freecount += *(pgno_t *)data.iov_base; - if (rc != MDBX_NOTFOUND) + if (unlikely(rc != MDBX_NOTFOUND)) goto finish; + freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + txn->mt_dbs[FREE_DBI].md_leaf_pages + txn->mt_dbs[FREE_DBI].md_overflow_pages; @@ -8386,12 +8385,11 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; rc = mdbx_env_cwalk(&my, &root, 0); - if (rc == MDBX_SUCCESS && root != new_root) { - rc = MDBX_INCOMPATIBLE; /* page leak or corrupt DB */ - } + if (rc == MDBX_SUCCESS && root != new_root) + rc = MDBX_PROBLEM; /* page leak or corrupt DB */ finish: - if (rc) + if (rc != MDBX_SUCCESS) my.mc_error = rc; mdbx_env_cthr_toggle(&my, 1 | MDBX_EOF); rc = mdbx_thread_join(thr); @@ -8406,26 +8404,25 @@ done: /* Copy environment as-is. */ static int __cold mdbx_env_copy_asis(MDBX_env *env, mdbx_filehandle_t fd) { MDBX_txn *txn = NULL; - int rc; /* Do the lock/unlock of the reader mutex before starting the * write txn. Otherwise other read txns could block writers. */ - rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); - if (unlikely(rc)) + int rc = mdbx_txn_begin(env, NULL, MDBX_RDONLY, &txn); + if (unlikely(rc != MDBX_SUCCESS)) return rc; /* We must start the actual read txn after blocking writers */ rc = mdbx_txn_end(txn, MDBX_END_RESET_TMP); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* FIXME: or just return? */ /* Temporarily block writers until we snapshot the meta pages */ rc = mdbx_txn_lock(env); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; rc = mdbx_txn_renew0(txn, MDBX_RDONLY); - if (rc) { + if (unlikely(rc != MDBX_SUCCESS)) { mdbx_txn_unlock(env); goto bailout; } @@ -8433,8 +8430,12 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, mdbx_filehandle_t fd) { rc = mdbx_write(fd, env->me_map, env->me_psize * NUM_METAS); mdbx_txn_unlock(env); - if (rc == MDBX_SUCCESS) - rc = mdbx_ftruncate(fd, txn->mt_next_pgno * env->me_psize); + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_write(fd, env->me_map + env->me_psize * NUM_METAS, + (txn->mt_next_pgno - NUM_METAS) * env->me_psize); + + if (likely(rc == MDBX_SUCCESS)) + rc = mdbx_ftruncate(fd, env->me_mapsize); bailout: mdbx_txn_abort(txn); @@ -8445,8 +8446,8 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, unsigned flags) { if (flags & MDBX_CP_COMPACT) return mdbx_env_compact(env, fd); - else - return mdbx_env_copy_asis(env, fd); + + return mdbx_env_copy_asis(env, fd); } int __cold mdbx_env_copy(MDBX_env *env, const char *path, unsigned flags) { From 9eeb00f448d21cdaa8a702c26f9dae8ea6085389 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 21:43:29 +0300 Subject: [PATCH 175/303] mdbx: trinity of meta-pages. --- TODO.md | 1 + mdbx.h | 17 +- src/bits.h | 53 +--- src/mdbx.c | 565 ++++++++++++++++++++++++++---------------- src/osal.c | 4 +- src/osal.h | 2 +- src/tools/mdbx_chk.c | 222 ++++++++++++----- src/tools/mdbx_stat.c | 19 +- 8 files changed, 537 insertions(+), 346 deletions(-) diff --git a/TODO.md b/TODO.md index ab59b4c3..92dd691c 100644 --- a/TODO.md +++ b/TODO.md @@ -15,3 +15,4 @@ - [ ] актуализация README.md - [ ] возможность хранения ключей внутри data (libfptu) - [ ] асинхронная фиксация (https://github.com/ReOpen/libmdbx/issues/5) +- [ ] (пере)выделять память под IDL-списки с учетом реального кол-ва страниц, т.е. max(MDB_IDL_UM_MAX/MDB_IDL_UM_MAX, npages) diff --git a/mdbx.h b/mdbx.h index 625cca46..45668cf6 100644 --- a/mdbx.h +++ b/mdbx.h @@ -427,13 +427,14 @@ typedef struct MDBX_stat { /* Information about the environment */ typedef struct MDBX_envinfo { - void *me_mapaddr; /* Address of map, if fixed */ - uint64_t me_mapsize; /* Size of the data memory map */ - uint64_t me_last_pgno; /* ID of the last used page */ - uint64_t me_last_txnid; /* ID of the last committed transaction */ - uint32_t me_maxreaders; /* max reader slots in the environment */ - uint32_t me_numreaders; /* max reader slots used in the environment */ - uint64_t me_tail_txnid; /* ID of the last reader transaction */ + void *me_mapaddr; /* Address of map, if fixed */ + uint64_t me_mapsize; /* Size of the data memory map */ + uint64_t me_recent_pgno; /* ID of the last used page */ + uint64_t me_recent_txnid; /* ID of the last committed transaction */ + uint32_t me_maxreaders; /* max reader slots in the environment */ + uint32_t me_numreaders; /* max reader slots used in the environment */ + uint64_t me_latter_reader_txnid; /* ID of the last reader transaction */ + uint64_t me_meta0_txnid, me_meta0_sign; uint64_t me_meta1_txnid, me_meta1_sign; uint64_t me_meta2_txnid, me_meta2_sign; } MDBX_envinfo; @@ -868,7 +869,7 @@ LIBMDBX_API void *mdbx_env_get_userctx(MDBX_env *env); * * [in] env An environment handle returned by mdbx_env_create(). * [in] msg The assertion message, not including newline. */ -typedef void MDBX_assert_func(MDBX_env *env, const char *msg, +typedef void MDBX_assert_func(const MDBX_env *env, const char *msg, const char *function, unsigned line); /* Set or reset the assert() callback of the environment. diff --git a/src/bits.h b/src/bits.h index 3a566cab..cbd2cc09 100644 --- a/src/bits.h +++ b/src/bits.h @@ -95,6 +95,7 @@ * pressure from other processes is high. So until OSs have * actual paging support for Huge pages, they're not viable. */ #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) +#define MIN_PAGESIZE 1024 /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the @@ -127,14 +128,14 @@ #define CORE_DBS 2 /* Number of meta pages - also hardcoded elsewhere */ -#define NUM_METAS 2 +#define NUM_METAS 3 /* A page number in the database. * * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ -typedef uint32_t pgno_t; -#define PRIaPGNO PRIu32 +typedef uint64_t pgno_t; +#define PRIaPGNO PRIu64 /* TODO */ /* A transaction ID. */ typedef uint64_t txnid_t; @@ -253,18 +254,12 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u volatile uint64_t mm_datasync_sign; -#define MDBX_TEMPORARY_CRUTCH FIXME -#ifndef MDBX_TEMPORARY_CRUTCH #define SIGN_IS_WEAK(sign) ((sign) == MDBX_DATASIGN_WEAK) #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#else -#define SIGN_IS_WEAK(sign) (false && (sign) == MDBX_DATASIGN_WEAK) -#define SIGN_IS_STEADY(sign) (true || (sign) > MDBX_DATASIGN_WEAK) -#endif /* FIXME: MDBX_TEMPORARY_CRUTCH */ #define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) #define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - volatile mdbx_canary mm_canary; + mdbx_canary mm_canary; } MDBX_meta; /* Common header for all page types. The page type depends on mp_flags. @@ -307,23 +302,17 @@ typedef struct MDBX_page { }; uint32_t mp_pages; /* number of overflow pages */ }; - indx_t mp_ptrs[1]; /* dynamic size */ + + /* dynamic size */ + union { + indx_t mp_ptrs[1]; + MDBX_meta mp_meta; + uint8_t mp_data[1]; + }; } MDBX_page; /* Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) - -/* Buffer for a stack-allocated meta page. - * The members define size and alignment, and silence type - * aliasing warnings. They are not used directly; that could - * mean incorrectly using several union members in parallel. */ -typedef union MDBX_metabuf { - MDBX_page mb_page; - struct { - char mm_pad[PAGEHDRSZ]; - MDBX_meta mm_meta; - } mb_metabuf; -} MDBX_metabuf; +#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data)) /* The header for the reader table (a memory-mapped lock file). */ typedef struct MDBX_lockinfo { @@ -795,22 +784,6 @@ static __inline void mdbx_jitter4testing(bool tiny) { /* Internal prototypes and inlines */ int mdbx_reader_check0(MDBX_env *env, int rlocked, int *dead); - -#define METAPAGE_1(env) (&((MDBX_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) - -#define METAPAGE_2(env) \ - (&((MDBX_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) - -static __inline MDBX_meta *mdbx_meta_head(MDBX_env *env) { - mdbx_jitter4testing(true); - MDBX_meta *a = METAPAGE_1(env); - mdbx_jitter4testing(true); - MDBX_meta *b = METAPAGE_2(env); - mdbx_jitter4testing(true); - - return (a->mm_txnid > b->mm_txnid) ? a : b; -} - void mdbx_rthc_dtor(void *rthc); void mdbx_rthc_lock(void); void mdbx_rthc_unlock(void); diff --git a/src/mdbx.c b/src/mdbx.c index 4c93522e..083e300f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -553,7 +553,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, static int mdbx_read_header(MDBX_env *env, MDBX_meta *meta); static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *pending); + MDBX_meta *const pending); static void mdbx_env_close0(MDBX_env *env); static MDBX_node *mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, int *exactp); @@ -1272,7 +1272,7 @@ bailout: return rc; } -static __inline uint64_t mdbx_meta_sign(MDBX_meta *meta) { +static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { uint64_t sign = MDBX_DATASIGN_NONE; #if 0 /* TODO */ sign = hippeus_hash64(&meta->mm_mapsize, @@ -1285,22 +1285,98 @@ static __inline uint64_t mdbx_meta_sign(MDBX_meta *meta) { return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; } -static __inline MDBX_meta *mdbx_env_meta_flipflop(const MDBX_env *env, - MDBX_meta *meta) { - return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env); +static __inline bool mdbx_meta_ot(const MDBX_meta *a, const MDBX_meta *b, + const bool roolback2steady) { + mdbx_jitter4testing(true); + if (a->mm_txnid == b->mm_txnid) + return META_IS_STEADY(b); + + mdbx_jitter4testing(true); + if (roolback2steady && META_IS_STEADY(a) != META_IS_STEADY(b)) + return META_IS_STEADY(b); + + mdbx_jitter4testing(true); + return a->mm_txnid < b->mm_txnid; } -static __inline int mdbx_meta_lt(const MDBX_meta *a, const MDBX_meta *b) { - if (META_IS_STEADY(a) == META_IS_STEADY(b)) - return a->mm_txnid < b->mm_txnid; - return META_IS_STEADY(b); +static __inline bool mdbx_meta_eq(const MDBX_meta *a, const MDBX_meta *b) { + mdbx_jitter4testing(true); + if (a->mm_txnid != b->mm_txnid) + return false; + + mdbx_jitter4testing(true); + if (META_IS_STEADY(a) != META_IS_STEADY(b)) + return false; + + mdbx_jitter4testing(true); + return true; +} + +#define METAPAGE(env, n) \ + (&((MDBX_page *)((env)->me_map + env->me_psize * (n)))->mp_meta) + +static int mdbx_meta_eq_mask(const MDBX_env *env) { + MDBX_meta *m0 = METAPAGE(env, 0); + MDBX_meta *m1 = METAPAGE(env, 1); + MDBX_meta *m2 = METAPAGE(env, 2); + + int rc = mdbx_meta_eq(m0, m1) ? 1 : 0; + if (mdbx_meta_eq(m1, m2)) + rc += 2; + if (mdbx_meta_eq(m2, m0)) + rc += 4; + return rc; +} + +static __inline MDBX_meta *mdbx_meta_recent(const MDBX_env *env, MDBX_meta *a, + MDBX_meta *b, + const bool roolback2steady) { + const bool a_older_that_b = mdbx_meta_ot(a, b, roolback2steady); + mdbx_assert(env, !mdbx_meta_eq(a, b)); + return a_older_that_b ? b : a; +} + +static __inline MDBX_meta *mdbx_meta_ancient(const MDBX_env *env, MDBX_meta *a, + MDBX_meta *b, + const bool roolback2steady) { + const bool a_older_that_b = mdbx_meta_ot(a, b, roolback2steady); + mdbx_assert(env, !mdbx_meta_eq(a, b)); + return a_older_that_b ? a : b; +} + +static __inline MDBX_meta *mdbx_meta_head(const MDBX_env *env, + const bool roolback2steady) { + MDBX_meta *m0 = METAPAGE(env, 0); + MDBX_meta *m1 = METAPAGE(env, 1); + MDBX_meta *m2 = METAPAGE(env, 2); + + MDBX_meta *head = mdbx_meta_recent(env, m0, m1, roolback2steady); + head = mdbx_meta_recent(env, head, m2, roolback2steady); + return head; +} + +static __hot MDBX_meta *mdbx_meta_steady_head(const MDBX_env *env) { + return mdbx_meta_head(env, true); +} + +static __hot MDBX_meta *mdbx_meta_fluid_head(const MDBX_env *env) { + return mdbx_meta_head(env, false); +} + +static const char *mdbx_durable_str(const MDBX_meta *const meta) { + if (META_IS_WEAK(meta)) + return "Weak"; + if (META_IS_STEADY(meta)) + return (meta->mm_datasync_sign == mdbx_meta_sign(meta)) ? "Steady" + : "Tainted"; + return "Legacy"; } /* Find oldest txnid still referenced. */ static txnid_t mdbx_find_oldest(MDBX_env *env, int *laggard) { - const MDBX_meta *const a = METAPAGE_1(env); - const MDBX_meta *const b = METAPAGE_2(env); - txnid_t oldest = mdbx_meta_lt(a, b) ? b->mm_txnid : a->mm_txnid; + const MDBX_meta *const head = mdbx_meta_head( + env, F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) ? false : true); + txnid_t oldest = head->mm_txnid; int i, reader; const MDBX_reader *const r = env->me_lck->mti_readers; @@ -1589,12 +1665,11 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, if ((flags & MDBX_ALLOC_GC) && ((flags & MDBX_ALLOC_KICK) || rc == MDBX_MAP_FULL)) { - MDBX_meta *head = mdbx_meta_head(env); - MDBX_meta *tail = mdbx_env_meta_flipflop(env, head); + MDBX_meta *fluid = mdbx_meta_fluid_head(env); + MDBX_meta *steady = mdbx_meta_steady_head(env); - if (oldest == tail->mm_txnid && META_IS_WEAK(head) && - !META_IS_WEAK(tail)) { - MDBX_meta meta = *head; + if (oldest == steady->mm_txnid && META_IS_WEAK(fluid) && + !META_IS_WEAK(steady)) { /* LY: Here an oom was happened: * - all pages had allocated; * - reclaiming was stopped at the last steady-sync; @@ -1605,16 +1680,17 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, * don't make a steady-sync, but only a legacy-mode checkpoint, * just for resume reclaiming only, not for data consistency. */ - mdbx_debug("kick-gc: head %" PRIaTXN "/%c, tail %" PRIaTXN - "/%c, oldest %" PRIaTXN "", - head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', - tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest); + mdbx_debug("kick-gc: head %" PRIaTXN "-%s, tail %" PRIaTXN + "-%s, oldest %" PRIaTXN "", + fluid->mm_txnid, mdbx_durable_str(fluid), steady->mm_txnid, + mdbx_durable_str(steady), oldest); - int me_flags = env->me_flags & MDBX_WRITEMAP; - if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC) + unsigned me_flags = env->me_flags & MDBX_WRITEMAP; + if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC)) me_flags |= MDBX_UTTERLY_NOSYNC; mdbx_assert(env, env->me_sync_pending > 0); + MDBX_meta meta = *fluid; if (mdbx_env_sync_locked(env, me_flags, &meta) == MDBX_SUCCESS) { txnid_t snap = mdbx_find_oldest(env, NULL); if (snap > oldest) { @@ -1878,7 +1954,7 @@ int mdbx_env_sync(MDBX_env *env, int force) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - MDBX_meta *head = mdbx_meta_head(env); + MDBX_meta *head = mdbx_meta_fluid_head(env); if (!META_IS_STEADY(head) || env->me_sync_pending || env->me_mapsize != head->mm_mapsize) { @@ -1907,11 +1983,16 @@ int mdbx_env_sync(MDBX_env *env, int force) { return rc; /* LY: head may be changed. */ - head = mdbx_meta_head(env); + head = mdbx_meta_fluid_head(env); } if (!META_IS_STEADY(head) || env->me_sync_pending || env->me_mapsize != head->mm_mapsize) { + mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64 + ", mapsize env=%" PRIuPTR " meta=%" PRIuPTR, + container_of(head, MDBX_page, mp_data)->mp_pgno, + mdbx_durable_str(head), env->me_sync_pending, env->me_mapsize, + head->mm_mapsize); MDBX_meta meta = *head; rc = mdbx_env_sync_locked(env, flags, &meta); if (unlikely(rc != MDBX_SUCCESS)) { @@ -2058,7 +2139,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { env->me_live_reader = pid; } - for (;;) { + while (1) { nr = env->me_lck->mti_numreaders; for (i = 0; i < nr; i++) if (env->me_lck->mti_readers[i].mr_pid == 0) @@ -2096,7 +2177,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } while (1) { - MDBX_meta *const meta = mdbx_meta_head(txn->mt_env); + MDBX_meta *const meta = mdbx_meta_fluid_head(txn->mt_env); mdbx_jitter4testing(false); const txnid_t snap = meta->mm_txnid; mdbx_jitter4testing(false); @@ -2114,8 +2195,11 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_canary = meta->mm_canary; /* LY: Retry on a race, ITS#7970. */ - if (likely(meta == mdbx_meta_head(txn->mt_env) && snap == meta->mm_txnid)) + if (likely(meta == mdbx_meta_fluid_head(txn->mt_env) && + snap == meta->mm_txnid)) { + mdbx_jitter4testing(false); break; + } } txn->mt_ro_reader = r; @@ -2128,7 +2212,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { return rc; mdbx_jitter4testing(false); - MDBX_meta *meta = mdbx_meta_head(env); + MDBX_meta *meta = mdbx_meta_fluid_head(env); mdbx_jitter4testing(false); txn->mt_canary = meta->mm_canary; txn->mt_txnid = meta->mm_txnid + 1; @@ -3188,63 +3272,64 @@ fail: /* Read the environment parameters of a DB environment * before mapping it into memory. */ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { - assert(offsetof(MDBX_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); + assert(offsetof(MDBX_page, mp_meta) == PAGEHDRSZ); memset(meta, 0, sizeof(MDBX_meta)); meta->mm_datasync_sign = MDBX_DATASIGN_WEAK; - unsigned offset = 0; - /* Read both meta pages so we can use the latest one. */ - for (int loops_left = 2; --loops_left >= 0;) { - MDBX_metabuf buf; + /* Read twice all meta pages so we can find the latest one. */ + unsigned loop_limit = NUM_METAS * 2; + for (unsigned loop_count = 0; loop_count < loop_limit; ++loop_count) { + MDBX_page page; - /* We don't know the page size on first time, so use a minimum value. */ - int rc = mdbx_pread(env->me_fd, &buf, sizeof(buf), offset); + /* We don't know the page size on first time. + * So, just guess it. */ + unsigned guess_pagesize = meta->mm_psize; + if (guess_pagesize == 0) + guess_pagesize = + (loop_count > NUM_METAS) ? env->me_psize : env->me_os_psize; + + const unsigned meta_number = loop_count % NUM_METAS; + const unsigned offset = guess_pagesize * meta_number; + int rc = mdbx_pread(env->me_fd, &page, sizeof(page), offset); if (rc != MDBX_SUCCESS) { - mdbx_debug("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(buf), rc, + mdbx_debug("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(page), rc, mdbx_strerror(rc)); return rc; } - MDBX_page *p = (MDBX_page *)&buf; - if (!F_ISSET(p->mp_flags, P_META)) { - mdbx_debug("page %" PRIaPGNO " not a meta-page", p->mp_pgno); + if (page.mp_pgno != meta_number) { + mdbx_debug("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, + page.mp_pgno); return MDBX_INVALID; } - MDBX_meta *m = PAGEDATA(p); - if (m->mm_magic != MDBX_MAGIC) { - mdbx_debug("meta[%u] has invalid magic", offset); + if (!F_ISSET(page.mp_flags, P_META)) { + mdbx_debug("page #%u not a meta-page", meta_number); return MDBX_INVALID; } - if (m->mm_version != MDBX_DATA_VERSION) { - mdbx_debug("database is version %u, expected version %u", m->mm_version, - MDBX_DATA_VERSION); + if (page.mp_meta.mm_magic != MDBX_MAGIC) { + mdbx_debug("meta[%u] has invalid magic", meta_number); + return MDBX_INVALID; + } + + if (page.mp_meta.mm_version != MDBX_DATA_VERSION) { + mdbx_debug("database is version %u, expected version %u", + page.mp_meta.mm_version, MDBX_DATA_VERSION); return MDBX_VERSION_MISMATCH; } -#ifndef MDBX_TEMPORARY_CRUTCH /* LY: check signature as a checksum */ - if (META_IS_STEADY(m) && m->mm_datasync_sign != mdbx_meta_sign(m)) { - mdbx_debug("steady-meta[%u] has invalid checksum", offset); + if (META_IS_STEADY(&page.mp_meta) && + page.mp_meta.mm_datasync_sign != mdbx_meta_sign(&page.mp_meta)) { + mdbx_debug("steady-meta[%u] has invalid checksum", meta_number); continue; } -#endif /* FIXME: MDBX_TEMPORARY_CRUTCH */ - if (mdbx_meta_lt(meta, m)) { - *meta = *m; + if (mdbx_meta_ot(meta, &page.mp_meta, true)) { + *meta = page.mp_meta; if (META_IS_WEAK(meta)) - loops_left += 1; /* LY: should re-read to avoid race */ - } - - if (offset) - offset = 0; - else { - offset = meta->mm_psize; - if (!offset) - offset = m->mm_psize; - if (!offset) - offset = env->me_os_psize; + loop_limit += 1; /* LY: should re-read to hush race with update */ } } @@ -3256,78 +3341,67 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { return MDBX_SUCCESS; } -/* Fill in most of the zeroed MDBX_meta for an empty database environment */ -static void __cold mdbx_meta_model(const MDBX_env *env, MDBX_meta *model) { +static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, + unsigned num) { memset(model, 0, sizeof(*model)); - model->mm_magic = MDBX_MAGIC; - model->mm_version = MDBX_DATA_VERSION; - model->mm_mapsize = env->me_mapsize; - model->mm_psize = env->me_psize; - model->mm_last_pg = NUM_METAS - 1; - model->mm_flags = (uint16_t)env->me_flags; - model->mm_flags |= MDBX_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ - model->mm_dbs[FREE_DBI].md_root = P_INVALID; - model->mm_dbs[MAIN_DBI].md_root = P_INVALID; - model->mm_datasync_sign = mdbx_meta_sign(model); + model->mp_pgno = num; + model->mp_flags = P_META; + model->mp_meta.mm_magic = MDBX_MAGIC; + model->mp_meta.mm_version = MDBX_DATA_VERSION; + model->mp_meta.mm_mapsize = env->me_mapsize; + model->mp_meta.mm_psize = env->me_psize; + model->mp_meta.mm_last_pg = NUM_METAS - 1; + model->mp_meta.mm_flags = (uint16_t)env->me_flags; + model->mp_meta.mm_flags |= + MDBX_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ + model->mp_meta.mm_dbs[FREE_DBI].md_root = P_INVALID; + model->mp_meta.mm_dbs[MAIN_DBI].md_root = P_INVALID; + model->mp_meta.mm_txnid = num; + model->mp_meta.mm_datasync_sign = mdbx_meta_sign(&model->mp_meta); + return (MDBX_page *)((uint8_t *)model + env->me_psize); } -/* Write the environment parameters of a freshly created DB environment. */ -static int __cold mdbx_env_init_metas(const MDBX_env *env, MDBX_meta *model) { - mdbx_debug("writing new meta pages"); - assert(offsetof(MDBX_metabuf, mb_metabuf.mm_meta) == PAGEHDRSZ); - - unsigned page_size = env->me_psize; - MDBX_page *first = calloc(NUM_METAS, page_size); - if (!first) - return MDBX_ENOMEM; - first->mp_pgno = 0; - first->mp_flags = P_META; - MDBX_meta *first_meta = (MDBX_meta *)PAGEDATA(first); - - MDBX_page *second = (MDBX_page *)((char *)first + page_size); - second->mp_pgno = 1; - second->mp_flags = P_META; - MDBX_meta *second_meta = (MDBX_meta *)PAGEDATA(second); - - *first_meta = *model; - model->mm_txnid += 1; - *second_meta = *model; - - int rc = mdbx_pwrite(env->me_fd, first, page_size * NUM_METAS, 0); - - free(first); - return rc; +/* Fill in most of the zeroed meta-pages for an empty database environment. + * Return pointer to recenly (head) meta-page. */ +static MDBX_page *__cold mdbx_init_metas(const MDBX_env *env, void *buffer) { + MDBX_page *page0 = (MDBX_page *)buffer; + MDBX_page *page1 = mdbx_meta_model(env, page0, 0); + MDBX_page *page2 = mdbx_meta_model(env, page1, 1); + mdbx_meta_model(env, page2, 2); + page2->mp_meta.mm_datasync_sign = MDBX_DATASIGN_WEAK; + mdbx_assert(env, !mdbx_meta_eq(&page0->mp_meta, &page1->mp_meta)); + mdbx_assert(env, !mdbx_meta_eq(&page1->mp_meta, &page2->mp_meta)); + mdbx_assert(env, !mdbx_meta_eq(&page2->mp_meta, &page0->mp_meta)); + return page1; } static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *pending) { - int rc; - MDBX_meta *head = mdbx_meta_head(env); - size_t prev_mapsize = head->mm_mapsize; - size_t used_size = env->me_psize * (pending->mm_last_pg + 1); + MDBX_meta *const pending) { + MDBX_meta *const meta0 = METAPAGE(env, 0); + MDBX_meta *const meta1 = METAPAGE(env, 1); + MDBX_meta *const meta2 = METAPAGE(env, 2); + MDBX_meta *const head = mdbx_meta_fluid_head(env); - mdbx_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env)); + const size_t prev_mapsize = head->mm_mapsize; + const size_t used_size = env->me_psize * (pending->mm_last_pg + 1); + + mdbx_assert(env, mdbx_meta_eq_mask(env) == 0); + mdbx_assert(env, + pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); mdbx_assert(env, !META_IS_STEADY(head) || env->me_sync_pending != 0 || env->me_mapsize != prev_mapsize); pending->mm_mapsize = env->me_mapsize; mdbx_assert(env, pending->mm_mapsize >= used_size); - if (unlikely(pending->mm_mapsize != prev_mapsize)) { - if (pending->mm_mapsize < prev_mapsize) { - /* LY: currently this can't happen, but force full-sync. */ - flags &= MDBX_WRITEMAP; - } else { - /* Persist any increases of mapsize config */ - } - } if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) flags &= MDBX_WRITEMAP; /* LY: step#1 - sync previously written/updated data-pages */ + int rc = MDBX_RESULT_TRUE; if (env->me_sync_pending && (flags & MDBX_NOSYNC) == 0) { - assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { rc = mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC); if (unlikely(rc != MDBX_SUCCESS)) @@ -3356,7 +3430,7 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, } } - /* LY: step#2 - update meta-page. */ + /* Steady or Weak */ if (env->me_sync_pending == 0) { pending->mm_datasync_sign = mdbx_meta_sign(pending); } else { @@ -3366,27 +3440,60 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, : MDBX_DATASIGN_WEAK; } - volatile MDBX_meta *target = - (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head)) - ? head - : mdbx_env_meta_flipflop(env, head); - size_t offset = (char *)target - env->me_map; + volatile MDBX_meta *target = nullptr; + if (head->mm_txnid == pending->mm_txnid) { + mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, + sizeof(head->mm_dbs)) == 0); + mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, + sizeof(head->mm_canary)) == 0); + mdbx_assert(env, head->mm_last_pg == pending->mm_last_pg); + mdbx_assert(env, head->mm_mapsize == pending->mm_mapsize); + if (!META_IS_STEADY(head) && META_IS_STEADY(pending)) + target = head; + else { + mdbx_assert(env, mdbx_meta_eq(head, pending)); + mdbx_debug("skip update meta"); + return MDBX_SUCCESS; + } + } else if (head == meta0) + target = mdbx_meta_ancient(env, meta1, meta2, true); + else if (head == meta1) + target = mdbx_meta_ancient(env, meta0, meta2, true); + else if (head == meta2) + target = mdbx_meta_ancient(env, meta0, meta1, true); - MDBX_meta *stay = mdbx_env_meta_flipflop(env, (MDBX_meta *)target); - mdbx_debug( - "writing meta %d (%s, was %" PRIaTXN "/%s, stay %s %" PRIaTXN - "/%s), root %" PRIaPGNO ", " - "txn_id %" PRIaTXN ", %s", - offset >= env->me_psize, target == head ? "head" : "tail", - target->mm_txnid, - META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" - : "Legacy", - stay == head ? "head" : "tail", stay->mm_txnid, - META_IS_WEAK(stay) ? "Weak" : META_IS_STEADY(stay) ? "Steady" : "Legacy", - pending->mm_dbs[MAIN_DBI].md_root, pending->mm_txnid, - META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" - : "Legacy"); + /* LY: step#2 - update meta-page. */ + mdbx_debug("writing meta%" PRIaPGNO " (%s, was %" PRIaTXN + ", %s), root %" PRIaPGNO "/%" PRIaPGNO ", " + "txn_id %" PRIaTXN ", %s", + container_of(target, MDBX_page, mp_data)->mp_pgno, + (target == head) ? "head" : "tail", target->mm_txnid, + mdbx_durable_str((const MDBX_meta *)target), + pending->mm_dbs[MAIN_DBI].md_root, + pending->mm_dbs[FREE_DBI].md_root, pending->mm_txnid, + mdbx_durable_str(pending)); + mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO + "/%" PRIaPGNO, + (meta0 == head) ? "head" : (meta0 == target) ? "tail" : "stay", + mdbx_durable_str(meta0), meta0->mm_txnid, + meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); + mdbx_debug("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO + "/%" PRIaPGNO, + (meta1 == head) ? "head" : (meta1 == target) ? "tail" : "stay", + mdbx_durable_str(meta1), meta1->mm_txnid, + meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); + mdbx_debug("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO + "/%" PRIaPGNO, + (meta2 == head) ? "head" : (meta2 == target) ? "tail" : "stay", + mdbx_durable_str(meta2), meta2->mm_txnid, + meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); + + mdbx_assert(env, !mdbx_meta_eq(pending, meta0)); + mdbx_assert(env, !mdbx_meta_eq(pending, meta1)); + mdbx_assert(env, !mdbx_meta_eq(pending, meta2)); + + const size_t offset = (char *)target - env->me_map; if (env->me_flags & MDBX_WRITEMAP) { /* LY: 'invalidate' the meta. */ mdbx_jitter4testing(true); @@ -3432,7 +3539,7 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, /* LY: step#3 - sync meta-pages. */ if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) { - assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { char *ptr = env->me_map + (offset & ~(env->me_os_psize - 1)); rc = mdbx_msync(ptr, env->me_os_psize, flags & MDBX_MAPASYNC); @@ -3570,9 +3677,9 @@ static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { #endif /* Lock meta pages to avoid unexpected write, - * before the data pages would be synchronized. */ + * before the data pages would be synchronized. */ if (flags & MDBX_WRITEMAP) { - rc = mdbx_mlock(env->me_map, env->me_psize * 2); + rc = mdbx_mlock(env->me_map, env->me_psize * NUM_METAS); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -3604,7 +3711,7 @@ int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { return MDBX_EINVAL; /* FIXME: lock/unlock */ - meta = mdbx_meta_head(env); + meta = mdbx_meta_fluid_head(env); if (!size) size = meta->mm_mapsize; /* Silently round up to minimum if the size is too small */ @@ -3674,9 +3781,10 @@ int __cold mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers) { } /* Further setup required for opening an MDBX environment */ -static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { +static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { + MDBX_meta meta; int rc = MDBX_RESULT_FALSE; - int err = mdbx_read_header(env, meta); + int err = mdbx_read_header(env, &meta); if (unlikely(err != MDBX_SUCCESS)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || (env->me_flags & MDBX_RDONLY) != 0) @@ -3689,26 +3797,43 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { env->me_psize = env->me_os_psize; if (env->me_psize > MAX_PAGESIZE) env->me_psize = MAX_PAGESIZE; + env->me_mapsize = roundup2( env->me_mapsize ? env->me_mapsize : DEFAULT_MAPSIZE, env->me_os_psize); - mdbx_meta_model(env, meta); - err = mdbx_env_init_metas(env, meta); + + void *buffer = calloc(NUM_METAS, env->me_psize); + if (!buffer) + return MDBX_ENOMEM; + + meta = mdbx_init_metas(env, buffer)->mp_meta; + err = mdbx_pwrite(env->me_fd, buffer, env->me_psize * NUM_METAS, 0); + free(buffer); + if (unlikely(err != MDBX_SUCCESS)) + return err; + +#ifndef NDEBUG /* just for checking */ + err = mdbx_read_header(env, &meta); + if (unlikely(err != MDBX_SUCCESS)) + return err; +#endif + + err = mdbx_ftruncate(env->me_fd, env->me_mapsize); if (unlikely(err != MDBX_SUCCESS)) return err; } else { - env->me_psize = meta->mm_psize; + env->me_psize = meta.mm_psize; /* Make sure mapsize >= committed data size. Even when using * mm_mapsize, which could be broken in old files (ITS#7789). */ const size_t usedsize = - roundup2((meta->mm_last_pg + 1) * meta->mm_psize, env->me_os_psize); - if (meta->mm_mapsize < usedsize) - meta->mm_mapsize = usedsize; + roundup2((meta.mm_last_pg + 1) * env->me_psize, env->me_os_psize); + if (meta.mm_mapsize < usedsize) + meta.mm_mapsize = usedsize; /* Was a mapsize configured? */ if (!env->me_mapsize || (env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) - env->me_mapsize = meta->mm_mapsize; + env->me_mapsize = meta.mm_mapsize; else if (env->me_mapsize < usedsize) env->me_mapsize = usedsize; } @@ -3717,9 +3842,9 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { err = mdbx_filesize(env->me_fd, &size); if (unlikely(err != MDBX_SUCCESS)) return err; - if (size != env->me_mapsize) { - mdbx_trace("filesize mismatch"); + mdbx_notice("filesize mismatch (wanna %" PRIu64 ", have %" PRIu64 ")", + env->me_mapsize, size); if ((env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) return MDBX_WANNA_RECOVERY /* LY: could not mdbx_ftruncate */; @@ -3733,20 +3858,28 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { if (err) return err; - const MDBX_meta *head = mdbx_meta_head(env); - if (head->mm_txnid != meta->mm_txnid) { - mdbx_trace("head->mm_txnid (%" PRIaTXN ") != (%" PRIaTXN ") meta->mm_txnid", - head->mm_txnid, meta->mm_txnid); + const unsigned meta_clash_mask = mdbx_meta_eq_mask(env); + if (meta_clash_mask) { + mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); + return MDBX_WANNA_RECOVERY; + } + + const MDBX_meta *head = mdbx_meta_fluid_head(env); + if (head->mm_txnid != meta.mm_txnid) { if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { - assert(META_IS_STEADY(meta) && !META_IS_STEADY(head)); + assert(META_IS_STEADY(&meta) && !META_IS_STEADY(head)); if (env->me_flags & MDBX_RDONLY) { - mdbx_trace("exclusive, but read-only, unable recovery/rollback"); + mdbx_error("rollback needed: (from head %" PRIaTXN + " to steady %" PRIaTXN "), but unable in read-only mode", + head->mm_txnid, meta.mm_txnid); return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */; } /* LY: rollback weak checkpoint */ MDBX_meta rollback = *head; rollback.mm_txnid = 0; + mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN, head->mm_txnid, + meta.mm_txnid); err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); if (err) @@ -3763,18 +3896,19 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, MDBX_meta *meta, int lck_rc) { } } - head = mdbx_meta_head(env); + head = mdbx_meta_fluid_head(env); if (head->mm_mapsize != env->me_mapsize) { - mdbx_trace("head->mm_mapsize (%" PRIu64 ") != (%" PRIu64 - ") env->mm_mapsize", - head->mm_mapsize, env->me_mapsize); + mdbx_info("mismatch meta.mapsize: present %" PRIu64 ", should %" PRIu64, + head->mm_mapsize, env->me_mapsize); if ((env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) return MDBX_MAP_RESIZED; - *meta = *head; - meta->mm_mapsize = env->me_mapsize; - err = mdbx_env_sync_locked(env, env->me_flags & MDBX_WRITEMAP, meta); + mdbx_trace("updating meta.mapsize: from %" PRIu64 " to %" PRIu64, + head->mm_mapsize, env->me_mapsize); + meta = *head; + meta.mm_mapsize = env->me_mapsize; + err = mdbx_env_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); if (err) return err; } @@ -3989,8 +4123,7 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, goto bailout; } - MDBX_meta meta; - const int dxb_rc = mdbx_setup_dxb(env, &meta, lck_rc); + const int dxb_rc = mdbx_setup_dxb(env, lck_rc); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; goto bailout; @@ -4051,13 +4184,13 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, #if MDBX_DEBUG if (rc == MDBX_SUCCESS) { - MDBX_meta *meta = mdbx_meta_head(env); + MDBX_meta *meta = mdbx_meta_fluid_head(env); MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; - int toggle = ((char *)meta == PAGEDATA(env->me_map)) ? 0 : 1; mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, env->me_psize); - mdbx_debug("using meta page %d, txn %" PRIaTXN "", toggle, meta->mm_txnid); + mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN "", + container_of(meta, MDBX_page, mp_data)->mp_pgno, meta->mm_txnid); mdbx_debug("depth: %u", db->md_depth); mdbx_debug("entries: %" PRIu64 "", db->md_entries); mdbx_debug("branch pages: %" PRIaPGNO "", db->md_branch_pages); @@ -4653,7 +4786,7 @@ static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { } } - mdbx_cassert(mc, root > 1); + mdbx_cassert(mc, root >= NUM_METAS); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) return rc; @@ -8337,18 +8470,7 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { if (unlikely(rc != MDBX_SUCCESS)) goto finish; - MDBX_page* mp = (MDBX_page *)my.mc_wbuf[0]; - memset(mp, 0, NUM_METAS * env->me_psize); - mp->mp_pgno = 0; - mp->mp_flags = P_META; - MDBX_meta* mm = (MDBX_meta *)PAGEDATA(mp); - mdbx_meta_model(env, mm); - - mp = (MDBX_page *)(my.mc_wbuf[0] + env->me_psize); - mp->mp_pgno = 1; - mp->mp_flags = P_META; - *(MDBX_meta *)PAGEDATA(mp) = *mm; - mm = (MDBX_meta *)PAGEDATA(mp); + MDBX_page *meta = mdbx_init_metas(env, my.mc_wbuf[0]); /* Set metapage 1 with current main DB */ pgno_t new_root, root = txn->mt_dbs[MAIN_DBI].md_root; @@ -8370,18 +8492,24 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { txn->mt_dbs[FREE_DBI].md_overflow_pages; new_root = txn->mt_next_pgno - 1 - freecount; - mm->mm_last_pg = new_root; - mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - mm->mm_dbs[MAIN_DBI].md_root = new_root; + meta->mp_meta.mm_last_pg = new_root; + meta->mp_meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta->mp_meta.mm_dbs[MAIN_DBI].md_root = new_root; } else { /* When the DB is empty, handle it specially to * fix any breakage like page leaks from ITS#8174. */ - mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; + meta->mp_meta.mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; } - if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { - mm->mm_txnid = 1; /* use metapage 1 */ + + /* copy canary sequenses if present */ + if (txn->mt_canary.v) { + meta->mp_meta.mm_canary = txn->mt_canary; + meta->mp_meta.mm_canary.v = meta->mp_meta.mm_txnid; } + /* update signature */ + meta->mp_meta.mm_datasync_sign = mdbx_meta_sign(&meta->mp_meta); + my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; rc = mdbx_env_cwalk(&my, &root, 0); @@ -8582,12 +8710,11 @@ int __cold mdbx_env_stat(MDBX_env *env, MDBX_stat *arg, size_t bytes) { if (unlikely(bytes != sizeof(MDBX_stat))) return MDBX_EINVAL; - meta = mdbx_meta_head(env); + meta = mdbx_meta_fluid_head(env); return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); } int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { - MDBX_meta *meta; if (unlikely(env == NULL || arg == NULL)) return MDBX_EINVAL; @@ -8595,37 +8722,38 @@ int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { if (bytes != sizeof(MDBX_envinfo)) return MDBX_EINVAL; - MDBX_meta *m1, *m2; - MDBX_reader *r; - unsigned i; - - m1 = METAPAGE_1(env); - m2 = METAPAGE_2(env); - + const MDBX_meta *const meta0 = METAPAGE(env, 0); + const MDBX_meta *const meta1 = METAPAGE(env, 1); + const MDBX_meta *const meta2 = METAPAGE(env, 2); do { - meta = mdbx_meta_head(env); - arg->me_last_txnid = meta->mm_txnid; - arg->me_last_pgno = meta->mm_last_pg; - arg->me_meta1_txnid = m1->mm_txnid; - arg->me_meta1_sign = m1->mm_datasync_sign; - arg->me_meta2_txnid = m2->mm_txnid; - arg->me_meta2_sign = m2->mm_datasync_sign; - } while (unlikely(arg->me_last_txnid != mdbx_meta_head(env)->mm_txnid || - arg->me_meta1_sign != m1->mm_datasync_sign || - arg->me_meta2_sign != m2->mm_datasync_sign)); + const MDBX_meta *meta = mdbx_meta_fluid_head(env); + arg->me_meta0_txnid = meta0->mm_txnid; + arg->me_meta0_sign = meta0->mm_datasync_sign; + arg->me_meta1_txnid = meta1->mm_txnid; + arg->me_meta1_sign = meta1->mm_datasync_sign; + arg->me_meta2_txnid = meta2->mm_txnid; + arg->me_meta2_sign = meta2->mm_datasync_sign; + arg->me_recent_txnid = meta->mm_txnid; + arg->me_recent_pgno = meta->mm_last_pg; + } while (unlikely(arg->me_meta0_txnid != meta0->mm_txnid || + arg->me_meta0_sign != meta0->mm_datasync_sign || + arg->me_meta1_txnid != meta1->mm_txnid || + arg->me_meta1_sign != meta1->mm_datasync_sign || + arg->me_meta2_txnid != meta2->mm_txnid || + arg->me_meta2_sign != meta2->mm_datasync_sign)); arg->me_mapsize = env->me_mapsize; arg->me_maxreaders = env->me_maxreaders; arg->me_numreaders = env->me_lck->mti_numreaders; - arg->me_tail_txnid = 0; + arg->me_latter_reader_txnid = 0; - r = env->me_lck->mti_readers; - arg->me_tail_txnid = arg->me_last_txnid; - for (i = 0; i < arg->me_numreaders; ++i) { + MDBX_reader *r = env->me_lck->mti_readers; + arg->me_latter_reader_txnid = arg->me_recent_txnid; + for (unsigned i = 0; i < arg->me_numreaders; ++i) { if (r[i].mr_pid) { txnid_t mr = r[i].mr_txnid; - if (arg->me_tail_txnid > mr) - arg->me_tail_txnid = mr; + if (arg->me_latter_reader_txnid > mr) + arg->me_latter_reader_txnid = mr; } } @@ -9264,7 +9392,7 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { continue; rc = env->me_oom_func(env, pid, tid, oldest, - mdbx_meta_head(env)->mm_txnid - oldest, retry); + mdbx_meta_fluid_head(env)->mm_txnid - oldest, retry); if (rc < 0) break; @@ -9329,7 +9457,7 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) return -1; MDBX_env *env = txn->mt_env; - MDBX_meta *meta = mdbx_meta_head(env); + MDBX_meta *meta = mdbx_meta_fluid_head(env); if (percent) { size_t maxpg = env->me_maxpg; size_t last = meta->mm_last_pg + 1; @@ -9487,9 +9615,10 @@ int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, ctx.mw_user = user; ctx.mw_visitor = visitor; - int rc = visitor(0, 2, user, "mdbx", "meta", 2, sizeof(MDBX_meta) * 2, - PAGEHDRSZ * 2, - (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * 2); + int rc = visitor(0, NUM_METAS, user, "mdbx", "meta", NUM_METAS, + sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS, + (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * + NUM_METAS); if (!rc) rc = mdbx_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0); if (!rc) diff --git a/src/osal.c b/src/osal.c index 458e50e3..0611ad38 100644 --- a/src/osal.c +++ b/src/osal.c @@ -48,8 +48,8 @@ __extern_C __declspec(dllimport) void __cdecl _assert(char const *message, #endif /* _MSC_VER */ #ifndef mdbx_assert_fail -void __cold mdbx_assert_fail(MDBX_env *env, const char *msg, const char *func, - int line) { +void __cold mdbx_assert_fail(const MDBX_env *env, const char *msg, + const char *func, int line) { #if MDBX_DEBUG if (env && env->me_assert_func) { env->me_assert_func(env, msg, func, line); diff --git a/src/osal.h b/src/osal.h index daea002c..123d42ac 100644 --- a/src/osal.h +++ b/src/osal.h @@ -338,7 +338,7 @@ static __inline void mdbx_invalidate_cache(void *addr, size_t nbytes) { /* libc compatibility stuff */ #ifndef mdbx_assert_fail -void mdbx_assert_fail(MDBX_env *env, const char *msg, const char *func, +void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, int line); #endif /* mdbx_assert_fail */ diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 71e8e103..f4578a6e 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -323,7 +323,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, if (key->iov_len != sizeof(txnid_t)) problem_add("entry", record_number, "wrong txn-id size", "key-size %" PRIiPTR "", key->iov_len); - else if (txnid < 1 || txnid > envinfo.me_last_txnid) + else if (txnid < 1 || txnid > envinfo.me_recent_txnid) problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN "", txnid); if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) @@ -340,14 +340,14 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, data->iov_len); else { freedb_pages += number; - if (envinfo.me_tail_txnid > txnid) + if (envinfo.me_latter_reader_txnid > txnid) reclaimable_pages += number; for (i = number, prev = 1; --i >= 0;) { pg = iptr[i]; - if (pg < NUM_METAS || pg > envinfo.me_last_pgno) + if (pg < NUM_METAS || pg > envinfo.me_recent_pgno) problem_add("entry", record_number, "wrong idl entry", "%u < %" PRIiPTR " < %" PRIiPTR "", NUM_METAS, pg, - envinfo.me_last_pgno); + envinfo.me_recent_pgno); else if (pg <= prev) { bad = " [bad sequence]"; problem_add("entry", record_number, "bad sequence", @@ -431,8 +431,7 @@ static int process_db(MDBX_dbi dbi, char *name, visitor *handler, int silent) { } } - if (dbi >= 2 /* CORE_DBS */ && name && only_subdb && - strcmp(only_subdb, name)) { + if (dbi >= CORE_DBS && name && only_subdb && strcmp(only_subdb, name)) { if (verbose) { print("Skip processing '%s'...\n", name); fflush(NULL); @@ -592,19 +591,132 @@ static void usage(char *prog) { const char *meta_synctype(uint64_t sign) { switch (sign) { - case 0: + case MDBX_DATASIGN_NONE: return "no-sync/legacy"; - case 1: + case MDBX_DATASIGN_WEAK: return "weak"; default: return "steady"; } } -int meta_lt(txnid_t txn1, uint64_t sign1, txnid_t txn2, uint64_t sign2) { - return (SIGN_IS_STEADY(sign1) == SIGN_IS_STEADY(sign2)) - ? txn1 < txn2 - : txn2 && SIGN_IS_STEADY(sign2); +static __inline bool meta_ot(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, + uint64_t sign_b, const bool roolback2steady) { + if (txn_a == txn_b) + return SIGN_IS_STEADY(sign_b); + + if (roolback2steady && SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b)) + return SIGN_IS_STEADY(sign_b); + + return txn_a < txn_b; +} + +static __inline bool meta_eq(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, + uint64_t sign_b) { + if (txn_a != txn_b) + return false; + + if (SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b)) + return false; + + return true; +} + +static __inline int meta_recent(const bool roolback2steady) { + + if (meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign, roolback2steady)) + return meta_ot(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign, + roolback2steady) + ? 1 + : 2; + + return meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta2_txnid, envinfo.me_meta2_sign, roolback2steady) + ? 2 + : 0; +} + +static __inline int meta_ancient(const bool roolback2steady) { + + if (meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign, roolback2steady)) + return meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta2_txnid, envinfo.me_meta2_sign, + roolback2steady) + ? 0 + : 2; + return meta_ot(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign, roolback2steady) + ? 2 + : 1; +} + +static int meta_steady_head(void) { return meta_recent(true); } + +static int meta_weak_head(void) { return meta_recent(false); } + +static int meta_tail(void) { return meta_ancient(true); } + +void verbose_meta(int num, txnid_t txnid, uint64_t sign) { + print(" - meta-%d: %s %" PRIu64, num, meta_synctype(sign), txnid); + bool stay = true; + + if (num == meta_steady_head() && num == meta_weak_head()) { + print(", head"); + stay = false; + } else if (num == meta_steady_head()) { + print(", head-steady"); + stay = false; + } else if (num == meta_weak_head()) { + print(", head-weak"); + stay = false; + } + if (num == meta_tail()) { + print(", tail"); + stay = false; + } + if (stay) + print(", stay"); + + if (txnid > envinfo.me_recent_txnid) + print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", + txnid - envinfo.me_recent_txnid, txnid, envinfo.me_recent_txnid); + print("\n"); +} + +static int check_meta_head(bool steady) { + switch (meta_recent(steady)) { + default: + assert(false); + error(" - unexpected internal error (%s)\n", + steady ? "meta_steady_head" : "meta_weak_head"); + case 0: + if (envinfo.me_meta0_txnid != envinfo.me_recent_txnid) { + print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 + ")\n", + 0, envinfo.me_meta0_txnid, envinfo.me_recent_txnid); + return 1; + } + break; + case 1: + if (envinfo.me_meta1_txnid != envinfo.me_recent_txnid) { + print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 + ")\n", + 1, envinfo.me_meta1_txnid, envinfo.me_recent_txnid); + return 1; + } + break; + case 2: + if (envinfo.me_meta2_txnid != envinfo.me_recent_txnid) { + print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 + ")\n", + 2, envinfo.me_meta2_txnid, envinfo.me_recent_txnid); + return 1; + } + } + return 0; } int main(int argc, char *argv[]) { @@ -739,7 +851,7 @@ int main(int argc, char *argv[]) { goto bailout; } - lastpgno = envinfo.me_last_pgno + 1; + lastpgno = envinfo.me_recent_pgno + 1; errno = 0; if (verbose) { @@ -754,71 +866,45 @@ int main(int argc, char *argv[]) { print(" - mapaddr %p\n", envinfo.me_mapaddr); print(" - pagesize %u, max keysize %" PRIuPTR ", max readers %u\n", envstat.ms_psize, maxkeysize, envinfo.me_maxreaders); - print(" - transactions: last %" PRIu64 ", bottom %" PRIu64 - ", lag reading %" PRIi64 "\n", - envinfo.me_last_txnid, envinfo.me_tail_txnid, - envinfo.me_last_txnid - envinfo.me_tail_txnid); + print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64 + ", lag %" PRIi64 "\n", + envinfo.me_recent_txnid, envinfo.me_latter_reader_txnid, + envinfo.me_recent_txnid - envinfo.me_latter_reader_txnid); - print(" - meta-1: %s %" PRIu64 ", %s", meta_synctype(envinfo.me_meta1_sign), - envinfo.me_meta1_txnid, - meta_lt(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, - envinfo.me_meta2_txnid, envinfo.me_meta2_sign) - ? "tail" - : "head"); - if (envinfo.me_meta1_txnid > envinfo.me_last_txnid) - print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", - envinfo.me_meta1_txnid - envinfo.me_last_txnid, - envinfo.me_meta1_txnid, envinfo.me_last_txnid); - print("\n"); + verbose_meta(0, envinfo.me_meta0_txnid, envinfo.me_meta0_sign); + verbose_meta(1, envinfo.me_meta1_txnid, envinfo.me_meta1_sign); + verbose_meta(2, envinfo.me_meta2_txnid, envinfo.me_meta2_sign); + } - print(" - meta-2: %s %" PRIu64 ", %s", meta_synctype(envinfo.me_meta2_sign), - envinfo.me_meta2_txnid, - meta_lt(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, - envinfo.me_meta1_txnid, envinfo.me_meta1_sign) - ? "tail" - : "head"); - if (envinfo.me_meta2_txnid > envinfo.me_last_txnid) - print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", - envinfo.me_meta2_txnid - envinfo.me_last_txnid, - envinfo.me_meta2_txnid, envinfo.me_last_txnid); - print("\n"); + if (verbose) + print(" - performs check for meta-pages overlap\n"); + if (meta_eq(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign)) { + print(" - meta-%d and meta-%d are clashed\n", 0, 1); + ++problems_meta; + } + if (meta_eq(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, + envinfo.me_meta2_txnid, envinfo.me_meta2_sign)) { + print(" - meta-%d and meta-%d are clashed\n", 1, 2); + ++problems_meta; + } + if (meta_eq(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, + envinfo.me_meta0_txnid, envinfo.me_meta0_sign)) { + print(" - meta-%d and meta-%d are clashed\n", 2, 0); + ++problems_meta; } if (exclusive > 1) { if (verbose) - print(" - perform full check last-txn-id with meta-pages\n"); - - if (!meta_lt(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, - envinfo.me_meta2_txnid, envinfo.me_meta2_sign) && - envinfo.me_meta1_txnid != envinfo.me_last_txnid) { - print(" - meta-1 txn-id mismatch last-txn-id (%" PRIi64 " != %" PRIi64 - ")\n", - envinfo.me_meta1_txnid, envinfo.me_last_txnid); - ++problems_meta; - } - - if (!meta_lt(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, - envinfo.me_meta1_txnid, envinfo.me_meta1_sign) && - envinfo.me_meta2_txnid != envinfo.me_last_txnid) { - print(" - meta-2 txn-id mismatch last-txn-id (%" PRIi64 " != %" PRIi64 - ")\n", - envinfo.me_meta2_txnid, envinfo.me_last_txnid); - ++problems_meta; - } + print(" - performs full check recent-txn-id with meta-pages\n"); + problems_meta += check_meta_head(true); } else if (locktxn) { if (verbose) - print(" - perform lite check last-txn-id with meta-pages (not a " + print(" - performs lite check recent-txn-id with meta-pages (not a " "monopolistic mode)\n"); - uint64_t last = (envinfo.me_meta2_txnid > envinfo.me_meta1_txnid) - ? envinfo.me_meta2_txnid - : envinfo.me_meta1_txnid; - if (last != envinfo.me_last_txnid) { - print(" - last-meta mismatch last-txn-id (%" PRIi64 " != %" PRIi64 ")\n", - last, envinfo.me_last_txnid); - ++problems_meta; - } + problems_meta += check_meta_head(false); } else if (verbose) { - print(" - skip check last-txn-id with meta-pages (monopolistic or " + print(" - skip check recent-txn-id with meta-pages (monopolistic or " "write-lock mode only)\n"); } diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index e3c53a9d..efa69b42 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -125,10 +125,11 @@ int main(int argc, char *argv[]) { printf(" Map size: %" PRIu64 "\n", mei.me_mapsize); printf(" Page size: %u\n", mst.ms_psize); printf(" Max pages: %" PRIu64 "\n", mei.me_mapsize / mst.ms_psize); - printf(" Number of pages used: %" PRIu64 "\n", mei.me_last_pgno + 1); - printf(" Last transaction ID: %" PRIu64 "\n", mei.me_last_txnid); + printf(" Number of pages used: %" PRIu64 "\n", mei.me_recent_pgno + 1); + printf(" Last transaction ID: %" PRIu64 "\n", mei.me_recent_txnid); printf(" Tail transaction ID: %" PRIu64 " (%" PRIi64 ")\n", - mei.me_tail_txnid, mei.me_tail_txnid - mei.me_last_txnid); + mei.me_latter_reader_txnid, + mei.me_latter_reader_txnid - mei.me_recent_txnid); printf(" Max readers: %u\n", mei.me_maxreaders); printf(" Number of readers used: %u\n", mei.me_numreaders); } else { @@ -181,7 +182,7 @@ int main(int argc, char *argv[]) { while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == 0) { iptr = data.iov_base; pages += *iptr; - if (envinfo && mei.me_tail_txnid > *(size_t *)key.iov_base) + if (envinfo && mei.me_latter_reader_txnid > *(size_t *)key.iov_base) reclaimable += *iptr; if (freinfo > 1) { char *bad = ""; @@ -220,14 +221,14 @@ int main(int argc, char *argv[]) { printf("Page Allocation Info\n"); printf(" Max pages: %9zu 100%%\n", value); - value = mei.me_last_pgno + 1; + value = mei.me_recent_pgno + 1; printf(" Number of pages used: %" PRIuPTR " %.1f%%\n", value, value / percent); - value = mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1); + value = mei.me_mapsize / mst.ms_psize - (mei.me_recent_pgno + 1); printf(" Remained: %" PRIuPTR " %.1f%%\n", value, value / percent); - value = mei.me_last_pgno + 1 - pages; + value = mei.me_recent_pgno + 1 - pages; printf(" Used now: %" PRIuPTR " %.1f%%\n", value, value / percent); value = pages; @@ -239,8 +240,8 @@ int main(int argc, char *argv[]) { value = reclaimable; printf(" Reclaimable: %" PRIuPTR " %.1f%%\n", value, value / percent); - value = - mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1) + reclaimable; + value = mei.me_mapsize / mst.ms_psize - (mei.me_recent_pgno + 1) + + reclaimable; printf(" Available: %" PRIuPTR " %.1f%%\n", value, value / percent); } else printf(" Free pages: %" PRIuPTR "\n", pages); From 700f3e9c2acfcd9f95675e0034f488f477d1bbd2 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 25 May 2017 14:07:58 +0300 Subject: [PATCH 176/303] mdbx: more checks for meta-params. --- src/mdbx.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 083e300f..4624b642 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3798,6 +3798,8 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { if (env->me_psize > MAX_PAGESIZE) env->me_psize = MAX_PAGESIZE; + assert(is_power2(env->me_psize)); + assert(env->me_psize >= MIN_PAGESIZE); env->me_mapsize = roundup2( env->me_mapsize ? env->me_mapsize : DEFAULT_MAPSIZE, env->me_os_psize); @@ -3822,6 +3824,11 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { return err; } else { env->me_psize = meta.mm_psize; + if (!is_power2(env->me_psize) || env->me_psize < MIN_PAGESIZE) { + mdbx_error("wrong pagesize %u (system %u)", env->me_psize, + env->me_os_psize); + return MDBX_WANNA_RECOVERY; + } /* Make sure mapsize >= committed data size. Even when using * mm_mapsize, which could be broken in old files (ITS#7789). */ @@ -3846,8 +3853,10 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { mdbx_notice("filesize mismatch (wanna %" PRIu64 ", have %" PRIu64 ")", env->me_mapsize, size); if ((env->me_flags & MDBX_RDONLY) || - lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) + lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { + mdbx_error("exclusive, but read-only, unable ftruncate/set-size"); return MDBX_WANNA_RECOVERY /* LY: could not mdbx_ftruncate */; + } err = mdbx_ftruncate(env->me_fd, env->me_mapsize); if (unlikely(err != MDBX_SUCCESS)) @@ -3887,12 +3896,12 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { } else if (!env->me_lck) { /* LY: without-lck (read-only) mode, so it is imposible that other * process made weak checkpoint. */ - mdbx_trace("without-lck, unable recovery/rollback"); + mdbx_error("without-lck, unable recovery/rollback"); return MDBX_WANNA_RECOVERY; } else { /* LY: assume just have a collision with other running process, * or someone make a weak checkpoint */ - mdbx_trace("assume collision or online weak checkpoint"); + mdbx_info("assume collision or online weak checkpoint"); } } @@ -3973,7 +3982,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { const uint64_t maxreaders = (size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader) + 1; if (maxreaders > UINT16_MAX) { - mdbx_notice("lck-size too big (up to %" PRIu64 " readers)", maxreaders); + mdbx_error("lck-size too big (up to %" PRIu64 " readers)", maxreaders); return MDBX_PROBLEM; } env->me_maxreaders = (unsigned)maxreaders; @@ -4019,11 +4028,11 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { env->me_lck->mti_format = MDBX_LOCK_FORMAT; } else { if (env->me_lck->mti_magic != MDBX_MAGIC) { - mdbx_debug("lock region has invalid magic"); + mdbx_error("lock region has invalid magic"); return MDBX_INVALID; } if (env->me_lck->mti_format != MDBX_LOCK_FORMAT) { - mdbx_debug("lock region has format+version 0x%" PRIx64 + mdbx_error("lock region has format+version 0x%" PRIx64 ", expected 0x%" PRIx64, env->me_lck->mti_format, MDBX_LOCK_FORMAT); return MDBX_VERSION_MISMATCH; @@ -4148,7 +4157,7 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, *exclusive = 0; } if ((env->me_lck->mti_envmode ^ env->me_flags) & mode_flags) { - /* LY: Current mode/flags incompatible with requested. */ + mdbx_error("current mode/flags incompatible with requested"); rc = MDBX_INCOMPATIBLE; goto bailout; } From 47cc45dec4888ab828bcc67c07b6d9b6a2553d47 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 10:50:20 +0300 Subject: [PATCH 177/303] mdbx: don't check present durability mode for MDBX_RDONLY. --- src/mdbx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 4624b642..4e7c9d88 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4156,7 +4156,8 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, /* LY: just indicate that is not an exclusive access. */ *exclusive = 0; } - if ((env->me_lck->mti_envmode ^ env->me_flags) & mode_flags) { + if ((env->me_flags & MDBX_RDONLY) == 0 && + ((env->me_lck->mti_envmode ^ env->me_flags) & mode_flags) != 0) { mdbx_error("current mode/flags incompatible with requested"); rc = MDBX_INCOMPATIBLE; goto bailout; From 1e0654a4a8a569e0b720d10aa9951be82e6d25bf Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 11:08:21 +0300 Subject: [PATCH 178/303] tool: show operational txnid in mdbx_chk. --- src/tools/mdbx_chk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index f4578a6e..f876c1ee 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -913,7 +913,7 @@ int main(int argc, char *argv[]) { size_t traversal_problems; size_t empty_pages, lost_bytes; - print("Traversal b-tree...\n"); + print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid); fflush(NULL); walk.pagemap = calloc(lastpgno, sizeof(*walk.pagemap)); if (!walk.pagemap) { From 5d836d2b03a0443ef2af3bf1f73732b40a437ece Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 11:45:13 +0300 Subject: [PATCH 179/303] mdbx: clarify steady/fluid/weak names of a head. --- src/mdbx.c | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 4e7c9d88..e9b74531 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1344,8 +1344,8 @@ static __inline MDBX_meta *mdbx_meta_ancient(const MDBX_env *env, MDBX_meta *a, return a_older_that_b ? a : b; } -static __inline MDBX_meta *mdbx_meta_head(const MDBX_env *env, - const bool roolback2steady) { +static __inline MDBX_meta *mdbx_meta_mostrecent(const MDBX_env *env, + const bool roolback2steady) { MDBX_meta *m0 = METAPAGE(env, 0); MDBX_meta *m1 = METAPAGE(env, 1); MDBX_meta *m2 = METAPAGE(env, 2); @@ -1355,12 +1355,12 @@ static __inline MDBX_meta *mdbx_meta_head(const MDBX_env *env, return head; } -static __hot MDBX_meta *mdbx_meta_steady_head(const MDBX_env *env) { - return mdbx_meta_head(env, true); +static __hot MDBX_meta *mdbx_meta_steady(const MDBX_env *env) { + return mdbx_meta_mostrecent(env, true); } -static __hot MDBX_meta *mdbx_meta_fluid_head(const MDBX_env *env) { - return mdbx_meta_head(env, false); +static __hot MDBX_meta *mdbx_meta_head(const MDBX_env *env) { + return mdbx_meta_mostrecent(env, false); } static const char *mdbx_durable_str(const MDBX_meta *const meta) { @@ -1374,7 +1374,7 @@ static const char *mdbx_durable_str(const MDBX_meta *const meta) { /* Find oldest txnid still referenced. */ static txnid_t mdbx_find_oldest(MDBX_env *env, int *laggard) { - const MDBX_meta *const head = mdbx_meta_head( + const MDBX_meta *const head = mdbx_meta_mostrecent( env, F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) ? false : true); txnid_t oldest = head->mm_txnid; @@ -1665,10 +1665,10 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, if ((flags & MDBX_ALLOC_GC) && ((flags & MDBX_ALLOC_KICK) || rc == MDBX_MAP_FULL)) { - MDBX_meta *fluid = mdbx_meta_fluid_head(env); - MDBX_meta *steady = mdbx_meta_steady_head(env); + MDBX_meta *head = mdbx_meta_head(env); + MDBX_meta *steady = mdbx_meta_steady(env); - if (oldest == steady->mm_txnid && META_IS_WEAK(fluid) && + if (oldest == steady->mm_txnid && META_IS_WEAK(head) && !META_IS_WEAK(steady)) { /* LY: Here an oom was happened: * - all pages had allocated; @@ -1682,7 +1682,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, mdbx_debug("kick-gc: head %" PRIaTXN "-%s, tail %" PRIaTXN "-%s, oldest %" PRIaTXN "", - fluid->mm_txnid, mdbx_durable_str(fluid), steady->mm_txnid, + head->mm_txnid, mdbx_durable_str(head), steady->mm_txnid, mdbx_durable_str(steady), oldest); unsigned me_flags = env->me_flags & MDBX_WRITEMAP; @@ -1690,7 +1690,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, me_flags |= MDBX_UTTERLY_NOSYNC; mdbx_assert(env, env->me_sync_pending > 0); - MDBX_meta meta = *fluid; + MDBX_meta meta = *head; if (mdbx_env_sync_locked(env, me_flags, &meta) == MDBX_SUCCESS) { txnid_t snap = mdbx_find_oldest(env, NULL); if (snap > oldest) { @@ -1954,7 +1954,7 @@ int mdbx_env_sync(MDBX_env *env, int force) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - MDBX_meta *head = mdbx_meta_fluid_head(env); + MDBX_meta *head = mdbx_meta_head(env); if (!META_IS_STEADY(head) || env->me_sync_pending || env->me_mapsize != head->mm_mapsize) { @@ -1983,7 +1983,7 @@ int mdbx_env_sync(MDBX_env *env, int force) { return rc; /* LY: head may be changed. */ - head = mdbx_meta_fluid_head(env); + head = mdbx_meta_head(env); } if (!META_IS_STEADY(head) || env->me_sync_pending || @@ -2177,7 +2177,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } while (1) { - MDBX_meta *const meta = mdbx_meta_fluid_head(txn->mt_env); + MDBX_meta *const meta = mdbx_meta_head(txn->mt_env); mdbx_jitter4testing(false); const txnid_t snap = meta->mm_txnid; mdbx_jitter4testing(false); @@ -2195,7 +2195,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_canary = meta->mm_canary; /* LY: Retry on a race, ITS#7970. */ - if (likely(meta == mdbx_meta_fluid_head(txn->mt_env) && + if (likely(meta == mdbx_meta_head(txn->mt_env) && snap == meta->mm_txnid)) { mdbx_jitter4testing(false); break; @@ -2212,7 +2212,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { return rc; mdbx_jitter4testing(false); - MDBX_meta *meta = mdbx_meta_fluid_head(env); + MDBX_meta *meta = mdbx_meta_head(env); mdbx_jitter4testing(false); txn->mt_canary = meta->mm_canary; txn->mt_txnid = meta->mm_txnid + 1; @@ -3380,7 +3380,7 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const meta0 = METAPAGE(env, 0); MDBX_meta *const meta1 = METAPAGE(env, 1); MDBX_meta *const meta2 = METAPAGE(env, 2); - MDBX_meta *const head = mdbx_meta_fluid_head(env); + MDBX_meta *const head = mdbx_meta_head(env); const size_t prev_mapsize = head->mm_mapsize; const size_t used_size = env->me_psize * (pending->mm_last_pg + 1); @@ -3451,7 +3451,7 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, if (!META_IS_STEADY(head) && META_IS_STEADY(pending)) target = head; else { - mdbx_assert(env, mdbx_meta_eq(head, pending)); + mdbx_ensure(env, mdbx_meta_eq(head, pending)); mdbx_debug("skip update meta"); return MDBX_SUCCESS; } @@ -3711,7 +3711,7 @@ int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { return MDBX_EINVAL; /* FIXME: lock/unlock */ - meta = mdbx_meta_fluid_head(env); + meta = mdbx_meta_head(env); if (!size) size = meta->mm_mapsize; /* Silently round up to minimum if the size is too small */ @@ -3873,7 +3873,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { return MDBX_WANNA_RECOVERY; } - const MDBX_meta *head = mdbx_meta_fluid_head(env); + const MDBX_meta *head = mdbx_meta_head(env); if (head->mm_txnid != meta.mm_txnid) { if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { assert(META_IS_STEADY(&meta) && !META_IS_STEADY(head)); @@ -3905,7 +3905,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { } } - head = mdbx_meta_fluid_head(env); + head = mdbx_meta_head(env); if (head->mm_mapsize != env->me_mapsize) { mdbx_info("mismatch meta.mapsize: present %" PRIu64 ", should %" PRIu64, head->mm_mapsize, env->me_mapsize); @@ -4194,7 +4194,7 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, #if MDBX_DEBUG if (rc == MDBX_SUCCESS) { - MDBX_meta *meta = mdbx_meta_fluid_head(env); + MDBX_meta *meta = mdbx_meta_head(env); MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, @@ -8720,7 +8720,7 @@ int __cold mdbx_env_stat(MDBX_env *env, MDBX_stat *arg, size_t bytes) { if (unlikely(bytes != sizeof(MDBX_stat))) return MDBX_EINVAL; - meta = mdbx_meta_fluid_head(env); + meta = mdbx_meta_head(env); return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); } @@ -8736,7 +8736,7 @@ int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); do { - const MDBX_meta *meta = mdbx_meta_fluid_head(env); + const MDBX_meta *meta = mdbx_meta_head(env); arg->me_meta0_txnid = meta0->mm_txnid; arg->me_meta0_sign = meta0->mm_datasync_sign; arg->me_meta1_txnid = meta1->mm_txnid; @@ -9402,7 +9402,7 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { continue; rc = env->me_oom_func(env, pid, tid, oldest, - mdbx_meta_fluid_head(env)->mm_txnid - oldest, retry); + mdbx_meta_head(env)->mm_txnid - oldest, retry); if (rc < 0) break; @@ -9467,7 +9467,7 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) return -1; MDBX_env *env = txn->mt_env; - MDBX_meta *meta = mdbx_meta_fluid_head(env); + MDBX_meta *meta = mdbx_meta_head(env); if (percent) { size_t maxpg = env->me_maxpg; size_t last = meta->mm_last_pg + 1; From d642127d100436bfbee0ff8581266829225ca3bb Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 12:00:05 +0300 Subject: [PATCH 180/303] tools: fix 'tail' show in mdbx_chk. --- src/tools/mdbx_chk.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index f876c1ee..3ea53521 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -638,42 +638,48 @@ static __inline int meta_recent(const bool roolback2steady) { : 0; } -static __inline int meta_ancient(const bool roolback2steady) { +static __inline int meta_tail(int head) { - if (meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, - envinfo.me_meta1_txnid, envinfo.me_meta1_sign, roolback2steady)) + if (head == 0) + return meta_ot(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, + envinfo.me_meta2_txnid, envinfo.me_meta2_sign, true) + ? 1 + : 2; + if (head == 1) return meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, - envinfo.me_meta2_txnid, envinfo.me_meta2_sign, - roolback2steady) + envinfo.me_meta2_txnid, envinfo.me_meta2_sign, true) ? 0 : 2; - return meta_ot(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, - envinfo.me_meta1_txnid, envinfo.me_meta1_sign, roolback2steady) - ? 2 - : 1; + if (head == 2) + return meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, + envinfo.me_meta1_txnid, envinfo.me_meta1_sign, true) + ? 0 + : 1; + assert(false); + return -1; } -static int meta_steady_head(void) { return meta_recent(true); } +static int meta_steady(void) { return meta_recent(true); } -static int meta_weak_head(void) { return meta_recent(false); } - -static int meta_tail(void) { return meta_ancient(true); } +static int meta_head(void) { return meta_recent(false); } void verbose_meta(int num, txnid_t txnid, uint64_t sign) { print(" - meta-%d: %s %" PRIu64, num, meta_synctype(sign), txnid); bool stay = true; - if (num == meta_steady_head() && num == meta_weak_head()) { + const int steady = meta_steady(); + const int head = meta_head(); + if (num == steady && num == head) { print(", head"); stay = false; - } else if (num == meta_steady_head()) { + } else if (num == steady) { print(", head-steady"); stay = false; - } else if (num == meta_weak_head()) { + } else if (num == head) { print(", head-weak"); stay = false; } - if (num == meta_tail()) { + if (num == meta_tail(head)) { print(", tail"); stay = false; } From 0e49b1b2b42f79e28592b975f85c5706ffbe1380 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 15:11:31 +0300 Subject: [PATCH 181/303] mdbx: update 'mapsize' while opening with txn-increment. --- src/mdbx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mdbx.c b/src/mdbx.c index e9b74531..51a6f1ee 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3917,6 +3917,9 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { head->mm_mapsize, env->me_mapsize); meta = *head; meta.mm_mapsize = env->me_mapsize; + meta.mm_txnid += 1; + if (META_IS_STEADY(head)) + meta.mm_datasync_sign = mdbx_meta_sign(&meta); err = mdbx_env_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); if (err) return err; From c02a3ac687efb3bfe59f49a0f12765e8b2146b84 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 15:20:25 +0300 Subject: [PATCH 182/303] mdbx: ignore meta-pages with wrong pagesize. --- src/mdbx.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 51a6f1ee..3f0984a4 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3319,10 +3319,22 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { return MDBX_VERSION_MISMATCH; } + /* LY: check pagesize */ + if (!is_power2(page.mp_meta.mm_psize) || + page.mp_meta.mm_psize < MIN_PAGESIZE || + page.mp_meta.mm_psize > MAX_PAGESIZE) { + mdbx_debug("meta[%u] has invalid pagesize %u", meta_number, + page.mp_meta.mm_psize); + continue; + } + /* LY: check signature as a checksum */ if (META_IS_STEADY(&page.mp_meta) && page.mp_meta.mm_datasync_sign != mdbx_meta_sign(&page.mp_meta)) { - mdbx_debug("steady-meta[%u] has invalid checksum", meta_number); + mdbx_debug("meta[%u] has invalid steady-checksum (0x%" PRIx64 + " != 0x%" PRIx64 ")", + meta_number, page.mp_meta.mm_datasync_sign, + mdbx_meta_sign(&page.mp_meta)); continue; } From 9959fe87117a11cb7cd91c59fa160090c80e9701 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 15:46:45 +0300 Subject: [PATCH 183/303] mdbx: reorder fields in metapages. --- src/bits.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bits.h b/src/bits.h index cbd2cc09..96967ff3 100644 --- a/src/bits.h +++ b/src/bits.h @@ -246,10 +246,10 @@ typedef struct MDBX_meta { #define mm_psize mm_dbs[FREE_DBI].md_xsize /* Any persistent environment flags, see mdbx_env */ #define mm_flags mm_dbs[FREE_DBI].md_flags + mdbx_canary mm_canary; /* Last used page in the datafile. - * Actually the file may be shorter if the freeDB lists the final pages. */ + * Actually the file may be shorter if the freeDB lists the final pages. */ pgno_t mm_last_pg; - volatile txnid_t mm_txnid; /* txnid that committed this page */ #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u volatile uint64_t mm_datasync_sign; @@ -259,7 +259,7 @@ typedef struct MDBX_meta { #define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) #define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - mdbx_canary mm_canary; + volatile txnid_t mm_txnid; /* txnid that committed this page */ } MDBX_meta; /* Common header for all page types. The page type depends on mp_flags. From f4d3b76f81012a7b829909909afe313672b15408 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 15:39:58 +0300 Subject: [PATCH 184/303] mdbx: minor refine and rename mdbx_sync_locked(). Change-Id: Iab650b091006e5646d9d22316d19aa58a517684b --- src/mdbx.c | 22 +++++++++++++--------- src/osal.c | 4 ++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 3f0984a4..ed666d6d 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -552,8 +552,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, pgno_t newpgno, unsigned nflags); static int mdbx_read_header(MDBX_env *env, MDBX_meta *meta); -static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending); +static int mdbx_sync_locked(MDBX_env *env, unsigned flags, + MDBX_meta *const pending); static void mdbx_env_close0(MDBX_env *env); static MDBX_node *mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, int *exactp); @@ -1691,7 +1691,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, mdbx_assert(env, env->me_sync_pending > 0); MDBX_meta meta = *head; - if (mdbx_env_sync_locked(env, me_flags, &meta) == MDBX_SUCCESS) { + if (mdbx_sync_locked(env, me_flags, &meta) == MDBX_SUCCESS) { txnid_t snap = mdbx_find_oldest(env, NULL); if (snap > oldest) { continue; @@ -1994,7 +1994,7 @@ int mdbx_env_sync(MDBX_env *env, int force) { mdbx_durable_str(head), env->me_sync_pending, env->me_mapsize, head->mm_mapsize); MDBX_meta meta = *head; - rc = mdbx_env_sync_locked(env, flags, &meta); + rc = mdbx_sync_locked(env, flags, &meta); if (unlikely(rc != MDBX_SUCCESS)) { mdbx_txn_unlock(env); return rc; @@ -3255,7 +3255,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { meta.mm_txnid = txn->mt_txnid; meta.mm_canary = txn->mt_canary; - rc = mdbx_env_sync_locked(env, env->me_flags | txn->mt_flags, &meta); + rc = mdbx_sync_locked(env, env->me_flags | txn->mt_flags, &meta); } if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -3387,8 +3387,9 @@ static MDBX_page *__cold mdbx_init_metas(const MDBX_env *env, void *buffer) { return page1; } -static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending) { +static int mdbx_sync_locked(MDBX_env *env, unsigned flags, + MDBX_meta *const pending) { + mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); MDBX_meta *const meta0 = METAPAGE(env, 0); MDBX_meta *const meta1 = METAPAGE(env, 1); MDBX_meta *const meta2 = METAPAGE(env, 2); @@ -3452,7 +3453,7 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, : MDBX_DATASIGN_WEAK; } - volatile MDBX_meta *target = nullptr; + MDBX_meta *target = nullptr; if (head->mm_txnid == pending->mm_txnid) { mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, sizeof(head->mm_dbs)) == 0); @@ -3506,6 +3507,7 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, !mdbx_meta_eq(pending, meta2)); const size_t offset = (char *)target - env->me_map; + mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); if (env->me_flags & MDBX_WRITEMAP) { /* LY: 'invalidate' the meta. */ mdbx_jitter4testing(true); @@ -3550,6 +3552,7 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, * how stale their view of these values is. */ /* LY: step#3 - sync meta-pages. */ + mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { @@ -3565,6 +3568,7 @@ static int mdbx_env_sync_locked(MDBX_env *env, unsigned flags, } /* LY: currently this can't happen, but... */ + mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); if (unlikely(pending->mm_mapsize < prev_mapsize)) { mdbx_assert(env, pending->mm_mapsize == env->me_mapsize); rc = mdbx_ftruncate(env->me_fd, pending->mm_mapsize); @@ -3932,7 +3936,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { meta.mm_txnid += 1; if (META_IS_STEADY(head)) meta.mm_datasync_sign = mdbx_meta_sign(&meta); - err = mdbx_env_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); + err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); if (err) return err; } diff --git a/src/osal.c b/src/osal.c index 0611ad38..7e98aa82 100644 --- a/src/osal.c +++ b/src/osal.c @@ -653,8 +653,8 @@ int mdbx_msync(void *addr, size_t length, int async) { return FlushViewOfFile(addr, length) ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else - return (msync(addr, length, async ? MS_ASYNC : MS_SYNC) == 0) ? MDBX_SUCCESS - : errno; + const int mode = async ? MS_ASYNC : MS_SYNC; + return (msync(addr, length, mode) == 0) ? MDBX_SUCCESS : errno; #endif } From 429384005563f80148e4f94e3507d9ac2457016e Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 15:44:00 +0300 Subject: [PATCH 185/303] mdbx: fix meta-invalidate race in MDBX_WRITEMAP mode (when target == head). --- src/mdbx.c | 50 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index ed666d6d..655fb310 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3508,27 +3508,49 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, const size_t offset = (char *)target - env->me_map; mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); + mdbx_ensure(env, target == head || target->mm_txnid < pending->mm_txnid); if (env->me_flags & MDBX_WRITEMAP) { - /* LY: 'invalidate' the meta. */ mdbx_jitter4testing(true); - if (target->mm_datasync_sign != MDBX_DATASIGN_WEAK || - target->mm_txnid != pending->mm_txnid) { + if (likely(target != head)) { +#ifdef NDEBUG + /* nodebug: 'invalidate' the meta to avoid false-reading + * from violators (make safer) */ target->mm_datasync_sign = MDBX_DATASIGN_WEAK; - mdbx_jitter4testing(true); target->mm_txnid = 0; + mdbx_coherent_barrier(); +#else + /* debug: provoke failure to catch a violators */ + memset(target->mm_dbs, 0xCC, + sizeof(target->mm_dbs) + sizeof(target->mm_canary)); + mdbx_jitter4testing(false); +#endif + + /* LY: update info */ + target->mm_mapsize = pending->mm_mapsize; + target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; + target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; + target->mm_canary = pending->mm_canary; + target->mm_last_pg = pending->mm_last_pg; mdbx_jitter4testing(true); + mdbx_coherent_barrier(); + + /* LY: 'commit' the meta */ + target->mm_txnid = pending->mm_txnid; + mdbx_jitter4testing(true); + } else { + /* dangerous case (target == head), only mm_datasync_sign could + * me updated, check assertions once again */ + mdbx_ensure(env, head->mm_txnid == pending->mm_txnid && + !META_IS_STEADY(head) && META_IS_STEADY(pending)); + mdbx_ensure(env, head->mm_last_pg == pending->mm_last_pg); + mdbx_ensure(env, head->mm_mapsize == pending->mm_mapsize); + mdbx_ensure(env, memcmp(&head->mm_dbs, &pending->mm_dbs, + sizeof(head->mm_dbs)) == 0); + mdbx_ensure(env, memcmp(&head->mm_canary, &pending->mm_canary, + sizeof(head->mm_canary)) == 0); } - /* LY: update info */ - target->mm_mapsize = pending->mm_mapsize; - target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; - target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; - target->mm_last_pg = pending->mm_last_pg; - target->mm_canary = pending->mm_canary; - /* LY: 'commit' the meta */ - mdbx_jitter4testing(true); - target->mm_txnid = pending->mm_txnid; - mdbx_jitter4testing(true); target->mm_datasync_sign = pending->mm_datasync_sign; + mdbx_coherent_barrier(); mdbx_jitter4testing(true); } else { pending->mm_magic = MDBX_MAGIC; From b038767a18b006e75a7805b7bd826822970c8af9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 16:17:24 +0300 Subject: [PATCH 186/303] mdbx: add 'mdbx_chk' to Makefile's 'check' target. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a8bae342..2a2b762d 100644 --- a/Makefile +++ b/Makefile @@ -72,7 +72,7 @@ install: $(LIBRARIES) $(TOOLS) $(HEADERS) clean: rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err src/*.o test/*.o -check: test/test +check: test/test mdbx_chk rm -f $(TESTDB) && (set -o pipefail; test/test --pathname=$(TESTDB) --dont-cleanup-after basic | tee test.log | tail -n 42) && ./mdbx_chk -vn $(TESTDB) define core-rule From 18cf804d0b85c66ba27ed80a412f17097330d506 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 17:11:48 +0300 Subject: [PATCH 187/303] mdbx: add STATIC_ASSERT_MSG, refine STATIC_ASSERT. --- src/bits.h | 3 +++ src/defs.h | 15 ++++++++++----- src/osal.c | 20 ++++++++++---------- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/bits.h b/src/bits.h index 96967ff3..9aa76327 100644 --- a/src/bits.h +++ b/src/bits.h @@ -31,6 +31,9 @@ /*----------------------------------------------------------------------------*/ /* Should be defined before any includes */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif #ifndef _FILE_OFFSET_BITS #define _FILE_OFFSET_BITS 64 #endif diff --git a/src/defs.h b/src/defs.h index 290a854d..79e65d44 100644 --- a/src/defs.h +++ b/src/defs.h @@ -368,15 +368,20 @@ #define FIXME "FIXME: " __FILE__ ", " STRINGIFY(__LINE__) -#ifndef STATIC_ASSERT -# if __STDC_VERSION__ >= 201112L -# define STATIC_ASSERT(expr, msg) _Static_assert(expr, msg) +#ifndef STATIC_ASSERT_MSG +# if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) \ + || __has_feature(c_static_assert) +# define STATIC_ASSERT_MSG(expr, msg) _Static_assert(expr, msg) # elif defined(static_assert) -# define STATIC_ASSERT(expr, msg) static_assert(expr, msg) +# define STATIC_ASSERT_MSG(expr, msg) static_assert(expr, msg) # else -# define STATIC_ASSERT(expr, msg) switch (0) {case 0:case (expr):;} +# define STATIC_ASSERT_MSG(expr, msg) switch (0) {case 0:case (expr):;} # endif #endif /* STATIC_ASSERT */ +#ifndef STATIC_ASSERT +# define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr) +#endif + /* *INDENT-ON* */ /* clang-format on */ diff --git a/src/osal.c b/src/osal.c index 7e98aa82..4b56be70 100644 --- a/src/osal.c +++ b/src/osal.c @@ -407,8 +407,8 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) { return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; } #else - STATIC_ASSERT(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); ssize_t read = pread(fd, buf, bytes, offset); if (read < 0) { int rc = errno; @@ -437,8 +437,8 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, int rc; ssize_t written; do { - STATIC_ASSERT(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); written = pwrite(fd, buf, bytes, offset); if (likely(bytes == (size_t)written)) return MDBX_SUCCESS; @@ -465,8 +465,8 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, int rc; ssize_t written; do { - STATIC_ASSERT(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); written = pwritev(fd, iov, iovcnt, offset); if (likely(expected_written == (size_t)written)) return MDBX_SUCCESS; @@ -556,8 +556,8 @@ int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { #else struct stat st; - STATIC_ASSERT(sizeof(off_t) <= sizeof(uint64_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); + STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); if (fstat(fd, &st)) return errno; @@ -574,8 +574,8 @@ int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { ? MDBX_SUCCESS : mdbx_get_errno_checked(); #else - STATIC_ASSERT(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno; #endif } From 1c170cb866e02cfa3d08a75740e48f3cf10bea10 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 17:23:09 +0300 Subject: [PATCH 188/303] mdbx: add MAX_PAGENO, MIN_PAGENO, MAX_MAPSIZE, MIN_MAPSIZE. --- src/bits.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/bits.h b/src/bits.h index 9aa76327..a2e2e840 100644 --- a/src/bits.h +++ b/src/bits.h @@ -139,6 +139,8 @@ * size up to 2^44 bytes, in case of 4K pages. */ typedef uint64_t pgno_t; #define PRIaPGNO PRIu64 /* TODO */ +#define MAX_PAGENO ((pgno_t)UINT64_C(0xffffFFFFffff)) +#define MIN_PAGENO (NUM_METAS - 1) /* A transaction ID. */ typedef uint64_t txnid_t; @@ -149,6 +151,13 @@ typedef uint64_t txnid_t; * this is plenty. */ typedef uint16_t indx_t; +#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) +#define MAX_MAPSIZE \ + ((sizeof(size_t) < 8) \ + ? UINT32_C(0x7ff80000) \ + : ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \ + : MAX_PAGENO * (uint64_t)MAX_PAGESIZE)) + /*----------------------------------------------------------------------------*/ /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) @@ -966,7 +975,7 @@ static __inline pgno_t NODEPGNO(const MDBX_node *node) { if (UNALIGNED_OK) { pgno = node->mn_ksize_and_pgno; if (sizeof(pgno_t) > 4) - pgno &= UINT64_C(0xffffFFFFffff); + pgno &= MAX_PAGENO; } else { pgno = node->mn_lo | ((pgno_t)node->mn_hi << 16); if (sizeof(pgno_t) > 4) @@ -977,7 +986,7 @@ static __inline pgno_t NODEPGNO(const MDBX_node *node) { /* Set the page number in a branch node */ static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { - assert(pgno <= (pgno_t)UINT64_C(0xffffFFFFffff)); + assert(pgno <= MAX_PAGENO); if (UNALIGNED_OK) { if (sizeof(pgno_t) > 4) From 93c5a84bcd27136396cad0d611827fede6a22d74 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 17:25:08 +0300 Subject: [PATCH 189/303] mdbx: more checks mdbx_read_header() for metapage's content. --- src/mdbx.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 12 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 655fb310..c149b222 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3292,49 +3292,115 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { const unsigned offset = guess_pagesize * meta_number; int rc = mdbx_pread(env->me_fd, &page, sizeof(page), offset); if (rc != MDBX_SUCCESS) { - mdbx_debug("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(page), rc, + mdbx_error("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(page), rc, mdbx_strerror(rc)); return rc; } if (page.mp_pgno != meta_number) { - mdbx_debug("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, + mdbx_error("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page.mp_pgno); return MDBX_INVALID; } if (!F_ISSET(page.mp_flags, P_META)) { - mdbx_debug("page #%u not a meta-page", meta_number); + mdbx_error("page #%u not a meta-page", meta_number); return MDBX_INVALID; } if (page.mp_meta.mm_magic != MDBX_MAGIC) { - mdbx_debug("meta[%u] has invalid magic", meta_number); + mdbx_error("meta[%u] has invalid magic", meta_number); return MDBX_INVALID; } if (page.mp_meta.mm_version != MDBX_DATA_VERSION) { - mdbx_debug("database is version %u, expected version %u", + mdbx_error("database is version %u, expected version %u", page.mp_meta.mm_version, MDBX_DATA_VERSION); return MDBX_VERSION_MISMATCH; } /* LY: check pagesize */ + STATIC_ASSERT(MIN_PAGESIZE < MAX_PAGESIZE); if (!is_power2(page.mp_meta.mm_psize) || page.mp_meta.mm_psize < MIN_PAGESIZE || page.mp_meta.mm_psize > MAX_PAGESIZE) { - mdbx_debug("meta[%u] has invalid pagesize %u", meta_number, - page.mp_meta.mm_psize); + mdbx_notice("meta[%u] has invalid pagesize %u, skip it", meta_number, + page.mp_meta.mm_psize); + continue; + } + + /* LY: check mapsize limits */ + STATIC_ASSERT(MAX_MAPSIZE < SSIZE_MAX - MAX_PAGESIZE); + STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); + if (page.mp_meta.mm_mapsize < MIN_MAPSIZE || + page.mp_meta.mm_mapsize > MAX_MAPSIZE) { + mdbx_notice("meta[%u] has invalid mapsize %" PRIu64 ", skip it", + meta_number, page.mp_meta.mm_mapsize); continue; } /* LY: check signature as a checksum */ if (META_IS_STEADY(&page.mp_meta) && page.mp_meta.mm_datasync_sign != mdbx_meta_sign(&page.mp_meta)) { - mdbx_debug("meta[%u] has invalid steady-checksum (0x%" PRIx64 - " != 0x%" PRIx64 ")", - meta_number, page.mp_meta.mm_datasync_sign, - mdbx_meta_sign(&page.mp_meta)); + mdbx_notice("meta[%u] has invalid steady-checksum (0x%" PRIx64 + " != 0x%" PRIx64 "), skip it", + meta_number, page.mp_meta.mm_datasync_sign, + mdbx_meta_sign(&page.mp_meta)); + continue; + } + + /* LY: check mapsize with given given pagesize */ + if (page.mp_meta.mm_mapsize < + MIN_PAGENO * (uint64_t)page.mp_meta.mm_psize || + page.mp_meta.mm_mapsize > + MAX_PAGENO * (uint64_t)page.mp_meta.mm_psize) { + mdbx_notice("meta[%u] has invalid mapsize %" PRIu64 + ", with given pagesize %u, skip it", + meta_number, page.mp_meta.mm_mapsize, page.mp_meta.mm_psize); + continue; + } + + /* LY: check last_pgno */ + if (page.mp_meta.mm_last_pg < MIN_PAGENO || + page.mp_meta.mm_last_pg > MAX_PAGENO || + page.mp_meta.mm_last_pg > + page.mp_meta.mm_mapsize / page.mp_meta.mm_psize) { + mdbx_notice("meta[%u] has invalid last-pageno %" PRIaPGNO ", skip it", + meta_number, page.mp_meta.mm_last_pg); + continue; + } + + /* LY: FreeDB root */ + if (page.mp_meta.mm_dbs[FREE_DBI].md_root == P_INVALID) { + if (page.mp_meta.mm_dbs[FREE_DBI].md_branch_pages || + page.mp_meta.mm_dbs[FREE_DBI].md_depth || + page.mp_meta.mm_dbs[FREE_DBI].md_entries || + page.mp_meta.mm_dbs[FREE_DBI].md_leaf_pages || + page.mp_meta.mm_dbs[FREE_DBI].md_overflow_pages) { + mdbx_notice("meta[%u] has false-empty freedb, skip it", meta_number); + continue; + } + } else if (page.mp_meta.mm_dbs[FREE_DBI].md_root > + page.mp_meta.mm_last_pg) { + mdbx_notice("meta[%u] has invalid freedb-root %" PRIaPGNO ", skip it", + meta_number, page.mp_meta.mm_dbs[FREE_DBI].md_root); + continue; + } + + /* LY: MainDB root */ + if (page.mp_meta.mm_dbs[MAIN_DBI].md_root == P_INVALID) { + if (page.mp_meta.mm_dbs[MAIN_DBI].md_branch_pages || + page.mp_meta.mm_dbs[MAIN_DBI].md_depth || + page.mp_meta.mm_dbs[MAIN_DBI].md_entries || + page.mp_meta.mm_dbs[MAIN_DBI].md_leaf_pages || + page.mp_meta.mm_dbs[MAIN_DBI].md_overflow_pages) { + mdbx_notice("meta[%u] has false-empty maindb", meta_number); + continue; + } + } else if (page.mp_meta.mm_dbs[MAIN_DBI].md_root > + page.mp_meta.mm_last_pg) { + mdbx_notice("meta[%u] has invalid maindb-root %" PRIaPGNO ", skip it", + meta_number, page.mp_meta.mm_dbs[MAIN_DBI].md_root); continue; } @@ -3346,7 +3412,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { } if (META_IS_WEAK(meta)) { - mdbx_debug("both meta-pages are weak, database is corrupted"); + mdbx_error("no usable meta-pages, database is corrupted"); return MDBX_CORRUPTED; } From bd70135db1d6f020c70f26f211b7c7386fa73581 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 18:10:23 +0300 Subject: [PATCH 190/303] mdbx: re-read and check meta-pages in mdbx_read_header() for consistency. --- src/mdbx.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index c149b222..54dfd74e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3290,11 +3290,33 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { const unsigned meta_number = loop_count % NUM_METAS; const unsigned offset = guess_pagesize * meta_number; - int rc = mdbx_pread(env->me_fd, &page, sizeof(page), offset); - if (rc != MDBX_SUCCESS) { - mdbx_error("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(page), rc, - mdbx_strerror(rc)); - return rc; + + unsigned retryleft = 42; + while (1) { + int rc = mdbx_pread(env->me_fd, &page, sizeof(page), offset); + if (rc != MDBX_SUCCESS) { + mdbx_error("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(page), + rc, mdbx_strerror(rc)); + return rc; + } + + MDBX_page again; + rc = mdbx_pread(env->me_fd, &again, sizeof(again), offset); + if (rc != MDBX_SUCCESS) { + mdbx_error("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(again), + rc, mdbx_strerror(rc)); + return rc; + } + + if (memcmp(&page, &again, sizeof(page)) == 0 || --retryleft == 0) + break; + + mdbx_info("meta[%u] was updated, re-read it", meta_number); + } + + if (!retryleft) { + mdbx_error("meta[%u] is too volatile, skip it", meta_number); + continue; } if (page.mp_pgno != meta_number) { From dd33bea1d8aa215924c1500f263bdcbaf08a281f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 18:17:17 +0300 Subject: [PATCH 191/303] tools: mdbx_chk - don't 'rolled-back' in cooperative more. --- src/tools/mdbx_chk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 3ea53521..726f5712 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -73,6 +73,7 @@ static __attribute__((constructor)) void init_walk(void) { uint64_t total_unused_bytes; int exclusive = 2; +int envflags = MDBX_RDONLY; MDBX_env *env; MDBX_txn *txn, *locktxn; @@ -686,7 +687,8 @@ void verbose_meta(int num, txnid_t txnid, uint64_t sign) { if (stay) print(", stay"); - if (txnid > envinfo.me_recent_txnid) + if (txnid > envinfo.me_recent_txnid && + (exclusive || (envflags & MDBX_RDONLY) == 0)) print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", txnid - envinfo.me_recent_txnid, txnid, envinfo.me_recent_txnid); print("\n"); @@ -729,7 +731,6 @@ int main(int argc, char *argv[]) { int i, rc; char *prog = argv[0]; char *envname; - int envflags = MDBX_RDONLY; int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; int dont_traversal = 0; struct timespec timestamp_start, timestamp_finish; From 76f459c08ed56eda5ca71789a9e7b5c8635863bd Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 20:00:52 +0300 Subject: [PATCH 192/303] mdbx: two-phase txnid to avoid bottom-top DMA updates. --- src/bits.h | 6 +- src/mdbx.c | 216 ++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 151 insertions(+), 71 deletions(-) diff --git a/src/bits.h b/src/bits.h index a2e2e840..6479b11d 100644 --- a/src/bits.h +++ b/src/bits.h @@ -252,6 +252,9 @@ typedef struct MDBX_meta { uint32_t mm_magic; /* Version number of this file. Must be set to MDBX_DATA_VERSION. */ uint32_t mm_version; + /* txnid that committed this page, */ + volatile txnid_t mm_txnid_top; + size_t mm_mapsize; /* size of mmap region */ MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ /* The size of pages used in this DB */ @@ -271,7 +274,8 @@ typedef struct MDBX_meta { #define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) #define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - volatile txnid_t mm_txnid; /* txnid that committed this page */ + /* txnid that committed this page */ + volatile txnid_t mm_txnid_bottom; } MDBX_meta; /* Common header for all page types. The page type depends on mp_flags. diff --git a/src/mdbx.c b/src/mdbx.c index 54dfd74e..d12b3834 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1272,6 +1272,61 @@ bailout: return rc; } +/*----------------------------------------------------------------------------*/ + +#define METAPAGE(env, n) \ + (&((MDBX_page *)((env)->me_map + env->me_psize * (n)))->mp_meta) + +#define METAPAGE_END(env) METAPAGE(env, NUM_METAS) + +static __inline txnid_t mdbx_meta_txnid(const MDBX_env *env, + const MDBX_meta *meta, + bool allow_volatile) { + mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); + txnid_t top = meta->mm_txnid_top; + txnid_t bottom = meta->mm_txnid_bottom; + if (!allow_volatile) + mdbx_assert(env, top == bottom); + return (top < bottom) ? top : bottom; +} + +static __inline txnid_t mdbx_meta_txnid_stable(const MDBX_env *env, + const MDBX_meta *meta) { + return mdbx_meta_txnid(env, meta, false); +} + +static __inline txnid_t mdbx_meta_txnid_fluid(const MDBX_env *env, + const MDBX_meta *meta) { + return mdbx_meta_txnid(env, meta, true); +} + +static __inline void mdbx_meta_update_begin(const MDBX_env *env, + MDBX_meta *meta, txnid_t txnid) { + mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); + mdbx_assert(env, meta->mm_txnid_top < txnid && meta->mm_txnid_bottom < txnid); + meta->mm_txnid_top = txnid; + (void)env; + mdbx_coherent_barrier(); +} + +static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta, + txnid_t txnid) { + mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); + mdbx_assert(env, meta->mm_txnid_top == txnid); + mdbx_assert(env, meta->mm_txnid_bottom < txnid); + + mdbx_jitter4testing(true); + meta->mm_txnid_bottom = txnid; + mdbx_coherent_barrier(); +} + +static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, + txnid_t txnid) { + mdbx_assert(env, meta < METAPAGE(env, 0) || meta > METAPAGE_END(env)); + meta->mm_txnid_top = txnid; + meta->mm_txnid_bottom = txnid; +} + static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { uint64_t sign = MDBX_DATASIGN_NONE; #if 0 /* TODO */ @@ -1285,10 +1340,13 @@ static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; } -static __inline bool mdbx_meta_ot(const MDBX_meta *a, const MDBX_meta *b, +static __inline bool mdbx_meta_ot(const MDBX_env *env, const MDBX_meta *a, + const MDBX_meta *b, const bool roolback2steady) { mdbx_jitter4testing(true); - if (a->mm_txnid == b->mm_txnid) + txnid_t txnid_a = mdbx_meta_txnid_fluid(env, a); + txnid_t txnid_b = mdbx_meta_txnid_fluid(env, b); + if (txnid_a == txnid_b) return META_IS_STEADY(b); mdbx_jitter4testing(true); @@ -1296,12 +1354,13 @@ static __inline bool mdbx_meta_ot(const MDBX_meta *a, const MDBX_meta *b, return META_IS_STEADY(b); mdbx_jitter4testing(true); - return a->mm_txnid < b->mm_txnid; + return txnid_a < txnid_b; } -static __inline bool mdbx_meta_eq(const MDBX_meta *a, const MDBX_meta *b) { +static __inline bool mdbx_meta_eq(const MDBX_env *env, const MDBX_meta *a, + const MDBX_meta *b) { mdbx_jitter4testing(true); - if (a->mm_txnid != b->mm_txnid) + if (mdbx_meta_txnid_fluid(env, a) != mdbx_meta_txnid_fluid(env, b)) return false; mdbx_jitter4testing(true); @@ -1312,18 +1371,15 @@ static __inline bool mdbx_meta_eq(const MDBX_meta *a, const MDBX_meta *b) { return true; } -#define METAPAGE(env, n) \ - (&((MDBX_page *)((env)->me_map + env->me_psize * (n)))->mp_meta) - static int mdbx_meta_eq_mask(const MDBX_env *env) { MDBX_meta *m0 = METAPAGE(env, 0); MDBX_meta *m1 = METAPAGE(env, 1); MDBX_meta *m2 = METAPAGE(env, 2); - int rc = mdbx_meta_eq(m0, m1) ? 1 : 0; - if (mdbx_meta_eq(m1, m2)) + int rc = mdbx_meta_eq(env, m0, m1) ? 1 : 0; + if (mdbx_meta_eq(env, m1, m2)) rc += 2; - if (mdbx_meta_eq(m2, m0)) + if (mdbx_meta_eq(env, m2, m0)) rc += 4; return rc; } @@ -1331,16 +1387,16 @@ static int mdbx_meta_eq_mask(const MDBX_env *env) { static __inline MDBX_meta *mdbx_meta_recent(const MDBX_env *env, MDBX_meta *a, MDBX_meta *b, const bool roolback2steady) { - const bool a_older_that_b = mdbx_meta_ot(a, b, roolback2steady); - mdbx_assert(env, !mdbx_meta_eq(a, b)); + const bool a_older_that_b = mdbx_meta_ot(env, a, b, roolback2steady); + mdbx_assert(env, !mdbx_meta_eq(env, a, b)); return a_older_that_b ? b : a; } static __inline MDBX_meta *mdbx_meta_ancient(const MDBX_env *env, MDBX_meta *a, MDBX_meta *b, const bool roolback2steady) { - const bool a_older_that_b = mdbx_meta_ot(a, b, roolback2steady); - mdbx_assert(env, !mdbx_meta_eq(a, b)); + const bool a_older_that_b = mdbx_meta_ot(env, a, b, roolback2steady); + mdbx_assert(env, !mdbx_meta_eq(env, a, b)); return a_older_that_b ? a : b; } @@ -1372,11 +1428,13 @@ static const char *mdbx_durable_str(const MDBX_meta *const meta) { return "Legacy"; } +/*----------------------------------------------------------------------------*/ + /* Find oldest txnid still referenced. */ static txnid_t mdbx_find_oldest(MDBX_env *env, int *laggard) { const MDBX_meta *const head = mdbx_meta_mostrecent( env, F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) ? false : true); - txnid_t oldest = head->mm_txnid; + txnid_t oldest = mdbx_meta_txnid_stable(env, head); int i, reader; const MDBX_reader *const r = env->me_lck->mti_readers; @@ -1668,7 +1726,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, MDBX_meta *head = mdbx_meta_head(env); MDBX_meta *steady = mdbx_meta_steady(env); - if (oldest == steady->mm_txnid && META_IS_WEAK(head) && + if (oldest == mdbx_meta_txnid_stable(env, steady) && META_IS_WEAK(head) && !META_IS_WEAK(steady)) { /* LY: Here an oom was happened: * - all pages had allocated; @@ -1682,7 +1740,8 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, mdbx_debug("kick-gc: head %" PRIaTXN "-%s, tail %" PRIaTXN "-%s, oldest %" PRIaTXN "", - head->mm_txnid, mdbx_durable_str(head), steady->mm_txnid, + mdbx_meta_txnid_stable(env, head), mdbx_durable_str(head), + mdbx_meta_txnid_stable(env, steady), mdbx_durable_str(steady), oldest); unsigned me_flags = env->me_flags & MDBX_WRITEMAP; @@ -2179,7 +2238,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { while (1) { MDBX_meta *const meta = mdbx_meta_head(txn->mt_env); mdbx_jitter4testing(false); - const txnid_t snap = meta->mm_txnid; + const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); mdbx_jitter4testing(false); if (r) { r->mr_txnid = snap; @@ -2196,7 +2255,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { /* LY: Retry on a race, ITS#7970. */ if (likely(meta == mdbx_meta_head(txn->mt_env) && - snap == meta->mm_txnid)) { + snap == mdbx_meta_txnid_fluid(env, meta))) { mdbx_jitter4testing(false); break; } @@ -2215,7 +2274,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { MDBX_meta *meta = mdbx_meta_head(env); mdbx_jitter4testing(false); txn->mt_canary = meta->mm_canary; - txn->mt_txnid = meta->mm_txnid + 1; + const txnid_t snap = mdbx_meta_txnid_stable(env, meta); + txn->mt_txnid = snap + 1; #if MDBX_DEBUG if (unlikely(txn->mt_txnid == mdbx_debug_edge)) { if (!mdbx_debug_logger) @@ -2225,7 +2285,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { "on/off edge (txn %" PRIaTXN ")", txn->mt_txnid); } #endif - if (unlikely(txn->mt_txnid < meta->mm_txnid)) { + if (unlikely(txn->mt_txnid < snap)) { mdbx_debug("txnid overflow!"); rc = MDBX_TXN_FULL; goto bailout; @@ -3252,7 +3312,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta.mm_last_pg = txn->mt_next_pgno - 1; - meta.mm_txnid = txn->mt_txnid; + mdbx_meta_set_txnid(env, &meta, txn->mt_txnid); meta.mm_canary = txn->mt_canary; rc = mdbx_sync_locked(env, env->me_flags | txn->mt_flags, &meta); @@ -3341,6 +3401,11 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { return MDBX_VERSION_MISMATCH; } + if (page.mp_meta.mm_txnid_top != page.mp_meta.mm_txnid_bottom) { + mdbx_warning("meta[%u] not completely updated, skip it", meta_number); + continue; + } + /* LY: check pagesize */ STATIC_ASSERT(MIN_PAGESIZE < MAX_PAGESIZE); if (!is_power2(page.mp_meta.mm_psize) || @@ -3426,7 +3491,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { continue; } - if (mdbx_meta_ot(meta, &page.mp_meta, true)) { + if (mdbx_meta_ot(env, meta, &page.mp_meta, true)) { *meta = page.mp_meta; if (META_IS_WEAK(meta)) loop_limit += 1; /* LY: should re-read to hush race with update */ @@ -3456,7 +3521,7 @@ static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, MDBX_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ model->mp_meta.mm_dbs[FREE_DBI].md_root = P_INVALID; model->mp_meta.mm_dbs[MAIN_DBI].md_root = P_INVALID; - model->mp_meta.mm_txnid = num; + mdbx_meta_set_txnid(env, &model->mp_meta, num); model->mp_meta.mm_datasync_sign = mdbx_meta_sign(&model->mp_meta); return (MDBX_page *)((uint8_t *)model + env->me_psize); } @@ -3469,9 +3534,9 @@ static MDBX_page *__cold mdbx_init_metas(const MDBX_env *env, void *buffer) { MDBX_page *page2 = mdbx_meta_model(env, page1, 1); mdbx_meta_model(env, page2, 2); page2->mp_meta.mm_datasync_sign = MDBX_DATASIGN_WEAK; - mdbx_assert(env, !mdbx_meta_eq(&page0->mp_meta, &page1->mp_meta)); - mdbx_assert(env, !mdbx_meta_eq(&page1->mp_meta, &page2->mp_meta)); - mdbx_assert(env, !mdbx_meta_eq(&page2->mp_meta, &page0->mp_meta)); + mdbx_assert(env, !mdbx_meta_eq(env, &page0->mp_meta, &page1->mp_meta)); + mdbx_assert(env, !mdbx_meta_eq(env, &page1->mp_meta, &page2->mp_meta)); + mdbx_assert(env, !mdbx_meta_eq(env, &page2->mp_meta, &page0->mp_meta)); return page1; } @@ -3542,7 +3607,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, } MDBX_meta *target = nullptr; - if (head->mm_txnid == pending->mm_txnid) { + if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_top) { mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, sizeof(head->mm_dbs)) == 0); mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, @@ -3552,7 +3617,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if (!META_IS_STEADY(head) && META_IS_STEADY(pending)) target = head; else { - mdbx_ensure(env, mdbx_meta_eq(head, pending)); + mdbx_ensure(env, mdbx_meta_eq(env, head, pending)); mdbx_debug("skip update meta"); return MDBX_SUCCESS; } @@ -3564,47 +3629,49 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target = mdbx_meta_ancient(env, meta0, meta1, true); /* LY: step#2 - update meta-page. */ - mdbx_debug("writing meta%" PRIaPGNO " (%s, was %" PRIaTXN - ", %s), root %" PRIaPGNO "/%" PRIaPGNO ", " - "txn_id %" PRIaTXN ", %s", - container_of(target, MDBX_page, mp_data)->mp_pgno, - (target == head) ? "head" : "tail", target->mm_txnid, - mdbx_durable_str((const MDBX_meta *)target), - pending->mm_dbs[MAIN_DBI].md_root, - pending->mm_dbs[FREE_DBI].md_root, pending->mm_txnid, - mdbx_durable_str(pending)); + mdbx_debug( + "writing meta%" PRIaPGNO " (%s, was %" PRIaTXN ", %s), root %" PRIaPGNO + "/%" PRIaPGNO ", " + "txn_id %" PRIaTXN ", %s", + container_of(target, MDBX_page, mp_data)->mp_pgno, + (target == head) ? "head" : "tail", mdbx_meta_txnid_stable(env, target), + mdbx_durable_str((const MDBX_meta *)target), + pending->mm_dbs[MAIN_DBI].md_root, pending->mm_dbs[FREE_DBI].md_root, + pending->mm_txnid_top, mdbx_durable_str(pending)); mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, (meta0 == head) ? "head" : (meta0 == target) ? "tail" : "stay", - mdbx_durable_str(meta0), meta0->mm_txnid, + mdbx_durable_str(meta0), mdbx_meta_txnid_fluid(env, meta0), meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); mdbx_debug("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, (meta1 == head) ? "head" : (meta1 == target) ? "tail" : "stay", - mdbx_durable_str(meta1), meta1->mm_txnid, + mdbx_durable_str(meta1), mdbx_meta_txnid_fluid(env, meta1), meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); mdbx_debug("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, (meta2 == head) ? "head" : (meta2 == target) ? "tail" : "stay", - mdbx_durable_str(meta2), meta2->mm_txnid, + mdbx_durable_str(meta2), mdbx_meta_txnid_fluid(env, meta2), meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); - mdbx_assert(env, !mdbx_meta_eq(pending, meta0)); - mdbx_assert(env, !mdbx_meta_eq(pending, meta1)); - mdbx_assert(env, !mdbx_meta_eq(pending, meta2)); + mdbx_assert(env, !mdbx_meta_eq(env, pending, meta0)); + mdbx_assert(env, !mdbx_meta_eq(env, pending, meta1)); + mdbx_assert(env, !mdbx_meta_eq(env, pending, meta2)); const size_t offset = (char *)target - env->me_map; mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); - mdbx_ensure(env, target == head || target->mm_txnid < pending->mm_txnid); + mdbx_ensure(env, + target == head || + mdbx_meta_txnid_stable(env, target) < pending->mm_txnid_top); if (env->me_flags & MDBX_WRITEMAP) { mdbx_jitter4testing(true); if (likely(target != head)) { + mdbx_meta_update_begin(env, target, pending->mm_txnid_top); #ifdef NDEBUG /* nodebug: 'invalidate' the meta to avoid false-reading * from violators (make safer) */ target->mm_datasync_sign = MDBX_DATASIGN_WEAK; - target->mm_txnid = 0; mdbx_coherent_barrier(); #else /* debug: provoke failure to catch a violators */ @@ -3623,13 +3690,14 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_coherent_barrier(); /* LY: 'commit' the meta */ - target->mm_txnid = pending->mm_txnid; + mdbx_meta_update_end(env, target, pending->mm_txnid_bottom); mdbx_jitter4testing(true); } else { /* dangerous case (target == head), only mm_datasync_sign could * me updated, check assertions once again */ - mdbx_ensure(env, head->mm_txnid == pending->mm_txnid && - !META_IS_STEADY(head) && META_IS_STEADY(pending)); + mdbx_ensure(env, + mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_top && + !META_IS_STEADY(head) && META_IS_STEADY(pending)); mdbx_ensure(env, head->mm_last_pg == pending->mm_last_pg); mdbx_ensure(env, head->mm_mapsize == pending->mm_mapsize); mdbx_ensure(env, memcmp(&head->mm_dbs, &pending->mm_dbs, @@ -4000,25 +4068,28 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { } const MDBX_meta *head = mdbx_meta_head(env); - if (head->mm_txnid != meta.mm_txnid) { + const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); + if (head_txnid != meta.mm_txnid_top) { if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { assert(META_IS_STEADY(&meta) && !META_IS_STEADY(head)); if (env->me_flags & MDBX_RDONLY) { mdbx_error("rollback needed: (from head %" PRIaTXN " to steady %" PRIaTXN "), but unable in read-only mode", - head->mm_txnid, meta.mm_txnid); + head_txnid, meta.mm_txnid_top); return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */; } /* LY: rollback weak checkpoint */ MDBX_meta rollback = *head; - rollback.mm_txnid = 0; - mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN, head->mm_txnid, - meta.mm_txnid); + mdbx_meta_set_txnid(env, &rollback, 0); + mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN, head_txnid, + meta.mm_txnid_top); + mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head)); err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); if (err) return err; + mdbx_ensure(env, 0 == mdbx_meta_txnid_fluid(env, head)); } else if (!env->me_lck) { /* LY: without-lck (read-only) mode, so it is imposible that other * process made weak checkpoint. */ @@ -4043,7 +4114,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { head->mm_mapsize, env->me_mapsize); meta = *head; meta.mm_mapsize = env->me_mapsize; - meta.mm_txnid += 1; + mdbx_meta_set_txnid(env, &meta, meta.mm_txnid_top + 1); if (META_IS_STEADY(head)) meta.mm_datasync_sign = mdbx_meta_sign(&meta); err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); @@ -4329,7 +4400,8 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, env->me_psize); mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN "", - container_of(meta, MDBX_page, mp_data)->mp_pgno, meta->mm_txnid); + container_of(meta, MDBX_page, mp_data)->mp_pgno, + mdbx_meta_txnid_fluid(env, meta)); mdbx_debug("depth: %u", db->md_depth); mdbx_debug("entries: %" PRIu64 "", db->md_entries); mdbx_debug("branch pages: %" PRIaPGNO "", db->md_branch_pages); @@ -8643,7 +8715,7 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { /* copy canary sequenses if present */ if (txn->mt_canary.v) { meta->mp_meta.mm_canary = txn->mt_canary; - meta->mp_meta.mm_canary.v = meta->mp_meta.mm_txnid; + meta->mp_meta.mm_canary.v = mdbx_meta_txnid_stable(env, &meta->mp_meta); } /* update signature */ @@ -8864,22 +8936,25 @@ int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); + const MDBX_meta *meta; do { - const MDBX_meta *meta = mdbx_meta_head(env); - arg->me_meta0_txnid = meta0->mm_txnid; + meta = mdbx_meta_head(env); + arg->me_recent_txnid = mdbx_meta_txnid_fluid(env, meta); + arg->me_meta0_txnid = mdbx_meta_txnid_fluid(env, meta0); arg->me_meta0_sign = meta0->mm_datasync_sign; - arg->me_meta1_txnid = meta1->mm_txnid; + arg->me_meta1_txnid = mdbx_meta_txnid_fluid(env, meta1); arg->me_meta1_sign = meta1->mm_datasync_sign; - arg->me_meta2_txnid = meta2->mm_txnid; + arg->me_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2); arg->me_meta2_sign = meta2->mm_datasync_sign; - arg->me_recent_txnid = meta->mm_txnid; arg->me_recent_pgno = meta->mm_last_pg; - } while (unlikely(arg->me_meta0_txnid != meta0->mm_txnid || + } while (unlikely(arg->me_meta0_txnid != mdbx_meta_txnid_fluid(env, meta0) || arg->me_meta0_sign != meta0->mm_datasync_sign || - arg->me_meta1_txnid != meta1->mm_txnid || + arg->me_meta1_txnid != mdbx_meta_txnid_fluid(env, meta1) || arg->me_meta1_sign != meta1->mm_datasync_sign || - arg->me_meta2_txnid != meta2->mm_txnid || - arg->me_meta2_sign != meta2->mm_datasync_sign)); + arg->me_meta2_txnid != mdbx_meta_txnid_fluid(env, meta2) || + arg->me_meta2_sign != meta2->mm_datasync_sign || + meta != mdbx_meta_head(env) || + arg->me_recent_txnid != mdbx_meta_txnid_fluid(env, meta))); arg->me_mapsize = env->me_mapsize; arg->me_maxreaders = env->me_maxreaders; @@ -9530,8 +9605,9 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { if (r->mr_txnid != oldest || pid <= 0) continue; - rc = env->me_oom_func(env, pid, tid, oldest, - mdbx_meta_head(env)->mm_txnid - oldest, retry); + rc = env->me_oom_func( + env, pid, tid, oldest, + mdbx_meta_txnid_stable(env, mdbx_meta_head(env)) - oldest, retry); if (rc < 0) break; @@ -9604,7 +9680,7 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) last = env->me_txn0->mt_next_pgno; *percent = (last * 100ull + maxpg / 2) / maxpg; } - txnid_t lag = meta->mm_txnid - txn->mt_ro_reader->mr_txnid; + txnid_t lag = mdbx_meta_txnid_fluid(env, meta) - txn->mt_ro_reader->mr_txnid; return (lag > INT_MAX) ? INT_MAX : (int)lag; } From 5c3691eff11c4c3a7676a3bcdc02198bed8de314 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 20:27:27 +0300 Subject: [PATCH 193/303] mdbx: split atomics for 32/64 uints. --- src/osal.h | 64 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/src/osal.h b/src/osal.h index 123d42ac..5491cd45 100644 --- a/src/osal.h +++ b/src/osal.h @@ -526,60 +526,68 @@ int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid); #error FIXME atomic-ops #endif -static __inline size_t mdbx_atomic_add(volatile size_t *p, size_t v) { +static __inline uint32_t mdbx_atomic_add32(volatile uint32_t *p, uint32_t v) { #ifdef ATOMIC_VAR_INIT return atomic_fetch_add(p, v); #elif defined(__GNUC__) || defined(__clang__) return __sync_fetch_and_add(p, v); #else - switch (sizeof(size_t)) { - case 4: #ifdef _MSC_VER - return _InterlockedExchangeAdd(p, v); + return _InterlockedExchangeAdd(p, v); #endif #ifdef __APPLE__ - return OSAtomicAdd32(v, (volatile int32_t *)p); + return OSAtomicAdd32(v, (volatile int32_t *)p); #endif - case 8: -#ifdef _MSC_VER - return _InterlockedExchangeAdd64(p, v); -#endif -#ifdef __APPLE__ - return OSAtomicAdd64(v, (volatile int64_t *)p); -#endif - } - while (1) - ; #endif } -#define mdbx_atomic_sub(p, v) mdbx_atomic_add(p, -(v)) +static __inline uint64_t mdbx_atomic_add64(volatile uint64_t *p, uint64_t v) { +#ifdef ATOMIC_VAR_INIT + return atomic_fetch_add(p, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_fetch_and_add(p, v); +#else +#ifdef _MSC_VER + return _InterlockedExchangeAdd64(p, v); +#endif +#ifdef __APPLE__ + return OSAtomicAdd64(v, (volatile int64_t *)p); +#endif +#endif +} -static __inline bool mdbx_atomic_compare_and_swap(volatile size_t *p, size_t c, - size_t v) { +#define mdbx_atomic_sub32(p, v) mdbx_atomic_add32(p, -(v)) +#define mdbx_atomic_sub64(p, v) mdbx_atomic_add64(p, -(v)) + +static __inline bool mdbx_atomic_compare_and_swap32(volatile uint32_t *p, + uint32_t c, uint32_t v) { #ifdef ATOMIC_VAR_INIT return atomic_compare_exchange_strong(p, &c, v); #elif defined(__GNUC__) || defined(__clang__) return __sync_bool_compare_and_swap(p, c, v); #else - switch (sizeof(size_t)) { - case 4: #ifdef _MSC_VER - return c == _InterlockedCompareExchange(p, v, c); + return c == _InterlockedCompareExchange(p, v, c); #endif #ifdef __APPLE__ - return c == OSAtomicCompareAndSwap32Barrier(c, v, (volatile int32_t *)p); + return c == OSAtomicCompareAndSwap32Barrier(c, v, (volatile int32_t *)p); #endif - case 8: +#endif +} + +static __inline bool mdbx_atomic_compare_and_swap64(volatile uint64_t *p, + uint64_t c, uint64_t v) { +#ifdef ATOMIC_VAR_INIT + return atomic_compare_exchange_strong(p, &c, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_bool_compare_and_swap(p, c, v); +#else #ifdef _MSC_VER - return c == _InterlockedCompareExchange64(p, v, c); + return c == _InterlockedCompareExchange64(p, v, c); #endif #ifdef __APPLE__ - return c == OSAtomicCompareAndSwap64Barrier(c, v, (volatile int32_t *)p); + return c == OSAtomicCompareAndSwap64Barrier(c, v, (volatile uint64_t *)p); #endif - } - while (1) - ; #endif } From 60fed8bbca70e89245a0d146ad1ed702249c4427 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 26 May 2017 20:28:09 +0300 Subject: [PATCH 194/303] mdbx: fix first-rdonly-blocker bug. --- src/bits.h | 3 ++- src/mdbx.c | 20 ++++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/bits.h b/src/bits.h index 6479b11d..76a6b9a8 100644 --- a/src/bits.h +++ b/src/bits.h @@ -337,7 +337,8 @@ typedef struct MDBX_lockinfo { /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */ uint64_t mti_format; /* Flags which environment was opened. */ - uint64_t mti_envmode; + uint32_t mti_envmode; + uint32_t mti_reserved; #ifdef MDBX_OSAL_LOCK MDBX_OSAL_LOCK mti_wmutex; diff --git a/src/mdbx.c b/src/mdbx.c index d12b3834..c2cf517f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4342,7 +4342,7 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, const unsigned mode_flags = MDBX_WRITEMAP | MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC; if (lck_rc == MDBX_RESULT_TRUE) { - env->me_lck->mti_envmode = env->me_flags & mode_flags; + env->me_lck->mti_envmode = env->me_flags & (mode_flags | MDBX_RDONLY); if (exclusive == NULL || *exclusive < 2) { /* LY: downgrade lock only if exclusive access not requested. * in case exclusive==1, just leave value as is. */ @@ -4356,11 +4356,19 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, /* LY: just indicate that is not an exclusive access. */ *exclusive = 0; } - if ((env->me_flags & MDBX_RDONLY) == 0 && - ((env->me_lck->mti_envmode ^ env->me_flags) & mode_flags) != 0) { - mdbx_error("current mode/flags incompatible with requested"); - rc = MDBX_INCOMPATIBLE; - goto bailout; + if ((env->me_flags & MDBX_RDONLY) == 0) { + while (env->me_lck->mti_envmode == MDBX_RDONLY) { + if (mdbx_atomic_compare_and_swap32(&env->me_lck->mti_envmode, + MDBX_RDONLY, + env->me_flags & mode_flags)) + break; + /* TODO: yield/relax cpu */ + } + if ((env->me_lck->mti_envmode ^ env->me_flags) & mode_flags) { + mdbx_error("current mode/flags incompatible with requested"); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } } } From 77f0e5e38ef2d84b74b543b69c36a8d6441d4c10 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sat, 27 May 2017 20:03:46 +0300 Subject: [PATCH 195/303] mdbx: drop unused debug features. --- mdbx.h | 7 +------ src/mdbx.c | 28 +++------------------------- test/test.cc | 2 +- 3 files changed, 5 insertions(+), 32 deletions(-) diff --git a/mdbx.h b/mdbx.h index 45668cf6..07fa432e 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1550,16 +1550,11 @@ LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDBX_env *env); #define MDBX_DBG_TRACE 4 #define MDBX_DBG_EXTRA 8 #define MDBX_DBG_AUDIT 16 -#define MDBX_DBG_EDGE 32 - -/* LY: a "don't touch" value */ -#define MDBX_DBG_DNT (-1L) typedef void MDBX_debug_func(int type, const char *function, int line, const char *msg, va_list args); -LIBMDBX_API int mdbx_setup_debug(int flags, MDBX_debug_func *logger, - long edge_txn); +LIBMDBX_API int mdbx_setup_debug(int flags, MDBX_debug_func *logger); typedef int MDBX_pgvisitor_func(uint64_t pgno, unsigned pgnumber, void *ctx, const char *dbi, const char *type, int nentries, diff --git a/src/mdbx.c b/src/mdbx.c index c2cf517f..4042969e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -502,12 +502,6 @@ int mdbx_runtime_flags = MDBX_DBG_PRINT MDBX_debug_func *mdbx_debug_logger; -int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); - -#if MDBX_DEBUG -txnid_t mdbx_debug_edge; -#endif - static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, int flags); static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num, MDBX_page **mp); @@ -2276,15 +2270,6 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_canary = meta->mm_canary; const txnid_t snap = mdbx_meta_txnid_stable(env, meta); txn->mt_txnid = snap + 1; -#if MDBX_DEBUG - if (unlikely(txn->mt_txnid == mdbx_debug_edge)) { - if (!mdbx_debug_logger) - mdbx_runtime_flags |= - MDBX_DBG_TRACE | MDBX_DBG_EXTRA | MDBX_DBG_AUDIT | MDBX_DBG_ASSERT; - mdbx_debug_log(MDBX_DBG_EDGE, __FUNCTION__, __LINE__, - "on/off edge (txn %" PRIaTXN ")", txn->mt_txnid); - } -#endif if (unlikely(txn->mt_txnid < snap)) { mdbx_debug("txnid overflow!"); rc = MDBX_TXN_FULL; @@ -9566,17 +9551,10 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { return rc; } -int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn) { +int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger) { unsigned ret = mdbx_runtime_flags; - if (flags != (int)MDBX_DBG_DNT) - mdbx_runtime_flags = flags; - if (logger != (MDBX_debug_func *)MDBX_DBG_DNT) - mdbx_debug_logger = logger; - if (edge_txn != (long)MDBX_DBG_DNT) { -#if MDBX_DEBUG - mdbx_debug_edge = edge_txn; -#endif - } + mdbx_runtime_flags = flags; + mdbx_debug_logger = logger; return ret; } diff --git a/test/test.cc b/test/test.cc index 44d59209..0d29ad8e 100644 --- a/test/test.cc +++ b/test/test.cc @@ -119,7 +119,7 @@ void testcase::db_prepare() { mdbx_dbg_opts |= MDBX_DBG_TRACE; if (config.params.loglevel <= logging::verbose) mdbx_dbg_opts |= MDBX_DBG_PRINT; - int rc = mdbx_setup_debug(mdbx_dbg_opts, mdbx_debug_logger, MDBX_DBG_DNT); + int rc = mdbx_setup_debug(mdbx_dbg_opts, mdbx_debug_logger); log_info("set mdbx debug-opts: 0x%02x", rc); MDBX_env *env = nullptr; From 17c805768431e1620087eb1185bd23554583dc04 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 13:07:29 +0300 Subject: [PATCH 196/303] mdbx: fix gcc 'uninitialized' warning with -Og. --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 4042969e..8684a12d 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -5856,7 +5856,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, int do_sub = 0, insert_key, insert_data; unsigned mcount = 0, dcount = 0, nospill; size_t nsize; - int rc, rc2; + int rc = MDBX_SUCCESS, rc2; unsigned nflags; DKBUF; From bb5e1b43afe08d01e9d46ec31a788afd6a3c5302 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 13:09:05 +0300 Subject: [PATCH 197/303] mdbx: relax mdbx_is_dirrty() assertion for map-range. --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 8684a12d..5b65980c 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -10187,7 +10187,7 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { * в пределах mmap, но за границей распределенных страниц. Это тяжелая * ошибка, к которой не возможно прийти без каких-то больших нарушений. * Поэтому не проверяем этот случай кроме как assert-ом, ибо бестолку. */ - mdbx_tassert(txn, env->me_map + env->me_mapsize > (char *)page); + mdbx_tassert(txn, env->me_map + env->me_mapsize >= (char *)page); } /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был From f945e8a06277ac43ec55b404320d6003e5d363a3 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 13:48:11 +0300 Subject: [PATCH 198/303] mdbx: fix cache-aligned size for MDBX_reader. Change-Id: I1e48dd279106daa629fbcd427ec841828799c8f6 --- src/bits.h | 12 ++++++------ src/mdbx.c | 3 +++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/bits.h b/src/bits.h index 76a6b9a8..a46ac42c 100644 --- a/src/bits.h +++ b/src/bits.h @@ -225,10 +225,10 @@ typedef struct MDBX_reader { volatile mdbx_tid_t mr_tid; /* cache line alignment */ - uint8_t pad[~(MDBX_CACHELINE_SIZE - 1) & - (sizeof(txnid_t) + sizeof(mdbx_pid_t) + sizeof(mdbx_tid_t) + - MDBX_CACHELINE_SIZE - 1)]; -} MDBX_reader; + uint8_t pad[MDBX_CACHELINE_SIZE - + (sizeof(txnid_t) + sizeof(mdbx_pid_t) + sizeof(mdbx_tid_t)) % + MDBX_CACHELINE_SIZE]; +} __cache_aligned MDBX_reader; /* Information about a single database in the environment. */ typedef struct MDBX_db { @@ -347,12 +347,12 @@ typedef struct MDBX_lockinfo { /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented * when readers release their slots. */ - __cache_aligned volatile unsigned mti_numreaders; + volatile unsigned __cache_aligned mti_numreaders; #ifdef MDBX_OSAL_LOCK /* Mutex protecting access to this table. */ MDBX_OSAL_LOCK mti_rmutex; #endif - MDBX_reader mti_readers[1]; + MDBX_reader __cache_aligned mti_readers[1]; } MDBX_lockinfo; #pragma pack(pop) diff --git a/src/mdbx.c b/src/mdbx.c index 5b65980c..85378b31 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2208,6 +2208,9 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } } + STATIC_ASSERT(sizeof(MDBX_reader) == MDBX_CACHELINE_SIZE); + STATIC_ASSERT( + offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE == 0); r = &env->me_lck->mti_readers[i]; /* Claim the reader slot, carefully since other code * uses the reader table un-mutexed: First reset the From 9318e4cabbb0eebd8e7ce99552e9ec3e8ba91533 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 13:09:36 +0300 Subject: [PATCH 199/303] test: fix oom_callback() nasty bug. --- test/test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test.cc b/test/test.cc index 0d29ad8e..0302479d 100644 --- a/test/test.cc +++ b/test/test.cc @@ -104,7 +104,7 @@ int testcase::oom_callback(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, osal_yield(); if (retry > 0) osal_udelay(retry * 100); - return 1 /* always retry */; + return 0 /* always retry */; } return -1; From 91bc3129d5fb61b03a6ec7fedc02235c6913a44f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 13:43:52 +0300 Subject: [PATCH 200/303] mdbx: refine mdbx_page_search(). Change-Id: I9c87ae2ffe317538813ebec971ad093936f4c504 --- src/mdbx.c | 68 +++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 85378b31..aeffe2a0 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4952,45 +4952,45 @@ static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) { mdbx_debug("transaction has failed, must abort"); return MDBX_BAD_TXN; - } else { - /* Make sure we're using an up-to-date root */ - if (unlikely(*mc->mc_dbflag & DB_STALE)) { - MDBX_cursor mc2; - if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) - return MDBX_BAD_DBI; - mdbx_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); - rc = mdbx_page_search(&mc2, &mc->mc_dbx->md_name, 0); + } + + /* Make sure we're using an up-to-date root */ + if (unlikely(*mc->mc_dbflag & DB_STALE)) { + MDBX_cursor mc2; + if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + return MDBX_BAD_DBI; + mdbx_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); + rc = mdbx_page_search(&mc2, &mc->mc_dbx->md_name, 0); + if (rc) + return rc; + { + MDBX_val data; + int exact = 0; + MDBX_node *leaf = mdbx_node_search(&mc2, &mc->mc_dbx->md_name, &exact); + if (!exact) + return MDBX_NOTFOUND; + if (unlikely((leaf->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) + return MDBX_INCOMPATIBLE; /* not a named DB */ + rc = mdbx_node_read(&mc2, leaf, &data); if (rc) return rc; - { - MDBX_val data; - int exact = 0; - MDBX_node *leaf = mdbx_node_search(&mc2, &mc->mc_dbx->md_name, &exact); - if (!exact) - return MDBX_NOTFOUND; - if (unlikely((leaf->mn_flags & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) - return MDBX_INCOMPATIBLE; /* not a named DB */ - rc = mdbx_node_read(&mc2, leaf, &data); - if (rc) - return rc; - uint16_t md_flags; - memcpy(&md_flags, ((char *)data.iov_base + offsetof(MDBX_db, md_flags)), - sizeof(uint16_t)); - /* The txn may not know this DBI, or another process may - * have dropped and recreated the DB with other flags. */ - if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != md_flags)) - return MDBX_INCOMPATIBLE; - memcpy(mc->mc_db, data.iov_base, sizeof(MDBX_db)); - } - *mc->mc_dbflag &= ~DB_STALE; + uint16_t md_flags; + memcpy(&md_flags, ((char *)data.iov_base + offsetof(MDBX_db, md_flags)), + sizeof(uint16_t)); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. */ + if (unlikely((mc->mc_db->md_flags & PERSISTENT_FLAGS) != md_flags)) + return MDBX_INCOMPATIBLE; + memcpy(mc->mc_db, data.iov_base, sizeof(MDBX_db)); } - root = mc->mc_db->md_root; + *mc->mc_dbflag &= ~DB_STALE; + } + root = mc->mc_db->md_root; - if (unlikely(root == P_INVALID)) { /* Tree is empty. */ - mdbx_debug("tree is empty"); - return MDBX_NOTFOUND; - } + if (unlikely(root == P_INVALID)) { /* Tree is empty. */ + mdbx_debug("tree is empty"); + return MDBX_NOTFOUND; } mdbx_cassert(mc, root >= NUM_METAS); From 2ea97ae281d6568a6ccc6a285a99e17e4193532d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 13:04:36 +0300 Subject: [PATCH 201/303] mdbx: refine find_oldest() and meta_head(). --- src/mdbx.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index aeffe2a0..60c50950 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1273,25 +1273,29 @@ bailout: #define METAPAGE_END(env) METAPAGE(env, NUM_METAS) -static __inline txnid_t mdbx_meta_txnid(const MDBX_env *env, - const MDBX_meta *meta, - bool allow_volatile) { +static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta, + bool allow_volatile) { mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); txnid_t top = meta->mm_txnid_top; txnid_t bottom = meta->mm_txnid_bottom; - if (!allow_volatile) + if (allow_volatile) + return (top < bottom) ? top : bottom; + if (unlikely(top != bottom)) { + mdbx_error("top %" PRIaTXN " != bottom %" PRIaTXN, top, bottom); + *(char *)0 = 0; mdbx_assert(env, top == bottom); - return (top < bottom) ? top : bottom; + } + return top; } static __inline txnid_t mdbx_meta_txnid_stable(const MDBX_env *env, const MDBX_meta *meta) { - return mdbx_meta_txnid(env, meta, false); + return meta_txnid(env, meta, false); } static __inline txnid_t mdbx_meta_txnid_fluid(const MDBX_env *env, const MDBX_meta *meta) { - return mdbx_meta_txnid(env, meta, true); + return meta_txnid(env, meta, true); } static __inline void mdbx_meta_update_begin(const MDBX_env *env, @@ -1425,10 +1429,11 @@ static const char *mdbx_durable_str(const MDBX_meta *const meta) { /*----------------------------------------------------------------------------*/ /* Find oldest txnid still referenced. */ -static txnid_t mdbx_find_oldest(MDBX_env *env, int *laggard) { +static txnid_t mdbx_find_oldest(MDBX_txn *txn, int *laggard) { + MDBX_env *env = txn->mt_env; const MDBX_meta *const head = mdbx_meta_mostrecent( env, F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) ? false : true); - txnid_t oldest = mdbx_meta_txnid_stable(env, head); + txnid_t oldest = meta_txnid(env, head, (txn->mt_flags & MDBX_RDONLY) ? true : false); int i, reader; const MDBX_reader *const r = env->me_lck->mti_readers; @@ -1560,7 +1565,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, mdbx_cursor_init(&m2, txn, FREE_DBI, NULL); if (flags & MDBX_LIFORECLAIM) { if (!found_oldest) { - oldest = mdbx_find_oldest(env, NULL); + oldest = mdbx_find_oldest(txn, NULL); found_oldest = 1; } /* Begin from oldest reader if any */ @@ -1582,7 +1587,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, /* Do not fetch more if the record will be too recent */ if (op != MDBX_FIRST && ++last >= oldest) { if (!found_oldest) { - oldest = mdbx_find_oldest(env, NULL); + oldest = mdbx_find_oldest(txn, NULL); found_oldest = 1; } if (oldest <= last) @@ -1595,7 +1600,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, if (op == MDBX_SET_RANGE) continue; found_oldest = 1; - if (oldest < mdbx_find_oldest(env, NULL)) { + if (oldest < mdbx_find_oldest(txn, NULL)) { oldest = env->me_pgoldest; last = oldest - 1; key.iov_base = &last; @@ -1613,7 +1618,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, last = *(txnid_t *)key.iov_base; if (oldest <= last) { if (!found_oldest) { - oldest = mdbx_find_oldest(env, NULL); + oldest = mdbx_find_oldest(txn, NULL); found_oldest = 1; } if (oldest <= last) { @@ -1745,7 +1750,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, mdbx_assert(env, env->me_sync_pending > 0); MDBX_meta meta = *head; if (mdbx_sync_locked(env, me_flags, &meta) == MDBX_SUCCESS) { - txnid_t snap = mdbx_find_oldest(env, NULL); + txnid_t snap = mdbx_find_oldest(txn, NULL); if (snap > oldest) { continue; } @@ -9571,7 +9576,7 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { if (mdbx_reader_check(env, NULL)) break; - txnid_t snap = mdbx_find_oldest(env, &reader); + txnid_t snap = mdbx_find_oldest(env->me_txn, &reader); if (oldest < snap || reader < 0) { if (retry && env->me_oom_func) { /* LY: notify end of oom-loop */ @@ -9614,7 +9619,7 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { /* LY: notify end of oom-loop */ env->me_oom_func(env, 0, 0, oldest, 0, -retry); } - return mdbx_find_oldest(env, NULL); + return mdbx_find_oldest(env->me_txn, NULL); } int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes) { From 236ddda1aedeb975183c42a0c69ed9a0fd70a660 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 13:48:36 +0300 Subject: [PATCH 202/303] mdbx: more assertions for mdbx_txn_renew0(). Change-Id: I3c179d105c3e6388c08b2c371ada4d457af50bed --- src/mdbx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 60c50950..ae60af0c 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1280,11 +1280,7 @@ static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta, txnid_t bottom = meta->mm_txnid_bottom; if (allow_volatile) return (top < bottom) ? top : bottom; - if (unlikely(top != bottom)) { - mdbx_error("top %" PRIaTXN " != bottom %" PRIaTXN, top, bottom); - *(char *)0 = 0; - mdbx_assert(env, top == bottom); - } + mdbx_assert(env, top == bottom); return top; } @@ -1433,7 +1429,8 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn, int *laggard) { MDBX_env *env = txn->mt_env; const MDBX_meta *const head = mdbx_meta_mostrecent( env, F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) ? false : true); - txnid_t oldest = meta_txnid(env, head, (txn->mt_flags & MDBX_RDONLY) ? true : false); + txnid_t oldest = + meta_txnid(env, head, (txn->mt_flags & MDBX_RDONLY) ? true : false); int i, reader; const MDBX_reader *const r = env->me_lck->mti_readers; @@ -2178,8 +2175,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != ~(txnid_t)0)) return MDBX_BAD_RSLOT; } else if (env->me_lck) { - mdbx_pid_t pid = env->me_pid; - mdbx_tid_t tid = mdbx_thread_self(); + const mdbx_pid_t pid = env->me_pid; + const mdbx_tid_t tid = mdbx_thread_self(); mdbx_assert(env, env->me_lck->mti_magic == MDBX_MAGIC); mdbx_assert(env, env->me_lck->mti_format == MDBX_LOCK_FORMAT); @@ -2245,6 +2242,9 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { if (r) { r->mr_txnid = snap; mdbx_jitter4testing(false); + mdbx_assert(env, r->mr_pid == mdbx_getpid()); + mdbx_assert(env, r->mr_tid == mdbx_thread_self()); + mdbx_assert(env, r->mr_txnid == snap); } mdbx_coherent_barrier(); mdbx_jitter4testing(true); From 2347282b4ffc4bfde094c5a825bb96f3fe8c72ec Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 13:06:29 +0300 Subject: [PATCH 203/303] mdbx: add assertions for reclaiming-edge. Change-Id: I87e11f69423b4e7841c4f8ec6b5ecfdff9e96b0d --- src/mdbx.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/mdbx.c b/src/mdbx.c index ae60af0c..f74723b8 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2263,6 +2263,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } } + mdbx_assert(env, txn->mt_txnid >= mdbx_find_oldest(txn, nullptr)); txn->mt_ro_reader = r; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ } else { @@ -4959,6 +4960,8 @@ static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { return MDBX_BAD_TXN; } + mdbx_cassert(mc, + mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); /* Make sure we're using an up-to-date root */ if (unlikely(*mc->mc_dbflag & DB_STALE)) { MDBX_cursor mc2; @@ -4998,6 +5001,8 @@ static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { return MDBX_NOTFOUND; } + mdbx_cassert(mc, + mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); mdbx_cassert(mc, root >= NUM_METAS); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) @@ -5104,6 +5109,8 @@ static __inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf, pgno_t pgno; int rc; + mdbx_cassert(mc, + mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { data->iov_len = NODEDSZ(leaf); data->iov_base = NODEDATA(leaf); @@ -5160,6 +5167,8 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { MDBX_node *indx; MDBX_page *mp; + mdbx_cassert(mc, + mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); if (unlikely(mc->mc_snum < 2)) { return MDBX_NOTFOUND; /* root has no siblings */ } @@ -5389,6 +5398,8 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_node *leaf = NULL; DKBUF; + mdbx_cassert(mc, + mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); if ((mc->mc_db->md_flags & MDBX_INTEGERKEY) && unlikely(key->iov_len != sizeof(uint32_t) && key->iov_len != sizeof(uint64_t))) { @@ -5675,6 +5686,8 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) return MDBX_BAD_TXN; + mdbx_cassert(mc, + mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); switch (op) { case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) @@ -6905,6 +6918,8 @@ static void mdbx_xcursor_init0(MDBX_cursor *mc) { static void mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node) { MDBX_xcursor *mx = mc->mc_xcursor; + mdbx_cassert(mc, + mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); if (node->mn_flags & F_SUBDATA) { memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDBX_db)); mx->mx_cursor.mc_pg[0] = 0; @@ -6954,6 +6969,8 @@ static void mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, int new_dupdata) { MDBX_xcursor *mx = mc->mc_xcursor; + mdbx_cassert(mc, + mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); if (new_dupdata) { mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; @@ -6993,6 +7010,8 @@ static void mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi, mc->mc_xcursor = mx; mdbx_xcursor_init0(mc); } + mdbx_cassert(mc, + mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); if (unlikely(*mc->mc_dbflag & DB_STALE)) { mdbx_page_search(mc, NULL, MDBX_PS_ROOTONLY); } @@ -7602,6 +7621,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { unsigned i; + mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >= + mdbx_find_oldest(csrc->mc_txn, nullptr)); cdst->mc_txn = csrc->mc_txn; cdst->mc_dbi = csrc->mc_dbi; cdst->mc_db = csrc->mc_db; From fca74ab80c96ba72d6857dbfa9dc1f430cba7b5d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 18:31:33 +0300 Subject: [PATCH 204/303] mdbx: refine meta-commit in WRITEMAP mode. Change-Id: Ieb53bf6144a104fc88b0b07b4abdde20f4b01978 --- src/mdbx.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index f74723b8..d0e4d65f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3661,13 +3661,10 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if (env->me_flags & MDBX_WRITEMAP) { mdbx_jitter4testing(true); if (likely(target != head)) { - mdbx_meta_update_begin(env, target, pending->mm_txnid_top); -#ifdef NDEBUG - /* nodebug: 'invalidate' the meta to avoid false-reading - * from violators (make safer) */ + /* LY: 'invalidate' the meta. */ target->mm_datasync_sign = MDBX_DATASIGN_WEAK; - mdbx_coherent_barrier(); -#else + mdbx_meta_update_begin(env, target, pending->mm_txnid_top); +#ifndef NDEBUG /* debug: provoke failure to catch a violators */ memset(target->mm_dbs, 0xCC, sizeof(target->mm_dbs) + sizeof(target->mm_canary)); From a0ddf167dd154a7c2144c44456aea55bc3bfd799 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 18:35:32 +0300 Subject: [PATCH 205/303] mdbx: common database format for 32/64 targets (remove 'size_t' structures). Change-Id: I2d7c77603ee27c5ae4b82f762a726f71136527c8 --- src/bits.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/bits.h b/src/bits.h index a46ac42c..e17164ba 100644 --- a/src/bits.h +++ b/src/bits.h @@ -232,15 +232,15 @@ typedef struct MDBX_reader { /* Information about a single database in the environment. */ typedef struct MDBX_db { - uint32_t md_xsize; /* also ksize for LEAF2 pages */ - uint16_t md_flags; /* see mdbx_dbi_open */ - uint16_t md_depth; /* depth of this tree */ - uint64_t md_seq; /* table sequence counter */ - pgno_t md_branch_pages; /* number of internal pages */ - pgno_t md_leaf_pages; /* number of leaf pages */ - pgno_t md_overflow_pages; /* number of overflow pages */ - pgno_t md_root; /* the root page of this tree */ - uint64_t md_entries; /* number of data items */ + uint32_t md_xsize; /* also ksize for LEAF2 pages */ + uint16_t md_flags; /* see mdbx_dbi_open */ + uint16_t md_depth; /* depth of this tree */ + uint64_t md_root; /* the root page of this tree */ + uint64_t md_seq; /* table sequence counter */ + uint64_t md_branch_pages; /* number of internal pages */ + uint64_t md_leaf_pages; /* number of leaf pages */ + uint64_t md_overflow_pages; /* number of overflow pages */ + uint64_t md_entries; /* number of data items */ } MDBX_db; /* Meta page content. @@ -255,7 +255,7 @@ typedef struct MDBX_meta { /* txnid that committed this page, */ volatile txnid_t mm_txnid_top; - size_t mm_mapsize; /* size of mmap region */ + uint64_t mm_mapsize; /* size of mmap region */ MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ /* The size of pages used in this DB */ #define mm_psize mm_dbs[FREE_DBI].md_xsize @@ -264,7 +264,8 @@ typedef struct MDBX_meta { mdbx_canary mm_canary; /* Last used page in the datafile. * Actually the file may be shorter if the freeDB lists the final pages. */ - pgno_t mm_last_pg; + uint64_t mm_last_pg; + volatile txnid_t mm_txnid; /* txnid that committed this page */ #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u volatile uint64_t mm_datasync_sign; From bd5d092a2b0fe64d5d9bd97b51fac3906e9701e0 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 18:49:05 +0300 Subject: [PATCH 206/303] mdbx: check system pagesize (paranoia). Change-Id: I596c686a996f7d1521789f22900022c4da629e1b --- src/mdbx.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index d0e4d65f..94202a87 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3804,14 +3804,25 @@ int __cold mdbx_env_create(MDBX_env **penv) { env->me_fd = INVALID_HANDLE_VALUE; env->me_lfd = INVALID_HANDLE_VALUE; env->me_pid = mdbx_getpid(); - mdbx_env_setup_limits(env, env->me_os_psize = mdbx_syspagesize()); - if (!is_power2(env->me_os_psize)) - return MDBX_INCOMPATIBLE; + + int rc; + env->me_os_psize = mdbx_syspagesize(); + if (!is_power2(env->me_os_psize) || env->me_os_psize < MIN_PAGESIZE) { + mdbx_error("unsuitable system pageize %u", env->me_os_psize); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + mdbx_env_setup_limits(env, env->me_os_psize); + VALGRIND_CREATE_MEMPOOL(env, 0, 0); env->me_signature = MDBX_ME_SIGNATURE; - *penv = env; return MDBX_SUCCESS; + +bailout: + free(env); + *penv = nullptr; + return rc; } static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { From 0f676db491a246ff1ff116a087df9ca8ca28f3d8 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 28 May 2017 18:50:09 +0300 Subject: [PATCH 207/303] mdbx: add locking while open/close/drop dbi-handles (avoid dbi-related races). This avoid races and collisions between threads when opening, closing and deleting DBI-handles. unfortunately, this does not resolve collision in the case an one thread closing the DBI-handle while the another thread performs transaction. Change-Id: I48c3ffb11a8f83739fae1712db3476645f573e09 --- src/bits.h | 1 + src/mdbx.c | 71 ++++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 18 deletions(-) diff --git a/src/bits.h b/src/bits.h index e17164ba..50eceff4 100644 --- a/src/bits.h +++ b/src/bits.h @@ -603,6 +603,7 @@ struct MDBX_env { unsigned me_maxreaders; /* size of the reader table */ /* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */ unsigned me_close_readers; + mdbx_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_dbi me_maxdbs; /* size of the DB table */ mdbx_pid_t me_pid; /* process ID of this env */ diff --git a/src/mdbx.c b/src/mdbx.c index 94202a87..a6281ec9 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3814,6 +3814,10 @@ int __cold mdbx_env_create(MDBX_env **penv) { } mdbx_env_setup_limits(env, env->me_os_psize); + rc = mdbx_fastmutex_init(&env->me_dbi_lock); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + VALGRIND_CREATE_MEMPOOL(env, 0, 0); env->me_signature = MDBX_ME_SIGNATURE; *penv = env; @@ -4507,6 +4511,7 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { } mdbx_env_close0(env); + mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); env->me_signature = 0; free(env); @@ -9106,7 +9111,8 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, } /* Fail, if no free slot and max hit */ - if (unlikely(slot >= txn->mt_env->me_maxdbs)) + MDBX_env *env = txn->mt_env; + if (unlikely(slot >= env->me_maxdbs)) return MDBX_DBS_FULL; /* Cannot mix named table with some main-table flags */ @@ -9137,7 +9143,11 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, if (unlikely(!namedup)) return MDBX_ENOMEM; - /* FIXME: lock here (to avoid races !!!) */ + int err = mdbx_fastmutex_acquire(&env->me_dbi_lock); + if (unlikely(err != MDBX_SUCCESS)) { + free(namedup); + return err; + } unsigned dbflag = DB_NEW | DB_VALID | DB_USRVALID; if (unlikely(rc)) { @@ -9165,7 +9175,7 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, txn->mt_dbxs[slot].md_cmp = nullptr; txn->mt_dbxs[slot].md_dcmp = nullptr; txn->mt_dbflags[slot] = dbflag; - txn->mt_dbiseqs[slot] = (txn->mt_env->me_dbiseqs[slot] += 1); + txn->mt_dbiseqs[slot] = (env->me_dbiseqs[slot] += 1); txn->mt_dbs[slot] = *(MDBX_db *)data.iov_base; rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp); @@ -9183,7 +9193,7 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, txn->mt_numdbs++; } - /* FIXME: unlock here (to avoid races !!!) */ + mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); return rc; } @@ -9218,13 +9228,11 @@ int __cold mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *arg, return mdbx_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); } -int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { - char *ptr; +static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) return MDBX_EINVAL; - /* FIXME: locking to avoid races ? */ - ptr = env->me_dbxs[dbi].md_name.iov_base; + char *ptr = env->me_dbxs[dbi].md_name.iov_base; /* If there was no name, this was already closed */ if (unlikely(!ptr)) return MDBX_BAD_DBI; @@ -9237,6 +9245,18 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { return MDBX_SUCCESS; } +int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { + if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) + return MDBX_EINVAL; + + int rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_dbi_close_locked(env, dbi); + mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + } + return rc; +} + int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { if (unlikely(!txn || !flags)) return MDBX_EINVAL; @@ -9344,9 +9364,6 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { } int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { - MDBX_cursor *mc, *m2; - int rc; - if (unlikely(1 < (unsigned)del || !txn)) return MDBX_EINVAL; @@ -9362,25 +9379,41 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) return MDBX_EACCESS; - rc = mdbx_cursor_open(txn, dbi, &mc); - if (unlikely(rc)) + MDBX_cursor *mc; + int rc = mdbx_cursor_open(txn, dbi, &mc); + if (unlikely(rc != MDBX_SUCCESS)) return rc; - /* FIXME: locking to avoid races ? */ + MDBX_env *env = txn->mt_env; + rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_cursor_close(mc); + return rc; + } + + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) { + rc = MDBX_EINVAL; + goto bailout; + } + + if (unlikely(TXN_DBI_CHANGED(txn, dbi))) { + rc = MDBX_BAD_DBI; + goto bailout; + } rc = mdbx_drop0(mc, mc->mc_db->md_flags & MDBX_DUPSORT); /* Invalidate the dropped DB's cursors */ - for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + for (MDBX_cursor *m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) m2->mc_flags &= ~(C_INITIALIZED | C_EOF); if (unlikely(rc)) - goto leave; + goto bailout; /* Can't delete the main DB */ if (del && dbi >= CORE_DBS) { rc = mdbx_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); if (likely(!rc)) { txn->mt_dbflags[dbi] = DB_STALE; - mdbx_dbi_close(txn->mt_env, dbi); + mdbx_dbi_close_locked(env, dbi); } else { txn->mt_flags |= MDBX_TXN_ERROR; } @@ -9397,8 +9430,10 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { txn->mt_flags |= MDBX_TXN_DIRTY; } -leave: + +bailout: mdbx_cursor_close(mc); + mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); return rc; } From d91785f635efb31f6258cf86b0c8b48da90716ac Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 29 May 2017 19:27:17 +0300 Subject: [PATCH 208/303] mdbx: disable unnecessary assertion in mdbx_is_dirty(). --- src/defs.h | 1 + src/mdbx.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/defs.h b/src/defs.h index 79e65d44..5e3db8cb 100644 --- a/src/defs.h +++ b/src/defs.h @@ -322,6 +322,7 @@ # define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) # define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) # define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) +# define RUNNING_ON_VALGRIND (0) #endif /* USE_VALGRIND */ #ifdef __SANITIZE_ADDRESS__ diff --git a/src/mdbx.c b/src/mdbx.c index a6281ec9..02a2ea76 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -10259,7 +10259,8 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { * в пределах mmap, но за границей распределенных страниц. Это тяжелая * ошибка, к которой не возможно прийти без каких-то больших нарушений. * Поэтому не проверяем этот случай кроме как assert-ом, ибо бестолку. */ - mdbx_tassert(txn, env->me_map + env->me_mapsize >= (char *)page); + /* if (!RUNNING_ON_VALGRIND) + mdbx_tassert(txn, env->me_map + env->me_mapsize >= (char *)page); */ } /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был From 70fe982c3a6dbcbea231126221cc3db6c162d3fe Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 29 May 2017 19:27:49 +0300 Subject: [PATCH 209/303] mdbx: fix MDBX_WANNA_RECOVERY description. --- mdbx.h | 2 +- src/mdbx.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mdbx.h b/mdbx.h index 07fa432e..d98ae7e6 100644 --- a/mdbx.h +++ b/mdbx.h @@ -406,7 +406,7 @@ typedef enum MDBX_cursor_op { * - ABI version mismatch (rare case); */ #define MDBX_EBADSIGN (-30420) -/* Database should be recovered, but this could be done automatically +/* Database should be recovered, but this could NOT be done automatically * right now (e.g. in readonly mode and so forth). */ #define MDBX_WANNA_RECOVERY (-30419) diff --git a/src/mdbx.c b/src/mdbx.c index 02a2ea76..e879a544 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -633,7 +633,7 @@ static const char *__mdbx_strerr(int errnum) { return "MDBX_EBADSIGN: Wrong signature of a runtime object(s)"; case MDBX_WANNA_RECOVERY: return "MDBX_WANNA_RECOVERY: Database should be recovered, but this could " - "be done in a read-only mode"; + "NOT be done in a read-only mode"; case MDBX_EKEYMISMATCH: return "MDBX_EKEYMISMATCH: The given key value is mismatched to the " "current cursor position"; From 11e8727483004946299f69e205e15a10c77a92ae Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 29 May 2017 20:16:09 +0300 Subject: [PATCH 210/303] mdbx: fix missing braces. --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index e879a544..e76b72ff 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4180,7 +4180,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { } } - if (size & (env->me_os_psize - 1) || size < env->me_os_psize) { + if ((size & (env->me_os_psize - 1)) || size < env->me_os_psize) { mdbx_notice("lck-file has invalid size %" PRIu64 " bytes", size); return MDBX_PROBLEM; } From a36b065cd9f172e4c15bf9df06670a238a4a2502 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 29 May 2017 20:17:02 +0300 Subject: [PATCH 211/303] mdbx: logging MDBX_PROBLEM reasons. --- src/mdbx.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index e76b72ff..16294080 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1037,6 +1037,9 @@ static int mdbx_page_loose(MDBX_cursor *mc, MDBX_page *mp) { unsigned x = mdbx_mid2l_search(dl, pgno); if (x <= dl[0].mid && dl[x].mid == pgno) { if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ + mdbx_error("wrong page 0x%p #%" PRIaPGNO + " in the dirtylist[%d], expecting %p", + dl[x].mptr, pgno, x, mp); mc->mc_flags &= ~(C_INITIALIZED | C_EOF); txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_PROBLEM; @@ -1935,6 +1938,9 @@ static int mdbx_page_touch(MDBX_cursor *mc) { unsigned x = mdbx_mid2l_search(dl, pgno); if (x <= dl[0].mid && dl[x].mid == pgno) { if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ + mdbx_error("wrong page 0x%p #%" PRIaPGNO + " in the dirtylist[%d], expecting %p", + dl[x].mptr, pgno, x, mp); mc->mc_flags &= ~(C_INITIALIZED | C_EOF); txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_PROBLEM; @@ -5082,6 +5088,8 @@ static int mdbx_ovpage_free(MDBX_cursor *mc, MDBX_page *mp) { dl[x] = ix; } else { mdbx_cassert(mc, x > 1); + mdbx_error("not found page 0x%p #%" PRIaPGNO " in the dirtylist", mp, + mp->mp_pgno); j = ++(dl[0].mid); dl[j] = ix; /* Unsorted. OK when MDBX_TXN_ERROR. */ txn->mt_flags |= MDBX_TXN_ERROR; @@ -6448,8 +6456,9 @@ new_sub: return rc; bad_sub: if (unlikely(rc == MDBX_KEYEXIST)) - /* should not happen, we deleted that item */ - rc = MDBX_PROBLEM; + mdbx_error("unexpected %s", "MDBX_KEYEXIST"); + /* should not happen, we deleted that item */ + rc = MDBX_PROBLEM; } mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return rc; @@ -8237,8 +8246,10 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, mn.mc_top++; } if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ + if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ { + mdbx_error("unexpected %s", "MDBX_NOTFOUND"); rc = MDBX_PROBLEM; + } goto done; } if (nflags & MDBX_APPEND) { @@ -8759,8 +8770,10 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; rc = mdbx_env_cwalk(&my, &root, 0); - if (rc == MDBX_SUCCESS && root != new_root) + if (rc == MDBX_SUCCESS && root != new_root) { + mdbx_error("unexpected root %" PRIaPGNO " (%" PRIaPGNO ")", root, new_root); rc = MDBX_PROBLEM; /* page leak or corrupt DB */ + } finish: if (rc != MDBX_SUCCESS) From d5b0e6832619bccce263484b32883dbdf9034e8c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 29 May 2017 20:44:17 +0300 Subject: [PATCH 212/303] mdbx: fix merge/rebase error (remove mm_txnid field). --- src/bits.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/bits.h b/src/bits.h index 50eceff4..80e0f2a9 100644 --- a/src/bits.h +++ b/src/bits.h @@ -265,7 +265,6 @@ typedef struct MDBX_meta { /* Last used page in the datafile. * Actually the file may be shorter if the freeDB lists the final pages. */ uint64_t mm_last_pg; - volatile txnid_t mm_txnid; /* txnid that committed this page */ #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u volatile uint64_t mm_datasync_sign; From 009618560f1ebc8842b2383a4b60755f0810dd08 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 30 May 2017 15:19:46 +0300 Subject: [PATCH 213/303] mdbx: refine mdbx_is_dirty(). --- src/mdbx.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 16294080..6e204553 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -10257,38 +10257,45 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { * что было исходно задумано, детали см в логике кода mdbx_page_touch(). * * Более того, в режиме БЕЗ WRITEMAP грязные страницы выделяются через - * malloc(), т.е. находятся вне mmap-диапазона. + * malloc(), т.е. находятся вне mmap-диапазона и тогда чтобы отличить + * действительно грязную страницу от указателя на данные пользователя + * следует сканировать dirtylist, что накладно. * - * Тем не менее, однозначно страница "не грязная" если адрес находится - * внутри mmap-диапазона и в заголовке страницы нет флажка P_DIRTY. */ + * Тем не менее, однозначно страница "не грязная" (не будет переписана + * во время транзакции) если адрес находится внутри mmap-диапазона + * и в заголовке страницы нет флажка P_DIRTY. */ if (env->me_map < (char *)page) { const size_t used_size = env->me_psize * txn->mt_next_pgno; if ((char *)page < env->me_map + used_size) { - /* страница внутри диапазона, смотрим на флажки */ - if ((page->mp_flags & (P_DIRTY | P_LOOSE | P_KEEP)) == 0) - return MDBX_RESULT_FALSE; + /* страница внутри диапазона, смотрим на флажки */ + return (page->mp_flags & (P_DIRTY | P_LOOSE | P_KEEP)) + ? MDBX_RESULT_TRUE + : MDBX_RESULT_FALSE; } /* Гипотетически здесь возможна ситуация, когда указатель адресует что-то * в пределах mmap, но за границей распределенных страниц. Это тяжелая * ошибка, к которой не возможно прийти без каких-то больших нарушений. - * Поэтому не проверяем этот случай кроме как assert-ом, ибо бестолку. */ - /* if (!RUNNING_ON_VALGRIND) - mdbx_tassert(txn, env->me_map + env->me_mapsize >= (char *)page); */ + * Поэтому не проверяем этот случай кроме как assert-ом, на то что + * страница вне mmap-диаппазона. */ + mdbx_tassert(txn, (char *)page >= env->me_map + env->me_mapsize); } /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был * передан некорректный адрес, либо адрес в теневой странице, которая была * выделена посредством malloc(). * - * Поэтому всегда считаем что страница вне mmap-диапазона "грязная", - * не просматривая при этом списки грязных и spilled страниц у каких-либо - * транзакций. Такая логика имеет ряд преимуществ: + * Для WRITE_MAP режима такая страница однозначно "не грязная", + * а для режимов без WRITE_MAP следует просматривать списки dirty + * и spilled страниц у каких-либо транзакций (в том числе дочерних). + * + * Поэтому для WRITE_MAP возвращаем false, а для остальных режимов + * всегда true. Такая логика имеет ряд преимуществ: * - не тратим время на просмотр списков; - * - результат всегда безопасен (может быть ложно-положительным, но - * не ложно-отрицательным); + * - результат всегда безопасен (может быть ложно-положительным, + * но не ложно-отрицательным); * - результат не зависит от вложенности транзакций и от относительного * положения переданной транзакции в этой рекурсии. */ - return MDBX_RESULT_TRUE; + return (env->me_flags & MDBX_WRITEMAP) ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; } int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, From e7ee0bc7625e77408b77654cadb61f849f8764af Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 6 Jun 2017 03:10:52 +0300 Subject: [PATCH 214/303] mdbx: refine testing-jitter, add MDBX_DBG_JITTER. Change-Id: Ibc3bd8a16626e97aabc2cc544a3803f2e2bc3a10 --- mdbx.h | 1 + src/bits.h | 3 ++- test/test.cc | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/mdbx.h b/mdbx.h index d98ae7e6..b74bef0e 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1550,6 +1550,7 @@ LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDBX_env *env); #define MDBX_DBG_TRACE 4 #define MDBX_DBG_EXTRA 8 #define MDBX_DBG_AUDIT 16 +#define MDBX_DBG_JITTER 32 typedef void MDBX_debug_func(int type, const char *function, int line, const char *msg, va_list args); diff --git a/src/bits.h b/src/bits.h index 80e0f2a9..b21c639b 100644 --- a/src/bits.h +++ b/src/bits.h @@ -792,7 +792,8 @@ void mdbx_panic(const char *fmt, ...) static __inline void mdbx_jitter4testing(bool tiny) { #ifndef NDEBUG - mdbx_osal_jitter(tiny); + if (MDBX_DBG_JITTER & mdbx_runtime_flags) + mdbx_osal_jitter(tiny); #else (void)tiny; #endif diff --git a/test/test.cc b/test/test.cc index 0302479d..04a1b82e 100644 --- a/test/test.cc +++ b/test/test.cc @@ -114,7 +114,7 @@ void testcase::db_prepare() { log_trace(">> db_prepare"); assert(!db_guard); - int mdbx_dbg_opts = MDBX_DBG_ASSERT; + int mdbx_dbg_opts = MDBX_DBG_ASSERT | MDBX_DBG_JITTER; if (config.params.loglevel <= logging::trace) mdbx_dbg_opts |= MDBX_DBG_TRACE; if (config.params.loglevel <= logging::verbose) From 98a8fbdc7a311b2aa118150d45fc85736a1933c1 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 31 May 2017 14:04:40 +0300 Subject: [PATCH 215/303] mdbx: restore SIGPIPE hushing in the db-copy-thread. --- src/mdbx.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 6e204553..2bdb8681 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -8491,6 +8491,16 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { mdbx_copy *my = arg; char *ptr; int toggle = 0, wsize; + int rc; + +#if defined(SIGPIPE) && !defined(_WIN32) && !defined(_WIN64) + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + rc = pthread_sigmask(SIG_BLOCK, &set, NULL); + if (rc != 0) + my->mc_error = rc; +#endif mdbx_condmutex_lock(&my->mc_condmutex); while (!my->mc_error) { @@ -8502,9 +8512,18 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { ptr = my->mc_wbuf[toggle]; again: if (wsize > 0 && !my->mc_error) { - int rc = mdbx_write(my->mc_fd, ptr, wsize); - if (rc != MDBX_SUCCESS) + rc = mdbx_write(my->mc_fd, ptr, wsize); + if (rc != MDBX_SUCCESS) { +#if defined(SIGPIPE) && !defined(_WIN32) && !defined(_WIN64) + if (rc == EPIPE) { + /* Collect the pending SIGPIPE, otherwise (at least OS X) + * gives it to the process on thread-exit (ITS#8504). */ + int tmp; + sigwait(&set, &tmp); + } +#endif my->mc_error = rc; + } } /* If there's an overflow page tail, write it too */ From d99b2a4b1651b44424d517c1798c43cba50ca7ee Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 31 May 2017 14:06:25 +0300 Subject: [PATCH 216/303] mdbx: use F_SETNOSIGPIPE to SIGPIPE husing on a OS X. --- src/mdbx.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/mdbx.c b/src/mdbx.c index 2bdb8681..856e638f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -8493,6 +8493,14 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { int toggle = 0, wsize; int rc; +#if defined(F_SETNOSIGPIPE) + /* OS X delivers SIGPIPE to the whole process, not the thread that caused it. + * Disable SIGPIPE using platform specific fcntl. */ + int enabled = 1; + if (fcntl(my->mc_fd, F_SETNOSIGPIPE, &enabled)) + my->mc_error = errno; +#endif + #if defined(SIGPIPE) && !defined(_WIN32) && !defined(_WIN64) sigset_t set; sigemptyset(&set); From 61a3766e23673f662f288644ab457e17bd306e72 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 30 May 2017 16:22:42 +0300 Subject: [PATCH 217/303] mdbx: update DB format and signatures. Change-Id: I9c4b187e8ebc3df63fef15ae98872e27d56a01ab --- src/bits.h | 87 ++++++++++++++++++++++++++++++------------------- src/mdbx.c | 95 ++++++++++++++++++++++++++---------------------------- src/osal.h | 4 +-- 3 files changed, 100 insertions(+), 86 deletions(-) diff --git a/src/bits.h b/src/bits.h index b21c639b..734c1fe7 100644 --- a/src/bits.h +++ b/src/bits.h @@ -25,7 +25,7 @@ /* Features under development */ #ifndef MDBX_DEVEL -# define MDBX_DEVEL 0 +# define MDBX_DEVEL 1 #endif /*----------------------------------------------------------------------------*/ @@ -116,12 +116,12 @@ /* A stamp that identifies a file as an MDBX file. * There's nothing special about this value other than that it is easily * recognizable, and it will reflect any byte order mismatches. */ -#define MDBX_MAGIC 0xBEEFC0DE +#define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11) /* The version number for a database's datafile format. */ -#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 999 : 1) +#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 255 : 2) /* The version number for a database's lockfile format. */ -#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 999 : 1) +#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 255 : 2) /* handle for the DB used to track free pages. */ #define FREE_DBI 0 @@ -241,21 +241,29 @@ typedef struct MDBX_db { uint64_t md_leaf_pages; /* number of leaf pages */ uint64_t md_overflow_pages; /* number of overflow pages */ uint64_t md_entries; /* number of data items */ + uint64_t md_merkle; /* Merkle tree checksum */ } MDBX_db; /* Meta page content. * A meta page is the start point for accessing a database snapshot. * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */ typedef struct MDBX_meta { - /* Stamp identifying this as an MDBX file. It must be set - * to MDBX_MAGIC. */ - uint32_t mm_magic; - /* Version number of this file. Must be set to MDBX_DATA_VERSION. */ - uint32_t mm_version; - /* txnid that committed this page, */ - volatile txnid_t mm_txnid_top; + /* Stamp identifying this as an MDBX file. + * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ + uint64_t mm_magic_and_version; - uint64_t mm_mapsize; /* size of mmap region */ + /* txnid that committed this page, the first of a two-phase-update pair */ + volatile txnid_t mm_txnid_a; + + uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ + uint8_t mm_validator_id; /* ID of checksum and page validation method, + * zero (nothing) for now */ + uint8_t mm_extra_pagehdr; /* extra bytes in the page header, + * zero (nothing) for now */ + uint32_t mm_reserved_pad; /* padding for aligment, unused for now */ + + uint64_t mm_dbsize_min; /* minimal size of db */ + uint64_t mm_dbsize_max; /* maximal size of db */ MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ /* The size of pages used in this DB */ #define mm_psize mm_dbs[FREE_DBI].md_xsize @@ -265,17 +273,20 @@ typedef struct MDBX_meta { /* Last used page in the datafile. * Actually the file may be shorter if the freeDB lists the final pages. */ uint64_t mm_last_pg; + #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u - volatile uint64_t mm_datasync_sign; - #define SIGN_IS_WEAK(sign) ((sign) == MDBX_DATASIGN_WEAK) #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) - #define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign) #define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) - /* txnid that committed this page */ - volatile txnid_t mm_txnid_bottom; + volatile uint64_t mm_datasync_sign; + + /* to be removed */ + uint64_t mm_mapsize; /* current size of mmap region */ + + /* txnid that committed this page, the second of a two-phase-update pair */ + volatile txnid_t mm_txnid_b; } MDBX_meta; /* Common header for all page types. The page type depends on mp_flags. @@ -297,7 +308,8 @@ typedef struct MDBX_meta { * in the snapshot: Either used by a database or listed in a freeDB record. */ typedef struct MDBX_page { union { - pgno_t mp_pgno; /* page number */ + uint64_t mp_validator; /* checksum of page content or a txnid during + * which the page has been updated */ struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ @@ -318,6 +330,7 @@ typedef struct MDBX_page { }; uint32_t mp_pages; /* number of overflow pages */ }; + pgno_t mp_pgno; /* page number */ /* dynamic size */ union { @@ -330,15 +343,19 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data)) +#pragma pack(pop) + /* The header for the reader table (a memory-mapped lock file). */ typedef struct MDBX_lockinfo { - /* Stamp identifying this as an MDBX file. It must be set to MDBX_MAGIC. */ - uint64_t mti_magic; + /* Stamp identifying this as an MDBX file. + * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */ + uint64_t mti_magic_and_version; + /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */ - uint64_t mti_format; + uint32_t mti_os_and_format; + /* Flags which environment was opened. */ - uint32_t mti_envmode; - uint32_t mti_reserved; + volatile uint32_t mti_envmode; #ifdef MDBX_OSAL_LOCK MDBX_OSAL_LOCK mti_wmutex; @@ -355,7 +372,19 @@ typedef struct MDBX_lockinfo { MDBX_reader __cache_aligned mti_readers[1]; } MDBX_lockinfo; -#pragma pack(pop) +#define MDBX_LOCKINFO_WHOLE_SIZE \ + ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ + ~((size_t)MDBX_CACHELINE_SIZE - 1)) + +/* Lockfile format signature: version, features and field layout */ +#define MDBX_LOCK_FORMAT \ + ((MDBX_OSAL_LOCK_SIGN << 16) + \ + (uint16_t)(MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1)) + +#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION) + +#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) + /*----------------------------------------------------------------------------*/ /* Two kind lists of pages (aka IDL) */ @@ -574,16 +603,6 @@ typedef struct MDBX_pgstate { txnid_t mf_pglast; /* ID of last used record, or 0 if !mf_pghead */ } MDBX_pgstate; -#define MDBX_LOCKINFO_WHOLE_SIZE \ - ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ - ~((size_t)MDBX_CACHELINE_SIZE - 1)) - -/* Lockfile format signature: version, features and field layout */ -#define MDBX_LOCK_FORMAT \ - (((uint64_t)(MDBX_OSAL_LOCK_SIGN) << 32) + \ - ((MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1) << 16) + \ - (MDBX_LOCK_VERSION) /* Flags which describe functionality */) - /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE (0x9A899641) diff --git a/src/mdbx.c b/src/mdbx.c index 856e638f..e7f55865 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1279,12 +1279,12 @@ bailout: static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta, bool allow_volatile) { mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - txnid_t top = meta->mm_txnid_top; - txnid_t bottom = meta->mm_txnid_bottom; + txnid_t a = meta->mm_txnid_a; + txnid_t b = meta->mm_txnid_b; if (allow_volatile) - return (top < bottom) ? top : bottom; - mdbx_assert(env, top == bottom); - return top; + return (a < b) ? a : b; + mdbx_assert(env, a == b); + return a; } static __inline txnid_t mdbx_meta_txnid_stable(const MDBX_env *env, @@ -1300,8 +1300,8 @@ static __inline txnid_t mdbx_meta_txnid_fluid(const MDBX_env *env, static __inline void mdbx_meta_update_begin(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - mdbx_assert(env, meta->mm_txnid_top < txnid && meta->mm_txnid_bottom < txnid); - meta->mm_txnid_top = txnid; + mdbx_assert(env, meta->mm_txnid_a < txnid && meta->mm_txnid_b < txnid); + meta->mm_txnid_a = txnid; (void)env; mdbx_coherent_barrier(); } @@ -1309,19 +1309,19 @@ static __inline void mdbx_meta_update_begin(const MDBX_env *env, static __inline void mdbx_meta_update_end(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { mdbx_assert(env, meta >= METAPAGE(env, 0) || meta < METAPAGE_END(env)); - mdbx_assert(env, meta->mm_txnid_top == txnid); - mdbx_assert(env, meta->mm_txnid_bottom < txnid); + mdbx_assert(env, meta->mm_txnid_a == txnid); + mdbx_assert(env, meta->mm_txnid_b < txnid); mdbx_jitter4testing(true); - meta->mm_txnid_bottom = txnid; + meta->mm_txnid_b = txnid; mdbx_coherent_barrier(); } static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { mdbx_assert(env, meta < METAPAGE(env, 0) || meta > METAPAGE_END(env)); - meta->mm_txnid_top = txnid; - meta->mm_txnid_bottom = txnid; + meta->mm_txnid_a = txnid; + meta->mm_txnid_b = txnid; } static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { @@ -1329,7 +1329,7 @@ static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { #if 0 /* TODO */ sign = hippeus_hash64(&meta->mm_mapsize, sizeof(MDBX_meta) - offsetof(MDBX_meta, mm_mapsize), - meta->mm_version | (uint64_t)MDBX_MAGIC << 32); + meta->mm_version | (uint64_t)MDBX_DXD_MAGIC << 32); #else (void)meta; #endif @@ -2183,8 +2183,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } else if (env->me_lck) { const mdbx_pid_t pid = env->me_pid; const mdbx_tid_t tid = mdbx_thread_self(); - mdbx_assert(env, env->me_lck->mti_magic == MDBX_MAGIC); - mdbx_assert(env, env->me_lck->mti_format == MDBX_LOCK_FORMAT); + mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); + mdbx_assert(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); rc = mdbx_rdt_lock(env); if (unlikely(MDBX_IS_ERROR(rc))) @@ -3390,18 +3390,14 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { return MDBX_INVALID; } - if (page.mp_meta.mm_magic != MDBX_MAGIC) { - mdbx_error("meta[%u] has invalid magic", meta_number); - return MDBX_INVALID; + if (page.mp_meta.mm_magic_and_version != MDBX_DATA_MAGIC) { + mdbx_error("meta[%u] has invalid magic/version", meta_number); + return ((page.mp_meta.mm_magic_and_version >> 8) != MDBX_MAGIC) + ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; } - if (page.mp_meta.mm_version != MDBX_DATA_VERSION) { - mdbx_error("database is version %u, expected version %u", - page.mp_meta.mm_version, MDBX_DATA_VERSION); - return MDBX_VERSION_MISMATCH; - } - - if (page.mp_meta.mm_txnid_top != page.mp_meta.mm_txnid_bottom) { + if (page.mp_meta.mm_txnid_a != page.mp_meta.mm_txnid_b) { mdbx_warning("meta[%u] not completely updated, skip it", meta_number); continue; } @@ -3511,8 +3507,7 @@ static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, memset(model, 0, sizeof(*model)); model->mp_pgno = num; model->mp_flags = P_META; - model->mp_meta.mm_magic = MDBX_MAGIC; - model->mp_meta.mm_version = MDBX_DATA_VERSION; + model->mp_meta.mm_magic_and_version = MDBX_DATA_MAGIC; model->mp_meta.mm_mapsize = env->me_mapsize; model->mp_meta.mm_psize = env->me_psize; model->mp_meta.mm_last_pg = NUM_METAS - 1; @@ -3607,7 +3602,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, } MDBX_meta *target = nullptr; - if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_top) { + if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) { mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, sizeof(head->mm_dbs)) == 0); mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, @@ -3637,7 +3632,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, (target == head) ? "head" : "tail", mdbx_meta_txnid_stable(env, target), mdbx_durable_str((const MDBX_meta *)target), pending->mm_dbs[MAIN_DBI].md_root, pending->mm_dbs[FREE_DBI].md_root, - pending->mm_txnid_top, mdbx_durable_str(pending)); + pending->mm_txnid_a, mdbx_durable_str(pending)); mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, @@ -3663,13 +3658,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); mdbx_ensure(env, target == head || - mdbx_meta_txnid_stable(env, target) < pending->mm_txnid_top); + mdbx_meta_txnid_stable(env, target) < pending->mm_txnid_a); if (env->me_flags & MDBX_WRITEMAP) { mdbx_jitter4testing(true); if (likely(target != head)) { /* LY: 'invalidate' the meta. */ target->mm_datasync_sign = MDBX_DATASIGN_WEAK; - mdbx_meta_update_begin(env, target, pending->mm_txnid_top); + mdbx_meta_update_begin(env, target, pending->mm_txnid_a); #ifndef NDEBUG /* debug: provoke failure to catch a violators */ memset(target->mm_dbs, 0xCC, @@ -3687,13 +3682,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_coherent_barrier(); /* LY: 'commit' the meta */ - mdbx_meta_update_end(env, target, pending->mm_txnid_bottom); + mdbx_meta_update_end(env, target, pending->mm_txnid_b); mdbx_jitter4testing(true); } else { /* dangerous case (target == head), only mm_datasync_sign could * me updated, check assertions once again */ mdbx_ensure(env, - mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_top && + mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a && !META_IS_STEADY(head) && META_IS_STEADY(pending)); mdbx_ensure(env, head->mm_last_pg == pending->mm_last_pg); mdbx_ensure(env, head->mm_mapsize == pending->mm_mapsize); @@ -3706,8 +3701,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_coherent_barrier(); mdbx_jitter4testing(true); } else { - pending->mm_magic = MDBX_MAGIC; - pending->mm_version = MDBX_DATA_VERSION; + pending->mm_magic_and_version = MDBX_DATA_MAGIC; rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), offset); if (unlikely(rc != MDBX_SUCCESS)) { undo: @@ -4081,13 +4075,13 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { const MDBX_meta *head = mdbx_meta_head(env); const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); - if (head_txnid != meta.mm_txnid_top) { + if (head_txnid != meta.mm_txnid_a) { if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { assert(META_IS_STEADY(&meta) && !META_IS_STEADY(head)); if (env->me_flags & MDBX_RDONLY) { mdbx_error("rollback needed: (from head %" PRIaTXN " to steady %" PRIaTXN "), but unable in read-only mode", - head_txnid, meta.mm_txnid_top); + head_txnid, meta.mm_txnid_a); return MDBX_WANNA_RECOVERY /* LY: could not recovery/rollback */; } @@ -4095,7 +4089,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { MDBX_meta rollback = *head; mdbx_meta_set_txnid(env, &rollback, 0); mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN, head_txnid, - meta.mm_txnid_top); + meta.mm_txnid_a); mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head)); err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), (uint8_t *)head - (uint8_t *)env->me_map); @@ -4126,7 +4120,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { head->mm_mapsize, env->me_mapsize); meta = *head; meta.mm_mapsize = env->me_mapsize; - mdbx_meta_set_txnid(env, &meta, meta.mm_txnid_top + 1); + mdbx_meta_set_txnid(env, &meta, meta.mm_txnid_a + 1); if (META_IS_STEADY(head)) meta.mm_datasync_sign = mdbx_meta_sign(&meta); err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); @@ -4236,17 +4230,18 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { if (err) return err; - env->me_lck->mti_magic = MDBX_MAGIC; - env->me_lck->mti_format = MDBX_LOCK_FORMAT; + env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC; + env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT; } else { - if (env->me_lck->mti_magic != MDBX_MAGIC) { - mdbx_error("lock region has invalid magic"); - return MDBX_INVALID; + if (env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { + mdbx_error("lock region has invalid magic/version"); + return ((env->me_lck->mti_magic_and_version >> 8) != MDBX_MAGIC) + ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; } - if (env->me_lck->mti_format != MDBX_LOCK_FORMAT) { - mdbx_error("lock region has format+version 0x%" PRIx64 - ", expected 0x%" PRIx64, - env->me_lck->mti_format, MDBX_LOCK_FORMAT); + if (env->me_lck->mti_os_and_format != MDBX_LOCK_FORMAT) { + mdbx_error("lock region has os/format 0x%" PRIx32 ", expected 0x%" PRIx32, + env->me_lck->mti_os_and_format, MDBX_LOCK_FORMAT); return MDBX_VERSION_MISMATCH; } } @@ -4417,8 +4412,8 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, MDBX_meta *meta = mdbx_meta_head(env); MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; - mdbx_debug("opened database version %u, pagesize %u", meta->mm_version, - env->me_psize); + mdbx_debug("opened database version %u, pagesize %u", + (uint8_t)meta->mm_magic_and_version, env->me_psize); mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN "", container_of(meta, MDBX_page, mp_data)->mp_pgno, mdbx_meta_txnid_fluid(env, meta)); diff --git a/src/osal.h b/src/osal.h index 5491cd45..ee776358 100644 --- a/src/osal.h +++ b/src/osal.h @@ -472,10 +472,10 @@ void mdbx_osal_jitter(bool tiny); #if defined(_WIN32) || defined(_WIN64) #undef MDBX_OSAL_LOCK -#define MDBX_OSAL_LOCK_SIGN MDBX_TETRAD('f', 'l', 'c', 'k') +#define MDBX_OSAL_LOCK_SIGN UINT32_C(0xF10C) #else #define MDBX_OSAL_LOCK pthread_mutex_t -#define MDBX_OSAL_LOCK_SIGN MDBX_TETRAD('P', 'T', 'M', 'X') +#define MDBX_OSAL_LOCK_SIGN UINT32_C(0x8017) #endif int mdbx_lck_init(MDBX_env *env); From 7f4684abff824e576a4851cbcaf0fa701b300260 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 31 May 2017 16:44:43 +0300 Subject: [PATCH 218/303] mdbx: update TODO. --- TODO.md | 85 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 16 deletions(-) diff --git a/TODO.md b/TODO.md index 92dd691c..e977a898 100644 --- a/TODO.md +++ b/TODO.md @@ -1,18 +1,71 @@ +Допеределки - [x] разделение errno и GetLastError() - [x] CI посредством AppVeyor -- [ ] uint32/uint64 в структурах -- [ ] правки API (много...) -- [ ] инкрементальный mmap -- [ ] возврат выделенных страниц в unallocated tail -- [ ] устранение всех предупреждений -- [ ] перевод mdbx-tools на С++ и сборка для Windows -- [ ] тест конкурентного доступа -- [ ] тест основного функционала (заменить текущий треш) -- [ ] базовый бенчмарк -- [ ] переработка формата: заголовки страниц, meta, clk... -- [ ] зачистка Doxygen и бесполезных коментариев -- [ ] сборка через CMake -- [ ] актуализация README.md -- [ ] возможность хранения ключей внутри data (libfptu) -- [ ] асинхронная фиксация (https://github.com/ReOpen/libmdbx/issues/5) -- [ ] (пере)выделять память под IDL-списки с учетом реального кол-ва страниц, т.е. max(MDB_IDL_UM_MAX/MDB_IDL_UM_MAX, npages) +- [x] тест конкурентного доступа. +- [x] тест основного функционала (заменить текущий треш). +- [x] uint32/uint64 в структурах +- [x] Завершить переименование +- [x] Макросы версионности, сделать как в fpta (cmake?) +- [x] Попробовать убрать yield (или что там с местом?) +- [x] trinity для copy/compaction +- [x] trinity для mdbx_chk и mdbx_stat +- [x] проверки с mdbx_meta_eq +- [x] Не проверять режим при открытии в readonly +- [x] Поправить выбор tail в mdbx_chk +- [x] Там-же проверять позицию реклайминга +- [x] поправить проблему открытия после READ-ONLY. +- [x] static-assertы на размер/выравнивание lck, meta и т.п. +- [x] Зачистить size_t +- [x] Добавить локи вокруг dbi +- [x] Привести в порядок volatile +- [x] контроль meta.mapsize +- [x] переработка формата: заголовки страниц, meta, clk... +- [x] зачистка Doxygen и бесполезных коментариев. +- [x] Добавить поле типа контрольной суммы. +- [x] Добавить поле/флаг размера pgno_t. +- [x] Поменять сигнатуры. +- [ ] Добавить мета-страницы в coredump, проверить lck +- [ ] Сделать список для txnid_t, кода sizeof(txnid_t) > sizeof(pgno_t) и вернуть размер pgno_t +- [ ] Избавиться от умножения на размер страницы (заменить на сдвиг). +- [ ] Устранение всех предупреждений (в том числе под Windows). +- [ ] Перевод mdbx-tools на С++ и сборка для Windows +- [ ] Заменить заглушки mdbx_version и mdbx_build +- [ ] Актуализация README.md + +CI +- [ ] Прикрутить проверку coverity +- [ ] Добавить в CI linux сборки для 32-битных таргетов + +Доработки API +- [ ] Добавить возможность "подбора" режима для mdbx_env_open() +- [ ] Дать возможность задавать размер страницы при создании БД. +- [ ] Изменение mapsize через API с блокировкой и увеличением txn, плюс поправить доку. +- [ ] Контроль размера страницы полного размера и кол-ва страниц при создании и обновлении. +- [ ] Инкрементальный mmap. +- [ ] Возврат выделенных страниц в unallocated tail-pool. +- [ ] Инкрементальное приращение размера (колбэк стратегии?). +- [ ] Переименовать в API: env->db, db->tbl + +Тест +- [ ] Реализовать cleanup в тесте +- [ ] usage для теста +- [ ] Логирование в файл, плюс более полный progress bar +- [ ] Опция игнорирования (пропуска части теста) при переполнении БД +- [ ] Додумать имя и размещение тестовой БД по-умолчанию. +- [ ] Базовый бенчмарк + +Отладка +- [ ] Убрать MDB_DEBUG (всегда: логирование важный ситуаций и ошибок, опционально: включение ассертов и трассировка) +- [ ] Заменить mdbx_debug на mdbx_trace, и почистить... +- [ ] Заметить максимум assert() на mdbx_assert(env, ...) + +Развитие +- [ ] Валидатор страниц БД по номеру транзакции: + ~0 при переработке и номер транзакции при выделении, + проверять что этот номер больше головы реклайминга и не-больше текущей транзакции. +- [ ] Добавить free_backlog в meta +- [ ] Валидатор страниц по CRC32, плюс контроль номер транзакии под модулю 2^32. +- [ ] Валидатор страниц по t1ha c контролем снимков/версий БД на основе Merkle Tree. +- [ ] Возможность хранения ключей внутри data (libfptu) +- [ ] Асинхронная фиксация (https://github.com/ReOpen/libmdbx/issues/5) +- [ ] (Пере)Выделять память под IDL-списки с учетом реального кол-ва страниц, т.е. max(MDB_IDL_UM_MAX/MDB_IDL_UM_MAX, npages) From 19dd181b6f58d1a02bf66c3d8e18ac2dd21211e0 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 31 May 2017 17:09:43 +0300 Subject: [PATCH 219/303] mdbx: fix STATIC_ASSERT for MSVC. --- src/defs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/defs.h b/src/defs.h index 5e3db8cb..6ef6e35a 100644 --- a/src/defs.h +++ b/src/defs.h @@ -375,6 +375,9 @@ # define STATIC_ASSERT_MSG(expr, msg) _Static_assert(expr, msg) # elif defined(static_assert) # define STATIC_ASSERT_MSG(expr, msg) static_assert(expr, msg) +# elif defined(_MSC_VER) +# include +# define STATIC_ASSERT_MSG(expr, msg) _STATIC_ASSERT(expr) # else # define STATIC_ASSERT_MSG(expr, msg) switch (0) {case 0:case (expr):;} # endif From 95ebdb706519edaeaf6144a6ca3fa083deeda1ea Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 31 May 2017 17:10:17 +0300 Subject: [PATCH 220/303] mdbx: MAX_PAGESIZE always 64K. --- src/bits.h | 59 ++++++++++++++++++++++++++---------------------------- src/mdbx.c | 45 ++++++++++++++++++++++------------------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/src/bits.h b/src/bits.h index 734c1fe7..89d3f808 100644 --- a/src/bits.h +++ b/src/bits.h @@ -82,24 +82,6 @@ /*----------------------------------------------------------------------------*/ /* Basic constants and types */ -/* The maximum size of a database page. - * - * It is 32k or 64k, since value-PAGEBASE must fit in - * MDBX_page.mp_upper. - * - * MDBX will use database pages < OS pages if needed. - * That causes more I/O in write transactions: The OS must - * know (read) the whole page before writing a partial page. - * - * Note that we don't currently support Huge pages. On Linux, - * regular data files cannot use Huge pages, and in general - * Huge pages aren't actually pageable. We rely on the OS - * demand-pager to read our data and page it out when memory - * pressure from other processes is high. So until OSs have - * actual paging support for Huge pages, they're not viable. */ -#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) -#define MIN_PAGESIZE 1024 - /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the * maximum size of a data item. Data items larger than this size will @@ -151,13 +133,6 @@ typedef uint64_t txnid_t; * this is plenty. */ typedef uint16_t indx_t; -#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) -#define MAX_MAPSIZE \ - ((sizeof(size_t) < 8) \ - ? UINT32_C(0x7ff80000) \ - : ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \ - : MAX_PAGENO * (uint64_t)MAX_PAGESIZE)) - /*----------------------------------------------------------------------------*/ /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) @@ -308,9 +283,10 @@ typedef struct MDBX_meta { * in the snapshot: Either used by a database or listed in a freeDB record. */ typedef struct MDBX_page { union { + struct MDBX_page *mp_next; /* for in-memory list of freed pages, + * must be first field, see NEXT_LOOSE_PAGE */ uint64_t mp_validator; /* checksum of page content or a txnid during * which the page has been updated */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -343,6 +319,30 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data)) +/* The maximum size of a database page. +* +* It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper. +* +* MDBX will use database pages < OS pages if needed. +* That causes more I/O in write transactions: The OS must +* know (read) the whole page before writing a partial page. +* +* Note that we don't currently support Huge pages. On Linux, +* regular data files cannot use Huge pages, and in general +* Huge pages aren't actually pageable. We rely on the OS +* demand-pager to read our data and page it out when memory +* pressure from other processes is high. So until OSs have +* actual paging support for Huge pages, they're not viable. */ +#define MAX_PAGESIZE 0x10000u +#define MIN_PAGESIZE 512u + +#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) +#define MAX_MAPSIZE \ + ((sizeof(size_t) < 8) \ + ? UINT32_C(0x7ff80000) \ + : ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \ + : MAX_PAGENO * (uint64_t)MAX_PAGESIZE)) + #pragma pack(pop) /* The header for the reader table (a memory-mapped lock file). */ @@ -885,11 +885,8 @@ static __inline size_t roundup2(size_t value, size_t granularity) { /* Address of first usable data byte in a page, after the header */ #define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) -/* ITS#7713, change PAGEBASE to handle 65536 byte pages */ -#define PAGEBASE ((MDBX_DEVEL) ? PAGEHDRSZ : 0) - /* Number of nodes on a page */ -#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) +#define NUMKEYS(p) ((p)->mp_lower >> 1) /* The amount of space remaining in the page */ #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) @@ -986,7 +983,7 @@ typedef struct MDBX_node { /* Address of node i in page p */ static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { assert(NUMKEYS(p) > (unsigned)(i)); - return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); + return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEHDRSZ); } /* Address of the key for the node */ diff --git a/src/mdbx.c b/src/mdbx.c index e7f55865..f3c1567e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -833,7 +833,7 @@ static void mdbx_page_list(MDBX_page *mp) { total = EVEN(total); } mdbx_print("Total: header %u + contents %u + unused %u\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, SIZELEFT(mp)); } @@ -1816,14 +1816,16 @@ done: * [in] src page to copy from * [in] psize size of a page */ static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, unsigned psize) { + STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); + STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 42); enum { Align = sizeof(pgno_t) }; indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; /* If page isn't full, just copy the used portion. Adjust * alignment so memcpy may copy words instead of bytes. */ if ((unused &= -Align) && !IS_LEAF2(src)) { - upper = (upper + PAGEBASE) & -Align; - memcpy(dst, src, (lower + PAGEBASE + (Align - 1)) & -Align); + upper = (upper + PAGEHDRSZ) & -Align; + memcpy(dst, src, (lower + PAGEHDRSZ + (Align - 1)) & -Align); memcpy((pgno_t *)((char *)dst + upper), (pgno_t *)((char *)src + upper), psize - upper); } else { @@ -6068,7 +6070,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, fp_flags = P_LEAF | P_DIRTY; fp = env->me_pbuf; fp->mp_leaf2_ksize = (uint16_t)data->iov_len; /* used if MDBX_DUPFIXED */ - fp->mp_lower = fp->mp_upper = (PAGEHDRSZ - PAGEBASE); + fp->mp_lower = fp->mp_upper = 0; olddata.iov_len = PAGEHDRSZ; goto prep_subDB; } @@ -6140,7 +6142,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, /* Make sub-page header for the dup items, with dummy body */ fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP; - fp->mp_lower = (PAGEHDRSZ - PAGEBASE); + fp->mp_lower = 0; xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; if (mc->mc_db->md_flags & MDBX_DUPFIXED) { fp->mp_flags |= P_LEAF2; @@ -6150,7 +6152,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + (dkey.iov_len & 1) + (data->iov_len & 1); } - fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEBASE); + fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ } else if (leaf->mn_flags & F_SUBDATA) { /* Data is on sub-DB, just store it */ @@ -6218,9 +6220,9 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (fp_flags & P_LEAF2) { memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize); } else { - memcpy((char *)mp + mp->mp_upper + PAGEBASE, - (char *)fp + fp->mp_upper + PAGEBASE, - olddata.iov_len - fp->mp_upper - PAGEBASE); + memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ, + (char *)fp + fp->mp_upper + PAGEHDRSZ, + olddata.iov_len - fp->mp_upper - PAGEHDRSZ); for (i = 0; i < NUMKEYS(fp); i++) mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; } @@ -6594,8 +6596,8 @@ static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num, mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize); np->mp_flags = flags | P_DIRTY; - np->mp_lower = (PAGEHDRSZ - PAGEBASE); - np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; + np->mp_lower = 0; + np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ; if (IS_BRANCH(np)) mc->mc_db->md_branch_pages++; @@ -6850,7 +6852,7 @@ static void mdbx_node_del(MDBX_cursor *mc, int ksize) { } } - base = (char *)mp + mp->mp_upper + PAGEBASE; + base = (char *)mp + mp->mp_upper + PAGEHDRSZ; memmove(base + sz, base, ptr - mp->mp_upper); mp->mp_lower -= sizeof(indx_t); @@ -6888,7 +6890,7 @@ static void mdbx_node_shrink(MDBX_page *mp, indx_t indx) { SETDSZ(node, nsize); /* Shift upward */ - base = (char *)mp + mp->mp_upper + PAGEBASE; + base = (char *)mp + mp->mp_upper + PAGEHDRSZ; memmove(base + delta, base, (char *)sp + len - base); ptr = mp->mp_ptrs[indx]; @@ -7231,7 +7233,7 @@ static int mdbx_update_key(MDBX_cursor *mc, MDBX_val *key) { mp->mp_ptrs[i] -= delta; } - base = (char *)mp + mp->mp_upper + PAGEBASE; + base = (char *)mp + mp->mp_upper + PAGEHDRSZ; len = ptr - mp->mp_upper + NODESIZE; memmove(base - delta, base, len); mp->mp_upper -= delta; @@ -8130,8 +8132,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, } copy->mp_pgno = mp->mp_pgno; copy->mp_flags = mp->mp_flags; - copy->mp_lower = (PAGEHDRSZ - PAGEBASE); - copy->mp_upper = env->me_psize - PAGEBASE; + copy->mp_lower = 0; + copy->mp_upper = env->me_psize - PAGEHDRSZ; /* prepare to insert */ for (i = 0, j = 0; i < nkeys; i++) { @@ -8173,7 +8175,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, psize += nsize; node = NULL; } else { - node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); if (IS_LEAF(mp)) { if (F_ISSET(node->mn_flags, F_BIGDATA)) @@ -8193,7 +8195,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, sepkey.iov_len = newkey->iov_len; sepkey.iov_base = newkey->iov_base; } else { - node = (MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); + node = + (MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEHDRSZ); sepkey.iov_len = node->mn_ksize; sepkey.iov_base = NODEKEY(node); } @@ -8272,7 +8275,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, /* Update index for the new key. */ mc->mc_ki[mc->mc_top] = j; } else { - node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); rkey.iov_base = NODEKEY(node); rkey.iov_len = node->mn_ksize; if (IS_LEAF(mp)) { @@ -8308,7 +8311,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, mp->mp_lower = copy->mp_lower; mp->mp_upper = copy->mp_upper; memcpy(NODEPTR(mp, nkeys - 1), NODEPTR(copy, nkeys - 1), - env->me_psize - copy->mp_upper - PAGEBASE); + env->me_psize - copy->mp_upper - PAGEHDRSZ); /* reset back to original page */ if (newindx < split_indx) { @@ -9806,7 +9809,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, return MDBX_CORRUPTED; nkeys = NUMKEYS(mp); - header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower; + header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower; unused_size = SIZELEFT(mp); payload_size = 0; From 52f52de2d512d3c0518fbbfb78fb6b5cbb818232 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 31 May 2017 18:58:55 +0300 Subject: [PATCH 221/303] mdbx: include meta-pages into a coredump. --- TODO.md | 2 +- src/mdbx.c | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/TODO.md b/TODO.md index e977a898..1c1c9a25 100644 --- a/TODO.md +++ b/TODO.md @@ -24,7 +24,7 @@ - [x] Добавить поле типа контрольной суммы. - [x] Добавить поле/флаг размера pgno_t. - [x] Поменять сигнатуры. -- [ ] Добавить мета-страницы в coredump, проверить lck +- [x] Добавить мета-страницы в coredump, проверить lck - [ ] Сделать список для txnid_t, кода sizeof(txnid_t) > sizeof(pgno_t) и вернуть размер pgno_t - [ ] Избавиться от умножения на размер страницы (заменить на сдвиг). - [ ] Устранение всех предупреждений (в том числе под Windows). diff --git a/src/mdbx.c b/src/mdbx.c index f3c1567e..a93bdab1 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3858,9 +3858,12 @@ static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { (void)madvise(env->me_map, env->me_mapsize, MADV_NOHUGEPAGE); #endif -#ifdef MADV_DONTDUMP +#if defined(MADV_DODUMP) && defined(MADV_DONTDUMP) + const size_t meta_length = env->me_psize * NUM_METAS; + (void)madvise(env->me_map, env->me_psize * NUM_METAS, MADV_DODUMP); if (!(flags & MDBX_PAGEPERTURB)) - (void)madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP); + (void)madvise(env->me_map + meta_length, env->me_mapsize - meta_length, + MADV_DONTDUMP); #endif #ifdef MADV_REMOVE @@ -4202,10 +4205,6 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { assert(addr != nullptr); env->me_lck = addr; -#ifdef MADV_NOHUGEPAGE - (void)madvise(env->me_lck, size, MADV_NOHUGEPAGE); -#endif - #ifdef MADV_DODUMP (void)madvise(env->me_lck, size, MADV_DODUMP); #endif From 1343b46466285f5d92937c4ecbdd490efc976818 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 5 Jun 2017 14:02:44 +0300 Subject: [PATCH 222/303] mdbx: add txnid-list functions. --- src/bits.h | 5 +++- src/mdbx.c | 86 +++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 73 insertions(+), 18 deletions(-) diff --git a/src/bits.h b/src/bits.h index 89d3f808..7d23cb62 100644 --- a/src/bits.h +++ b/src/bits.h @@ -394,6 +394,9 @@ typedef struct MDBX_lockinfo { * descending order. */ typedef pgno_t *MDBX_IDL; +/* List of txnid, only for MDBX_env.mt_lifo_reclaimed */ +typedef txnid_t *MDBX_TXL; + /* An ID2 is an ID/pointer pair. */ typedef struct MDBX_ID2 { pgno_t mid; /* The ID */ @@ -451,7 +454,7 @@ struct MDBX_txn { txnid_t mt_txnid; MDBX_env *mt_env; /* the DB environment */ /* The list of reclaimed txns from freeDB */ - MDBX_IDL mt_lifo_reclaimed; + MDBX_TXL mt_lifo_reclaimed; /* The list of pages that became unused during this transaction. */ MDBX_IDL mt_free_pages; /* The list of loose pages that became unused and may be reused diff --git a/src/mdbx.c b/src/mdbx.c index a93bdab1..8e832f23 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -168,13 +168,27 @@ static MDBX_IDL mdbx_midl_alloc(unsigned size) { return ids; } +static MDBX_TXL mdbx_txl_alloc(unsigned size) { + MDBX_TXL ptr = malloc((size + 2) * sizeof(txnid_t)); + if (likely(ptr)) { + *ptr++ = size; + *ptr = 0; + } + return ptr; +} + /* Free an IDL. * [in] ids The IDL to free. */ static void mdbx_midl_free(MDBX_IDL ids) { - if (ids) + if (likely(ids)) free(ids - 1); } +static void mdbx_txl_free(MDBX_TXL list) { + if (likely(list)) + free(list - 1); +} + /* Append ID to IDL. The IDL must be big enough. */ static __inline void mdbx_midl_xappend(MDBX_IDL idl, pgno_t id) { assert(idl[0] + (size_t)1 < MDBX_IDL_ALLOCLEN(idl)); @@ -244,6 +258,17 @@ static int mdbx_midl_grow(MDBX_IDL *idp, unsigned num) { return 0; } +static int mdbx_txl_grow(MDBX_TXL *ptr, unsigned num) { + MDBX_TXL list = *ptr - 1; + /* grow it */ + list = realloc(list, (*list + num + 2) * sizeof(txnid_t)); + if (unlikely(!list)) + return MDBX_ENOMEM; + *list++ += num; + *ptr = list; + return 0; +} + /* Make room for num additional elements in an IDL. * [in,out] idp Address of the IDL. * [in] num Number of elements to make room for. @@ -251,7 +276,7 @@ static int mdbx_midl_grow(MDBX_IDL *idp, unsigned num) { static int mdbx_midl_need(MDBX_IDL *idp, unsigned num) { MDBX_IDL ids = *idp; num += ids[0]; - if (num > ids[-1]) { + if (unlikely(num > ids[-1])) { num = (num + num / 4 + (256 + 2)) & -256; ids = realloc(ids - 1, num * sizeof(pgno_t)); if (unlikely(!ids)) @@ -269,7 +294,7 @@ static int mdbx_midl_need(MDBX_IDL *idp, unsigned num) { static int mdbx_midl_append(MDBX_IDL *idp, pgno_t id) { MDBX_IDL ids = *idp; /* Too big? */ - if (ids[0] >= ids[-1]) { + if (unlikely(ids[0] >= ids[-1])) { if (mdbx_midl_grow(idp, MDBX_IDL_UM_MAX)) return MDBX_ENOMEM; ids = *idp; @@ -279,6 +304,19 @@ static int mdbx_midl_append(MDBX_IDL *idp, pgno_t id) { return 0; } +static int mdbx_txl_append(MDBX_TXL *ptr, txnid_t id) { + MDBX_TXL list = *ptr; + /* Too big? */ + if (unlikely(list[0] >= list[-1])) { + if (mdbx_txl_grow(ptr, list[0])) + return MDBX_ENOMEM; + list = *ptr; + } + list[0]++; + list[list[0]] = id; + return 0; +} + /* Append an IDL onto an IDL. * [in,out] idp Address of the IDL to append to. * [in] app The IDL to append. @@ -286,7 +324,7 @@ static int mdbx_midl_append(MDBX_IDL *idp, pgno_t id) { static int mdbx_midl_append_list(MDBX_IDL *idp, MDBX_IDL app) { MDBX_IDL ids = *idp; /* Too big? */ - if (ids[0] + app[0] >= ids[-1]) { + if (unlikely(ids[0] + app[0] >= ids[-1])) { if (mdbx_midl_grow(idp, app[0])) return MDBX_ENOMEM; ids = *idp; @@ -296,6 +334,19 @@ static int mdbx_midl_append_list(MDBX_IDL *idp, MDBX_IDL app) { return 0; } +static int mdbx_txl_append_list(MDBX_TXL *ptr, MDBX_TXL append) { + MDBX_TXL list = *ptr; + /* Too big? */ + if (unlikely(list[0] + append[0] >= list[-1])) { + if (mdbx_txl_grow(ptr, append[0])) + return MDBX_ENOMEM; + list = *ptr; + } + memcpy(&list[list[0] + 1], &append[1], append[0] * sizeof(txnid_t)); + list[0] += append[0]; + return 0; +} + /* Append an ID range onto an IDL. * [in,out] idp Address of the IDL to append to. * [in] id The lowest ID to append. @@ -304,7 +355,7 @@ static int mdbx_midl_append_list(MDBX_IDL *idp, MDBX_IDL app) { static int mdbx_midl_append_range(MDBX_IDL *idp, pgno_t id, unsigned n) { pgno_t *ids = *idp, len = ids[0]; /* Too big? */ - if (len + n > ids[-1]) { + if (unlikely(len + n > ids[-1])) { if (mdbx_midl_grow(idp, n | MDBX_IDL_UM_MAX)) return MDBX_ENOMEM; ids = *idp; @@ -1644,7 +1695,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, goto fail; if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { - txn->mt_lifo_reclaimed = mdbx_midl_alloc(env->me_maxfree_1pg); + txn->mt_lifo_reclaimed = mdbx_txl_alloc(env->me_maxfree_1pg); if (unlikely(!txn->mt_lifo_reclaimed)) { rc = MDBX_ENOMEM; goto fail; @@ -1666,7 +1717,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, mop = env->me_pghead; } if (flags & MDBX_LIFORECLAIM) { - if ((rc = mdbx_midl_append(&txn->mt_lifo_reclaimed, last)) != 0) + if ((rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, last)) != 0) goto fail; } env->me_pglast = last; @@ -2559,7 +2610,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { if (txn->mt_lifo_reclaimed) { txn->mt_lifo_reclaimed[0] = 0; if (txn != env->me_txn0) { - mdbx_midl_free(txn->mt_lifo_reclaimed); + mdbx_txl_free(txn->mt_lifo_reclaimed); txn->mt_lifo_reclaimed = NULL; } } @@ -2823,14 +2874,14 @@ again: } if (unlikely(!txn->mt_lifo_reclaimed)) { - txn->mt_lifo_reclaimed = mdbx_midl_alloc(env->me_maxfree_1pg); + txn->mt_lifo_reclaimed = mdbx_txl_alloc(env->me_maxfree_1pg); if (unlikely(!txn->mt_lifo_reclaimed)) { rc = MDBX_ENOMEM; goto bailout; } } /* LY: append the list. */ - rc = mdbx_midl_append(&txn->mt_lifo_reclaimed, env->me_pglast - 1); + rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, env->me_pglast - 1); if (unlikely(rc)) goto bailout; --env->me_pglast; @@ -2970,7 +3021,7 @@ bailout: } txn->mt_lifo_reclaimed[0] = 0; if (txn != env->me_txn0) { - mdbx_midl_free(txn->mt_lifo_reclaimed); + mdbx_txl_free(txn->mt_lifo_reclaimed); txn->mt_lifo_reclaimed = NULL; } } @@ -3122,11 +3173,11 @@ int mdbx_txn_commit(MDBX_txn *txn) { /* Append our reclaim list to parent's */ if (txn->mt_lifo_reclaimed) { if (parent->mt_lifo_reclaimed) { - rc = mdbx_midl_append_list(&parent->mt_lifo_reclaimed, - txn->mt_lifo_reclaimed); + rc = mdbx_txl_append_list(&parent->mt_lifo_reclaimed, + txn->mt_lifo_reclaimed); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - mdbx_midl_free(txn->mt_lifo_reclaimed); + mdbx_txl_free(txn->mt_lifo_reclaimed); } else parent->mt_lifo_reclaimed = txn->mt_lifo_reclaimed; txn->mt_lifo_reclaimed = NULL; @@ -4457,9 +4508,10 @@ static void __cold mdbx_env_close0(MDBX_env *env) { free(env->me_dbflags); free(env->me_path); free(env->me_dirtylist); - if (env->me_txn0) - mdbx_midl_free(env->me_txn0->mt_lifo_reclaimed); - free(env->me_txn0); + if (env->me_txn0) { + mdbx_txl_free(env->me_txn0->mt_lifo_reclaimed); + free(env->me_txn0); + } mdbx_midl_free(env->me_free_pgs); if (env->me_flags & MDBX_ENV_TXKEY) { From 59c22ab0365ac65b177abc6f69970b18e7e6a824 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 5 Jun 2017 14:22:52 +0300 Subject: [PATCH 223/303] mdbx: use 32-bit pgno_t (page numbers). Change-Id: Ie9f3528e12b7be27765c0225edde26e4d8282692 --- TODO.md | 2 +- src/bits.h | 38 ++++++++++++++++++++++---------------- src/mdbx.c | 2 +- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/TODO.md b/TODO.md index 1c1c9a25..3c4c1c80 100644 --- a/TODO.md +++ b/TODO.md @@ -25,7 +25,7 @@ - [x] Добавить поле/флаг размера pgno_t. - [x] Поменять сигнатуры. - [x] Добавить мета-страницы в coredump, проверить lck -- [ ] Сделать список для txnid_t, кода sizeof(txnid_t) > sizeof(pgno_t) и вернуть размер pgno_t +- [x] Сделать список для txnid_t, кода sizeof(txnid_t) > sizeof(pgno_t) и вернуть размер pgno_t - [ ] Избавиться от умножения на размер страницы (заменить на сдвиг). - [ ] Устранение всех предупреждений (в том числе под Windows). - [ ] Перевод mdbx-tools на С++ и сборка для Windows diff --git a/src/bits.h b/src/bits.h index 7d23cb62..1c46708e 100644 --- a/src/bits.h +++ b/src/bits.h @@ -119,14 +119,21 @@ * * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ -typedef uint64_t pgno_t; -#define PRIaPGNO PRIu64 /* TODO */ +typedef uint32_t pgno_t; +#define PRIaPGNO PRIu32 #define MAX_PAGENO ((pgno_t)UINT64_C(0xffffFFFFffff)) #define MIN_PAGENO (NUM_METAS - 1) /* A transaction ID. */ typedef uint64_t txnid_t; #define PRIaTXN PRIi64 +#if MDBX_DEVEL +#define MIN_TXNID (UINT64_MAX - UINT32_MAX) +#elif MDBX_DEBUG +#define MIN_TXNID UINT64_C(0x100000000) +#else +#define MIN_TXNID UINT64_C(0) +#endif /* MIN_TXNID */ /* Used for offsets within a single page. * Since memory pages are typically 4 or 8KB in size, 12-13 bits, @@ -207,16 +214,16 @@ typedef struct MDBX_reader { /* Information about a single database in the environment. */ typedef struct MDBX_db { - uint32_t md_xsize; /* also ksize for LEAF2 pages */ - uint16_t md_flags; /* see mdbx_dbi_open */ - uint16_t md_depth; /* depth of this tree */ - uint64_t md_root; /* the root page of this tree */ - uint64_t md_seq; /* table sequence counter */ - uint64_t md_branch_pages; /* number of internal pages */ - uint64_t md_leaf_pages; /* number of leaf pages */ - uint64_t md_overflow_pages; /* number of overflow pages */ - uint64_t md_entries; /* number of data items */ - uint64_t md_merkle; /* Merkle tree checksum */ + uint16_t md_flags; /* see mdbx_dbi_open */ + uint16_t md_depth; /* depth of this tree */ + uint32_t md_xsize; /* also ksize for LEAF2 pages */ + pgno_t md_root; /* the root page of this tree */ + pgno_t md_branch_pages; /* number of internal pages */ + pgno_t md_leaf_pages; /* number of leaf pages */ + pgno_t md_overflow_pages; /* number of overflow pages */ + uint64_t md_seq; /* table sequence counter */ + uint64_t md_entries; /* number of data items */ + uint64_t md_merkle; /* Merkle tree checksum */ } MDBX_db; /* Meta page content. @@ -235,7 +242,9 @@ typedef struct MDBX_meta { * zero (nothing) for now */ uint8_t mm_extra_pagehdr; /* extra bytes in the page header, * zero (nothing) for now */ - uint32_t mm_reserved_pad; /* padding for aligment, unused for now */ + /* Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. */ + pgno_t mm_last_pg; uint64_t mm_dbsize_min; /* minimal size of db */ uint64_t mm_dbsize_max; /* maximal size of db */ @@ -245,9 +254,6 @@ typedef struct MDBX_meta { /* Any persistent environment flags, see mdbx_env */ #define mm_flags mm_dbs[FREE_DBI].md_flags mdbx_canary mm_canary; - /* Last used page in the datafile. - * Actually the file may be shorter if the freeDB lists the final pages. */ - uint64_t mm_last_pg; #define MDBX_DATASIGN_NONE 0u #define MDBX_DATASIGN_WEAK 1u diff --git a/src/mdbx.c b/src/mdbx.c index 8e832f23..add26908 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3569,7 +3569,7 @@ static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, MDBX_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ model->mp_meta.mm_dbs[FREE_DBI].md_root = P_INVALID; model->mp_meta.mm_dbs[MAIN_DBI].md_root = P_INVALID; - mdbx_meta_set_txnid(env, &model->mp_meta, num); + mdbx_meta_set_txnid(env, &model->mp_meta, MIN_TXNID + num); model->mp_meta.mm_datasync_sign = mdbx_meta_sign(&model->mp_meta); return (MDBX_page *)((uint8_t *)model + env->me_psize); } From 439ae3983c485c7d3bced8b10b18ee1ab4591255 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 5 Jun 2017 16:56:10 +0300 Subject: [PATCH 224/303] mdbx: use 2^N constants for fill-threshold. Change-Id: I3e190f5af77282a7995ae733c7a6865a21daa6c3 --- src/bits.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bits.h b/src/bits.h index 1c46708e..f06685b9 100644 --- a/src/bits.h +++ b/src/bits.h @@ -902,11 +902,11 @@ static __inline size_t roundup2(size_t value, size_t granularity) { /* The percentage of space used in the page, in tenths of a percent. */ #define PAGEFILL(env, p) \ - (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + (1024L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ ((env)->me_psize - PAGEHDRSZ)) /* The minimum page fill factor, in tenths of a percent. * Pages emptier than this are candidates for merging. */ -#define FILL_THRESHOLD 250 +#define FILL_THRESHOLD 256 /* Test if a page is a leaf page */ #define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) From 9ef81ac16c421ba106ab943f99d19b8326967176 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 5 Jun 2017 16:56:59 +0300 Subject: [PATCH 225/303] mdbx: add 'unlikely' to IS_OVERFLOW(). Change-Id: Ia92aac948046890ca4f9d9c3fbff0b4ac5728fab --- src/bits.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bits.h b/src/bits.h index f06685b9..3e91348e 100644 --- a/src/bits.h +++ b/src/bits.h @@ -915,7 +915,7 @@ static __inline size_t roundup2(size_t value, size_t granularity) { /* Test if a page is a branch page */ #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) /* Test if a page is an overflow page */ -#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) +#define IS_OVERFLOW(p) unlikely(F_ISSET((p)->mp_flags, P_OVERFLOW)) /* Test if a page is a sub page */ #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) From 802c10f13f0618a9e3683463f3aa2c7f46788c9b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 5 Jun 2017 17:16:21 +0300 Subject: [PATCH 226/303] mdbx: log2shift instead of multiplication by pagesize. Change-Id: I72c4c7699a409795459b466ff74c388b15b13687 --- TODO.md | 2 +- src/bits.h | 17 ++++++- src/mdbx.c | 140 ++++++++++++++++++++++++++++------------------------- 3 files changed, 92 insertions(+), 67 deletions(-) diff --git a/TODO.md b/TODO.md index 3c4c1c80..338f6e83 100644 --- a/TODO.md +++ b/TODO.md @@ -26,7 +26,7 @@ - [x] Поменять сигнатуры. - [x] Добавить мета-страницы в coredump, проверить lck - [x] Сделать список для txnid_t, кода sizeof(txnid_t) > sizeof(pgno_t) и вернуть размер pgno_t -- [ ] Избавиться от умножения на размер страницы (заменить на сдвиг). +- [x] Избавиться от умножения на размер страницы (заменить на сдвиг). - [ ] Устранение всех предупреждений (в том числе под Windows). - [ ] Перевод mdbx-tools на С++ и сборка для Windows - [ ] Заменить заглушки mdbx_version и mdbx_build diff --git a/src/bits.h b/src/bits.h index 3e91348e..865ade2e 100644 --- a/src/bits.h +++ b/src/bits.h @@ -626,6 +626,7 @@ struct MDBX_env { #define MDBX_ENV_TXKEY 0x10000000U uint32_t me_flags; /* see mdbx_env */ unsigned me_psize; /* DB page size, inited from me_os_psize */ + unsigned me_psize2log; /* log2 of DB page size */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ /* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */ @@ -920,7 +921,7 @@ static __inline size_t roundup2(size_t value, size_t granularity) { #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) /* The number of overflow pages needed to store the given size. */ -#define OVPAGES(size, psize) ((PAGEHDRSZ - 1 + (size)) / (psize) + 1) +#define OVPAGES(env, size) (bytes2pgno(env, PAGEHDRSZ - 1 + (size)) + 1) /* Link in MDBX_txn.mt_loose_pages list. * Kept outside the page header, which is needed when reusing the page. */ @@ -1112,3 +1113,17 @@ static __inline void SETDSZ(MDBX_node *node, unsigned size) { #else #define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a))) #endif + +static __inline size_t pgno2bytes(const MDBX_env *env, pgno_t pgno) { + mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize); + return ((size_t)pgno) << env->me_psize2log; +} + +static __inline MDBX_page *pgno2page(const MDBX_env *env, pgno_t pgno) { + return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); +} + +static __inline pgno_t bytes2pgno(const MDBX_env *env, size_t bytes) { + mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1); + return (pgno_t)(bytes >> env->me_psize2log); +} diff --git a/src/mdbx.c b/src/mdbx.c index add26908..e7b11cb8 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -553,8 +553,9 @@ int mdbx_runtime_flags = MDBX_DBG_PRINT MDBX_debug_func *mdbx_debug_logger; -static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, int flags); -static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num, +static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, + int flags); +static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, unsigned num, MDBX_page **mp); static int mdbx_page_touch(MDBX_cursor *mc); static int mdbx_cursor_touch(MDBX_cursor *mc); @@ -983,15 +984,15 @@ int mdbx_dcmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, * Set MDBX_TXN_ERROR on failure. */ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { MDBX_env *env = txn->mt_env; - size_t size = env->me_psize; MDBX_page *np = env->me_dpages; + size_t size = env->me_psize; if (likely(num == 1 && np)) { ASAN_UNPOISON_MEMORY_REGION(np, size); VALGRIND_MEMPOOL_ALLOC(env, np, size); VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); env->me_dpages = np->mp_next; } else { - size *= num; + size = pgno2bytes(env, num); np = malloc(size); if (unlikely(!np)) { txn->mt_flags |= MDBX_TXN_ERROR; @@ -1006,7 +1007,7 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { * many pages they will be filling in at least up to the last page. */ size_t skip = PAGEHDRSZ; if (num > 1) - skip += (num - 1) * env->me_psize; + skip += pgno2bytes(env, num - 1); memset((char *)np + skip, 0, size - skip); } VALGRIND_MAKE_MEM_UNDEFINED(np, size); @@ -1048,7 +1049,7 @@ static void mdbx_dlist_free(MDBX_txn *txn) { } static void __cold mdbx_kill_page(MDBX_env *env, pgno_t pgno) { - const size_t offs = env->me_psize * pgno; + const size_t offs = pgno2bytes(env, pgno); const size_t shift = offsetof(MDBX_page, mp_pages); if (env->me_flags & MDBX_WRITEMAP) { @@ -1241,7 +1242,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { i += txn->mt_dbs[MAIN_DBI].md_depth; /* For puts, roughly factor in the key+data size */ if (key) - i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; + i += bytes2pgno(txn->mt_env, LEAFSIZE(key, data) + txn->mt_env->me_psize); i += i; /* double it for good measure */ size_t need = i; @@ -1322,8 +1323,7 @@ bailout: /*----------------------------------------------------------------------------*/ -#define METAPAGE(env, n) \ - (&((MDBX_page *)((env)->me_map + env->me_psize * (n)))->mp_meta) +#define METAPAGE(env, n) (&pgno2page(env, n)->mp_meta) #define METAPAGE_END(env) METAPAGE(env, NUM_METAS) @@ -1545,7 +1545,7 @@ static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { #define MDBX_ALLOC_ALL \ (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW | MDBX_ALLOC_KICK) -static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, +static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, int flags) { int rc; MDBX_txn *txn = mc->mc_txn; @@ -1829,10 +1829,10 @@ static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, done: assert(mp && num); if (env->me_flags & MDBX_WRITEMAP) { - np = (MDBX_page *)(env->me_map + env->me_psize * pgno); + np = pgno2page(env, pgno); /* LY: reset no-access flag from mdbx_kill_page() */ - VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); - ASAN_UNPOISON_MEMORY_REGION(np, env->me_psize * num); + VALGRIND_MAKE_MEM_UNDEFINED(np, pgno2bytes(env, num)); + ASAN_UNPOISON_MEMORY_REGION(np, pgno2bytes(env, num)); } else { if (unlikely(!(np = mdbx_page_malloc(txn, num)))) { rc = MDBX_ENOMEM; @@ -1849,8 +1849,8 @@ done: } if (env->me_flags & MDBX_PAGEPERTURB) - memset(np, 0x71 /* 'q', 113 */, env->me_psize * num); - VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); + memset(np, 0x71 /* 'q', 113 */, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(np, pgno2bytes(env, num)); np->mp_pgno = pgno; np->mp_leaf2_ksize = 0; @@ -1908,18 +1908,15 @@ static int mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, MDBX_page **ret) { int num; if (txn->mt_dirtyroom == 0) return MDBX_TXN_FULL; - if (IS_OVERFLOW(mp)) - num = mp->mp_pages; - else - num = 1; + num = IS_OVERFLOW(mp) ? mp->mp_pages : 1; if (env->me_flags & MDBX_WRITEMAP) { np = mp; } else { np = mdbx_page_malloc(txn, num); if (unlikely(!np)) return MDBX_ENOMEM; - if (num > 1) - memcpy(np, mp, num * env->me_psize); + if (unlikely(num > 1)) + memcpy(np, mp, pgno2bytes(env, num)); else mdbx_page_copy(np, mp, env->me_psize); } @@ -1932,8 +1929,7 @@ static int mdbx_page_unspill(MDBX_txn *txn, MDBX_page *mp, MDBX_page **ret) { else txn->mt_spill_pages[x] |= 1; } /* otherwise, if belonging to a parent txn, the - * page remains spilled until child commits - */ + * page remains spilled until child commits */ mdbx_page_dirty(txn, np); np->mp_flags |= P_DIRTY; @@ -2077,10 +2073,11 @@ int mdbx_env_sync(MDBX_env *env, int force) { env->me_sync_pending >= env->me_sync_threshold)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - if (env->me_sync_pending > env->me_psize * 16 && + if (env->me_sync_pending > + pgno2bytes(env, 16 /* FIXME: define threshold */) && (flags & MDBX_NOSYNC) == 0) { assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - size_t used_size = env->me_psize * (head->mm_last_pg + 1); + size_t used_size = pgno2bytes(env, head->mm_last_pg + 1); mdbx_txn_unlock(env); /* LY: pre-sync without holding lock to reduce latency for writer(s) */ @@ -3036,7 +3033,7 @@ bailout: static int mdbx_page_flush(MDBX_txn *txn, size_t keep) { MDBX_env *env = txn->mt_env; MDBX_ID2L dl = txn->mt_rw_dirtylist; - unsigned psize = env->me_psize, j; + unsigned j; int i, pagecount = dl[0].mid, rc; size_t size = 0, pos = 0; pgno_t pgno = 0; @@ -3059,7 +3056,8 @@ static int mdbx_page_flush(MDBX_txn *txn, size_t keep) { continue; } dp->mp_flags &= ~P_DIRTY; - env->me_sync_pending += IS_OVERFLOW(dp) ? psize * dp->mp_pages : psize; + env->me_sync_pending += + IS_OVERFLOW(dp) ? pgno2bytes(env, dp->mp_pages) : env->me_psize; } goto done; } @@ -3077,10 +3075,8 @@ static int mdbx_page_flush(MDBX_txn *txn, size_t keep) { pgno = dl[i].mid; /* clear dirty flag */ dp->mp_flags &= ~P_DIRTY; - pos = pgno * psize; - size = psize; - if (IS_OVERFLOW(dp)) - size *= dp->mp_pages; + pos = pgno2bytes(env, pgno); + size = IS_OVERFLOW(dp) ? pgno2bytes(env, dp->mp_pages) : env->me_psize; env->me_sync_pending += size; } /* Write up to MDBX_COMMIT_PAGES dirty pages at a time. */ @@ -3107,7 +3103,7 @@ static int mdbx_page_flush(MDBX_txn *txn, size_t keep) { n++; } - mdbx_invalidate_cache(env->me_map, txn->mt_next_pgno * env->me_psize); + mdbx_invalidate_cache(env->me_map, pgno2bytes(env, txn->mt_next_pgno)); for (i = keep; ++i <= pagecount;) { dp = dl[i].mptr; @@ -3597,7 +3593,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const head = mdbx_meta_head(env); const size_t prev_mapsize = head->mm_mapsize; - const size_t used_size = env->me_psize * (pending->mm_last_pg + 1); + const size_t used_size = pgno2bytes(env, pending->mm_last_pg + 1); mdbx_assert(env, mdbx_meta_eq_mask(env) == 0); mdbx_assert(env, @@ -3838,13 +3834,25 @@ int mdbx_get_maxkeysize(size_t pagesize) { return (maxkey > 0 && maxkey < INT_MAX) ? (int)maxkey : -MDBX_EINVAL; } -static void __cold mdbx_env_setup_limits(MDBX_env *env, size_t pagesize) { - env->me_maxfree_1pg = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - env->me_maxpg = env->me_mapsize / pagesize; +static void __cold mdbx_setup_pagesize(MDBX_env *env, size_t pagesize) { + mdbx_ensure(env, is_power2(pagesize)); + mdbx_ensure(env, pagesize >= MIN_PAGESIZE); + mdbx_ensure(env, pagesize <= MAX_PAGESIZE); + env->me_psize = pagesize; + env->me_maxfree_1pg = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; env->me_nodemax = mdbx_calc_nodemax(pagesize); env->me_maxkey_limit = mdbx_calc_maxkey(env->me_nodemax); - assert(env->me_maxkey_limit > 42 && env->me_maxkey_limit < pagesize); + mdbx_assert(env, + env->me_maxkey_limit > 42 && env->me_maxkey_limit < pagesize); + + env->me_psize2log = 0; + while (pagesize > 1) { + env->me_psize2log += 1; + pagesize >>= 1; + } + + env->me_maxpg = bytes2pgno(env, env->me_mapsize); } int __cold mdbx_env_create(MDBX_env **penv) { @@ -3865,7 +3873,7 @@ int __cold mdbx_env_create(MDBX_env **penv) { rc = MDBX_INCOMPATIBLE; goto bailout; } - mdbx_env_setup_limits(env, env->me_os_psize); + mdbx_setup_pagesize(env, env->me_os_psize); rc = mdbx_fastmutex_init(&env->me_dbi_lock); if (unlikely(rc != MDBX_SUCCESS)) @@ -3910,8 +3918,8 @@ static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { #endif #if defined(MADV_DODUMP) && defined(MADV_DONTDUMP) - const size_t meta_length = env->me_psize * NUM_METAS; - (void)madvise(env->me_map, env->me_psize * NUM_METAS, MADV_DODUMP); + const size_t meta_length = pgno2bytes(env, NUM_METAS); + (void)madvise(env->me_map, meta_length, MADV_DODUMP); if (!(flags & MDBX_PAGEPERTURB)) (void)madvise(env->me_map + meta_length, env->me_mapsize - meta_length, MADV_DONTDUMP); @@ -3935,7 +3943,7 @@ static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { /* Lock meta pages to avoid unexpected write, * before the data pages would be synchronized. */ if (flags & MDBX_WRITEMAP) { - rc = mdbx_mlock(env->me_map, env->me_psize * NUM_METAS); + rc = mdbx_mlock(env->me_map, pgno2bytes(env, NUM_METAS)); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -3955,7 +3963,7 @@ int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(size < env->me_psize * 8)) + if (unlikely(size < pgno2bytes(env, MIN_PAGENO))) return MDBX_EINVAL; /* If env is already open, caller is responsible for making @@ -3971,7 +3979,7 @@ int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { if (!size) size = meta->mm_mapsize; /* Silently round up to minimum if the size is too small */ - const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; + const size_t usedsize = pgno2bytes(env, meta->mm_last_pg + 1); if (size < usedsize) size = usedsize; @@ -3992,8 +4000,7 @@ int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { } env->me_mapsize = size; - if (env->me_psize) - env->me_maxpg = env->me_mapsize / env->me_psize; + env->me_maxpg = bytes2pgno(env, env->me_mapsize); return MDBX_SUCCESS; } @@ -4053,9 +4060,9 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { env->me_psize = env->me_os_psize; if (env->me_psize > MAX_PAGESIZE) env->me_psize = MAX_PAGESIZE; + mdbx_ensure(env, is_power2(env->me_psize)); + mdbx_ensure(env, env->me_psize >= MIN_PAGESIZE); - assert(is_power2(env->me_psize)); - assert(env->me_psize >= MIN_PAGESIZE); env->me_mapsize = roundup2( env->me_mapsize ? env->me_mapsize : DEFAULT_MAPSIZE, env->me_os_psize); @@ -4080,7 +4087,8 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { return err; } else { env->me_psize = meta.mm_psize; - if (!is_power2(env->me_psize) || env->me_psize < MIN_PAGESIZE) { + if (!is_power2(env->me_psize) || env->me_psize < MIN_PAGESIZE || + env->me_psize > MAX_PAGESIZE) { mdbx_error("wrong pagesize %u (system %u)", env->me_psize, env->me_os_psize); return MDBX_WANNA_RECOVERY; @@ -4088,8 +4096,8 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { /* Make sure mapsize >= committed data size. Even when using * mm_mapsize, which could be broken in old files (ITS#7789). */ - const size_t usedsize = - roundup2((meta.mm_last_pg + 1) * env->me_psize, env->me_os_psize); + const size_t usedsize = roundup2( + (meta.mm_last_pg + 1) * (size_t)env->me_psize, env->me_os_psize); if (meta.mm_mapsize < usedsize) meta.mm_mapsize = usedsize; @@ -4101,6 +4109,8 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { env->me_mapsize = usedsize; } + mdbx_setup_pagesize(env, env->me_psize); + uint64_t size; err = mdbx_filesize(env->me_fd, &size); if (unlikely(err != MDBX_SUCCESS)) @@ -4184,7 +4194,6 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { return err; } - mdbx_env_setup_limits(env, env->me_psize); return rc; } @@ -4894,7 +4903,7 @@ static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, level = 0; mapped: - p = (MDBX_page *)(env->me_map + env->me_psize * pgno); + p = pgno2page(env, pgno); done: *ret = p; @@ -6294,7 +6303,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { MDBX_page *omp; pgno_t pg; - int level, ovpages, dpages = OVPAGES(data->iov_len, env->me_psize); + int level, ovpages, dpages = OVPAGES(env, data->iov_len); memcpy(&pg, olddata.iov_base, sizeof(pg)); if (unlikely((rc2 = mdbx_page_get(mc, pg, &omp, &level)) != 0)) @@ -6330,11 +6339,12 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, /* Currently we make the page look as with put() in the * parent txn, in case the user peeks at MDBX_RESERVEd * or unused parts. Some users treat ovpages specially. */ - size_t whole = (size_t)env->me_psize * ovpages; + const size_t whole = pgno2bytes(env, ovpages); /* Skip the part where MDBX will put *data. * Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. */ - size_t off = (PAGEHDRSZ + data->iov_len) & -(ssize_t)sizeof(size_t); + const size_t off = + (PAGEHDRSZ + data->iov_len) & -(ssize_t)sizeof(size_t); memcpy((size_t *)((char *)np + off), (size_t *)((char *)omp + off), whole - off); memcpy(np, omp, PAGEHDRSZ); /* Copy header of page */ @@ -6637,7 +6647,7 @@ fail: * [out] mp Address of a page, or NULL on failure. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num, +static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, unsigned num, MDBX_page **mp) { MDBX_page *np; int rc; @@ -6779,7 +6789,7 @@ static int mdbx_node_add(MDBX_cursor *mc, indx_t indx, MDBX_val *key, node_size += sizeof(pgno_t); } else if (unlikely(node_size + data->iov_len > mc->mc_txn->mt_env->me_nodemax)) { - int ovpages = OVPAGES(data->iov_len, mc->mc_txn->mt_env->me_psize); + unsigned ovpages = OVPAGES(mc->mc_txn->mt_env, data->iov_len); int rc; /* Put data on overflow page. */ mdbx_debug("data size is %" PRIuPTR ", node would be %" PRIuPTR @@ -8646,7 +8656,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { return rc; /* Make cursor pages writable */ - buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum); + buf = ptr = malloc(pgno2bytes(my->mc_env, mc.mc_snum)); if (buf == NULL) return MDBX_ENOMEM; @@ -8698,7 +8708,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { my->mc_next_pgno += omp->mp_pages; my->mc_wlen[toggle] += my->mc_env->me_psize; if (omp->mp_pages > 1) { - my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1); + my->mc_olen[toggle] = pgno2bytes(my->mc_env, omp->mp_pages - 1); my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize; rc = mdbx_env_cthr_toggle(my, 1); if (unlikely(rc != MDBX_SUCCESS)) @@ -8843,7 +8853,7 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { /* update signature */ meta->mp_meta.mm_datasync_sign = mdbx_meta_sign(&meta->mp_meta); - my.mc_wlen[0] = env->me_psize * NUM_METAS; + my.mc_wlen[0] = pgno2bytes(env, NUM_METAS); my.mc_txn = txn; rc = mdbx_env_cwalk(&my, &root, 0); if (rc == MDBX_SUCCESS && root != new_root) { @@ -8890,12 +8900,12 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, mdbx_filehandle_t fd) { goto bailout; } - rc = mdbx_write(fd, env->me_map, env->me_psize * NUM_METAS); + rc = mdbx_write(fd, env->me_map, pgno2bytes(env, NUM_METAS)); mdbx_txn_unlock(env); if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_write(fd, env->me_map + env->me_psize * NUM_METAS, - (txn->mt_next_pgno - NUM_METAS) * env->me_psize); + rc = mdbx_write(fd, env->me_map + pgno2bytes(env, NUM_METAS), + pgno2bytes(env, txn->mt_next_pgno - NUM_METAS)); if (likely(rc == MDBX_SUCCESS)) rc = mdbx_ftruncate(fd, env->me_mapsize); @@ -9932,7 +9942,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, over_header = PAGEHDRSZ; over_payload = NODEDSZ(node); - over_unused = omp->mp_pages * ctx->mw_txn->mt_env->me_psize - + over_unused = pgno2bytes(ctx->mw_txn->mt_env, omp->mp_pages) - over_payload - over_header; rc = ctx->mw_visitor(*opg, omp->mp_pages, ctx->mw_user, dbi, @@ -10341,7 +10351,7 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { * во время транзакции) если адрес находится внутри mmap-диапазона * и в заголовке страницы нет флажка P_DIRTY. */ if (env->me_map < (char *)page) { - const size_t used_size = env->me_psize * txn->mt_next_pgno; + const size_t used_size = pgno2bytes(env, txn->mt_next_pgno); if ((char *)page < env->me_map + used_size) { /* страница внутри диапазона, смотрим на флажки */ return (page->mp_flags & (P_DIRTY | P_LOOSE | P_KEEP)) From 61d7ea283f1d302e4f4523d24695fb1868160e8b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 5 Jun 2017 17:26:08 +0300 Subject: [PATCH 227/303] mdbx: size_t for sync_threshold and mapsize. --- src/bits.h | 6 +++--- src/mdbx.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/bits.h b/src/bits.h index 865ade2e..b2d219eb 100644 --- a/src/bits.h +++ b/src/bits.h @@ -666,9 +666,9 @@ struct MDBX_env { #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif - uint64_t me_sync_pending; /* Total dirty/non-sync'ed bytes - * since the last mdbx_env_sync() */ - uint64_t me_sync_threshold; /* Treshold of above to force synchronous flush */ + size_t me_sync_pending; /* Total dirty/non-sync'ed bytes + * since the last mdbx_env_sync() */ + size_t me_sync_threshold; /* Treshold of above to force synchronous flush */ MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */ #ifdef USE_VALGRIND int me_valgrind_handle; diff --git a/src/mdbx.c b/src/mdbx.c index e7b11cb8..71a53374 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2099,8 +2099,8 @@ int mdbx_env_sync(MDBX_env *env, int force) { if (!META_IS_STEADY(head) || env->me_sync_pending || env->me_mapsize != head->mm_mapsize) { - mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64 - ", mapsize env=%" PRIuPTR " meta=%" PRIuPTR, + mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIuPTR + ", mapsize env=%" PRIuPTR " meta=%" PRIu64, container_of(head, MDBX_page, mp_data)->mp_pgno, mdbx_durable_str(head), env->me_sync_pending, env->me_mapsize, head->mm_mapsize); @@ -4116,7 +4116,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { if (unlikely(err != MDBX_SUCCESS)) return err; if (size != env->me_mapsize) { - mdbx_notice("filesize mismatch (wanna %" PRIu64 ", have %" PRIu64 ")", + mdbx_notice("filesize mismatch (wanna %" PRIuPTR ", have %" PRIu64 ")", env->me_mapsize, size); if ((env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { @@ -4176,13 +4176,13 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { head = mdbx_meta_head(env); if (head->mm_mapsize != env->me_mapsize) { - mdbx_info("mismatch meta.mapsize: present %" PRIu64 ", should %" PRIu64, + mdbx_info("mismatch meta.mapsize: present %" PRIu64 ", should %" PRIuPTR, head->mm_mapsize, env->me_mapsize); if ((env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) return MDBX_MAP_RESIZED; - mdbx_trace("updating meta.mapsize: from %" PRIu64 " to %" PRIu64, + mdbx_trace("updating meta.mapsize: from %" PRIu64 " to %" PRIuPTR, head->mm_mapsize, env->me_mapsize); meta = *head; meta.mm_mapsize = env->me_mapsize; From 7e85ad82f13897e4c436ca3a8213b10ac5539d38 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 5 Jun 2017 22:45:13 +0300 Subject: [PATCH 228/303] mdbx: refine read_header(), add MDBX_TOO_LARGE. Change-Id: I25b9bb2e3817993d627b2b784dfa496d9ba7e7b0 --- mdbx.h | 4 +++ src/mdbx.c | 82 +++++++++++++++++++++++++++++++++--------------------- 2 files changed, 54 insertions(+), 32 deletions(-) diff --git a/mdbx.h b/mdbx.h index b74bef0e..9429eb71 100644 --- a/mdbx.h +++ b/mdbx.h @@ -414,6 +414,10 @@ typedef enum MDBX_cursor_op { * when mdbx_cursor_put() called with MDBX_CURRENT option. */ #define MDBX_EKEYMISMATCH (-30418) +/* Database is too large for current system, i.e. could NOT be mapped into RAM. + */ +#define MDBX_TOO_LARGE (-30417) + /* Statistics for a database in the environment */ typedef struct MDBX_stat { uint32_t ms_psize; /* Size of a database page. diff --git a/src/mdbx.c b/src/mdbx.c index 71a53374..93bcf94b 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -689,6 +689,9 @@ static const char *__mdbx_strerr(int errnum) { case MDBX_EKEYMISMATCH: return "MDBX_EKEYMISMATCH: The given key value is mismatched to the " "current cursor position"; + case MDBX_TOO_LARGE: + return "Database is too large for current system, i.e.could NOT be mapped " + "into RAM."; default: return NULL; } @@ -3384,6 +3387,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { assert(offsetof(MDBX_page, mp_meta) == PAGEHDRSZ); memset(meta, 0, sizeof(MDBX_meta)); meta->mm_datasync_sign = MDBX_DATASIGN_WEAK; + int rc = MDBX_CORRUPTED; /* Read twice all meta pages so we can find the latest one. */ unsigned loop_limit = NUM_METAS * 2; @@ -3402,19 +3406,19 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { unsigned retryleft = 42; while (1) { - int rc = mdbx_pread(env->me_fd, &page, sizeof(page), offset); - if (rc != MDBX_SUCCESS) { + int err = mdbx_pread(env->me_fd, &page, sizeof(page), offset); + if (err != MDBX_SUCCESS) { mdbx_error("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(page), - rc, mdbx_strerror(rc)); - return rc; + err, mdbx_strerror(err)); + return err; } MDBX_page again; - rc = mdbx_pread(env->me_fd, &again, sizeof(again), offset); - if (rc != MDBX_SUCCESS) { + err = mdbx_pread(env->me_fd, &again, sizeof(again), offset); + if (err != MDBX_SUCCESS) { mdbx_error("read meta[%u,%u]: %i, %s", offset, (unsigned)sizeof(again), - rc, mdbx_strerror(rc)); - return rc; + err, mdbx_strerror(err)); + return err; } if (memcmp(&page, &again, sizeof(page)) == 0 || --retryleft == 0) @@ -3451,26 +3455,6 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { continue; } - /* LY: check pagesize */ - STATIC_ASSERT(MIN_PAGESIZE < MAX_PAGESIZE); - if (!is_power2(page.mp_meta.mm_psize) || - page.mp_meta.mm_psize < MIN_PAGESIZE || - page.mp_meta.mm_psize > MAX_PAGESIZE) { - mdbx_notice("meta[%u] has invalid pagesize %u, skip it", meta_number, - page.mp_meta.mm_psize); - continue; - } - - /* LY: check mapsize limits */ - STATIC_ASSERT(MAX_MAPSIZE < SSIZE_MAX - MAX_PAGESIZE); - STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); - if (page.mp_meta.mm_mapsize < MIN_MAPSIZE || - page.mp_meta.mm_mapsize > MAX_MAPSIZE) { - mdbx_notice("meta[%u] has invalid mapsize %" PRIu64 ", skip it", - meta_number, page.mp_meta.mm_mapsize); - continue; - } - /* LY: check signature as a checksum */ if (META_IS_STEADY(&page.mp_meta) && page.mp_meta.mm_datasync_sign != mdbx_meta_sign(&page.mp_meta)) { @@ -3481,6 +3465,33 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { continue; } + /* LY: check pagesize */ + if (!is_power2(page.mp_meta.mm_psize) || + page.mp_meta.mm_psize < MIN_PAGESIZE || + page.mp_meta.mm_psize > MAX_PAGESIZE) { + mdbx_notice("meta[%u] has invalid pagesize %u, skip it", meta_number, + page.mp_meta.mm_psize); + rc = MDBX_VERSION_MISMATCH; + continue; + } + + /* LY: check mapsize limits */ + STATIC_ASSERT(MAX_MAPSIZE < SSIZE_MAX - MAX_PAGESIZE); + if (page.mp_meta.mm_mapsize < MIN_MAPSIZE) { + mdbx_notice("meta[%u] has invalid mapsize %" PRIu64 ", skip it", + meta_number, page.mp_meta.mm_mapsize); + rc = MDBX_VERSION_MISMATCH; + continue; + } + + STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); + if (page.mp_meta.mm_mapsize > MAX_MAPSIZE) { + mdbx_notice("meta[%u] has too large mapsize %" PRIu64 ", skip it", + meta_number, page.mp_meta.mm_mapsize); + rc = MDBX_TOO_LARGE; + continue; + } + /* LY: check mapsize with given given pagesize */ if (page.mp_meta.mm_mapsize < MIN_PAGENO * (uint64_t)page.mp_meta.mm_psize || @@ -3489,6 +3500,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { mdbx_notice("meta[%u] has invalid mapsize %" PRIu64 ", with given pagesize %u, skip it", meta_number, page.mp_meta.mm_mapsize, page.mp_meta.mm_psize); + rc = MDBX_CORRUPTED; continue; } @@ -3499,6 +3511,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { page.mp_meta.mm_mapsize / page.mp_meta.mm_psize) { mdbx_notice("meta[%u] has invalid last-pageno %" PRIaPGNO ", skip it", meta_number, page.mp_meta.mm_last_pg); + rc = MDBX_CORRUPTED; continue; } @@ -3510,12 +3523,14 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { page.mp_meta.mm_dbs[FREE_DBI].md_leaf_pages || page.mp_meta.mm_dbs[FREE_DBI].md_overflow_pages) { mdbx_notice("meta[%u] has false-empty freedb, skip it", meta_number); + rc = MDBX_CORRUPTED; continue; } } else if (page.mp_meta.mm_dbs[FREE_DBI].md_root > page.mp_meta.mm_last_pg) { mdbx_notice("meta[%u] has invalid freedb-root %" PRIaPGNO ", skip it", meta_number, page.mp_meta.mm_dbs[FREE_DBI].md_root); + rc = MDBX_CORRUPTED; continue; } @@ -3527,12 +3542,14 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { page.mp_meta.mm_dbs[MAIN_DBI].md_leaf_pages || page.mp_meta.mm_dbs[MAIN_DBI].md_overflow_pages) { mdbx_notice("meta[%u] has false-empty maindb", meta_number); + rc = MDBX_CORRUPTED; continue; } } else if (page.mp_meta.mm_dbs[MAIN_DBI].md_root > page.mp_meta.mm_last_pg) { mdbx_notice("meta[%u] has invalid maindb-root %" PRIaPGNO ", skip it", meta_number, page.mp_meta.mm_dbs[MAIN_DBI].md_root); + rc = MDBX_CORRUPTED; continue; } @@ -3545,7 +3562,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { if (META_IS_WEAK(meta)) { mdbx_error("no usable meta-pages, database is corrupted"); - return MDBX_CORRUPTED; + return rc; } return MDBX_SUCCESS; @@ -3963,6 +3980,9 @@ int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(size < MIN_MAPSIZE || size > MAX_MAPSIZE)) + return MDBX_EINVAL; + if (unlikely(size < pgno2bytes(env, MIN_PAGENO))) return MDBX_EINVAL; @@ -3976,8 +3996,6 @@ int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { /* FIXME: lock/unlock */ meta = mdbx_meta_head(env); - if (!size) - size = meta->mm_mapsize; /* Silently round up to minimum if the size is too small */ const size_t usedsize = pgno2bytes(env, meta->mm_last_pg + 1); if (size < usedsize) @@ -4104,7 +4122,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { /* Was a mapsize configured? */ if (!env->me_mapsize || (env->me_flags & MDBX_RDONLY) || lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) - env->me_mapsize = meta.mm_mapsize; + env->me_mapsize = (size_t)meta.mm_mapsize; else if (env->me_mapsize < usedsize) env->me_mapsize = usedsize; } From bfa6dea7845a308b30f96a5f206708d904b18c5c Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 5 Jun 2017 20:48:05 +0300 Subject: [PATCH 229/303] mdbx: fix size_t/unsigned/int warnings. Change-Id: Ic5a8684aed232b8b732d6e7a87a6757f3f7afcec --- TODO.md | 2 +- mdbx.h | 8 +- src/bits.h | 13 +- src/mdbx.c | 380 ++++++++++++++++++++++++------------------- src/tools/mdbx_chk.c | 12 +- test/utils.cc | 4 +- 6 files changed, 233 insertions(+), 186 deletions(-) diff --git a/TODO.md b/TODO.md index 338f6e83..c0170135 100644 --- a/TODO.md +++ b/TODO.md @@ -27,7 +27,7 @@ - [x] Добавить мета-страницы в coredump, проверить lck - [x] Сделать список для txnid_t, кода sizeof(txnid_t) > sizeof(pgno_t) и вернуть размер pgno_t - [x] Избавиться от умножения на размер страницы (заменить на сдвиг). -- [ ] Устранение всех предупреждений (в том числе под Windows). +- [x] Устранение всех предупреждений (в том числе под Windows). - [ ] Перевод mdbx-tools на С++ и сборка для Windows - [ ] Заменить заглушки mdbx_version и mdbx_build - [ ] Актуализация README.md diff --git a/mdbx.h b/mdbx.h index 9429eb71..f696d0a3 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1562,9 +1562,9 @@ typedef void MDBX_debug_func(int type, const char *function, int line, LIBMDBX_API int mdbx_setup_debug(int flags, MDBX_debug_func *logger); typedef int MDBX_pgvisitor_func(uint64_t pgno, unsigned pgnumber, void *ctx, - const char *dbi, const char *type, int nentries, - int payload_bytes, int header_bytes, - int unused_bytes); + const char *dbi, const char *type, + size_t nentries, size_t payload_bytes, + size_t header_bytes, size_t unused_bytes); LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, void *ctx); @@ -1595,7 +1595,7 @@ LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, * of multi-values/duplicates for a given key. * 2) updates the key for pointing to the actual key's data inside DB. */ LIBMDBX_API int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, int *values_count); + MDBX_val *data, size_t *values_count); LIBMDBX_API int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr); diff --git a/src/bits.h b/src/bits.h index b2d219eb..73bea6f5 100644 --- a/src/bits.h +++ b/src/bits.h @@ -660,9 +660,9 @@ struct MDBX_env { unsigned me_maxfree_1pg; /* Max size of a node on a page */ unsigned me_nodemax; - unsigned me_maxkey_limit; /* max size of a key */ - int me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + unsigned me_maxkey_limit; /* max size of a key */ + mdbx_pid_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif @@ -896,7 +896,7 @@ static __inline size_t roundup2(size_t value, size_t granularity) { #define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) /* Number of nodes on a page */ -#define NUMKEYS(p) ((p)->mp_lower >> 1) +#define NUMKEYS(p) ((unsigned)(p)->mp_lower >> 1) /* The amount of space remaining in the page */ #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) @@ -1045,9 +1045,10 @@ static __inline size_t NODEDSZ(const MDBX_node *node) { } /* Set the size of the data for a leaf node */ -static __inline void SETDSZ(MDBX_node *node, unsigned size) { +static __inline void SETDSZ(MDBX_node *node, size_t size) { + assert(size < INT_MAX); if (UNALIGNED_OK) { - node->mn_dsize = size; + node->mn_dsize = (uint32_t)size; } else { node->mn_lo = (uint16_t)size; node->mn_hi = (uint16_t)(size >> 16); diff --git a/src/mdbx.c b/src/mdbx.c index 93bcf94b..7ba9b54f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -159,16 +159,16 @@ __cold void mdbx_rthc_remove(mdbx_thread_key_t key) { /* Allocate an IDL. * Allocates memory for an IDL of the given size. * Returns IDL on success, NULL on failure. */ -static MDBX_IDL mdbx_midl_alloc(unsigned size) { +static MDBX_IDL mdbx_midl_alloc(size_t size) { MDBX_IDL ids = malloc((size + 2) * sizeof(pgno_t)); if (likely(ids)) { - *ids++ = size; + *ids++ = (pgno_t)size; *ids = 0; } return ids; } -static MDBX_TXL mdbx_txl_alloc(unsigned size) { +static MDBX_TXL mdbx_txl_alloc(size_t size) { MDBX_TXL ptr = malloc((size + 2) * sizeof(txnid_t)); if (likely(ptr)) { *ptr++ = size; @@ -247,21 +247,21 @@ static void mdbx_midl_shrink(MDBX_IDL *idp) { /* Grow an IDL. * Return the IDL to the size growed by given number. * [in,out] idp Address of the IDL to grow. */ -static int mdbx_midl_grow(MDBX_IDL *idp, unsigned num) { +static int mdbx_midl_grow(MDBX_IDL *idp, size_t num) { MDBX_IDL idn = *idp - 1; /* grow it */ idn = realloc(idn, (*idn + num + 2) * sizeof(pgno_t)); if (unlikely(!idn)) return MDBX_ENOMEM; - *idn++ += num; + *idn++ += (pgno_t)num; *idp = idn; return 0; } -static int mdbx_txl_grow(MDBX_TXL *ptr, unsigned num) { +static int mdbx_txl_grow(MDBX_TXL *ptr, size_t num) { MDBX_TXL list = *ptr - 1; /* grow it */ - list = realloc(list, (*list + num + 2) * sizeof(txnid_t)); + list = realloc(list, ((size_t)*list + num + 2) * sizeof(txnid_t)); if (unlikely(!list)) return MDBX_ENOMEM; *list++ += num; @@ -273,7 +273,7 @@ static int mdbx_txl_grow(MDBX_TXL *ptr, unsigned num) { * [in,out] idp Address of the IDL. * [in] num Number of elements to make room for. * Returns 0 on success, MDBX_ENOMEM on failure. */ -static int mdbx_midl_need(MDBX_IDL *idp, unsigned num) { +static int mdbx_midl_need(MDBX_IDL *idp, size_t num) { MDBX_IDL ids = *idp; num += ids[0]; if (unlikely(num > ids[-1])) { @@ -281,7 +281,7 @@ static int mdbx_midl_need(MDBX_IDL *idp, unsigned num) { ids = realloc(ids - 1, num * sizeof(pgno_t)); if (unlikely(!ids)) return MDBX_ENOMEM; - *ids++ = num - 2; + *ids++ = (pgno_t)num - 2; *idp = ids; } return 0; @@ -308,7 +308,7 @@ static int mdbx_txl_append(MDBX_TXL *ptr, txnid_t id) { MDBX_TXL list = *ptr; /* Too big? */ if (unlikely(list[0] >= list[-1])) { - if (mdbx_txl_grow(ptr, list[0])) + if (mdbx_txl_grow(ptr, (size_t)list[0])) return MDBX_ENOMEM; list = *ptr; } @@ -338,11 +338,11 @@ static int mdbx_txl_append_list(MDBX_TXL *ptr, MDBX_TXL append) { MDBX_TXL list = *ptr; /* Too big? */ if (unlikely(list[0] + append[0] >= list[-1])) { - if (mdbx_txl_grow(ptr, append[0])) + if (mdbx_txl_grow(ptr, (size_t)append[0])) return MDBX_ENOMEM; list = *ptr; } - memcpy(&list[list[0] + 1], &append[1], append[0] * sizeof(txnid_t)); + memcpy(&list[list[0] + 1], &append[1], (size_t)append[0] * sizeof(txnid_t)); list[0] += append[0]; return 0; } @@ -352,7 +352,7 @@ static int mdbx_txl_append_list(MDBX_TXL *ptr, MDBX_TXL append) { * [in] id The lowest ID to append. * [in] n Number of IDs to append. * Returns 0 on success, MDBX_ENOMEM if the IDL is too large. */ -static int mdbx_midl_append_range(MDBX_IDL *idp, pgno_t id, unsigned n) { +static int mdbx_midl_append_range(MDBX_IDL *idp, pgno_t id, size_t n) { pgno_t *ids = *idp, len = ids[0]; /* Too big? */ if (unlikely(len + n > ids[-1])) { @@ -360,7 +360,7 @@ static int mdbx_midl_append_range(MDBX_IDL *idp, pgno_t id, unsigned n) { return MDBX_ENOMEM; ids = *idp; } - ids[0] = len + n; + ids[0] = len + (pgno_t)n; ids += len; while (n) ids[n--] = id++; @@ -605,7 +605,7 @@ static void mdbx_env_close0(MDBX_env *env); static MDBX_node *mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, int *exactp); static int mdbx_node_add(MDBX_cursor *mc, indx_t indx, MDBX_val *key, MDBX_val *data, pgno_t pgno, unsigned flags); -static void mdbx_node_del(MDBX_cursor *mc, int ksize); +static void mdbx_node_del(MDBX_cursor *mc, size_t ksize); static void mdbx_node_shrink(MDBX_page *mp, indx_t indx); static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft); static int mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf, MDBX_val *data); @@ -923,18 +923,16 @@ static void mdbx_cursor_chk(MDBX_cursor *mc) { static void mdbx_audit(MDBX_txn *txn) { MDBX_cursor mc; MDBX_val key, data; - pgno_t freecount, count; - MDBX_dbi i; int rc; - freecount = 0; + pgno_t freecount = 0; mdbx_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdbx_cursor_get(&mc, &key, &data, MDBX_NEXT)) == 0) freecount += *(pgno_t *)data.iov_base; mdbx_tassert(txn, rc == MDBX_NOTFOUND); - count = 0; - for (i = 0; i < txn->mt_numdbs; i++) { + pgno_t count = 0; + for (MDBX_dbi i = 0; i < txn->mt_numdbs; i++) { MDBX_xcursor mx; if (!(txn->mt_dbflags[i] & DB_VALID)) continue; @@ -946,10 +944,8 @@ static void mdbx_audit(MDBX_txn *txn) { if (txn->mt_dbs[i].md_flags & MDBX_DUPSORT) { rc = mdbx_page_search(&mc, NULL, MDBX_PS_FIRST); for (; rc == MDBX_SUCCESS; rc = mdbx_cursor_sibling(&mc, 1)) { - unsigned j; - MDBX_page *mp; - mp = mc.mc_pg[mc.mc_top]; - for (j = 0; j < NUMKEYS(mp); j++) { + MDBX_page *mp = mc.mc_pg[mc.mc_top]; + for (unsigned j = 0; j < NUMKEYS(mp); j++) { MDBX_node *leaf = NODEPTR(mp, j); if (leaf->mn_flags & F_SUBDATA) { MDBX_db db; @@ -1195,7 +1191,7 @@ mark_done: return rc; } -static int mdbx_page_flush(MDBX_txn *txn, size_t keep); +static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep); /* Spill pages from the dirty list back to disk. * This is intended to prevent running into MDBX_TXN_FULL situations, @@ -1239,7 +1235,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { return MDBX_SUCCESS; /* Estimate how much space this op will take */ - size_t i = m0->mc_db->md_depth; + pgno_t i = m0->mc_db->md_depth; /* Named DBs also dirty the main DB */ if (m0->mc_dbi >= CORE_DBS) i += txn->mt_dbs[MAIN_DBI].md_depth; @@ -1247,7 +1243,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { if (key) i += bytes2pgno(txn->mt_env, LEAFSIZE(key, data) + txn->mt_env->me_psize); i += i; /* double it for good measure */ - size_t need = i; + pgno_t need = i; if (txn->mt_dirtyroom > i) return MDBX_SUCCESS; @@ -1259,7 +1255,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { } else { /* purge deleted slots */ MDBX_IDL sl = txn->mt_spill_pages; - unsigned num = sl[0], j = 0; + pgno_t num = sl[0], j = 0; for (i = 1; i <= num; i++) { if (!(sl[i] & 1)) sl[++j] = sl[i]; @@ -1268,7 +1264,7 @@ static int mdbx_page_spill(MDBX_cursor *m0, MDBX_val *key, MDBX_val *data) { } /* Preserve pages which may soon be dirtied again */ - int rc = mdbx_pages_xkeep(m0, P_DIRTY, 1); + int rc = mdbx_pages_xkeep(m0, P_DIRTY, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -1684,7 +1680,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (flags & MDBX_LIFORECLAIM) { if (txn->mt_lifo_reclaimed) { - for (j = txn->mt_lifo_reclaimed[0]; j > 0; --j) + for (j = (unsigned)txn->mt_lifo_reclaimed[0]; j > 0; --j) if (txn->mt_lifo_reclaimed[j] == last) break; if (j) @@ -2896,7 +2892,8 @@ again: head_room = mop_len - total_room; if (head_room > maxfree_1pg && head_id > 1) { /* Overflow multi-page for part of me_pghead */ - head_room /= head_id; /* amortize page sizes */ + head_room /= (head_id < INT16_MAX) ? (pgno_t)head_id + : INT16_MAX; /* amortize page sizes */ head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); } else if (head_room < 0) { /* Rare case, not bothering to delete this record */ @@ -2989,7 +2986,7 @@ again: data.iov_base = mop -= len; save = mop[0]; - mop[0] = len; + mop[0] = (pgno_t)len; rc = mdbx_cursor_put(&mc, &key, &data, MDBX_CURRENT); mdbx_tassert( txn, cleanup_idx == @@ -3015,7 +3012,7 @@ bailout: * records. */ cleanup_idx = 0; /* LY: restart filling */ - refill_idx = total_room = head_room = 0; + total_room = head_room = refill_idx = 0; more = 1; goto again; } @@ -3033,11 +3030,11 @@ bailout: * [in] txn the transaction that's being committed * [in] keep number of initial pages in dirtylist to keep dirty. * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_flush(MDBX_txn *txn, size_t keep) { +static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) { MDBX_env *env = txn->mt_env; MDBX_ID2L dl = txn->mt_rw_dirtylist; - unsigned j; - int i, pagecount = dl[0].mid, rc; + unsigned i, j, pagecount = dl[0].mid; + int rc; size_t size = 0, pos = 0; pgno_t pgno = 0; MDBX_page *dp = NULL; @@ -3609,7 +3606,10 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const meta2 = METAPAGE(env, 2); MDBX_meta *const head = mdbx_meta_head(env); - const size_t prev_mapsize = head->mm_mapsize; + mdbx_assert(env, head->mm_mapsize < MAX_MAPSIZE); + STATIC_ASSERT(SSIZE_MAX > MAX_MAPSIZE); + const size_t prev_mapsize = (size_t)head->mm_mapsize; + const size_t used_size = pgno2bytes(env, pending->mm_last_pg + 1); mdbx_assert(env, mdbx_meta_eq_mask(env) == 0); @@ -3806,11 +3806,17 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); if (unlikely(pending->mm_mapsize < prev_mapsize)) { mdbx_assert(env, pending->mm_mapsize == env->me_mapsize); - rc = mdbx_ftruncate(env->me_fd, pending->mm_mapsize); + if (pending->mm_mapsize > MAX_MAPSIZE) { + rc = MDBX_PROBLEM; + goto fail; + } + const size_t mapsize = (size_t)pending->mm_mapsize; + mdbx_assert(env, pending->mm_mapsize == mapsize); + + rc = mdbx_ftruncate(env->me_fd, mapsize); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - rc = mdbx_mremap_size((void **)&env->me_map, prev_mapsize, - pending->mm_mapsize); + rc = mdbx_mremap_size((void **)&env->me_map, prev_mapsize, mapsize); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } @@ -3828,26 +3834,22 @@ int __cold mdbx_env_get_maxkeysize(MDBX_env *env) { return env->me_maxkey_limit; } -static __inline ssize_t mdbx_calc_nodemax(ssize_t pagesize) { - assert(pagesize > 0); - return (((pagesize - PAGEHDRSZ) / MDBX_MINKEYS) & -(ssize_t)2) - - sizeof(indx_t); -} +#define mdbx_nodemax(pagesize) \ + (((((pagesize)-PAGEHDRSZ) / MDBX_MINKEYS) & -(ssize_t)2) - sizeof(indx_t)) -static __inline ssize_t mdbx_calc_maxkey(ssize_t nodemax) { - assert(nodemax > 0); - return nodemax - (NODESIZE + sizeof(MDBX_db)); -} +#define mdbx_maxkey(nodemax) ((nodemax) - (NODESIZE + sizeof(MDBX_db))) + +#define mdbx_maxfree1pg(pagesize) (((pagesize)-PAGEHDRSZ) / sizeof(pgno_t) - 1) int mdbx_get_maxkeysize(size_t pagesize) { if (pagesize == 0) pagesize = mdbx_syspagesize(); - ssize_t nodemax = mdbx_calc_nodemax(pagesize); + ssize_t nodemax = mdbx_nodemax(pagesize); if (nodemax < 0) return -MDBX_EINVAL; - ssize_t maxkey = mdbx_calc_maxkey(nodemax); + ssize_t maxkey = mdbx_maxkey(nodemax); return (maxkey > 0 && maxkey < INT_MAX) ? (int)maxkey : -MDBX_EINVAL; } @@ -3856,12 +3858,27 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, size_t pagesize) { mdbx_ensure(env, pagesize >= MIN_PAGESIZE); mdbx_ensure(env, pagesize <= MAX_PAGESIZE); - env->me_psize = pagesize; - env->me_maxfree_1pg = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - env->me_nodemax = mdbx_calc_nodemax(pagesize); - env->me_maxkey_limit = mdbx_calc_maxkey(env->me_nodemax); - mdbx_assert(env, - env->me_maxkey_limit > 42 && env->me_maxkey_limit < pagesize); + env->me_psize = (unsigned)pagesize; + + STATIC_ASSERT(mdbx_maxfree1pg(MIN_PAGESIZE) > 42); + STATIC_ASSERT(mdbx_maxfree1pg(MAX_PAGESIZE) < MDBX_IDL_DB_MAX); + const ssize_t maxfree_1pg = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + mdbx_ensure(env, maxfree_1pg > 42 && maxfree_1pg < MDBX_IDL_DB_MAX); + env->me_maxfree_1pg = (unsigned)maxfree_1pg; + + STATIC_ASSERT(mdbx_nodemax(MIN_PAGESIZE) > 42); + STATIC_ASSERT(mdbx_nodemax(MAX_PAGESIZE) < UINT16_MAX); + const ssize_t nodemax = mdbx_nodemax(pagesize); + mdbx_ensure(env, nodemax > 42 && nodemax < UINT16_MAX); + env->me_nodemax = (unsigned)nodemax; + + STATIC_ASSERT(mdbx_maxkey(MIN_PAGESIZE) > 42); + STATIC_ASSERT(mdbx_maxkey(MIN_PAGESIZE) < MIN_PAGESIZE); + STATIC_ASSERT(mdbx_maxkey(MAX_PAGESIZE) > 42); + STATIC_ASSERT(mdbx_maxkey(MAX_PAGESIZE) < MAX_PAGESIZE); + const ssize_t maxkey_limit = mdbx_maxkey(env->me_nodemax); + mdbx_ensure(env, maxkey_limit > 42 && (size_t)maxkey_limit < pagesize); + env->me_maxkey_limit = (unsigned)maxkey_limit; env->me_psize2log = 0; while (pagesize > 1) { @@ -3884,12 +3901,13 @@ int __cold mdbx_env_create(MDBX_env **penv) { env->me_pid = mdbx_getpid(); int rc; - env->me_os_psize = mdbx_syspagesize(); - if (!is_power2(env->me_os_psize) || env->me_os_psize < MIN_PAGESIZE) { - mdbx_error("unsuitable system pageize %u", env->me_os_psize); + const size_t os_psize = mdbx_syspagesize(); + if (!is_power2(os_psize) || os_psize < MIN_PAGESIZE) { + mdbx_error("unsuitable system pageize %" PRIuPTR, os_psize); rc = MDBX_INCOMPATIBLE; goto bailout; } + env->me_os_psize = (unsigned)os_psize; mdbx_setup_pagesize(env, env->me_os_psize); rc = mdbx_fastmutex_init(&env->me_dbi_lock); @@ -4261,23 +4279,22 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { return err; size = wanna; } - } - - if ((size & (env->me_os_psize - 1)) || size < env->me_os_psize) { + } else if (size > SSIZE_MAX || (size & (env->me_os_psize - 1)) || + size < env->me_os_psize) { mdbx_notice("lck-file has invalid size %" PRIu64 " bytes", size); return MDBX_PROBLEM; } - const uint64_t maxreaders = - (size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader) + 1; + const size_t maxreaders = + ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader) + 1; if (maxreaders > UINT16_MAX) { - mdbx_error("lck-size too big (up to %" PRIu64 " readers)", maxreaders); + mdbx_error("lck-size too big (up to %" PRIuPTR " readers)", maxreaders); return MDBX_PROBLEM; } env->me_maxreaders = (unsigned)maxreaders; void *addr = NULL; - err = mdbx_mmap(&addr, size, true, env->me_lfd); + err = mdbx_mmap(&addr, (size_t)size, true, env->me_lfd); if (unlikely(err != MDBX_SUCCESS)) return err; assert(addr != nullptr); @@ -4304,7 +4321,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { if (rc == MDBX_RESULT_TRUE) { /* LY: exlcusive mode, init lck */ - memset(env->me_lck, 0, size); + memset(env->me_lck, 0, (size_t)size); err = mdbx_lck_init(env); if (err) return err; @@ -4344,9 +4361,6 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, mode_t mode, int *exclusive) { - int oflags, rc, len; - char *lck_pathname, *dxb_pathname; - if (unlikely(!env || !path)) return MDBX_EINVAL; @@ -4357,16 +4371,17 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, (flags & ~(CHANGEABLE | CHANGELESS))) return MDBX_EINVAL; - len = strlen(path); + size_t len_full, len = strlen(path); if (flags & MDBX_NOSUBDIR) { - rc = len + sizeof(MDBX_LOCK_SUFFIX) + len + 1; + len_full = len + sizeof(MDBX_LOCK_SUFFIX) + len + 1; } else { - rc = len + sizeof(MDBX_LOCKNAME) + len + sizeof(MDBX_DATANAME); + len_full = len + sizeof(MDBX_LOCKNAME) + len + sizeof(MDBX_DATANAME); } - lck_pathname = malloc(rc); + char *lck_pathname = malloc(len_full); if (!lck_pathname) return MDBX_ENOMEM; + char *dxb_pathname; if (flags & MDBX_NOSUBDIR) { dxb_pathname = lck_pathname + len + sizeof(MDBX_LOCK_SUFFIX); sprintf(lck_pathname, "%s" MDBX_LOCK_SUFFIX, path); @@ -4377,7 +4392,7 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, sprintf(dxb_pathname, "%s" MDBX_DATANAME, path); } - rc = MDBX_SUCCESS; + int rc = MDBX_SUCCESS; flags |= env->me_flags; if (flags & MDBX_RDONLY) { /* LY: silently ignore irrelevant flags when @@ -4403,6 +4418,7 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, } env->me_dbxs[FREE_DBI].md_cmp = mdbx_cmp_int_ai; /* aligned MDBX_INTEGERKEY */ + int oflags; if (F_ISSET(flags, MDBX_RDONLY)) oflags = O_RDONLY; else @@ -5970,7 +5986,6 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, uint16_t fp_flags; MDBX_val xdata, *rdata, dkey, olddata; MDBX_db dummy; - int do_sub = 0, insert_key, insert_data; unsigned mcount = 0, dcount = 0, nospill; size_t nsize; int rc = MDBX_SUCCESS, rc2; @@ -5987,10 +6002,12 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, /* Check this first so counter will always be zero on any early failures. */ if (flags & MDBX_MULTIPLE) { - dcount = data[1].iov_len; - data[1].iov_len = 0; if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDBX_DUPFIXED))) return MDBX_INCOMPATIBLE; + if (unlikely(data[1].iov_len >= INT_MAX)) + return MDBX_EINVAL; + dcount = (unsigned)data[1].iov_len; + data[1].iov_len = 0; } if (flags & MDBX_RESERVE) { @@ -6137,7 +6154,8 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc2; } - insert_key = insert_data = rc; + bool insert_key, insert_data, do_sub = false; + insert_key = insert_data = (rc != MDBX_SUCCESS); if (insert_key) { /* The key does not exist */ mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); @@ -6285,7 +6303,8 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, xdata.iov_base = &dummy; if ((rc = mdbx_page_alloc(mc, 1, &mp, MDBX_ALLOC_ALL))) return rc; - offset = env->me_psize - olddata.iov_len; + mdbx_cassert(mc, env->me_psize > olddata.iov_len); + offset = env->me_psize - (unsigned)olddata.iov_len; flags |= F_DUPDATA | F_SUBDATA; dummy.md_root = mp->mp_pgno; sub_root = mp; @@ -6308,7 +6327,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rdata = &xdata; flags |= F_DUPDATA; - do_sub = 1; + do_sub = true; if (!insert_key) mdbx_node_del(mc, 0); goto new_sub; @@ -6388,14 +6407,16 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, else if (!(mc->mc_flags & C_SUB)) memcpy(olddata.iov_base, data->iov_base, data->iov_len); else { - assert(NUMKEYS(mc->mc_pg[mc->mc_top]) == 1); - assert(mc->mc_pg[mc->mc_top]->mp_upper == - mc->mc_pg[mc->mc_top]->mp_lower); - assert(IS_LEAF(mc->mc_pg[mc->mc_top]) && - !IS_LEAF2(mc->mc_pg[mc->mc_top])); - assert(NODEDSZ(leaf) == 0); - assert(leaf->mn_flags == 0); - memcpy(NODEKEY(leaf), key->iov_base, leaf->mn_ksize = key->iov_len); + mdbx_cassert(mc, NUMKEYS(mc->mc_pg[mc->mc_top]) == 1); + mdbx_cassert(mc, mc->mc_pg[mc->mc_top]->mp_upper == + mc->mc_pg[mc->mc_top]->mp_lower); + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) && + !IS_LEAF2(mc->mc_pg[mc->mc_top])); + mdbx_cassert(mc, NODEDSZ(leaf) == 0); + mdbx_cassert(mc, leaf->mn_flags == 0); + mdbx_cassert(mc, key->iov_len < UINT16_MAX); + leaf->mn_ksize = (uint16_t)key->iov_len; + memcpy(NODEKEY(leaf), key->iov_base, key->iov_len); assert((char *)NODEDATA(leaf) + NODEDSZ(leaf) < (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); goto fix_parent; @@ -6496,7 +6517,8 @@ new_sub: } } } - ecount = mc->mc_xcursor->mx_db.md_entries; + mdbx_cassert(mc, mc->mc_xcursor->mx_db.md_entries < SIZE_MAX); + ecount = (size_t)mc->mc_xcursor->mx_db.md_entries; if (flags & MDBX_APPENDDUP) xflags |= MDBX_APPEND; rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); @@ -6504,7 +6526,7 @@ new_sub: void *db = NODEDATA(leaf); memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } - insert_data = mc->mc_xcursor->mx_db.md_entries - ecount; + insert_data = (ecount != (size_t)mc->mc_xcursor->mx_db.md_entries); } /* Increment count unless we just replaced an existing item. */ if (insert_data) @@ -6524,7 +6546,7 @@ new_sub: data[1].iov_len = mcount; if (mcount < dcount) { data[0].iov_base = (char *)data[0].iov_base + data[0].iov_len; - insert_key = insert_data = 0; + insert_key = insert_data = false; goto more; } } @@ -6765,7 +6787,6 @@ static int mdbx_node_add(MDBX_cursor *mc, indx_t indx, MDBX_val *key, unsigned i; size_t node_size = NODESIZE; ssize_t room; - unsigned ofs; MDBX_node *node; MDBX_page *mp = mc->mc_pg[mc->mc_top]; MDBX_page *ofp = NULL; /* overflow page */ @@ -6783,17 +6804,19 @@ static int mdbx_node_add(MDBX_cursor *mc, indx_t indx, MDBX_val *key, if (IS_LEAF2(mp)) { mdbx_cassert(mc, key); /* Move higher keys up one slot. */ - int ksize = mc->mc_db->md_xsize, dif; - char *ptr = LEAF2KEY(mp, indx, ksize); - dif = NUMKEYS(mp) - indx; - if (dif > 0) - memmove(ptr + ksize, ptr, dif * ksize); + const int ksize = mc->mc_db->md_xsize; + char *const ptr = LEAF2KEY(mp, indx, ksize); + const int diff = NUMKEYS(mp) - indx; + if (diff > 0) + memmove(ptr + ksize, ptr, diff * ksize); /* insert new key */ memcpy(ptr, key->iov_base, ksize); /* Just using these for counting */ + mdbx_cassert(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); + mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t)); + mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); return MDBX_SUCCESS; } @@ -6807,7 +6830,7 @@ static int mdbx_node_add(MDBX_cursor *mc, indx_t indx, MDBX_val *key, node_size += sizeof(pgno_t); } else if (unlikely(node_size + data->iov_len > mc->mc_txn->mt_env->me_nodemax)) { - unsigned ovpages = OVPAGES(mc->mc_txn->mt_env, data->iov_len); + pgno_t ovpages = OVPAGES(mc->mc_txn->mt_env, data->iov_len); int rc; /* Put data on overflow page. */ mdbx_debug("data size is %" PRIuPTR ", node would be %" PRIuPTR @@ -6835,8 +6858,9 @@ update: mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; /* Adjust free space offsets. */ - ofs = mp->mp_upper - node_size; + size_t ofs = mp->mp_upper - node_size; mdbx_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t)); + mdbx_cassert(mc, ofs <= UINT16_MAX); mp->mp_ptrs[indx] = (uint16_t)ofs; mp->mp_upper = (uint16_t)ofs; mp->mp_lower += sizeof(indx_t); @@ -6888,10 +6912,9 @@ full: * [in] mc Cursor pointing to the node to delete. * [in] ksize The size of a node. Only used if the page is * part of a MDBX_DUPFIXED database. */ -static void mdbx_node_del(MDBX_cursor *mc, int ksize) { +static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; indx_t indx = mc->mc_ki[mc->mc_top]; - unsigned sz; indx_t i, j, numkeys, ptr; MDBX_node *node; char *base; @@ -6902,17 +6925,21 @@ static void mdbx_node_del(MDBX_cursor *mc, int ksize) { mdbx_cassert(mc, indx < numkeys); if (IS_LEAF2(mp)) { - int x = numkeys - 1 - indx; + mdbx_cassert(mc, ksize >= sizeof(indx_t)); + unsigned diff = numkeys - 1 - indx; base = LEAF2KEY(mp, indx, ksize); - if (x) - memmove(base, base + ksize, x * ksize); + if (diff) + memmove(base, base + ksize, diff * ksize); + mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += ksize - sizeof(indx_t); + mdbx_cassert(mc, + (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); + mp->mp_upper += (indx_t)(ksize - sizeof(indx_t)); return; } node = NODEPTR(mp, indx); - sz = NODESIZE + node->mn_ksize; + size_t sz = NODESIZE + node->mn_ksize; if (IS_LEAF(mp)) { if (F_ISSET(node->mn_flags, F_BIGDATA)) sz += sizeof(pgno_t); @@ -6925,8 +6952,10 @@ static void mdbx_node_del(MDBX_cursor *mc, int ksize) { for (i = j = 0; i < numkeys; i++) { if (i != indx) { mp->mp_ptrs[j] = mp->mp_ptrs[i]; - if (mp->mp_ptrs[i] < ptr) - mp->mp_ptrs[j] += sz; + if (mp->mp_ptrs[i] < ptr) { + mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_ptrs[j] >= sz); + mp->mp_ptrs[j] += (indx_t)sz; + } j++; } } @@ -6934,8 +6963,10 @@ static void mdbx_node_del(MDBX_cursor *mc, int ksize) { base = (char *)mp + mp->mp_upper + PAGEHDRSZ; memmove(base + sz, base, ptr - mp->mp_upper); + mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); - mp->mp_upper += sz; + mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_upper >= sz); + mp->mp_upper += (indx_t)sz; } /* Compact the main page after deleting a node on a subpage. @@ -6945,7 +6976,7 @@ static void mdbx_node_shrink(MDBX_page *mp, indx_t indx) { MDBX_node *node; MDBX_page *sp, *xp; char *base; - unsigned nsize, delta, len, ptr; + size_t nsize, delta, len, ptr; int i; node = NODEPTR(mp, indx); @@ -6960,8 +6991,10 @@ static void mdbx_node_shrink(MDBX_page *mp, indx_t indx) { return; /* do not make the node uneven-sized */ } else { xp = (MDBX_page *)((char *)sp + delta); /* destination subpage */ - for (i = NUMKEYS(sp); --i >= 0;) - xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta; + for (i = NUMKEYS(sp); --i >= 0;) { + assert(sp->mp_ptrs[i] >= delta); + xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta); + } len = PAGEHDRSZ; } sp->mp_upper = sp->mp_lower; @@ -6974,10 +7007,13 @@ static void mdbx_node_shrink(MDBX_page *mp, indx_t indx) { ptr = mp->mp_ptrs[indx]; for (i = NUMKEYS(mp); --i >= 0;) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] += delta; + if (mp->mp_ptrs[i] <= ptr) { + assert((size_t)UINT16_MAX - mp->mp_ptrs[i] >= delta); + mp->mp_ptrs[i] += (indx_t)delta; + } } - mp->mp_upper += delta; + assert((size_t)UINT16_MAX - mp->mp_upper >= delta); + mp->mp_upper += (indx_t)delta; } /* Initial setup of a sorted-dups cursor. @@ -7049,7 +7085,7 @@ static void mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node) { mdbx_debug("Sub-db -%u root page %" PRIaPGNO "", mx->mx_cursor.mc_dbi, mx->mx_db.md_root); mx->mx_dbflag = DB_VALID | DB_USRVALID | DB_DUPDATA; - /* #if UINT_MAX < SIZE_MAX + /* FIXME: #if UINT_MAX < SIZE_MAX if (mx->mx_dbx.md_cmp == mdbx_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) mx->mx_dbx.md_cmp = mdbx_cmp_clong; @@ -7217,9 +7253,9 @@ int mdbx_cursor_count(MDBX_cursor *mc, size_t *countp) { if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); - *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > INT_MAX) - ? INT_MAX - : mc->mc_xcursor->mx_db.md_entries; + *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > SIZE_MAX) + ? SIZE_MAX + : (size_t)mc->mc_xcursor->mx_db.md_entries; } } return MDBX_SUCCESS; @@ -8079,21 +8115,19 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, pgno_t newpgno, unsigned nflags) { unsigned flags; int rc = MDBX_SUCCESS, new_root = 0, did_split = 0; - indx_t newindx; pgno_t pgno = 0; - int i, j, split_indx, nkeys, pmax; + unsigned i, ptop; MDBX_env *env = mc->mc_txn->mt_env; MDBX_node *node; MDBX_val sepkey, rkey, xdata, *rdata = &xdata; MDBX_page *copy = NULL; - MDBX_page *mp, *rp, *pp; - int ptop; + MDBX_page *rp, *pp; MDBX_cursor mn; DKBUF; - mp = mc->mc_pg[mc->mc_top]; - newindx = mc->mc_ki[mc->mc_top]; - nkeys = NUMKEYS(mp); + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + unsigned newindx = mc->mc_ki[mc->mc_top]; + unsigned nkeys = NUMKEYS(mp); mdbx_debug("-----> splitting %s page %" PRIaPGNO " and adding [%s] at index %i/%i", @@ -8147,6 +8181,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, mn.mc_pg[mn.mc_top] = rp; mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1; + unsigned split_indx; if (nflags & MDBX_APPEND) { mn.mc_ki[mn.mc_top] = 0; sepkey = *newkey; @@ -8176,27 +8211,32 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, sepkey.iov_base = split; } if (x < 0) { + mdbx_cassert(mc, ksize >= sizeof(indx_t)); ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize); memcpy(rp->mp_ptrs, split, rsize); sepkey.iov_base = rp->mp_ptrs; memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); memcpy(ins, newkey->iov_base, ksize); + mdbx_cassert(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); mp->mp_lower += sizeof(indx_t); - mp->mp_upper -= ksize - sizeof(indx_t); + mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t)); + mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); } else { if (x) memcpy(rp->mp_ptrs, split, x * ksize); ins = LEAF2KEY(rp, x, ksize); memcpy(ins, newkey->iov_base, ksize); memcpy(ins + ksize, split + x * ksize, rsize - x * ksize); + mdbx_cassert(mc, UINT16_MAX - rp->mp_lower >= (int)sizeof(indx_t)); rp->mp_lower += sizeof(indx_t); - rp->mp_upper -= ksize - sizeof(indx_t); + mdbx_cassert(mc, rp->mp_upper >= ksize - sizeof(indx_t)); + rp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); mc->mc_ki[mc->mc_top] = x; } } else { - int psize, nsize, k; + size_t psize, nsize, k; /* Maximum free space in an empty page */ - pmax = env->me_psize - PAGEHDRSZ; + unsigned pmax = env->me_psize - PAGEHDRSZ; if (IS_LEAF(mp)) nsize = mdbx_leaf_size(env, newkey, newdata); else @@ -8215,10 +8255,9 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, copy->mp_upper = env->me_psize - PAGEHDRSZ; /* prepare to insert */ - for (i = 0, j = 0; i < nkeys; i++) { - if (i == newindx) { + for (unsigned j = i = 0; i < nkeys; i++) { + if (i == newindx) copy->mp_ptrs[j++] = 0; - } copy->mp_ptrs[j++] = mp->mp_ptrs[i]; } @@ -8237,19 +8276,20 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, * the split so the new page is emptier than the old page. * This yields better packing during sequential inserts. */ + int dir; if (nkeys < 20 || nsize > pmax / 16 || newindx >= nkeys) { /* Find split point */ psize = 0; if (newindx <= split_indx || newindx >= nkeys) { i = 0; - j = 1; - k = newindx >= nkeys ? nkeys : split_indx + 1 + IS_LEAF(mp); + dir = 1; + k = (newindx >= nkeys) ? nkeys : split_indx + 1 + IS_LEAF(mp); } else { i = nkeys; - j = -1; + dir = -1; k = split_indx - 1; } - for (; i != k; i += j) { + for (; i != k; i += dir) { if (i == newindx) { psize += nsize; node = NULL; @@ -8264,8 +8304,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, } psize = EVEN(psize); } - if (psize > pmax || i == k - j) { - split_indx = i + (j < 0); + if (psize > pmax || i == k - dir) { + split_indx = i + (dir < 0); break; } } @@ -8297,9 +8337,9 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, goto done; /* root split? */ - if (mc->mc_snum > snum) { + if (mc->mc_snum > snum) ptop++; - } + /* Right page might now have changed parent. * Check if left page also changed parent. */ if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && @@ -8341,7 +8381,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, /* Move nodes */ mc->mc_pg[mc->mc_top] = rp; i = split_indx; - j = 0; + indx_t n = 0; do { if (i == newindx) { rkey.iov_base = newkey->iov_base; @@ -8352,7 +8392,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, pgno = newpgno; flags = nflags; /* Update index for the new key. */ - mc->mc_ki[mc->mc_top] = j; + mc->mc_ki[mc->mc_top] = n; } else { node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); rkey.iov_base = NODEKEY(node); @@ -8366,21 +8406,21 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, flags = node->mn_flags; } - if (!IS_LEAF(mp) && j == 0) { + if (!IS_LEAF(mp) && n == 0) { /* First branch index doesn't need key data. */ rkey.iov_len = 0; } - rc = mdbx_node_add(mc, j, &rkey, rdata, pgno, flags); + rc = mdbx_node_add(mc, n, &rkey, rdata, pgno, flags); if (rc) goto done; if (i == nkeys) { i = 0; - j = 0; + n = 0; mc->mc_pg[mc->mc_top] = copy; } else { i++; - j++; + n++; } } while (i != split_indx); @@ -8552,8 +8592,8 @@ typedef struct mdbx_copy { mdbx_condmutex_t mc_condmutex; char *mc_wbuf[2]; char *mc_over[2]; - int mc_wlen[2]; - int mc_olen[2]; + size_t mc_wlen[2]; + size_t mc_olen[2]; pgno_t mc_next_pgno; mdbx_filehandle_t mc_fd; int mc_toggle; /* Buffer number in provider */ @@ -8567,7 +8607,7 @@ typedef struct mdbx_copy { static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { mdbx_copy *my = arg; char *ptr; - int toggle = 0, wsize; + int toggle = 0; int rc; #if defined(F_SETNOSIGPIPE) @@ -8593,7 +8633,7 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { mdbx_condmutex_wait(&my->mc_condmutex); if (my->mc_new == 0 + MDBX_EOF) /* 0 buffers, just EOF */ break; - wsize = my->mc_wlen[toggle]; + size_t wsize = my->mc_wlen[toggle]; ptr = my->mc_wbuf[toggle]; again: if (wsize > 0 && !my->mc_error) { @@ -8942,14 +8982,13 @@ int __cold mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, } int __cold mdbx_env_copy(MDBX_env *env, const char *path, unsigned flags) { - int rc, len; char *lck_pathname; mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE; if (env->me_flags & MDBX_NOSUBDIR) { lck_pathname = (char *)path; } else { - len = strlen(path); + size_t len = strlen(path); len += sizeof(MDBX_DATANAME); lck_pathname = malloc(len); if (!lck_pathname) @@ -8960,7 +8999,8 @@ int __cold mdbx_env_copy(MDBX_env *env, const char *path, unsigned flags) { /* The destination path must exist, but the destination file must not. * We don't want the OS to cache the writes, since the source data is * already in the OS cache. */ - rc = mdbx_openfile(lck_pathname, O_WRONLY | O_CREAT | O_EXCL, 0666, &newfd); + int rc = + mdbx_openfile(lck_pathname, O_WRONLY | O_CREAT | O_EXCL, 0666, &newfd); if (rc == MDBX_SUCCESS) { if (env->me_psize >= env->me_os_psize) { #ifdef F_NOCACHE /* __APPLE__ */ @@ -9761,7 +9801,9 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { if (oldest < snap || reader < 0) { if (retry && env->me_oom_func) { /* LY: notify end of oom-loop */ - env->me_oom_func(env, 0, 0, oldest, snap - oldest, -retry); + const txnid_t gap = snap - oldest; + env->me_oom_func(env, 0, 0, oldest, + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, -retry); } return snap; } @@ -9780,9 +9822,10 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { if (r->mr_txnid != oldest || pid <= 0) continue; - rc = env->me_oom_func( - env, pid, tid, oldest, - mdbx_meta_txnid_stable(env, mdbx_meta_head(env)) - oldest, retry); + const txnid_t gap = + mdbx_meta_txnid_stable(env, mdbx_meta_head(env)) - oldest; + rc = env->me_oom_func(env, pid, tid, oldest, + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, retry); if (rc < 0) break; @@ -9849,11 +9892,11 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) MDBX_env *env = txn->mt_env; MDBX_meta *meta = mdbx_meta_head(env); if (percent) { - size_t maxpg = env->me_maxpg; - size_t last = meta->mm_last_pg + 1; + pgno_t maxpg = env->me_maxpg; + pgno_t last = meta->mm_last_pg + 1; if (env->me_txn) last = env->me_txn0->mt_next_pgno; - *percent = (last * 100ull + maxpg / 2) / maxpg; + *percent = (int)((last * UINT64_C(100) + maxpg / 2) / maxpg); } txnid_t lag = mdbx_meta_txnid_fluid(env, meta) - txn->mt_ro_reader->mr_txnid; return (lag > INT_MAX) ? INT_MAX : (int)lag; @@ -9870,7 +9913,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, pgno_t pg, int deep) { MDBX_page *mp; int rc, i, nkeys; - unsigned header_size, unused_size, payload_size, align_bytes; + size_t header_size, unused_size, payload_size, align_bytes; const char *type; if (pg == P_INVALID) @@ -10278,7 +10321,7 @@ bailout: } int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - int *values_count) { + size_t *values_count) { DKBUF; mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); @@ -10313,7 +10356,10 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { mdbx_tassert(txn, mc.mc_xcursor == &mx && (mx.mx_cursor.mc_flags & C_INITIALIZED)); - *values_count = mx.mx_db.md_entries; + *values_count = (sizeof(*values_count) >= sizeof(mx.mx_db.md_entries) || + mx.mx_db.md_entries <= SIZE_MAX) + ? (size_t)mx.mx_db.md_entries + : SIZE_MAX; } } } diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 726f5712..7c14188c 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -224,8 +224,9 @@ static uint64_t problems_pop(struct problem *list) { } static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, - const char *dbi, const char *type, int nentries, - int payload_bytes, int header_bytes, int unused_bytes) { + const char *dbi, const char *type, size_t nentries, + size_t payload_bytes, size_t header_bytes, + size_t unused_bytes) { (void)ctx; if (type) { @@ -240,13 +241,14 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, print(" %s-page %" PRIu64, type, pgno); else print(" %s-span %" PRIu64 "[%u]", type, pgno, pgnumber); - print(" of %s: header %i, payload %i, unused %i\n", dbi, header_bytes, - payload_bytes, unused_bytes); + print(" of %s: header %" PRIiPTR ", payload %" PRIiPTR + ", unused %" PRIiPTR "\n", + dbi, header_bytes, payload_bytes, unused_bytes); } walk.pgcount += pgnumber; - if (unused_bytes < 0 || (size_t)unused_bytes > page_size) + if (unused_bytes > page_size) problem_add("page", pgno, "illegal unused-bytes", "%u < %i < %u", 0, unused_bytes, envstat.ms_psize); diff --git a/test/utils.cc b/test/utils.cc index ae58311f..fd2162f1 100644 --- a/test/utils.cc +++ b/test/utils.cc @@ -163,9 +163,7 @@ uint64_t entropy_ticks(void) { #elif defined(_M_IX86) || defined(_M_X64) return __rdtsc(); -#endif /* __GNUC__ || __clang__ */ - -#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) +#elif defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) LARGE_INTEGER PerformanceCount; if (QueryPerformanceCounter(&PerformanceCount)) return PerformanceCount.QuadPart; From 90b30e75849b1cc200db29fdcd928ac251eca2a9 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 5 Jun 2017 21:04:42 +0300 Subject: [PATCH 230/303] mdbx: level-4 warnings for debug-build. --- dll.vcxproj | 4 ++-- test/test.vcxproj | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dll.vcxproj b/dll.vcxproj index 44c71646..2866a857 100644 --- a/dll.vcxproj +++ b/dll.vcxproj @@ -79,7 +79,7 @@ WIN32;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions);MDBX_DEBUG=1 MultiThreadedDebugDLL - Level3 + Level4 ProgramDatabase Disabled true @@ -120,7 +120,7 @@ - Level3 + Level4 WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions);MDBX_DEBUG=1 MultiThreadedDebugDLL true diff --git a/test/test.vcxproj b/test/test.vcxproj index 6676ffc0..400090b5 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -93,7 +93,7 @@ Use - Level3 + Level4 Disabled WIN32;_DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions) true @@ -108,7 +108,7 @@ Use - Level3 + Level4 Disabled _DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions) true From 318646794e6a8af653a4648956147b9a36c8cd85 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 5 Jun 2017 22:46:15 +0300 Subject: [PATCH 231/303] mdbx: partial fix MSVC level-4 warnings. --- src/bits.h | 14 ++++++++------ src/mdbx.c | 6 +++--- test/log.cc | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/bits.h b/src/bits.h index 73bea6f5..de87d85d 100644 --- a/src/bits.h +++ b/src/bits.h @@ -66,12 +66,14 @@ #endif /* __SANITIZE_THREAD__ */ #ifdef _MSC_VER -#pragma warning(disable : 4464) /* C4464: relative include path contains '..' */ -#pragma warning(disable : 4710) /* C4710: 'xyz': function not inlined */ -#pragma warning(disable : 4711) /* C4711: function 'xyz' selected for automatic inline expansion */ -#pragma warning(disable : 4201) /* C4201: nonstandard extension used : nameless struct / union */ -#pragma warning(disable : 4706) /* C4706: assignment within conditional expression */ -#pragma warning(disable : 4127) /* C4127: conditional expression is constant */ +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#pragma warning(disable : 4710) /* 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* function 'xyz' selected for automatic inline expansion */ +#pragma warning(disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4706) /* assignment within conditional expression */ +#pragma warning(disable : 4127) /* conditional expression is constant */ +#pragma warning(disable : 4324) /* 'xyz': structure was padded due to alignment specifier */ +#pragma warning(disable : 4310) /* cast truncates constant value */ #endif /* _MSC_VER (warnings) */ #include "./osal.h" diff --git a/src/mdbx.c b/src/mdbx.c index 7ba9b54f..2ff871e6 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3200,8 +3200,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI]; for (i = CORE_DBS; i < txn->mt_numdbs; i++) { /* preserve parent's DB_NEW status */ - x = parent->mt_dbflags[i] & DB_NEW; - parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + parent->mt_dbflags[i] = + txn->mt_dbflags[i] | (parent->mt_dbflags[i] & DB_NEW); } dst = parent->mt_rw_dirtylist; @@ -7294,7 +7294,7 @@ MDBX_txn *mdbx_cursor_txn(MDBX_cursor *mc) { MDBX_dbi mdbx_cursor_dbi(MDBX_cursor *mc) { if (unlikely(!mc || mc->mc_signature != MDBX_MC_SIGNATURE)) - return INT_MIN; + return UINT_MAX; return mc->mc_dbi; } diff --git a/test/log.cc b/test/log.cc index 3c0a4b11..ebb859b6 100644 --- a/test/log.cc +++ b/test/log.cc @@ -97,10 +97,10 @@ bool output(const logging::loglevel priority, const char *format, va_list ap) { chrono::time now = chrono::now_realtime(); struct tm tm; - time_t time = now.utc; #ifdef _MSC_VER int rc = _localtime32_s(&tm, (const __time32_t *)&now.utc); #else + time_t time = now.utc; int rc = localtime_r(&time, &tm) ? MDBX_SUCCESS : errno; #endif if (rc != MDBX_SUCCESS) From 3bf3a08f9641792944b3ca8f0845269e08d6b5d0 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 6 Jun 2017 16:42:21 +0300 Subject: [PATCH 232/303] mdbx: fix write-txn thread cleanup. --- src/mdbx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 2ff871e6..14ae7d45 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2621,6 +2621,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { env->me_pglast = 0; env->me_txn = NULL; + txn->mt_signature = 0; mode = 0; /* txn == env->me_txn0, do not free() it */ /* The writer mutex was locked in mdbx_txn_begin. */ @@ -2638,9 +2639,9 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { } if (mode & MDBX_END_FREE) { + mdbx_ensure(env, txn != env->me_txn0); txn->mt_signature = 0; - if (txn != env->me_txn0) - free(txn); + free(txn); } return MDBX_SUCCESS; From 465459dc58b2ee2ae2976c4bbfc0ce44c156374a Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 6 Jun 2017 17:05:30 +0300 Subject: [PATCH 233/303] mdbx: add MDBX_txn.mt_owner and MDBX_THREAD_MISMATCH. --- libmdbx.files | 1 + mdbx.h | 8 +++-- src/bits.h | 1 + src/mdbx.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++--- src/osal.c | 8 ----- src/osal.h | 9 ++++- 6 files changed, 104 insertions(+), 16 deletions(-) diff --git a/libmdbx.files b/libmdbx.files index 3f51a9b5..b6b1d7a2 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -2,6 +2,7 @@ AUTHORS LICENSE Makefile README.md +TODO.md mdbx.h src/bits.h src/defs.h diff --git a/mdbx.h b/mdbx.h index f696d0a3..ec38193c 100644 --- a/mdbx.h +++ b/mdbx.h @@ -414,10 +414,14 @@ typedef enum MDBX_cursor_op { * when mdbx_cursor_put() called with MDBX_CURRENT option. */ #define MDBX_EKEYMISMATCH (-30418) -/* Database is too large for current system, i.e. could NOT be mapped into RAM. - */ +/* Database is too large for current system, + * e.g. could NOT be mapped into RAM. */ #define MDBX_TOO_LARGE (-30417) +/* A thread has attempted to use a not owned object, + * e.g. a transaction that started by another thread. */ +#define MDBX_THREAD_MISMATCH (-30416) + /* Statistics for a database in the environment */ typedef struct MDBX_stat { uint32_t ms_psize; /* Size of a database page. diff --git a/src/bits.h b/src/bits.h index de87d85d..3753f7c0 100644 --- a/src/bits.h +++ b/src/bits.h @@ -527,6 +527,7 @@ struct MDBX_txn { * dirtylist into mt_parent after freeing hidden mt_parent pages. */ unsigned mt_dirtyroom; mdbx_canary mt_canary; + mdbx_tid_t mt_owner; /* thread ID that owns this transaction */ }; /* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. diff --git a/src/mdbx.c b/src/mdbx.c index 14ae7d45..31720305 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -690,8 +690,11 @@ static const char *__mdbx_strerr(int errnum) { return "MDBX_EKEYMISMATCH: The given key value is mismatched to the " "current cursor position"; case MDBX_TOO_LARGE: - return "Database is too large for current system, i.e.could NOT be mapped " - "into RAM."; + return "MDBX_TOO_LARGE: Database is too large for current system, " + "e.g. could NOT be mapped into RAM"; + case MDBX_THREAD_MISMATCH: + return "MDBX_THREAD_MISMATCH: A thread has attempted to use a not " + "owned object, e.g. a transaction that started by another thread"; default: return NULL; } @@ -968,13 +971,13 @@ static void mdbx_audit(MDBX_txn *txn) { int mdbx_cmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { - mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_cmp(a, b); } int mdbx_dcmp(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { - mdbx_ensure(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_dcmp(a, b); } @@ -2394,6 +2397,9 @@ int mdbx_txn_renew(MDBX_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY | MDBX_TXN_FINISHED))) return MDBX_EINVAL; @@ -2434,6 +2440,9 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, if (unlikely(parent->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EINVAL; + if (unlikely(parent->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + /* Nested transactions: Max 1 child, write txns only, no writemap */ flags |= parent->mt_flags; if (unlikely(flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED))) { @@ -2510,6 +2519,7 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, if (txn != env->me_txn0) free(txn); } else { + txn->mt_owner = mdbx_thread_self(); txn->mt_signature = MDBX_MT_SIGNATURE; *ret = txn; mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "", @@ -2593,7 +2603,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { mdbx_coherent_barrier(); txn->mt_numdbs = 0; /* prevent further DBI activity */ txn->mt_flags |= MDBX_TXN_FINISHED; - + txn->mt_owner = 0; } else if (!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED)) { pgno_t *pghead = env->me_pghead; @@ -2621,6 +2631,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { env->me_pglast = 0; env->me_txn = NULL; + txn->mt_owner = 0; txn->mt_signature = 0; mode = 0; /* txn == env->me_txn0, do not free() it */ @@ -2640,6 +2651,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { if (mode & MDBX_END_FREE) { mdbx_ensure(env, txn != env->me_txn0); + txn->mt_owner = 0; txn->mt_signature = 0; free(txn); } @@ -2654,6 +2666,9 @@ int mdbx_txn_reset(MDBX_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + /* This call is only valid for read-only txns */ if (unlikely(!(txn->mt_flags & MDBX_TXN_RDONLY))) return MDBX_EINVAL; @@ -2669,6 +2684,9 @@ int mdbx_txn_abort(MDBX_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) /* LY: don't close DBI-handles in MDBX mode */ return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | @@ -3133,6 +3151,9 @@ int mdbx_txn_commit(MDBX_txn *txn) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + MDBX_env *env = txn->mt_env; if (unlikely(env->me_pid != mdbx_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; @@ -5256,6 +5277,9 @@ int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -7162,6 +7186,9 @@ int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) return MDBX_EINVAL; @@ -7197,6 +7224,9 @@ int mdbx_cursor_renew(MDBX_txn *txn, MDBX_cursor *mc) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE && mc->mc_signature != MDBX_MC_READY4CLOSE)) return MDBX_EINVAL; @@ -7231,6 +7261,9 @@ int mdbx_cursor_count(MDBX_cursor *mc, size_t *countp) { if (unlikely(mc->mc_signature != MDBX_MC_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(mc->mc_txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) return MDBX_BAD_TXN; @@ -8054,6 +8087,9 @@ int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -8543,6 +8579,9 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -9236,6 +9275,9 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) return MDBX_BAD_TXN; @@ -9368,6 +9410,9 @@ int __cold mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *arg, if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) return MDBX_EINVAL; @@ -9422,6 +9467,9 @@ int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_VALID))) return MDBX_EINVAL; @@ -9528,6 +9576,9 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -9602,6 +9653,9 @@ int mdbx_set_compare(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -9616,6 +9670,9 @@ int mdbx_set_dupsort(MDBX_txn *txn, MDBX_dbi dbi, MDBX_cmp_func *cmp) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -9887,6 +9944,9 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!txn->mt_ro_reader)) return -1; @@ -10041,9 +10101,13 @@ int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, void *user) { if (unlikely(!txn)) return MDBX_BAD_TXN; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + mdbx_walk_ctx_t ctx; ctx.mw_txn = txn; ctx.mw_user = user; @@ -10069,6 +10133,9 @@ int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) return MDBX_BAD_TXN; @@ -10097,9 +10164,13 @@ int mdbx_canary_put(MDBX_txn *txn, const mdbx_canary *canary) { int mdbx_canary_get(MDBX_txn *txn, mdbx_canary *canary) { if (unlikely(txn == NULL || canary == NULL)) return MDBX_EINVAL; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + *canary = txn->mt_canary; return MDBX_SUCCESS; } @@ -10197,6 +10268,9 @@ int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *new_data, if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) return MDBX_EINVAL; @@ -10332,6 +10406,9 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; @@ -10396,6 +10473,9 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) return MDBX_RESULT_FALSE; @@ -10457,6 +10537,9 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_EBADSIGN; + if (unlikely(txn->mt_owner != mdbx_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) return MDBX_EINVAL; diff --git a/src/osal.c b/src/osal.c index 4b56be70..7196cf6c 100644 --- a/src/osal.c +++ b/src/osal.c @@ -615,14 +615,6 @@ void mdbx_thread_rthc_set(mdbx_thread_key_t key, const void *value) { #endif } -mdbx_tid_t mdbx_thread_self(void) { -#if defined(_WIN32) || defined(_WIN64) - return GetCurrentThreadId(); -#else - return pthread_self(); -#endif -} - int mdbx_thread_create(mdbx_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg) { diff --git a/src/osal.h b/src/osal.h index ee776358..6265ea14 100644 --- a/src/osal.h +++ b/src/osal.h @@ -439,7 +439,6 @@ int mdbx_thread_create(mdbx_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); int mdbx_thread_join(mdbx_thread_t thread); -mdbx_tid_t mdbx_thread_self(void); int mdbx_thread_key_create(mdbx_thread_key_t *key); void mdbx_thread_key_delete(mdbx_thread_key_t key); void *mdbx_thread_rthc_get(mdbx_thread_key_t key); @@ -465,6 +464,14 @@ static __inline mdbx_pid_t mdbx_getpid(void) { #endif } +static __inline mdbx_tid_t mdbx_thread_self(void) { +#if defined(_WIN32) || defined(_WIN64) + return GetCurrentThreadId(); +#else + return pthread_self(); +#endif +} + void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ From 1d15ae4b13a6320176d4a3d0975bb711150a4626 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 6 Jun 2017 17:41:06 +0300 Subject: [PATCH 234/303] mdbx: zeroing MDBX_page.mp_validator for Valgrind/AddressSanitizer. --- src/mdbx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mdbx.c b/src/mdbx.c index 31720305..48314481 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3075,6 +3075,7 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) { continue; } dp->mp_flags &= ~P_DIRTY; + dp->mp_validator = 0 /* TODO */; env->me_sync_pending += IS_OVERFLOW(dp) ? pgno2bytes(env, dp->mp_pages) : env->me_psize; } @@ -3094,6 +3095,7 @@ static int mdbx_page_flush(MDBX_txn *txn, pgno_t keep) { pgno = dl[i].mid; /* clear dirty flag */ dp->mp_flags &= ~P_DIRTY; + dp->mp_validator = 0 /* TODO */; pos = pgno2bytes(env, pgno); size = IS_OVERFLOW(dp) ? pgno2bytes(env, dp->mp_pages) : env->me_psize; env->me_sync_pending += size; From fbce45cb985c7685773851a1f815001d4999432e Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 6 Jun 2017 17:59:12 +0300 Subject: [PATCH 235/303] mdbx: allow calling mdbx_env_sync() inside transaction. --- src/mdbx.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 48314481..13768b29 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2062,9 +2062,13 @@ int mdbx_env_sync(MDBX_env *env, int force) { if (unlikely(flags & (MDBX_RDONLY | MDBX_FATAL_ERROR))) return MDBX_EACCESS; - int rc = mdbx_txn_lock(env); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + const bool outside_txn = (env->me_txn0->mt_owner != mdbx_thread_self()); + + if (outside_txn) { + int rc = mdbx_txn_lock(env); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } MDBX_meta *head = mdbx_meta_head(env); if (!META_IS_STEADY(head) || env->me_sync_pending || @@ -2075,19 +2079,19 @@ int mdbx_env_sync(MDBX_env *env, int force) { env->me_sync_pending >= env->me_sync_threshold)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - if (env->me_sync_pending > + if (outside_txn && + env->me_sync_pending > pgno2bytes(env, 16 /* FIXME: define threshold */) && (flags & MDBX_NOSYNC) == 0) { assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); size_t used_size = pgno2bytes(env, head->mm_last_pg + 1); + mdbx_txn_unlock(env); /* LY: pre-sync without holding lock to reduce latency for writer(s) */ - if (flags & MDBX_WRITEMAP) { - rc = mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC); - } else { - rc = mdbx_filesync(env->me_fd, false); - } + int rc = (flags & MDBX_WRITEMAP) + ? mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC) + : mdbx_filesync(env->me_fd, false); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -2107,16 +2111,17 @@ int mdbx_env_sync(MDBX_env *env, int force) { mdbx_durable_str(head), env->me_sync_pending, env->me_mapsize, head->mm_mapsize); MDBX_meta meta = *head; - rc = mdbx_sync_locked(env, flags, &meta); + int rc = mdbx_sync_locked(env, flags, &meta); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_txn_unlock(env); + if (outside_txn) + mdbx_txn_unlock(env); return rc; } } } - mdbx_txn_unlock(env); - assert(rc == MDBX_SUCCESS); + if (outside_txn) + mdbx_txn_unlock(env); return MDBX_SUCCESS; } From 204904df7cff07e0f12accc99e374934d3a43ce4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 6 Jun 2017 18:28:04 +0300 Subject: [PATCH 236/303] mdbx: disallow begin txn if env not yet opened. --- mdbx.h | 2 ++ src/mdbx.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/mdbx.h b/mdbx.h index ec38193c..8d5a6e2e 100644 --- a/mdbx.h +++ b/mdbx.h @@ -79,6 +79,7 @@ typedef DWORD mdbx_tid_t; #define MDBX_EROFS ERROR_FILE_READ_ONLY #define MDBX_ENOSYS ERROR_NOT_SUPPORTED #define MDBX_EIO ERROR_WRITE_FAULT +#define MDBX_EPERM ERROR_INVALID_FUNCTION #else @@ -97,6 +98,7 @@ typedef pthread_t mdbx_tid_t; #define MDBX_EROFS EROFS #define MDBX_ENOSYS ENOSYS #define MDBX_EIO EIO +#define MDBX_EPERM EPERM #endif #ifdef _MSC_VER diff --git a/src/mdbx.c b/src/mdbx.c index 13768b29..417595ad 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2434,6 +2434,9 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, return MDBX_PANIC; } + if (unlikely(!env->me_map)) + return MDBX_EPERM; + flags &= MDBX_TXN_BEGIN_FLAGS; flags |= env->me_flags & MDBX_WRITEMAP; From d2d513014de04cc9abac7fda0db5df437ee9f8c6 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 6 Jun 2017 20:18:09 +0300 Subject: [PATCH 237/303] mdbx: add mdbx_lck_upgrade() to OSAL. --- src/lck-posix.c | 2 ++ src/lck-windows.c | 76 ++++++++++++++++++++++++++++++++++++----------- src/osal.h | 1 + 3 files changed, 61 insertions(+), 18 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 5eb6942c..0d79d932 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -93,6 +93,8 @@ static __inline int mdbx_lck_shared(int lfd) { int mdbx_lck_downgrade(MDBX_env *env) { return mdbx_lck_shared(env->me_lfd); } +int mdbx_lck_upgrade(MDBX_env *env) { return mdbx_lck_exclusive(env->me_lfd); } + int mdbx_rpid_set(MDBX_env *env) { return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid, 1); } diff --git a/src/lck-windows.c b/src/lck-windows.c index 898f7ecd..7a083136 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -280,31 +280,71 @@ int mdbx_lck_seize(MDBX_env *env) { return rc; } -/* Transite from exclusive state (E-E) to used (S-?) */ int mdbx_lck_downgrade(MDBX_env *env) { - int rc; + /* Transite from exclusive state (E-E) to used (S-?) */ assert(env->me_fd != INVALID_HANDLE_VALUE); + assert(env->me_lfd != INVALID_HANDLE_VALUE); - if (env->me_lfd != INVALID_HANDLE_VALUE) { - /* 1) must be at E-E (exclusive), transite to ?_E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "E-E(exclusive) >> ?-E(middle)", GetLastError()); + /* 1) must be at E-E (exclusive), transite to ?_E (middle) */ + if (!funlock(env->me_lfd, LCK_LOWER)) + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "E-E(exclusive) >> ?-E(middle)", GetLastError()); - /* 2) now at ?-E (middle), transite to S-E (locked) */ - if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { - rc = mdbx_get_errno_checked() /* 3) something went wrong, give up */; - return rc; - } - - /* 4) got S-E (locked), continue transition to S-? (used) */ - if (!funlock(env->me_lfd, LCK_UPPER)) - mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "S-E(locked) >> S-?(used)", GetLastError()); + /* 2) now at ?-E (middle), transite to S-E (locked) */ + if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { + int rc = mdbx_get_errno_checked() /* 3) something went wrong, give up */; + return rc; } + + /* 4) got S-E (locked), continue transition to S-? (used) */ + if (!funlock(env->me_lfd, LCK_UPPER)) + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "S-E(locked) >> S-?(used)", GetLastError()); + return MDBX_SUCCESS /* 5) now at S-? (used), done */; } +int mdbx_lck_upgrade(MDBX_env *env) { + /* Transite from locked state (S-E) to exclusive (E-E) */ + assert(env->me_fd != INVALID_HANDLE_VALUE); + assert(env->me_lfd != INVALID_HANDLE_VALUE); + + /* 1) must be at S-E (locked), transite to ?_E (middle) */ + if (!funlock(env->me_lfd, LCK_LOWER)) + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "S-E(locked) >> ?-E(middle)", GetLastError()); + + /* 3) now on ?-E (middle), try E-E (exclusive) */ + mdbx_jitter4testing(false); + if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) + return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive), done */ + + /* 5) still on ?-E (middle) */ + int rc = mdbx_get_errno_checked(); + mdbx_jitter4testing(false); + if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { + /* 6) something went wrong, report but continue */ + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + "?-E(middle) >> E-E(exclusive)", rc); + } + + /* 7) still on ?-E (middle), try restore S-E (locked) */ + mdbx_jitter4testing(false); + rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) + ? MDBX_RESULT_FALSE + : mdbx_get_errno_checked(); + + mdbx_jitter4testing(false); + if (rc != MDBX_RESULT_FALSE) { + mdbx_fatal("%s(%s) failed: errcode %u", mdbx_func_, + "?-E(middle) >> S-E(locked)", rc); + return rc; + } + + /* 8) now on S-E (locked) */ + return MDBX_RESULT_FALSE; +} + void mdbx_lck_destroy(MDBX_env *env) { int rc; @@ -327,7 +367,7 @@ void mdbx_lck_destroy(MDBX_env *env) { if (env->me_fd != INVALID_HANDLE_VALUE) { /* explicitly unlock to avoid latency for other processes (windows kernel - * releases such locks via deferred queues) */ + * releases such locks via deferred queues) */ while (funlock(env->me_fd, LCK_BODY)) ; rc = mdbx_get_errno_checked(); diff --git a/src/osal.h b/src/osal.h index 6265ea14..684fed86 100644 --- a/src/osal.h +++ b/src/osal.h @@ -489,6 +489,7 @@ int mdbx_lck_init(MDBX_env *env); int mdbx_lck_seize(MDBX_env *env); int mdbx_lck_downgrade(MDBX_env *env); +int mdbx_lck_upgrade(MDBX_env *env); void mdbx_lck_destroy(MDBX_env *env); int mdbx_rdt_lock(MDBX_env *env); From 3ca64535b6411923e6e2741b4c8f5d32d1fa4013 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 6 Jun 2017 21:20:16 +0300 Subject: [PATCH 238/303] mdbx: add NTAPI to OSAL. --- src/osal.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/osal.c b/src/osal.c index 7196cf6c..3d216d73 100644 --- a/src/osal.c +++ b/src/osal.c @@ -33,6 +33,38 @@ static int waitstatus2errcode(DWORD result) { return ERROR_UNHANDLED_ERROR; } } + +/* Map a result from an NTAPI call to WIN32 error code. */ +static int ntstatus2errcode(NTSTATUS status) { + DWORD dummy; + OVERLAPPED ov = {status}; + return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS + : GetLastError(); +} + +/* We use native NT APIs to setup the memory map, so that we can + * let the DB file grow incrementally instead of always preallocating + * the full size. These APIs are defined in and + * but those headers are meant for driver-level development and + * conflict with the regular user-level headers, so we explicitly + * declare them here. Using these APIs also means we must link to + * ntdll.dll, which is not linked by default in user code. */ +NTSTATUS WINAPI NtCreateSection(OUT PHANDLE sh, IN ACCESS_MASK acc, + IN void *oa OPTIONAL, + IN PLARGE_INTEGER ms OPTIONAL, IN ULONG pp, + IN ULONG aa, IN HANDLE fh OPTIONAL); + +typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; + +NTSTATUS WINAPI NtMapViewOfSection(IN PHANDLE sh, IN HANDLE ph, + IN OUT PVOID *addr, IN ULONG_PTR zbits, + IN SIZE_T cs, + IN OUT PLARGE_INTEGER off OPTIONAL, + IN OUT PSIZE_T vs, IN SECTION_INHERIT ih, + IN ULONG at, IN ULONG pp); + +NTSTATUS WINAPI NtClose(HANDLE h); + #endif /* _WIN32 || _WIN64 */ /*----------------------------------------------------------------------------*/ From 6f53dd0719d8977169d6d20bc8836eb6126bd6b6 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 9 Jun 2017 16:35:41 +0300 Subject: [PATCH 239/303] mdbx: add mdbx_dbi_flags_ex(). --- mdbx.h | 6 ++++++ src/bits.h | 12 ++++++------ src/mdbx.c | 12 ++++++++++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/mdbx.h b/mdbx.h index 8d5a6e2e..4fe0d908 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1104,8 +1104,14 @@ LIBMDBX_API int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *stat, * [in] txn A transaction handle returned by mdbx_txn_begin() * [in] dbi A database handle returned by mdbx_dbi_open() * [out] flags Address where the flags will be returned. + * [out] state Address where the state will be returned. * * Returns A non-zero error value on failure and 0 on success. */ +#define MDBX_TBL_DIRTY 0x01 /* DB was written in this txn */ +#define MDBX_TBL_STALE 0x02 /* Named-DB record is older than txnID */ +#define MDBX_TBL_NEW 0x04 /* Named-DB handle opened in this txn */ +LIBMDBX_API int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, + unsigned *state); LIBMDBX_API int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags); /* Close a database handle. Normally unnecessary. diff --git a/src/bits.h b/src/bits.h index 3753f7c0..90df0972 100644 --- a/src/bits.h +++ b/src/bits.h @@ -488,12 +488,12 @@ struct MDBX_txn { unsigned *mt_dbiseqs; /* Transaction DB Flags */ -#define DB_DIRTY 0x01 /* DB was written in this txn */ -#define DB_STALE 0x02 /* Named-DB record is older than txnID */ -#define DB_NEW 0x04 /* Named-DB handle opened in this txn */ -#define DB_VALID 0x08 /* DB handle is valid, see also MDBX_VALID */ -#define DB_USRVALID 0x10 /* As DB_VALID, but not set for FREE_DBI */ -#define DB_DUPDATA 0x20 /* DB is MDBX_DUPSORT data */ +#define DB_DIRTY MDBX_TBL_DIRTY /* DB was written in this txn */ +#define DB_STALE MDBX_TBL_STALE /* Named-DB record is older than txnID */ +#define DB_NEW MDBX_TBL_NEW /* Named-DB handle opened in this txn */ +#define DB_VALID 0x08 /* DB handle is valid, see also MDBX_VALID */ +#define DB_USRVALID 0x10 /* As DB_VALID, but not set for FREE_DBI */ +#define DB_DUPDATA 0x20 /* DB is MDBX_DUPSORT data */ /* In write txns, array of cursors for each DB */ MDBX_cursor **mt_cursors; /* Array of flags for each DB */ diff --git a/src/mdbx.c b/src/mdbx.c index 417595ad..13f4ee74 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9470,8 +9470,9 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { return rc; } -int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { - if (unlikely(!txn || !flags)) +int mdbx_dbi_flags_ex(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, + unsigned *state) { + if (unlikely(!txn || !flags || !state)) return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) @@ -9484,9 +9485,16 @@ int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { return MDBX_EINVAL; *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; + *state = txn->mt_dbflags[dbi] & (DB_NEW | DB_DIRTY | DB_STALE); + return MDBX_SUCCESS; } +int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { + unsigned state; + return mdbx_dbi_flags_ex(txn, dbi, flags, &state); +} + /* Add all the DB's pages to the free list. * [in] mc Cursor on the DB to free. * [in] subs non-Zero to check for sub-DBs in this DB. From e7f9f2bd5c05c08a290a7336c7cca2b88a328d52 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 12 Jun 2017 15:41:36 +0300 Subject: [PATCH 240/303] ci: add Coverity Scan (initial). Change-Id: I7430ae49b1e5f02dbaa2fc31a5d3b992ec2ca2e3 --- .travis.yml | 20 +++++++++++++++++++- README.md | 1 + 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index cc56c3a5..590944bb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ language: c -sudo: false +sudo: required dist: trusty compiler: @@ -10,3 +10,21 @@ os: - linux script: if [ "${COVERITY_SCAN_BRANCH}" != 1 ]; then make all check; fi + +env: + global: + - secure: "M+W+heGGyRQJoBq2W0uqWVrpL4KBXmL0MFL7FSs7f9vmAaDyEgziUXeZRj3GOKzW4kTef3LpIeiu9SmvqSMoQivGGiomZShqPVl045o/OUgRCAT7Al1RLzEZ0efSHpIPf0PZ6byEf6GR2ML76OfuL6JxTVdnz8iVyO2sgLE1HbX1VeB+wgd/jfMeOBhCCXskfK6MLyZihfMYsiYZYSaV98ZDhDLSlzuuRIgzb0bMi8aL6AErs0WLW0NelRBeHkKPYfAUc85pdQHscgrJw6Rh/zT6+8BQ/q5f4IgWhiu4xoRg3Ngl7SNoedRQh93ADM3UG2iGl6HDFpVORaXcFWKAtuYY+kHQ0HB84BRYpQmeBuXNpltsfxQ3d1Q3u0RlE45zRvmr2+X1mFnkcNUAWISLPbsOUlriDQM8irGwRpho77/uYnRC00bJsHW//s6+uPf9zrAw1nI4f0y3PAWukGF/xs6HAI3FZPsuSSnx18Tj3Opgbc9Spop+V3hkhdiJoPGpNKTkFX4ZRXfkPgoRVJmtp4PpbpH0Ps/mCriKjMEfGGi0HcVCi0pEGLXiecdqJ5KPg5+22zNycEujQBJcNTKd9shN+R3glrbmhAxTEzGdGwxXXJ2ybwJ2PWJLMYZ7g98nLyX+uQPaA3BlsbYJHNeS5283/9pJsd9DzfHKsN2nFSc=" + +before_install: + - echo -n | openssl s_client -connect scan.coverity.com:443 | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' | sudo tee -a /etc/ssl/certs/ca- + +addons: + coverity_scan: + project: + name: "ReOpen/libmdbx" + version: 0.1 + description: "Build submitted via Travis CI" + notification_email: leo@yuriev.ru + build_command_prepend: "make clean" + build_command: "make all -j 4" + branch_pattern: coverity_scan diff --git a/README.md b/README.md index 4c85905e..a9ef1ba6 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ Extended LMDB, aka "Расширенная LMDB". *The Future will Positive. Всё будет хорошо.* [![Build Status](https://travis-ci.org/ReOpen/libmdbx.svg?branch=devel)](https://travis-ci.org/ReOpen/libmdbx) [![Build status](https://ci.appveyor.com/api/projects/status/v21jlh5kfmk85r7t/branch/devel?svg=true)](https://ci.appveyor.com/project/leo-yuriev/libmdbx/branch/devel) +[![Coverity Scan Status](https://scan.coverity.com/projects/12915/badge.svg)](https://scan.coverity.com/projects/reopen-libmdbx) English version [by Google](https://translate.googleusercontent.com/translate_c?act=url&ie=UTF8&sl=ru&tl=en&u=https://github.com/ReOpen/libmdbx/tree/devel) and [by Yandex](https://translate.yandex.ru/translate?url=https%3A%2F%2Fgithub.com%2FReOpen%2Flibmdbx%2Ftree%2Fdevel&lang=ru-en). From 0ef41bef2b64a816fd4a6387afe808ecab714a1d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 13 Jun 2017 19:27:52 +0300 Subject: [PATCH 241/303] mdbx: fix mdbx_txn_straggler() for write-txn. --- src/mdbx.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 13f4ee74..4271cb99 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9965,19 +9965,25 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) if (unlikely(txn->mt_owner != mdbx_thread_self())) return MDBX_THREAD_MISMATCH; - if (unlikely(!txn->mt_ro_reader)) - return -1; - MDBX_env *env = txn->mt_env; - MDBX_meta *meta = mdbx_meta_head(env); - if (percent) { - pgno_t maxpg = env->me_maxpg; - pgno_t last = meta->mm_last_pg + 1; - if (env->me_txn) - last = env->me_txn0->mt_next_pgno; - *percent = (int)((last * UINT64_C(100) + maxpg / 2) / maxpg); + pgno_t maxpg = env->me_maxpg; + if (unlikely((txn->mt_flags & MDBX_RDONLY) == 0)) { + *percent = (int)((txn->mt_next_pgno * UINT64_C(100) + maxpg / 2) / maxpg); + return -1; } - txnid_t lag = mdbx_meta_txnid_fluid(env, meta) - txn->mt_ro_reader->mr_txnid; + + txnid_t recent; + MDBX_meta *meta; + do { + meta = mdbx_meta_head(env); + recent = mdbx_meta_txnid_fluid(env, meta); + if (percent) { + pgno_t last = meta->mm_last_pg + 1; + *percent = (int)((last * UINT64_C(100) + maxpg / 2) / maxpg); + } + } while (unlikely(recent != mdbx_meta_txnid_fluid(env, meta))); + + txnid_t lag = recent - txn->mt_ro_reader->mr_txnid; return (lag > INT_MAX) ? INT_MAX : (int)lag; } From 2392c70e2bbab422a3192b190a2b7ae2fad4c35f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 14 Jun 2017 19:36:32 +0300 Subject: [PATCH 242/303] mdbx: refine MDBX_oom_func description. --- mdbx.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mdbx.h b/mdbx.h index 4fe0d908..f5731988 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1534,9 +1534,9 @@ LIBMDBX_API int mdbx_txn_straggler(MDBX_txn *txn, int *percent); * [in] retry A retry number, less that zero for notify end of OOM-loop. * * Returns -1 on failure (reader is not killed), - * 0 on a race condition (no such reader), - * 1 on success (reader was killed), - * >1 on success (reader was SURE killed). */ + * 0 should wait or retry, + * 1 drop reader txn-lock (reading-txn was aborted), + * >1 drop reader registration (reader process was killed). */ typedef int(MDBX_oom_func)(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, unsigned gap, int retry); From 71e2fe3df0b96d20b85b21884acd53c4911ea215 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 15 Jun 2017 04:06:07 +0300 Subject: [PATCH 243/303] mdbx: add MDBX_DBG_DUMP for mdbx_setup_debug(). So, meta-pages and lck-section now will be included into a coredump. --- mdbx.h | 1 + src/mdbx.c | 26 ++++++++++++++++++++++++++ test/base.h | 3 +++ test/test.cc | 2 +- 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/mdbx.h b/mdbx.h index f5731988..f875fef2 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1567,6 +1567,7 @@ LIBMDBX_API MDBX_oom_func *mdbx_env_get_oomfunc(MDBX_env *env); #define MDBX_DBG_EXTRA 8 #define MDBX_DBG_AUDIT 16 #define MDBX_DBG_JITTER 32 +#define MDBX_DBG_DUMP 64 typedef void MDBX_debug_func(int type, const char *function, int line, const char *msg, va_list args); diff --git a/src/mdbx.c b/src/mdbx.c index 4271cb99..462564db 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9859,6 +9859,32 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger) { unsigned ret = mdbx_runtime_flags; mdbx_runtime_flags = flags; + +#ifdef __linux__ + if (flags & MDBX_DBG_DUMP) { + int core_filter_fd = open("/proc/self/coredump_filter", O_TRUNC | O_RDWR); + if (core_filter_fd >= 0) { + char buf[32]; + const unsigned r = pread(core_filter_fd, buf, sizeof(buf), 0); + if (r > 0 && r < sizeof(buf)) { + buf[r] = 0; + unsigned long mask = strtoul(buf, NULL, 16); + if (mask != ULONG_MAX) { + mask |= 1 << 3 /* Dump file-backed shared mappings */; + mask |= 1 << 6 /* Dump shared huge pages */; + mask |= 1 << 8 /* Dump shared DAX pages */; + unsigned w = snprintf(buf, sizeof(buf), "0x%lx\n", mask); + if (w > 0 && w < sizeof(buf)) { + w = pwrite(core_filter_fd, buf, w, 0); + (void)w; + } + } + } + close(core_filter_fd); + } + } +#endif /* __linux__ */ + mdbx_debug_logger = logger; return ret; } diff --git a/test/base.h b/test/base.h index 39e2c357..fe09aa89 100644 --- a/test/base.h +++ b/test/base.h @@ -37,8 +37,11 @@ #if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) #include #else +#include #include +#include #include +#include #include #endif diff --git a/test/test.cc b/test/test.cc index 04a1b82e..0874c4bb 100644 --- a/test/test.cc +++ b/test/test.cc @@ -114,7 +114,7 @@ void testcase::db_prepare() { log_trace(">> db_prepare"); assert(!db_guard); - int mdbx_dbg_opts = MDBX_DBG_ASSERT | MDBX_DBG_JITTER; + int mdbx_dbg_opts = MDBX_DBG_ASSERT | MDBX_DBG_JITTER | MDBX_DBG_DUMP; if (config.params.loglevel <= logging::trace) mdbx_dbg_opts |= MDBX_DBG_TRACE; if (config.params.loglevel <= logging::verbose) From ddc378793639f7f657ecb11f702954d89ddf7afd Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 14 Jun 2017 23:34:11 +0300 Subject: [PATCH 244/303] mdbx: fix MDBX_RESULT_TRUE handling inside mdbx_mutex_failed()...mdbx_oomkick(). --- src/lck-posix.c | 2 ++ src/mdbx.c | 47 ++++++++++++++++++++++++++++------------------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index 0d79d932..c1f23b38 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -295,6 +295,8 @@ static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, (rc ? "this process' env is hosed" : "recovering")); int check_rc = mdbx_reader_check0(env, rlocked, NULL); + check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; + int mreco_rc = pthread_mutex_consistent(mutex); check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; diff --git a/src/mdbx.c b/src/mdbx.c index 462564db..c41eb6ae 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9780,6 +9780,10 @@ int __cold mdbx_reader_check(MDBX_env *env, int *dead) { return mdbx_reader_check0(env, 0, dead); } +/* Return: + * MDBX_RESULT_TRUE - done and mutex recovered + * MDBX_SUCCESS - done + * Otherwise errcode. */ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { assert(rdt_locked >= 0); @@ -9792,7 +9796,7 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { mdbx_pid_t *pids = alloca((snap_nreaders + 1) * sizeof(mdbx_pid_t)); pids[0] = 0; - int rc = MDBX_RESULT_FALSE, count = 0; + int rc = MDBX_SUCCESS, count = 0; MDBX_reader *mr = env->me_lck->mti_readers; for (unsigned i = 0; i < snap_nreaders; i++) { @@ -9802,39 +9806,44 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { if (pid != env->me_pid) continue; if (mdbx_pid_insert(pids, pid) != 0) - continue; + continue /* such pid already processed */; - rc = mdbx_rpid_check(env, pid); - if (rc == MDBX_RESULT_TRUE) - continue; /* reader is live */ + int err = mdbx_rpid_check(env, pid); + if (err == MDBX_RESULT_TRUE) + continue /* reader is live */; - if (rc != MDBX_RESULT_FALSE) - break; /* mdbx_rpid_check() failed */ + if (err != MDBX_SUCCESS) { + rc = err; + break /* mdbx_rpid_check() failed */; + } /* stale reader found */ if (!rdt_locked) { - rc = mdbx_rdt_lock(env); - if (MDBX_IS_ERROR(rc)) + err = mdbx_rdt_lock(env); + if (MDBX_IS_ERROR(rc)) { + rc = err; break; + } rdt_locked = -1; - if (rc == MDBX_RESULT_TRUE) - /* the above checked all readers */ + if (err == MDBX_RESULT_TRUE) { + /* mutex recovered, the mdbx_mutex_failed() checked all readers */ + rc = MDBX_RESULT_TRUE; break; + } /* a other process may have clean and reused slot, recheck */ if (mr[i].mr_pid != pid) continue; - rc = mdbx_rpid_check(env, pid); - if (MDBX_IS_ERROR(rc)) + err = mdbx_rpid_check(env, pid); + if (MDBX_IS_ERROR(rc)) { + rc = err; break; - - if (rc != MDBX_RESULT_FALSE) { - /* the race with other process, slot reused */ - rc = MDBX_RESULT_FALSE; - continue; } + + if (err != MDBX_SUCCESS) + continue /* the race with other process, slot reused */; } /* clean it */ @@ -9896,7 +9905,7 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { for (retry = 0; retry < INT_MAX; ++retry) { int reader; - if (mdbx_reader_check(env, NULL)) + if (MDBX_IS_ERROR(mdbx_reader_check0(env, false, NULL))) break; txnid_t snap = mdbx_find_oldest(env->me_txn, &reader); From 760f7d13318ad246795670206d1402402417cc27 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 14 Jun 2017 21:34:43 +0300 Subject: [PATCH 245/303] mdbx: fix nasty 'inverted if-condition' inside mdbx_reader_check0(). Fix c2087f186ef5ae021bb1b9f6f8111e7b7eccffc1 --- src/mdbx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index c41eb6ae..dfc8a9dd 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9802,9 +9802,9 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { for (unsigned i = 0; i < snap_nreaders; i++) { const mdbx_pid_t pid = mr[i].mr_pid; if (pid == 0) - continue; - if (pid != env->me_pid) - continue; + continue /* skip empty */; + if (pid == env->me_pid) + continue /* skip self */; if (mdbx_pid_insert(pids, pid) != 0) continue /* such pid already processed */; From ae71c2aa36c11405fbfa635c2c12cc609e18f203 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 14 Jun 2017 23:33:44 +0300 Subject: [PATCH 246/303] mdbx: simplify find-oldest inside mdbx_page_alloc(). Change-Id: I88dfc8ee268eba2452bffd3c4e6cfd066bcd8b04 --- src/mdbx.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index dfc8a9dd..688d9799 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1558,7 +1558,6 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, txnid_t oldest = 0, last = 0; MDBX_cursor_op op; MDBX_cursor m2; - int found_oldest = 0; if (likely(flags & MDBX_ALLOC_GC)) { flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); @@ -1614,13 +1613,10 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (unlikely(!(flags & MDBX_ALLOC_GC))) break; - oldest = env->me_pgoldest; + oldest = (flags & MDBX_LIFORECLAIM) ? mdbx_find_oldest(txn, NULL) + : env->me_pgoldest; mdbx_cursor_init(&m2, txn, FREE_DBI, NULL); if (flags & MDBX_LIFORECLAIM) { - if (!found_oldest) { - oldest = mdbx_find_oldest(txn, NULL); - found_oldest = 1; - } /* Begin from oldest reader if any */ if (oldest > 2) { last = oldest - 1; @@ -1639,10 +1635,6 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (!(flags & MDBX_LIFORECLAIM)) { /* Do not fetch more if the record will be too recent */ if (op != MDBX_FIRST && ++last >= oldest) { - if (!found_oldest) { - oldest = mdbx_find_oldest(txn, NULL); - found_oldest = 1; - } if (oldest <= last) break; } @@ -1652,7 +1644,6 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (rc == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { if (op == MDBX_SET_RANGE) continue; - found_oldest = 1; if (oldest < mdbx_find_oldest(txn, NULL)) { oldest = env->me_pgoldest; last = oldest - 1; @@ -1670,10 +1661,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, last = *(txnid_t *)key.iov_base; if (oldest <= last) { - if (!found_oldest) { - oldest = mdbx_find_oldest(txn, NULL); - found_oldest = 1; - } + oldest = mdbx_find_oldest(txn, NULL); if (oldest <= last) { if (flags & MDBX_LIFORECLAIM) continue; @@ -1804,18 +1792,15 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, MDBX_meta meta = *head; if (mdbx_sync_locked(env, me_flags, &meta) == MDBX_SUCCESS) { txnid_t snap = mdbx_find_oldest(txn, NULL); - if (snap > oldest) { + if (snap > oldest) continue; - } } } if (rc == MDBX_MAP_FULL) { txnid_t snap = mdbx_oomkick(env, oldest); - if (snap > oldest) { - oldest = snap; + if (snap > oldest) continue; - } } } From c01aeb5c685727be781551c0cbef8cb740cdd10b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 15 Jun 2017 05:02:14 +0300 Subject: [PATCH 247/303] mdbx: refine meta_txnid(). Change-Id: I4d78b2ebed8850c90dc762405ba939be7385c9a3 --- src/bits.h | 2 +- src/mdbx.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bits.h b/src/bits.h index 90df0972..837fce76 100644 --- a/src/bits.h +++ b/src/bits.h @@ -134,7 +134,7 @@ typedef uint64_t txnid_t; #elif MDBX_DEBUG #define MIN_TXNID UINT64_C(0x100000000) #else -#define MIN_TXNID UINT64_C(0) +#define MIN_TXNID UINT64_C(1) #endif /* MIN_TXNID */ /* Used for offsets within a single page. diff --git a/src/mdbx.c b/src/mdbx.c index 688d9799..d4facf9e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1335,7 +1335,7 @@ static __inline txnid_t meta_txnid(const MDBX_env *env, const MDBX_meta *meta, txnid_t a = meta->mm_txnid_a; txnid_t b = meta->mm_txnid_b; if (allow_volatile) - return (a < b) ? a : b; + return (a == b) ? a : 0; mdbx_assert(env, a == b); return a; } From 81661ff9523687636f6c4bba6e60c555fafabbb9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 14 Jun 2017 23:33:13 +0300 Subject: [PATCH 248/303] mdbx: add shared cache for oldest reader's txnid. Change-Id: I48cbc778b873445dffa8ecef1fc3633e0193131f --- src/bits.h | 80 +++++++++++++++++++++++++++++++++--------------------- src/mdbx.c | 62 ++++++++++++++++++++---------------------- 2 files changed, 78 insertions(+), 64 deletions(-) diff --git a/src/bits.h b/src/bits.h index 837fce76..e80b7973 100644 --- a/src/bits.h +++ b/src/bits.h @@ -365,18 +365,35 @@ typedef struct MDBX_lockinfo { /* Flags which environment was opened. */ volatile uint32_t mti_envmode; + union { #ifdef MDBX_OSAL_LOCK - MDBX_OSAL_LOCK mti_wmutex; + MDBX_OSAL_LOCK mti_wmutex; #endif + uint64_t align_wmutex; + }; - /* The number of slots that have been used in the reader table. - * This always records the maximum count, it is not decremented - * when readers release their slots. */ - volatile unsigned __cache_aligned mti_numreaders; + union { + /* The number of slots that have been used in the reader table. + * This always records the maximum count, it is not decremented + * when readers release their slots. */ + volatile unsigned __cache_aligned mti_numreaders; + uint64_t align_numreaders; + }; + + union { #ifdef MDBX_OSAL_LOCK - /* Mutex protecting access to this table. */ - MDBX_OSAL_LOCK mti_rmutex; + /* Mutex protecting access to this table. */ + MDBX_OSAL_LOCK mti_rmutex; #endif + uint64_t align_rmutex; + }; + + union { + volatile txnid_t mti_oldest; + uint64_t align_oldest; + }; + uint8_t pad_align[MDBX_CACHELINE_SIZE - sizeof(uint64_t) * 6]; + MDBX_reader __cache_aligned mti_readers[1]; } MDBX_lockinfo; @@ -635,23 +652,23 @@ struct MDBX_env { /* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */ unsigned me_close_readers; mdbx_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - MDBX_dbi me_maxdbs; /* size of the DB table */ - mdbx_pid_t me_pid; /* process ID of this env */ - char *me_path; /* path to the DB files */ - char *me_map; /* the memory map of the data file */ - MDBX_lockinfo *me_lck; /* the memory map of the lock file, never NULL */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn; /* current write transaction */ - MDBX_txn *me_txn0; /* prealloc'd write transaction */ - size_t me_mapsize; /* size of the data memory map */ - pgno_t me_maxpg; /* me_mapsize / me_psize */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - txnid_t me_pgoldest; /* ID of oldest reader last time we looked */ - MDBX_pgstate me_pgstate; /* state of old pages from freeDB */ + MDBX_dbi me_numdbs; /* number of DBs opened */ + MDBX_dbi me_maxdbs; /* size of the DB table */ + mdbx_pid_t me_pid; /* process ID of this env */ + char *me_path; /* path to the DB files */ + char *me_map; /* the memory map of the data file */ + MDBX_lockinfo *me_lck; /* the memory map of the lock file, never NULL */ + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn; /* current write transaction */ + MDBX_txn *me_txn0; /* prealloc'd write transaction */ + size_t me_mapsize; /* size of the data memory map */ + pgno_t me_maxpg; /* me_mapsize / me_psize */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + mdbx_thread_key_t me_txkey; /* thread-key for readers */ + volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ + MDBX_pgstate me_pgstate; /* state of old pages from freeDB */ #define me_pglast me_pgstate.mf_pglast #define me_pghead me_pgstate.mf_pghead MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ @@ -663,16 +680,17 @@ struct MDBX_env { unsigned me_maxfree_1pg; /* Max size of a node on a page */ unsigned me_nodemax; - unsigned me_maxkey_limit; /* max size of a key */ - mdbx_pid_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + unsigned me_maxkey_limit; /* max size of a key */ + mdbx_pid_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ + size_t me_sync_pending; /* Total dirty/non-sync'ed bytes + * since the last mdbx_env_sync() */ + size_t me_sync_threshold; /* Treshold of above to force synchronous flush */ + MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */ + txnid_t me_oldest_stub; #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif - size_t me_sync_pending; /* Total dirty/non-sync'ed bytes - * since the last mdbx_env_sync() */ - size_t me_sync_threshold; /* Treshold of above to force synchronous flush */ - MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */ #ifdef USE_VALGRIND int me_valgrind_handle; #endif diff --git a/src/mdbx.c b/src/mdbx.c index d4facf9e..6697eb93 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1483,12 +1483,10 @@ static const char *mdbx_durable_str(const MDBX_meta *const meta) { /* Find oldest txnid still referenced. */ static txnid_t mdbx_find_oldest(MDBX_txn *txn, int *laggard) { MDBX_env *env = txn->mt_env; - const MDBX_meta *const head = mdbx_meta_mostrecent( - env, F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) ? false : true); - txnid_t oldest = - meta_txnid(env, head, (txn->mt_flags & MDBX_RDONLY) ? true : false); + mdbx_assert(env, (txn->mt_flags & MDBX_RDONLY) == 0); int i, reader; + txnid_t oldest = txn->mt_txnid - 1; const MDBX_reader *const r = env->me_lck->mti_readers; for (reader = -1, i = env->me_lck->mti_numreaders; --i >= 0;) { if (r[i].mr_pid) { @@ -1503,7 +1501,9 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn, int *laggard) { if (laggard) *laggard = reader; - return env->me_pgoldest = oldest; + + *env->me_oldest = oldest; + return oldest; } /* Add a page to the txn's dirty list */ @@ -1614,7 +1614,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, break; oldest = (flags & MDBX_LIFORECLAIM) ? mdbx_find_oldest(txn, NULL) - : env->me_pgoldest; + : env->me_oldest[0]; mdbx_cursor_init(&m2, txn, FREE_DBI, NULL); if (flags & MDBX_LIFORECLAIM) { /* Begin from oldest reader if any */ @@ -1645,7 +1645,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (op == MDBX_SET_RANGE) continue; if (oldest < mdbx_find_oldest(txn, NULL)) { - oldest = env->me_pgoldest; + oldest = *env->me_oldest; last = oldest - 1; key.iov_base = &last; key.iov_len = sizeof(last); @@ -2283,7 +2283,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } while (1) { - MDBX_meta *const meta = mdbx_meta_head(txn->mt_env); + MDBX_meta *const meta = mdbx_meta_head(env); mdbx_jitter4testing(false); const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); mdbx_jitter4testing(false); @@ -2293,8 +2293,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { mdbx_assert(env, r->mr_pid == mdbx_getpid()); mdbx_assert(env, r->mr_tid == mdbx_thread_self()); mdbx_assert(env, r->mr_txnid == snap); + mdbx_coherent_barrier(); } - mdbx_coherent_barrier(); mdbx_jitter4testing(true); /* Snap the state from current meta-head */ @@ -2304,14 +2304,15 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_canary = meta->mm_canary; /* LY: Retry on a race, ITS#7970. */ - if (likely(meta == mdbx_meta_head(txn->mt_env) && + mdbx_compiler_barrier(); + if (likely(meta == mdbx_meta_head(env) && snap == mdbx_meta_txnid_fluid(env, meta))) { mdbx_jitter4testing(false); break; } } - mdbx_assert(env, txn->mt_txnid >= mdbx_find_oldest(txn, nullptr)); + mdbx_assert(env, txn->mt_txnid >= *env->me_oldest); txn->mt_ro_reader = r; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ } else { @@ -4314,7 +4315,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { err = mdbx_mmap(&addr, (size_t)size, true, env->me_lfd); if (unlikely(err != MDBX_SUCCESS)) return err; - assert(addr != nullptr); + mdbx_assert(env, addr != nullptr); env->me_lck = addr; #ifdef MADV_DODUMP @@ -4359,6 +4360,8 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { } } + mdbx_assert(env, !MDBX_IS_ERROR(rc)); + env->me_oldest = &env->me_lck->mti_oldest; return rc; } @@ -4451,6 +4454,7 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, goto bailout; } + env->me_oldest = &env->me_oldest_stub; const int dxb_rc = mdbx_setup_dxb(env, lck_rc); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; @@ -4594,8 +4598,9 @@ static void __cold mdbx_env_close0(MDBX_env *env) { mdbx_munmap((void *)env->me_lck, (env->me_maxreaders - 1) * sizeof(MDBX_reader) + sizeof(MDBX_lockinfo)); - env->me_lck = NULL; + env->me_lck = nullptr; env->me_pid = 0; + env->me_oldest = nullptr; mdbx_lck_destroy(env); if (env->me_lfd != INVALID_HANDLE_VALUE) { @@ -5087,8 +5092,7 @@ static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { return MDBX_BAD_TXN; } - mdbx_cassert(mc, - mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); + mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]); /* Make sure we're using an up-to-date root */ if (unlikely(*mc->mc_dbflag & DB_STALE)) { MDBX_cursor mc2; @@ -5128,8 +5132,7 @@ static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) { return MDBX_NOTFOUND; } - mdbx_cassert(mc, - mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); + mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]); mdbx_cassert(mc, root >= NUM_METAS); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) @@ -5238,8 +5241,7 @@ static __inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf, pgno_t pgno; int rc; - mdbx_cassert(mc, - mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); + mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]); if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { data->iov_len = NODEDSZ(leaf); data->iov_base = NODEDATA(leaf); @@ -5299,8 +5301,7 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) { MDBX_node *indx; MDBX_page *mp; - mdbx_cassert(mc, - mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); + mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]); if (unlikely(mc->mc_snum < 2)) { return MDBX_NOTFOUND; /* root has no siblings */ } @@ -5530,8 +5531,7 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_node *leaf = NULL; DKBUF; - mdbx_cassert(mc, - mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); + mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]); if ((mc->mc_db->md_flags & MDBX_INTEGERKEY) && unlikely(key->iov_len != sizeof(uint32_t) && key->iov_len != sizeof(uint64_t))) { @@ -5818,8 +5818,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) return MDBX_BAD_TXN; - mdbx_cassert(mc, - mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); + mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]); switch (op) { case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) @@ -7072,8 +7071,7 @@ static void mdbx_xcursor_init0(MDBX_cursor *mc) { static void mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node) { MDBX_xcursor *mx = mc->mc_xcursor; - mdbx_cassert(mc, - mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); + mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]); if (node->mn_flags & F_SUBDATA) { memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDBX_db)); mx->mx_cursor.mc_pg[0] = 0; @@ -7123,8 +7121,7 @@ static void mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, int new_dupdata) { MDBX_xcursor *mx = mc->mc_xcursor; - mdbx_cassert(mc, - mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); + mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]); if (new_dupdata) { mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; @@ -7164,8 +7161,7 @@ static void mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi, mc->mc_xcursor = mx; mdbx_xcursor_init0(mc); } - mdbx_cassert(mc, - mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr)); + mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]); if (unlikely(*mc->mc_dbflag & DB_STALE)) { mdbx_page_search(mc, NULL, MDBX_PS_ROOTONLY); } @@ -7784,8 +7780,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { unsigned i; - mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >= - mdbx_find_oldest(csrc->mc_txn, nullptr)); + mdbx_cassert(csrc, + csrc->mc_txn->mt_txnid >= csrc->mc_txn->mt_env->me_oldest[0]); cdst->mc_txn = csrc->mc_txn; cdst->mc_dbi = csrc->mc_dbi; cdst->mc_db = csrc->mc_db; From edc936dc2806a679fd5e5c0f9666d1f95a93cc34 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 16 Jun 2017 04:16:30 +0300 Subject: [PATCH 249/303] mdbx: rework mdbx_oomkick() and speedup mdbx_find_oldest(). Change-Id: Icc2c450e7f10efe1b4ab8705e6a659cece256dc1 --- src/mdbx.c | 93 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 41 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 6697eb93..1268e1c2 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -749,7 +749,7 @@ const char *__cold mdbx_strerror(int errnum) { return msg; } -static txnid_t mdbx_oomkick(MDBX_env *env, txnid_t oldest); +static txnid_t mdbx_oomkick(MDBX_env *env, const txnid_t laggard); void __cold mdbx_debug_log(int type, const char *function, int line, const char *fmt, ...) { @@ -1481,28 +1481,29 @@ static const char *mdbx_durable_str(const MDBX_meta *const meta) { /*----------------------------------------------------------------------------*/ /* Find oldest txnid still referenced. */ -static txnid_t mdbx_find_oldest(MDBX_txn *txn, int *laggard) { +static txnid_t mdbx_find_oldest(MDBX_txn *txn) { MDBX_env *env = txn->mt_env; mdbx_assert(env, (txn->mt_flags & MDBX_RDONLY) == 0); - int i, reader; + const txnid_t last_oldest = env->me_oldest[0]; txnid_t oldest = txn->mt_txnid - 1; - const MDBX_reader *const r = env->me_lck->mti_readers; - for (reader = -1, i = env->me_lck->mti_numreaders; --i >= 0;) { - if (r[i].mr_pid) { + mdbx_assert(env, oldest >= last_oldest); + + const MDBX_reader *const rtbl = env->me_lck->mti_readers; + for (int i = env->me_lck->mti_numreaders; + oldest != last_oldest && --i >= 0;) { + if (rtbl[i].mr_pid) { mdbx_jitter4testing(true); - txnid_t snap = r[i].mr_txnid; - if (oldest > snap) { + const txnid_t snap = rtbl[i].mr_txnid; + if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) oldest = snap; - reader = i; - } } } - if (laggard) - *laggard = reader; - - *env->me_oldest = oldest; + if (oldest != last_oldest) { + mdbx_assert(env, oldest >= env->me_oldest[0]); + env->me_oldest[0] = oldest; + } return oldest; } @@ -1613,7 +1614,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (unlikely(!(flags & MDBX_ALLOC_GC))) break; - oldest = (flags & MDBX_LIFORECLAIM) ? mdbx_find_oldest(txn, NULL) + oldest = (flags & MDBX_LIFORECLAIM) ? mdbx_find_oldest(txn) : env->me_oldest[0]; mdbx_cursor_init(&m2, txn, FREE_DBI, NULL); if (flags & MDBX_LIFORECLAIM) { @@ -1644,7 +1645,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if (rc == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { if (op == MDBX_SET_RANGE) continue; - if (oldest < mdbx_find_oldest(txn, NULL)) { + if (oldest < mdbx_find_oldest(txn)) { oldest = *env->me_oldest; last = oldest - 1; key.iov_base = &last; @@ -1661,7 +1662,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, last = *(txnid_t *)key.iov_base; if (oldest <= last) { - oldest = mdbx_find_oldest(txn, NULL); + oldest = mdbx_find_oldest(txn); if (oldest <= last) { if (flags & MDBX_LIFORECLAIM) continue; @@ -1791,7 +1792,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, mdbx_assert(env, env->me_sync_pending > 0); MDBX_meta meta = *head; if (mdbx_sync_locked(env, me_flags, &meta) == MDBX_SUCCESS) { - txnid_t snap = mdbx_find_oldest(txn, NULL); + txnid_t snap = mdbx_find_oldest(txn); if (snap > oldest) continue; } @@ -9879,28 +9880,39 @@ int __cold mdbx_setup_debug(int flags, MDBX_debug_func *logger) { return ret; } -static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { +static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) { mdbx_debug("DB size maxed out"); int retry; for (retry = 0; retry < INT_MAX; ++retry) { - int reader; - if (MDBX_IS_ERROR(mdbx_reader_check0(env, false, NULL))) break; - txnid_t snap = mdbx_find_oldest(env->me_txn, &reader); - if (oldest < snap || reader < 0) { - if (retry && env->me_oom_func) { - /* LY: notify end of oom-loop */ - const txnid_t gap = snap - oldest; - env->me_oom_func(env, 0, 0, oldest, - (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, -retry); + txnid_t oldest = env->me_txn0->mt_txnid - 1; + MDBX_reader *const rtbl = env->me_lck->mti_readers; + MDBX_reader *asleep = nullptr; + for (int i = env->me_lck->mti_numreaders; --i >= 0;) { + if (rtbl[i].mr_pid) { + mdbx_jitter4testing(true); + const txnid_t snap = rtbl[i].mr_txnid; + if (oldest > snap && laggard <= /* ignore pending updates */ snap) { + oldest = snap; + asleep = &rtbl[i]; + } } - return snap; } - MDBX_reader *r; + if (laggard < oldest || !asleep) { + if (retry && env->me_oom_func) { + /* LY: notify end of oom-loop */ + const txnid_t gap = oldest - laggard; + env->me_oom_func(env, 0, 0, laggard, + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, -retry); + } + mdbx_assert(env, env->me_oldest[0] <= oldest); + return env->me_oldest[0] = oldest; + } + mdbx_tid_t tid; mdbx_pid_t pid; int rc; @@ -9908,24 +9920,23 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { if (!env->me_oom_func) break; - r = &env->me_lck->mti_readers[reader]; - pid = r->mr_pid; - tid = r->mr_tid; - if (r->mr_txnid != oldest || pid <= 0) + pid = asleep->mr_pid; + tid = asleep->mr_tid; + if (asleep->mr_txnid != laggard || pid <= 0) continue; const txnid_t gap = - mdbx_meta_txnid_stable(env, mdbx_meta_head(env)) - oldest; - rc = env->me_oom_func(env, pid, tid, oldest, + mdbx_meta_txnid_stable(env, mdbx_meta_head(env)) - laggard; + rc = env->me_oom_func(env, pid, tid, laggard, (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, retry); if (rc < 0) break; if (rc) { - r->mr_txnid = ~(txnid_t)0; + asleep->mr_txnid = ~(txnid_t)0; if (rc > 1) { - r->mr_tid = 0; - r->mr_pid = 0; + asleep->mr_tid = 0; + asleep->mr_pid = 0; mdbx_coherent_barrier(); } } @@ -9933,9 +9944,9 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, txnid_t oldest) { if (retry && env->me_oom_func) { /* LY: notify end of oom-loop */ - env->me_oom_func(env, 0, 0, oldest, 0, -retry); + env->me_oom_func(env, 0, 0, laggard, 0, -retry); } - return mdbx_find_oldest(env->me_txn, NULL); + return mdbx_find_oldest(env->me_txn); } int __cold mdbx_env_set_syncbytes(MDBX_env *env, size_t bytes) { From f93818a926e01803583643c78b95c9281520b5d1 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 21 Jun 2017 01:20:21 +0300 Subject: [PATCH 250/303] mdbx: fix MAX_MAPSIZE for Win32. Change-Id: I04cbffb14ab7c73d935614b2f5c8140000b78d76 --- src/bits.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/bits.h b/src/bits.h index e80b7973..8776ecec 100644 --- a/src/bits.h +++ b/src/bits.h @@ -345,11 +345,16 @@ typedef struct MDBX_page { #define MIN_PAGESIZE 512u #define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) -#define MAX_MAPSIZE \ - ((sizeof(size_t) < 8) \ - ? UINT32_C(0x7ff80000) \ - : ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \ - : MAX_PAGENO * (uint64_t)MAX_PAGESIZE)) +#if defined(_WIN32) || defined(_WIN64) +#define MAX_MAPSIZE32 UINT32_C(0x38000000) +#else +#define MAX_MAPSIZE32 UINT32_C(0x7ff80000) +#endif +#define MAX_MAPSIZE64 \ + ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \ + : MAX_PAGENO * (uint64_t)MAX_PAGESIZE) + +#define MAX_MAPSIZE ((sizeof(size_t) < 8) ? MAX_MAPSIZE32 : MAX_MAPSIZE64) #pragma pack(pop) From 151d4540de65b7cd907fd71f2ad83f71187d8c9c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 21 Jun 2017 01:19:04 +0300 Subject: [PATCH 251/303] mdbx: refine internal constant (use UINT32_C, etc minors). Change-Id: I0f994ee75b5aa1494fcc8ca42a46120865676e25 --- Makefile | 2 +- src/bits.h | 25 ++++++++++++++----------- src/mdbx.c | 12 +++++++----- src/tools/mdbx_chk.c | 4 +--- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 2a2b762d..bdcaf3e3 100644 --- a/Makefile +++ b/Makefile @@ -73,7 +73,7 @@ clean: rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err src/*.o test/*.o check: test/test mdbx_chk - rm -f $(TESTDB) && (set -o pipefail; test/test --pathname=$(TESTDB) --dont-cleanup-after basic | tee test.log | tail -n 42) && ./mdbx_chk -vn $(TESTDB) + rm -f $(TESTDB) test.log && (set -o pipefail; test/test --pathname=$(TESTDB) --dont-cleanup-after basic | tee -a test.log | tail -n 42) && ./mdbx_chk -vn $(TESTDB) define core-rule $(patsubst %.c,%.o,$(1)): $(1) $(CORE_INC) mdbx.h Makefile diff --git a/src/bits.h b/src/bits.h index 8776ecec..7d8d37b2 100644 --- a/src/bits.h +++ b/src/bits.h @@ -113,6 +113,7 @@ #define MAIN_DBI 1 /* Number of DBs in metapage (free and main) - also hardcoded elsewhere */ #define CORE_DBS 2 +#define MAX_DBI (INT16_MAX - CORE_DBS) /* Number of meta pages - also hardcoded elsewhere */ #define NUM_METAS 3 @@ -142,6 +143,8 @@ typedef uint64_t txnid_t; * this is plenty. */ typedef uint16_t indx_t; +#define MEGABYTE ((size_t)1 << 20) + /*----------------------------------------------------------------------------*/ /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) @@ -472,8 +475,8 @@ typedef struct MDBX_dbx { /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { -#define MDBX_MT_SIGNATURE (0x93D53A31) - unsigned mt_signature; +#define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) + uint32_t mt_signature; MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ MDBX_txn *mt_child; @@ -567,10 +570,10 @@ struct MDBX_xcursor; * Exception: An xcursor's pointer to a P_SUBP page can be stale. * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */ struct MDBX_cursor { -#define MDBX_MC_SIGNATURE (0xFE05D5B1) -#define MDBX_MC_READY4CLOSE (0x2817A047) -#define MDBX_MC_WAIT4EOT (0x90E297A7) - unsigned mc_signature; +#define MDBX_MC_SIGNATURE UINT32_C(0xFE05D5B1) +#define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047) +#define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7) + uint32_t mc_signature; /* Next cursor on this DB in this txn */ MDBX_cursor *mc_next; /* Backup of the original cursor if this cursor is a shadow */ @@ -639,16 +642,16 @@ typedef struct MDBX_pgstate { /* The database environment. */ struct MDBX_env { -#define MDBX_ME_SIGNATURE (0x9A899641) - unsigned me_signature; +#define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) + uint32_t me_signature; mdbx_filehandle_t me_fd; /* The main data file */ mdbx_filehandle_t me_lfd; /* The lock file */ /* Failed to update the meta page. Probably an I/O error. */ -#define MDBX_FATAL_ERROR 0x80000000U +#define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ -#define MDBX_ENV_ACTIVE 0x20000000U +#define MDBX_ENV_ACTIVE UINT32_C(0x20000000) /* me_txkey is set */ -#define MDBX_ENV_TXKEY 0x10000000U +#define MDBX_ENV_TXKEY UINT32_C(0x10000000) uint32_t me_flags; /* see mdbx_env */ unsigned me_psize; /* DB page size, inited from me_os_psize */ unsigned me_psize2log; /* log2 of DB page size */ diff --git a/src/mdbx.c b/src/mdbx.c index 1268e1c2..edd0387b 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1380,9 +1380,7 @@ static __inline void mdbx_meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, static __inline uint64_t mdbx_meta_sign(const MDBX_meta *meta) { uint64_t sign = MDBX_DATASIGN_NONE; #if 0 /* TODO */ - sign = hippeus_hash64(&meta->mm_mapsize, - sizeof(MDBX_meta) - offsetof(MDBX_meta, mm_mapsize), - meta->mm_version | (uint64_t)MDBX_DXD_MAGIC << 32); + sign = hippeus_hash64(...); #else (void)meta; #endif @@ -4060,6 +4058,9 @@ int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { } int __cold mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { + if (unlikely(dbs > MAX_DBI)) + return MDBX_EINVAL; + if (unlikely(!env)) return MDBX_EINVAL; @@ -4961,6 +4962,7 @@ static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **ret, mapped: p = pgno2page(env, pgno); +/* TODO: check p->mp_validator here */ done: *ret = p; @@ -9759,7 +9761,7 @@ int __cold mdbx_reader_check(MDBX_env *env, int *dead) { return MDBX_EINVAL; if (dead) *dead = 0; - return mdbx_reader_check0(env, 0, dead); + return mdbx_reader_check0(env, false, dead); } /* Return: @@ -10164,7 +10166,7 @@ int __cold mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, ctx.mw_user = user; ctx.mw_visitor = visitor; - int rc = visitor(0, NUM_METAS, user, "mdbx", "meta", NUM_METAS, + int rc = visitor(0, NUM_METAS, user, "meta", "meta", NUM_METAS, sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS, (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * NUM_METAS); diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 7c14188c..dabe2f82 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -48,8 +48,6 @@ static void signal_handler(int sig) { gotsignal = 1; } -#define MAX_DBI 32768 - #define EXIT_INTERRUPTED (EXIT_FAILURE + 4) #define EXIT_FAILURE_SYS (EXIT_FAILURE + 3) #define EXIT_FAILURE_MDB (EXIT_FAILURE + 2) @@ -809,7 +807,7 @@ int main(int argc, char *argv[]) { } rc = mdbx_env_set_maxdbs(env, MAX_DBI); - if (rc < 0) { + if (rc) { error("mdbx_env_set_maxdbs failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } From 10b01970261010eb8b00f317d2607512ecc62fb2 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 21 Jun 2017 01:41:25 +0300 Subject: [PATCH 252/303] mdbx: drop mdbx_get_errno_checked(). Change-Id: Ifa829b465083f17eba911e19947bdae854f9759c --- src/lck-windows.c | 35 +++++++++++++++++------------------ src/osal.c | 44 ++++++++++++++++++++------------------------ src/osal.h | 15 --------------- 3 files changed, 37 insertions(+), 57 deletions(-) diff --git a/src/lck-windows.c b/src/lck-windows.c index 7a083136..7bbef4d9 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -129,7 +129,7 @@ static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset, int mdbx_txn_lock(MDBX_env *env) { if (flock(env->me_fd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_BODY)) return MDBX_SUCCESS; - return mdbx_get_errno_checked(); + return GetLastError(); } void mdbx_txn_unlock(MDBX_env *env) { @@ -155,7 +155,7 @@ int mdbx_rdt_lock(MDBX_env *env) { /* transite from S-? (used) to S-E (locked), e.g. exlcusive lock upper-part */ if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) return MDBX_SUCCESS; - return mdbx_get_errno_checked(); + return GetLastError(); } void mdbx_rdt_unlock(MDBX_env *env) { @@ -196,7 +196,7 @@ static int internal_seize_lck(HANDLE lfd) { /* 1) now on ?-? (free), get ?-E (middle) */ mdbx_jitter4testing(false); if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { - rc = mdbx_get_errno_checked() /* 2) something went wrong, give up */; + rc = GetLastError() /* 2) something went wrong, give up */; mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "?-?(free) >> ?-E(middle)", rc); return rc; @@ -208,7 +208,7 @@ static int internal_seize_lck(HANDLE lfd) { return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive), done */ /* 5) still on ?-E (middle) */ - rc = mdbx_get_errno_checked(); + rc = GetLastError(); mdbx_jitter4testing(false); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ @@ -220,9 +220,8 @@ static int internal_seize_lck(HANDLE lfd) { /* 7) still on ?-E (middle), try S-E (locked) */ mdbx_jitter4testing(false); - rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) - ? MDBX_RESULT_FALSE - : mdbx_get_errno_checked(); + rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE + : GetLastError(); mdbx_jitter4testing(false); if (rc != MDBX_RESULT_FALSE) @@ -247,7 +246,7 @@ int mdbx_lck_seize(MDBX_env *env) { /* LY: without-lck mode (e.g. on read-only filesystem) */ mdbx_jitter4testing(false); if (!flock(env->me_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { - rc = mdbx_get_errno_checked(); + rc = GetLastError(); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); return rc; } @@ -264,7 +263,7 @@ int mdbx_lck_seize(MDBX_env *env) { * - we can't lock meta-pages, otherwise other process could get an error * while opening db in valid (non-conflict) mode. */ if (!flock(env->me_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { - rc = mdbx_get_errno_checked(); + rc = GetLastError(); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lock-against-without-lck", rc); mdbx_jitter4testing(false); @@ -292,7 +291,7 @@ int mdbx_lck_downgrade(MDBX_env *env) { /* 2) now at ?-E (middle), transite to S-E (locked) */ if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { - int rc = mdbx_get_errno_checked() /* 3) something went wrong, give up */; + int rc = GetLastError() /* 3) something went wrong, give up */; return rc; } @@ -320,7 +319,7 @@ int mdbx_lck_upgrade(MDBX_env *env) { return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive), done */ /* 5) still on ?-E (middle) */ - int rc = mdbx_get_errno_checked(); + int rc = GetLastError(); mdbx_jitter4testing(false); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, report but continue */ @@ -332,7 +331,7 @@ int mdbx_lck_upgrade(MDBX_env *env) { mdbx_jitter4testing(false); rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE - : mdbx_get_errno_checked(); + : GetLastError(); mdbx_jitter4testing(false); if (rc != MDBX_RESULT_FALSE) { @@ -352,14 +351,14 @@ void mdbx_lck_destroy(MDBX_env *env) { /* double `unlock` for robustly remove overlapped shared/exclusive locks */ while (funlock(env->me_lfd, LCK_LOWER)) ; - rc = mdbx_get_errno_checked(); + rc = GetLastError(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); while (funlock(env->me_lfd, LCK_UPPER)) ; - rc = mdbx_get_errno_checked(); + rc = GetLastError(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); @@ -370,21 +369,21 @@ void mdbx_lck_destroy(MDBX_env *env) { * releases such locks via deferred queues) */ while (funlock(env->me_fd, LCK_BODY)) ; - rc = mdbx_get_errno_checked(); + rc = GetLastError(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); while (funlock(env->me_fd, LCK_META)) ; - rc = mdbx_get_errno_checked(); + rc = GetLastError(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); while (funlock(env->me_fd, LCK_WHOLE)) ; - rc = mdbx_get_errno_checked(); + rc = GetLastError(); assert(rc == ERROR_NOT_LOCKED); (void)rc; SetLastError(ERROR_SUCCESS); @@ -418,7 +417,7 @@ int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { rc = WaitForSingleObject(hProcess, 0); CloseHandle(hProcess); } else { - rc = mdbx_get_errno_checked(); + rc = GetLastError(); } switch (rc) { diff --git a/src/osal.c b/src/osal.c index 3d216d73..02b6ce5c 100644 --- a/src/osal.c +++ b/src/osal.c @@ -22,7 +22,7 @@ static int waitstatus2errcode(DWORD result) { case WAIT_OBJECT_0: return MDBX_SUCCESS; case WAIT_FAILED: - return mdbx_get_errno_checked(); + return GetLastError(); case WAIT_ABANDONED: return ERROR_ABANDONED_WAIT_0; case WAIT_IO_COMPLETION: @@ -202,11 +202,11 @@ int mdbx_condmutex_init(mdbx_condmutex_t *condmutex) { condmutex->event = NULL; condmutex->mutex = CreateMutex(NULL, FALSE, NULL); if (!condmutex->mutex) - return mdbx_get_errno_checked(); + return GetLastError(); condmutex->event = CreateEvent(NULL, FALSE, FALSE, NULL); if (!condmutex->event) { - rc = mdbx_get_errno_checked(); + rc = GetLastError(); (void)CloseHandle(condmutex->mutex); condmutex->mutex = NULL; } @@ -235,14 +235,12 @@ int mdbx_condmutex_destroy(mdbx_condmutex_t *condmutex) { int rc = MDBX_EINVAL; #if defined(_WIN32) || defined(_WIN64) if (condmutex->event) { - rc = - CloseHandle(condmutex->event) ? MDBX_SUCCESS : mdbx_get_errno_checked(); + rc = CloseHandle(condmutex->event) ? MDBX_SUCCESS : GetLastError(); if (rc == MDBX_SUCCESS) condmutex->event = NULL; } if (condmutex->mutex) { - rc = - CloseHandle(condmutex->mutex) ? MDBX_SUCCESS : mdbx_get_errno_checked(); + rc = CloseHandle(condmutex->mutex) ? MDBX_SUCCESS : GetLastError(); if (rc == MDBX_SUCCESS) condmutex->mutex = NULL; } @@ -272,8 +270,7 @@ int mdbx_condmutex_lock(mdbx_condmutex_t *condmutex) { int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) - return ReleaseMutex(condmutex->mutex) ? MDBX_SUCCESS - : mdbx_get_errno_checked(); + return ReleaseMutex(condmutex->mutex) ? MDBX_SUCCESS : GetLastError(); #else return pthread_mutex_unlock(&condmutex->mutex); #endif @@ -281,7 +278,7 @@ int mdbx_condmutex_unlock(mdbx_condmutex_t *condmutex) { int mdbx_condmutex_signal(mdbx_condmutex_t *condmutex) { #if defined(_WIN32) || defined(_WIN64) - return SetEvent(condmutex->event) ? MDBX_SUCCESS : mdbx_get_errno_checked(); + return SetEvent(condmutex->event) ? MDBX_SUCCESS : GetLastError(); #else return pthread_cond_signal(&condmutex->cond); #endif @@ -385,14 +382,14 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, CreationDisposition, FlagsAndAttributes, NULL); if (*fd == INVALID_HANDLE_VALUE) - return mdbx_get_errno_checked(); - if ((flags & O_CREAT) && mdbx_get_errno_checked() != ERROR_ALREADY_EXISTS) { + return GetLastError(); + if ((flags & O_CREAT) && GetLastError() != ERROR_ALREADY_EXISTS) { /* set FILE_ATTRIBUTE_NOT_CONTENT_INDEXED for new file */ DWORD FileAttributes = GetFileAttributesA(pathname); if (FileAttributes == INVALID_FILE_ATTRIBUTES || !SetFileAttributesA(pathname, FileAttributes | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED)) { - int rc = mdbx_get_errno_checked(); + int rc = GetLastError(); CloseHandle(*fd); *fd = INVALID_HANDLE_VALUE; return rc; @@ -417,7 +414,7 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, int mdbx_closefile(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) - return CloseHandle(fd) ? MDBX_SUCCESS : mdbx_get_errno_checked(); + return CloseHandle(fd) ? MDBX_SUCCESS : GetLastError(); #else return (close(fd) == 0) ? MDBX_SUCCESS : errno; #endif @@ -435,7 +432,7 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) { DWORD read = 0; if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { - int rc = mdbx_get_errno_checked(); + int rc = GetLastError(); return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; } #else @@ -464,7 +461,7 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, DWORD written; if (likely(WriteFile(fd, buf, (DWORD)bytes, &written, &ov))) return (bytes == written) ? MDBX_SUCCESS : MDBX_EIO /* ERROR_WRITE_FAULT */; - return mdbx_get_errno_checked(); + return GetLastError(); #else int rc; ssize_t written; @@ -524,7 +521,7 @@ int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { #if defined(_WIN32) || defined(_WIN64) DWORD written; if (unlikely(!WriteFile(fd, ptr, (DWORD)chunk, &written, NULL))) - return mdbx_get_errno_checked(); + return GetLastError(); #else ssize_t written = write(fd, ptr, chunk); if (written < 0) { @@ -557,7 +554,7 @@ int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync) { #if defined(_WIN32) || defined(_WIN64) (void)fullsync; - return FlushFileBuffers(fd) ? MDBX_SUCCESS : mdbx_get_errno_checked(); + return FlushFileBuffers(fd) ? MDBX_SUCCESS : GetLastError(); #elif __GLIBC_PREREQ(2, 16) || _BSD_SOURCE || _XOPEN_SOURCE || \ (__GLIBC_PREREQ(2, 8) && _POSIX_C_SOURCE >= 200112L) for (;;) { @@ -583,7 +580,7 @@ int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { #if defined(_WIN32) || defined(_WIN64) BY_HANDLE_FILE_INFORMATION info; if (!GetFileInformationByHandle(fd, &info)) - return mdbx_get_errno_checked(); + return GetLastError(); *length = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32; #else struct stat st; @@ -604,7 +601,7 @@ int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { li.QuadPart = length; return (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) && SetEndOfFile(fd)) ? MDBX_SUCCESS - : mdbx_get_errno_checked(); + : GetLastError(); #else STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), "libmdbx requires 64-bit file I/O on 64-bit systems"); @@ -617,7 +614,7 @@ int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { int mdbx_thread_key_create(mdbx_thread_key_t *key) { #if defined(_WIN32) || defined(_WIN64) *key = TlsAlloc(); - return (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : mdbx_get_errno_checked(); + return (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : GetLastError(); #else return pthread_key_create(key, mdbx_rthc_dtor); #endif @@ -652,7 +649,7 @@ int mdbx_thread_create(mdbx_thread_t *thread, void *arg) { #if defined(_WIN32) || defined(_WIN64) *thread = CreateThread(NULL, 0, start_routine, arg, 0, NULL); - return *thread ? MDBX_SUCCESS : mdbx_get_errno_checked(); + return *thread ? MDBX_SUCCESS : GetLastError(); #else return pthread_create(thread, NULL, start_routine, arg); #endif @@ -674,8 +671,7 @@ int mdbx_msync(void *addr, size_t length, int async) { #if defined(_WIN32) || defined(_WIN64) if (async) return MDBX_SUCCESS; - return FlushViewOfFile(addr, length) ? MDBX_SUCCESS - : mdbx_get_errno_checked(); + return FlushViewOfFile(addr, length) ? MDBX_SUCCESS : GetLastError(); #else const int mode = async ? MS_ASYNC : MS_SYNC; return (msync(addr, length, mode) == 0) ? MDBX_SUCCESS : errno; diff --git a/src/osal.h b/src/osal.h index 684fed86..0c62da1d 100644 --- a/src/osal.h +++ b/src/osal.h @@ -396,21 +396,6 @@ static __inline int mdbx_get_errno(void) { return rc; } -static __inline int __mdbx_get_errno_checked(const char *file, unsigned line) { -#if defined(_WIN32) || defined(_WIN64) - DWORD rc = GetLastError(); - if (unlikely(rc == MDBX_EINVAL)) - mdbx_assert_fail(nullptr, "unexpected ERROR_INVALID_PARAMETER", file, line); -#else - int rc = errno; - if (unlikely(rc == MDBX_EINVAL)) - mdbx_assert_fail(nullptr, "unexpected EINVAL", file, line); -#endif - return rc; -} - -#define mdbx_get_errno_checked() __mdbx_get_errno_checked(__FILE__, __LINE__) - int mdbx_memalign_alloc(size_t alignment, size_t bytes, void **result); void mdbx_memalign_free(void *ptr); From 45defdc170c2ec01a5cd6406dc91a229577f970b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 21 Jun 2017 01:42:27 +0300 Subject: [PATCH 253/303] mdbx: refine err returning (MDBX_EPERM, etc). Change-Id: Iaa21a0e6632be47d6ef2a3676ba5e1381fc03b4d --- src/mdbx.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index edd0387b..66ebd74f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3847,7 +3847,7 @@ fail: int __cold mdbx_env_get_maxkeysize(MDBX_env *env) { if (!env || env->me_signature != MDBX_ME_SIGNATURE || !env->me_maxkey_limit) - return MDBX_EINVAL; + return -MDBX_EINVAL; return env->me_maxkey_limit; } @@ -4068,21 +4068,24 @@ int __cold mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { return MDBX_EBADSIGN; if (unlikely(env->me_map)) - return MDBX_EINVAL; + return MDBX_EPERM; env->me_maxdbs = dbs + CORE_DBS; return MDBX_SUCCESS; } int __cold mdbx_env_set_maxreaders(MDBX_env *env, unsigned readers) { - if (unlikely(!env || readers < 1)) + if (unlikely(readers < 1 || readers > INT16_MAX)) + return MDBX_EINVAL; + + if (unlikely(!env)) return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(env->me_map || readers > INT16_MAX)) - return MDBX_EINVAL; + if (unlikely(env->me_map)) + return MDBX_EPERM; env->me_maxreaders = readers; return MDBX_SUCCESS; From 9a2fff91f38dbe29de27c4a85b6c2bc2ccf69a7e Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 21 Jun 2017 01:34:56 +0300 Subject: [PATCH 254/303] mdbx: dynamic mapsize/geometry and API (mostly done). Change-Id: Ifb768ff3207cae1755c53c211a3ae552c6455e12 --- mdbx.h | 26 +- src/bits.h | 59 +- src/mdbx.c | 1191 ++++++++++++++++++++++++++++------------- src/osal.c | 210 ++++++-- src/osal.h | 21 +- src/tools/mdbx_chk.c | 48 +- src/tools/mdbx_dump.c | 2 - src/tools/mdbx_load.c | 4 +- src/tools/mdbx_stat.c | 74 +-- 9 files changed, 1133 insertions(+), 502 deletions(-) diff --git a/mdbx.h b/mdbx.h index f875fef2..65ff15e4 100644 --- a/mdbx.h +++ b/mdbx.h @@ -72,6 +72,7 @@ typedef unsigned mode_t; typedef HANDLE mdbx_filehandle_t; typedef DWORD mdbx_pid_t; typedef DWORD mdbx_tid_t; +typedef SSIZE_T ssize_t; #define MDBX_ENODATA ERROR_HANDLE_EOF #define MDBX_EINVAL ERROR_INVALID_PARAMETER #define MDBX_EACCESS ERROR_ACCESS_DENIED @@ -437,16 +438,24 @@ typedef struct MDBX_stat { /* Information about the environment */ typedef struct MDBX_envinfo { - void *me_mapaddr; /* Address of map, if fixed */ - uint64_t me_mapsize; /* Size of the data memory map */ - uint64_t me_recent_pgno; /* ID of the last used page */ - uint64_t me_recent_txnid; /* ID of the last committed transaction */ - uint32_t me_maxreaders; /* max reader slots in the environment */ - uint32_t me_numreaders; /* max reader slots used in the environment */ + struct { + uint64_t lower; /* lower limit for datafile size */ + uint64_t upper; /* upper limit for datafile size */ + uint64_t current; /* current datafile size */ + uint64_t shrink; /* shrink theshold for datafile */ + uint64_t grow; /* growth step for datafile */ + } me_geo; + uint64_t me_mapsize; /* Size of the data memory map */ + uint64_t me_last_pgno; /* ID of the last used page */ + uint64_t me_recent_txnid; /* ID of the last committed transaction */ uint64_t me_latter_reader_txnid; /* ID of the last reader transaction */ uint64_t me_meta0_txnid, me_meta0_sign; uint64_t me_meta1_txnid, me_meta1_sign; uint64_t me_meta2_txnid, me_meta2_sign; + uint32_t me_maxreaders; /* max reader slots in the environment */ + uint32_t me_numreaders; /* max reader slots used in the environment */ + uint32_t me_dxb_pagesize; /* database pagesize */ + uint32_t me_sys_pagesize; /* system pagesize */ } MDBX_envinfo; /* Return a string describing a given error code. @@ -800,6 +809,11 @@ LIBMDBX_API int mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *fd); * - MDBX_EINVAL - an invalid parameter was specified, * or the environment has an active write transaction. */ LIBMDBX_API int mdbx_env_set_mapsize(MDBX_env *env, size_t size); +LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower, + ssize_t size_now, ssize_t size_upper, + ssize_t growth_step, + ssize_t shrink_threshold, + ssize_t pagesize); /* Set the maximum number of threads/reader slots for the environment. * diff --git a/src/bits.h b/src/bits.h index 7d8d37b2..69689bb3 100644 --- a/src/bits.h +++ b/src/bits.h @@ -125,7 +125,7 @@ typedef uint32_t pgno_t; #define PRIaPGNO PRIu32 #define MAX_PAGENO ((pgno_t)UINT64_C(0xffffFFFFffff)) -#define MIN_PAGENO (NUM_METAS - 1) +#define MIN_PAGENO NUM_METAS /* A transaction ID. */ typedef uint64_t txnid_t; @@ -247,12 +247,17 @@ typedef struct MDBX_meta { * zero (nothing) for now */ uint8_t mm_extra_pagehdr; /* extra bytes in the page header, * zero (nothing) for now */ - /* Last used page in the datafile. - * Actually the file may be shorter if the freeDB lists the final pages. */ - pgno_t mm_last_pg; - uint64_t mm_dbsize_min; /* minimal size of db */ - uint64_t mm_dbsize_max; /* maximal size of db */ + struct { + uint16_t grow; /* datafile growth step in pages */ + uint16_t shrink; /* datafile shrink threshold in pages */ + pgno_t lower; /* minimal size of datafile in pages */ + pgno_t upper; /* maximal size of datafile in pages */ + pgno_t now; /* current size of datafile in pages */ + pgno_t next; /* first unused page in the datafile, + * but actually the file may be shorter. */ + } mm_geo; + MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ /* The size of pages used in this DB */ #define mm_psize mm_dbs[FREE_DBI].md_xsize @@ -268,9 +273,6 @@ typedef struct MDBX_meta { #define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign) volatile uint64_t mm_datasync_sign; - /* to be removed */ - uint64_t mm_mapsize; /* current size of mmap region */ - /* txnid that committed this page, the second of a two-phase-update pair */ volatile txnid_t mm_txnid_b; } MDBX_meta; @@ -481,6 +483,7 @@ struct MDBX_txn { /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ MDBX_txn *mt_child; pgno_t mt_next_pgno; /* next unallocated page */ + pgno_t mt_end_pgno; /* corresponding to the current size of datafile */ /* The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. */ @@ -636,8 +639,9 @@ typedef struct MDBX_xcursor { /* State of FreeDB old pages, stored in the MDBX_env */ typedef struct MDBX_pgstate { - pgno_t *mf_pghead; /* Reclaimed freeDB pages, or NULL before use */ - txnid_t mf_pglast; /* ID of last used record, or 0 if !mf_pghead */ + pgno_t *mf_reclaimed_pglist; /* Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_last_reclaimed; /* ID of last used record, or 0 if + !mf_reclaimed_pglist */ } MDBX_pgstate; /* The database environment. */ @@ -646,6 +650,10 @@ struct MDBX_env { uint32_t me_signature; mdbx_filehandle_t me_fd; /* The main data file */ mdbx_filehandle_t me_lfd; /* The lock file */ +#ifdef MDBX_OSAL_SECTION + MDBX_OSAL_SECTION me_dxb_section; + MDBX_OSAL_SECTION me_lck_section; +#endif /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ @@ -670,15 +678,14 @@ struct MDBX_env { MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ size_t me_mapsize; /* size of the data memory map */ - pgno_t me_maxpg; /* me_mapsize / me_psize */ MDBX_dbx *me_dbxs; /* array of static DB info */ uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ MDBX_pgstate me_pgstate; /* state of old pages from freeDB */ -#define me_pglast me_pgstate.mf_pglast -#define me_pghead me_pgstate.mf_pghead +#define me_last_reclaimed me_pgstate.mf_last_reclaimed +#define me_reclaimed_pglist me_pgstate.mf_reclaimed_pglist MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */ /* IDL of pages that became unused in a write txn */ MDBX_IDL me_free_pgs; @@ -702,6 +709,13 @@ struct MDBX_env { #ifdef USE_VALGRIND int me_valgrind_handle; #endif + struct { + size_t lower; /* minimal size of datafile */ + size_t upper; /* maximal size of datafile */ + size_t now; /* current size of datafile */ + size_t grow; /* step to grow datafile */ + size_t shrink; /* threshold to shrink datafile */ + } me_dbgeo; /* */ }; /* Nested transaction */ @@ -869,13 +883,24 @@ int mdbx_rthc_alloc(mdbx_thread_key_t *key, MDBX_reader *begin, void mdbx_rthc_remove(mdbx_thread_key_t key); void mdbx_rthc_cleanup(void); -static __inline bool is_power2(size_t x) { return (x & (x - 1)) == 0; } +static __inline bool mdbx_is_power2(size_t x) { return (x & (x - 1)) == 0; } -static __inline size_t roundup2(size_t value, size_t granularity) { - assert(is_power2(granularity)); +static __inline size_t mdbx_roundup2(size_t value, size_t granularity) { + assert(mdbx_is_power2(granularity)); return (value + granularity - 1) & ~(granularity - 1); } +static __inline unsigned mdbx_log2(size_t value) { + assert(mdbx_is_power2(value)); + + unsigned log = 0; + while (value > 1) { + log += 1; + value >>= 1; + } + return log; +} + #define MDBX_IS_ERROR(rc) \ ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) diff --git a/src/mdbx.c b/src/mdbx.c index 66ebd74f..269769d0 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1408,7 +1408,8 @@ static __inline bool mdbx_meta_ot(const MDBX_env *env, const MDBX_meta *a, static __inline bool mdbx_meta_eq(const MDBX_env *env, const MDBX_meta *a, const MDBX_meta *b) { mdbx_jitter4testing(true); - if (mdbx_meta_txnid_fluid(env, a) != mdbx_meta_txnid_fluid(env, b)) + const txnid_t txnid = mdbx_meta_txnid_fluid(env, a); + if (!txnid || txnid != mdbx_meta_txnid_fluid(env, b)) return false; mdbx_jitter4testing(true); @@ -1522,14 +1523,15 @@ static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { txn->mt_dirtyroom--; } -/* Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. Set MDBX_TXN_ERROR on failure. +/* Allocate page numbers and memory for writing. Maintain me_last_reclaimed, + * me_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. - * Do not modify the freedB, just merge freeDB records into me_pghead[] - * and move me_pglast to say which records were consumed. Only this - * function can create me_pghead and move me_pglast/mt_next_pgno. + * Do not modify the freedB, just merge freeDB records into me_reclaimed_pglist + * and move me_last_reclaimed to say which records were consumed. Only this + * function can create me_reclaimed_pglist and move + * me_last_reclaimed/mt_next_pgno. * * [in] mc cursor A cursor handle identifying the transaction and * database for which we are allocating. @@ -1551,12 +1553,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, int rc; MDBX_txn *txn = mc->mc_txn; MDBX_env *env = txn->mt_env; - pgno_t pgno, *mop = env->me_pghead; - unsigned i = 0, j, mop_len = mop ? mop[0] : 0, n2 = num - 1; MDBX_page *np; - txnid_t oldest = 0, last = 0; - MDBX_cursor_op op; - MDBX_cursor m2; if (likely(flags & MDBX_ALLOC_GC)) { flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); @@ -1582,6 +1579,12 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } } + const MDBX_meta *head = mdbx_meta_head(env); + pgno_t pgno, *repg_list = env->me_reclaimed_pglist; + unsigned repg_pos = 0, repg_len = repg_list ? repg_list[0] : 0; + txnid_t oldest = 0, last = 0; + const unsigned wanna_range = num - 1; + /* If our dirty list is already full, we can't do anything */ if (unlikely(txn->mt_dirtyroom == 0)) { rc = MDBX_TXN_FULL; @@ -1589,41 +1592,40 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } for (;;) { /* oom-kick retry loop */ - for (op = MDBX_FIRST;; + MDBX_cursor recur; + for (MDBX_cursor_op op = MDBX_FIRST;; op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) { MDBX_val key, data; - MDBX_node *leaf; - pgno_t *idl; /* Seek a big enough contiguous page range. Prefer * pages at the tail, just truncating the list. */ - if (likely(flags & MDBX_ALLOC_CACHE) && mop_len > n2 && + if (likely(flags & MDBX_ALLOC_CACHE) && repg_len > wanna_range && (!(flags & MDBX_COALESCE) || op == MDBX_FIRST)) { - i = mop_len; + repg_pos = repg_len; do { - pgno = mop[i]; - if (likely(mop[i - n2] == pgno + n2)) + pgno = repg_list[repg_pos]; + if (likely(repg_list[repg_pos - wanna_range] == pgno + wanna_range)) goto done; - } while (--i > n2); + } while (--repg_pos > wanna_range); } - if (op == MDBX_FIRST) { /* 1st iteration */ - /* Prepare to fetch more and coalesce */ + if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ if (unlikely(!(flags & MDBX_ALLOC_GC))) - break; + break /* reclaiming is prohibited for now */; + /* Prepare to fetch more and coalesce */ oldest = (flags & MDBX_LIFORECLAIM) ? mdbx_find_oldest(txn) : env->me_oldest[0]; - mdbx_cursor_init(&m2, txn, FREE_DBI, NULL); + mdbx_cursor_init(&recur, txn, FREE_DBI, NULL); if (flags & MDBX_LIFORECLAIM) { /* Begin from oldest reader if any */ if (oldest > 2) { last = oldest - 1; op = MDBX_SET_RANGE; } - } else if (env->me_pglast) { - /* Continue lookup from env->me_pglast to higher/last */ - last = env->me_pglast; + } else if (env->me_last_reclaimed) { + /* Continue lookup from env->me_last_reclaimed to oldest reader */ + last = env->me_last_reclaimed; op = MDBX_SET_RANGE; } @@ -1632,14 +1634,15 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } if (!(flags & MDBX_LIFORECLAIM)) { - /* Do not fetch more if the record will be too recent */ + /* Do not try fetch more if the record will be too recent */ if (op != MDBX_FIRST && ++last >= oldest) { + oldest = mdbx_find_oldest(txn); if (oldest <= last) break; } } - rc = mdbx_cursor_get(&m2, &key, NULL, op); + rc = mdbx_cursor_get(&recur, &key, NULL, op); if (rc == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { if (op == MDBX_SET_RANGE) continue; @@ -1649,7 +1652,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, key.iov_base = &last; key.iov_len = sizeof(last); op = MDBX_SET_RANGE; - rc = mdbx_cursor_get(&m2, &key, NULL, op); + rc = mdbx_cursor_get(&recur, &key, NULL, op); } } if (unlikely(rc)) { @@ -1669,18 +1672,21 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } if (flags & MDBX_LIFORECLAIM) { + /* skip IDs of records that already reclaimed */ if (txn->mt_lifo_reclaimed) { - for (j = (unsigned)txn->mt_lifo_reclaimed[0]; j > 0; --j) - if (txn->mt_lifo_reclaimed[j] == last) + unsigned i; + for (i = (unsigned)txn->mt_lifo_reclaimed[0]; i > 0; --i) + if (txn->mt_lifo_reclaimed[i] == last) break; - if (j) + if (i) continue; } } - np = m2.mc_pg[m2.mc_top]; - leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); - if (unlikely((rc = mdbx_node_read(&m2, leaf, &data)) != MDBX_SUCCESS)) + /* Reading next FreeDB record */ + np = recur.mc_pg[recur.mc_top]; + MDBX_node *leaf = NODEPTR(np, recur.mc_ki[recur.mc_top]); + if (unlikely((rc = mdbx_node_read(&recur, leaf, &data)) != MDBX_SUCCESS)) goto fail; if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { @@ -1691,70 +1697,85 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } } - idl = (pgno_t *)data.iov_base; - mdbx_tassert(txn, idl[0] == 0 || - data.iov_len == (idl[0] + 1) * sizeof(pgno_t)); - i = idl[0]; - if (!mop) { - if (unlikely(!(env->me_pghead = mop = mdbx_midl_alloc(i)))) { + /* Append IDL from FreeDB record to me_reclaimed_pglist */ + pgno_t *re_idl = (pgno_t *)data.iov_base; + mdbx_tassert(txn, re_idl[0] == 0 || + data.iov_len == (re_idl[0] + 1) * sizeof(pgno_t)); + repg_pos = re_idl[0]; + if (!repg_list) { + if (unlikely(!(env->me_reclaimed_pglist = repg_list = + mdbx_midl_alloc(repg_pos)))) { rc = MDBX_ENOMEM; goto fail; } } else { - if (unlikely((rc = mdbx_midl_need(&env->me_pghead, i)) != 0)) + if (unlikely((rc = mdbx_midl_need(&env->me_reclaimed_pglist, + repg_pos)) != 0)) goto fail; - mop = env->me_pghead; + repg_list = env->me_reclaimed_pglist; } + + /* Remember ID of FreeDB record */ if (flags & MDBX_LIFORECLAIM) { if ((rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, last)) != 0) goto fail; } - env->me_pglast = last; + env->me_last_reclaimed = last; if (mdbx_debug_enabled(MDBX_DBG_EXTRA)) { mdbx_debug_extra("IDL read txn %" PRIaTXN " root %" PRIaPGNO " num %u, IDL", - last, txn->mt_dbs[FREE_DBI].md_root, i); - for (j = i; j; j--) - mdbx_debug_extra_print(" %" PRIaPGNO "", idl[j]); + last, txn->mt_dbs[FREE_DBI].md_root, repg_pos); + unsigned i; + for (i = repg_pos; i; i--) + mdbx_debug_extra_print(" %" PRIaPGNO "", re_idl[i]); mdbx_debug_extra_print("\n"); } /* Merge in descending sorted order */ - mdbx_midl_xmerge(mop, idl); - mop_len = mop[0]; + mdbx_midl_xmerge(repg_list, re_idl); + repg_len = repg_list[0]; if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) { - /* force gc reclaim mode */ + /* Done for a kick-reclaim mode, actually no page needed */ return MDBX_SUCCESS; } + /* Return suitable pages into "unallocated" pull */ + while (repg_len > wanna_range && + repg_list[repg_len] == txn->mt_next_pgno - 1) { + txn->mt_next_pgno -= 1; + repg_len -= 1; + repg_list[0] = repg_len; + } + /* Don't try to coalesce too much. */ - if (mop_len > MDBX_IDL_UM_SIZE / 2) + if (repg_len > MDBX_IDL_UM_SIZE / 2) break; if (flags & MDBX_COALESCE) { - if (mop_len /* current size */ >= env->me_maxfree_1pg / 2 || - i /* prev size */ >= env->me_maxfree_1pg / 4) + if (repg_len /* current size */ >= env->me_maxfree_1pg / 2 || + repg_pos /* prev size */ >= env->me_maxfree_1pg / 4) flags &= ~MDBX_COALESCE; } } if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == (MDBX_COALESCE | MDBX_ALLOC_CACHE) && - mop_len > n2) { - i = mop_len; + repg_len > wanna_range) { + repg_pos = repg_len; do { - pgno = mop[i]; - if (mop[i - n2] == pgno + n2) + pgno = repg_list[repg_pos]; + if (repg_list[repg_pos - wanna_range] == pgno + wanna_range) goto done; - } while (--i > n2); + } while (--repg_pos > wanna_range); } /* Use new pages from the map when nothing suitable in the freeDB */ - i = 0; + repg_pos = 0; pgno = txn->mt_next_pgno; rc = MDBX_MAP_FULL; - if (likely(pgno + num <= env->me_maxpg)) { + const pgno_t next = pgno + num; + if (likely(next <= txn->mt_end_pgno)) { rc = MDBX_NOTFOUND; if (likely(flags & MDBX_ALLOC_NEW)) goto done; @@ -1762,7 +1783,6 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, if ((flags & MDBX_ALLOC_GC) && ((flags & MDBX_ALLOC_KICK) || rc == MDBX_MAP_FULL)) { - MDBX_meta *head = mdbx_meta_head(env); MDBX_meta *steady = mdbx_meta_steady(env); if (oldest == mdbx_meta_txnid_stable(env, steady) && META_IS_WEAK(head) && @@ -1783,13 +1803,11 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, mdbx_meta_txnid_stable(env, steady), mdbx_durable_str(steady), oldest); - unsigned me_flags = env->me_flags & MDBX_WRITEMAP; - if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC)) - me_flags |= MDBX_UTTERLY_NOSYNC; - - mdbx_assert(env, env->me_sync_pending > 0); + const unsigned flags = F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) + ? env->me_flags + : env->me_flags & MDBX_WRITEMAP; MDBX_meta meta = *head; - if (mdbx_sync_locked(env, me_flags, &meta) == MDBX_SUCCESS) { + if (mdbx_sync_locked(env, flags, &meta) == MDBX_SUCCESS) { txnid_t snap = mdbx_find_oldest(txn); if (snap > oldest) continue; @@ -1803,6 +1821,44 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } } + if (rc == MDBX_MAP_FULL) { + mdbx_assert(env, next > txn->mt_end_pgno); + if (unlikely(pgno2bytes(env, next) <= env->me_mapsize)) { + pgno_t growth_pgno = txn->mt_next_pgno + head->mm_geo.grow; + if (growth_pgno > MAX_PAGENO) + growth_pgno = MAX_PAGENO; + size_t growth_bytes = + mdbx_roundup2(pgno2bytes(env, growth_pgno), env->me_os_psize); + if (growth_bytes > env->me_mapsize) + growth_bytes = env->me_mapsize; + growth_pgno = bytes2pgno(env, growth_bytes); + mdbx_assert(env, growth_pgno <= head->mm_geo.upper); + mdbx_assert(env, growth_pgno > txn->mt_end_pgno); + mdbx_info("growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO + "), %" PRIuPTR " bytes", + growth_pgno, growth_pgno - txn->mt_end_pgno, growth_bytes); + + mdbx_mmap_param_t mmap; + mmap.address = env->me_map; +#ifdef MDBX_OSAL_SECTION + mmap.section = env->me_dxb_section; +#endif + mmap.fd = env->me_fd; + rc = + mdbx_mresize(env->me_flags, &mmap, env->me_dbgeo.now, growth_bytes); + if (rc == MDBX_SUCCESS) { + txn->mt_end_pgno = growth_pgno; + env->me_dbgeo.now = growth_bytes; + continue; + } + mdbx_error("error while growth datafile to %" PRIaPGNO + "pages (+%" PRIaPGNO "), %" PRIuPTR " bytes, errcode %d", + growth_pgno, growth_pgno - txn->mt_end_pgno, growth_bytes, + rc); + } else if (next < head->mm_geo.upper) + rc = MDBX_MAP_RESIZED; + } + fail: if (mp) { *mp = NULL; @@ -1825,13 +1881,15 @@ done: goto fail; } } - if (i) { - mop[0] = mop_len -= num; - /* Move any stragglers down */ - for (j = i - num; j < mop_len;) - mop[++j] = mop[++i]; + + if (repg_pos) { + /* Cutoff allocated pages from me_reclaimed_pglist */ + repg_list[0] = repg_len -= num; + for (unsigned i = repg_pos - num; i < repg_len;) + repg_list[++i] = repg_list[++repg_pos]; } else { txn->mt_next_pgno = pgno + num; + mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno); } if (env->me_flags & MDBX_PAGEPERTURB) @@ -2046,7 +2104,8 @@ int mdbx_env_sync(MDBX_env *env, int force) { if (unlikely(flags & (MDBX_RDONLY | MDBX_FATAL_ERROR))) return MDBX_EACCESS; - const bool outside_txn = (env->me_txn0->mt_owner != mdbx_thread_self()); + const bool outside_txn = + (!env->me_txn0 || env->me_txn0->mt_owner != mdbx_thread_self()); if (outside_txn) { int rc = mdbx_txn_lock(env); @@ -2055,12 +2114,10 @@ int mdbx_env_sync(MDBX_env *env, int force) { } MDBX_meta *head = mdbx_meta_head(env); - if (!META_IS_STEADY(head) || env->me_sync_pending || - env->me_mapsize != head->mm_mapsize) { + if (!META_IS_STEADY(head) || env->me_sync_pending) { - if (force || head->mm_mapsize != env->me_mapsize || - (env->me_sync_threshold && - env->me_sync_pending >= env->me_sync_threshold)) + if (force || (env->me_sync_threshold && + env->me_sync_pending >= env->me_sync_threshold)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; if (outside_txn && @@ -2068,7 +2125,8 @@ int mdbx_env_sync(MDBX_env *env, int force) { pgno2bytes(env, 16 /* FIXME: define threshold */) && (flags & MDBX_NOSYNC) == 0) { assert(((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - size_t used_size = pgno2bytes(env, head->mm_last_pg + 1); + const size_t used_size = + mdbx_roundup2(pgno2bytes(env, head->mm_geo.next), env->me_os_psize); mdbx_txn_unlock(env); @@ -2087,13 +2145,10 @@ int mdbx_env_sync(MDBX_env *env, int force) { head = mdbx_meta_head(env); } - if (!META_IS_STEADY(head) || env->me_sync_pending || - env->me_mapsize != head->mm_mapsize) { - mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIuPTR - ", mapsize env=%" PRIuPTR " meta=%" PRIu64, + if (!META_IS_STEADY(head) || env->me_sync_pending) { + mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIuPTR, container_of(head, MDBX_page, mp_data)->mp_pgno, - mdbx_durable_str(head), env->me_sync_pending, env->me_mapsize, - head->mm_mapsize); + mdbx_durable_str(head), env->me_sync_pending); MDBX_meta meta = *head; int rc = mdbx_sync_locked(env, flags, &meta); if (unlikely(rc != MDBX_SUCCESS)) { @@ -2250,7 +2305,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { if (likely(i < env->me_maxreaders)) break; - rc = mdbx_reader_check0(env, 1, NULL); + rc = mdbx_reader_check0(env, true, NULL); if (rc != MDBX_RESULT_TRUE) { mdbx_rdt_unlock(env); return (rc == MDBX_SUCCESS) ? MDBX_READERS_FULL : rc; @@ -2259,7 +2314,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { STATIC_ASSERT(sizeof(MDBX_reader) == MDBX_CACHELINE_SIZE); STATIC_ASSERT( - offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE == 0); + offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE == 0); r = &env->me_lck->mti_readers[i]; /* Claim the reader slot, carefully since other code * uses the reader table un-mutexed: First reset the @@ -2298,7 +2353,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { /* Snap the state from current meta-head */ txn->mt_txnid = snap; - txn->mt_next_pgno = meta->mm_last_pg + 1; + txn->mt_next_pgno = meta->mm_geo.next; + txn->mt_end_pgno = meta->mm_geo.now; memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); txn->mt_canary = meta->mm_canary; @@ -2350,7 +2406,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { /* Copy the DB info and flags */ memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); /* Moved to here to avoid a data race in read TXNs */ - txn->mt_next_pgno = meta->mm_last_pg + 1; + txn->mt_next_pgno = meta->mm_geo.next; + txn->mt_end_pgno = meta->mm_geo.now; } /* Setup db info */ @@ -2367,7 +2424,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { mdbx_debug("environment had fatal error, must shutdown!"); rc = MDBX_PANIC; - } else if (unlikely(env->me_maxpg < txn->mt_next_pgno)) { + } else if (unlikely(env->me_mapsize < pgno2bytes(env, txn->mt_next_pgno))) { rc = MDBX_MAP_RESIZED; } else { return MDBX_SUCCESS; @@ -2479,6 +2536,7 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, txn->mt_rw_dirtylist[0].mid = 0; txn->mt_spill_pages = NULL; txn->mt_next_pgno = parent->mt_next_pgno; + txn->mt_end_pgno = parent->mt_end_pgno; parent->mt_flags |= MDBX_TXN_HAS_CHILD; parent->mt_child = txn; txn->mt_parent = parent; @@ -2489,12 +2547,14 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW; rc = 0; ntxn = (MDBX_ntxn *)txn; - ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ - if (env->me_pghead) { - size = MDBX_IDL_SIZEOF(env->me_pghead); - env->me_pghead = mdbx_midl_alloc(env->me_pghead[0]); - if (likely(env->me_pghead)) - memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + ntxn->mnt_pgstate = + env->me_pgstate; /* save parent me_reclaimed_pglist & co */ + if (env->me_reclaimed_pglist) { + size = MDBX_IDL_SIZEOF(env->me_reclaimed_pglist); + env->me_reclaimed_pglist = mdbx_midl_alloc(env->me_reclaimed_pglist[0]); + if (likely(env->me_reclaimed_pglist)) + memcpy(env->me_reclaimed_pglist, ntxn->mnt_pgstate.mf_reclaimed_pglist, + size); else rc = MDBX_ENOMEM; } @@ -2598,7 +2658,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { txn->mt_flags |= MDBX_TXN_FINISHED; txn->mt_owner = 0; } else if (!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED)) { - pgno_t *pghead = env->me_pghead; + pgno_t *pghead = env->me_reclaimed_pglist; if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ mdbx_cursors_eot(txn, 0); @@ -2620,8 +2680,8 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { mdbx_midl_shrink(&txn->mt_free_pages); env->me_free_pgs = txn->mt_free_pages; /* me_pgstate: */ - env->me_pghead = NULL; - env->me_pglast = 0; + env->me_reclaimed_pglist = NULL; + env->me_last_reclaimed = 0; env->me_txn = NULL; txn->mt_owner = 0; @@ -2692,7 +2752,9 @@ int mdbx_txn_abort(MDBX_txn *txn) { } static __inline int mdbx_backlog_size(MDBX_txn *txn) { - int reclaimed = txn->mt_env->me_pghead ? txn->mt_env->me_pghead[0] : 0; + int reclaimed = txn->mt_env->me_reclaimed_pglist + ? txn->mt_env->me_reclaimed_pglist[0] + : 0; return reclaimed + txn->mt_loose_count; } @@ -2725,8 +2787,8 @@ static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *mc) { * This changes the freelist. Keep trying until it stabilizes. */ static int mdbx_freelist_save(MDBX_txn *txn) { - /* env->me_pghead[] can grow and shrink during this call. - * env->me_pglast and txn->mt_free_pages[] can only grow. + /* env->me_reclaimed_pglist[] can grow and shrink during this call. + * env->me_last_reclaimed and txn->mt_free_pages[] can only grow. * Page numbers cannot disappear from txn->mt_free_pages[]. */ MDBX_cursor mc; MDBX_env *env = txn->mt_env; @@ -2753,8 +2815,8 @@ again: if (!lifo) { /* If using records from freeDB which we have not yet - * deleted, delete them and any we reserved for me_pghead. */ - while (pglast < env->me_pglast) { + * deleted, delete them and any we reserved for me_reclaimed_pglist. */ + while (pglast < env->me_last_reclaimed) { rc = mdbx_cursor_first(&mc, &key, NULL); if (unlikely(rc)) goto bailout; @@ -2764,7 +2826,7 @@ again: pglast = head_id = *(txnid_t *)key.iov_base; total_room = head_room = 0; more = 1; - mdbx_tassert(txn, pglast <= env->me_pglast); + mdbx_tassert(txn, pglast <= env->me_last_reclaimed); mc.mc_flags |= C_RECLAIMING; rc = mdbx_cursor_del(&mc, 0); mc.mc_flags &= ~C_RECLAIMING; @@ -2793,9 +2855,9 @@ again: } } - if (unlikely(!env->me_pghead) && txn->mt_loose_pages) { + if (unlikely(!env->me_reclaimed_pglist) && txn->mt_loose_pages) { /* Put loose page numbers in mt_free_pages, since - * we may be unable to return them to me_pghead. */ + * we may be unable to return them to me_reclaimed_pglist. */ MDBX_page *mp = txn->mt_loose_pages; if (unlikely((rc = mdbx_midl_need(&txn->mt_free_pages, txn->mt_loose_count)) != 0)) @@ -2843,15 +2905,15 @@ again: continue; } - mop = env->me_pghead; + mop = env->me_reclaimed_pglist; mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count; if (mop_len && refill_idx == 0) refill_idx = 1; - /* Reserve records for me_pghead[]. Split it if multi-page, + /* Reserve records for me_reclaimed_pglist[]. Split it if multi-page, * to avoid searching freeDB for a page range. Use keys in - * range [1,me_pglast]: Smaller than txnid of oldest reader. */ + * range [1,me_last_reclaimed]: Smaller than txnid of oldest reader. */ if (total_room >= mop_len) { if (total_room == mop_len || --more < 0) break; @@ -2875,7 +2937,7 @@ again: goto bailout; /* LY: freedb is empty, will look any free txn-id in high2low order. */ - if (unlikely(env->me_pglast < 1)) { + if (unlikely(env->me_last_reclaimed < 1)) { /* LY: not any txn in the past of freedb. */ rc = MDBX_MAP_FULL; goto bailout; @@ -2889,10 +2951,11 @@ again: } } /* LY: append the list. */ - rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, env->me_pglast - 1); + rc = mdbx_txl_append(&txn->mt_lifo_reclaimed, + env->me_last_reclaimed - 1); if (unlikely(rc)) goto bailout; - --env->me_pglast; + --env->me_last_reclaimed; /* LY: note that freeDB cleanup is not needed. */ ++cleanup_idx; } @@ -2903,7 +2966,7 @@ again: total_room -= head_room; head_room = mop_len - total_room; if (head_room > maxfree_1pg && head_id > 1) { - /* Overflow multi-page for part of me_pghead */ + /* Overflow multi-page for part of me_reclaimed_pglist */ head_room /= (head_id < INT16_MAX) ? (pgno_t)head_id : INT16_MAX; /* amortize page sizes */ head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); @@ -2931,16 +2994,16 @@ again: cleanup_idx == (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)); - /* Return loose page numbers to me_pghead, though usually none are + /* Return loose page numbers to me_reclaimed_pglist, though usually none are * left at this point. The pages themselves remain in dirtylist. */ if (txn->mt_loose_pages) { MDBX_page *mp = txn->mt_loose_pages; unsigned count = txn->mt_loose_count; MDBX_IDL loose; /* Room for loose pages + temp IDL with same */ - if ((rc = mdbx_midl_need(&env->me_pghead, 2 * count + 1)) != 0) + if ((rc = mdbx_midl_need(&env->me_reclaimed_pglist, 2 * count + 1)) != 0) goto bailout; - mop = env->me_pghead; + mop = env->me_reclaimed_pglist; loose = mop + MDBX_IDL_ALLOCLEN(mop) - count; for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp)) loose[++count] = mp->mp_pgno; @@ -2952,7 +3015,7 @@ again: mop_len = mop[0]; } - /* Fill in the reserved me_pghead records */ + /* Fill in the reserved me_reclaimed_pglist records */ rc = MDBX_SUCCESS; if (mop_len) { MDBX_val key, data; @@ -2973,7 +3036,7 @@ again: if (!lifo) { id = *(txnid_t *)key.iov_base; - mdbx_tassert(txn, id <= env->me_pglast); + mdbx_tassert(txn, id <= env->me_last_reclaimed); } else { mdbx_tassert(txn, refill_idx > 0 && refill_idx <= txn->mt_lifo_reclaimed[0]); @@ -3205,6 +3268,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { * to the parent or set MDBX_TXN_ERROR in the parent. */ parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_end_pgno = txn->mt_end_pgno; parent->mt_flags = txn->mt_flags; /* Merge our cursors into parent's and close them */ @@ -3315,7 +3379,7 @@ int mdbx_txn_commit(MDBX_txn *txn) { parent->mt_loose_count += txn->mt_loose_count; parent->mt_child = NULL; - mdbx_midl_free(((MDBX_ntxn *)txn)->mnt_pgstate.mf_pghead); + mdbx_midl_free(((MDBX_ntxn *)txn)->mnt_pgstate.mf_reclaimed_pglist); txn->mt_signature = 0; free(txn); return rc; @@ -3364,8 +3428,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { if (unlikely(rc != MDBX_SUCCESS)) goto fail; - mdbx_midl_free(env->me_pghead); - env->me_pghead = NULL; + mdbx_midl_free(env->me_reclaimed_pglist); + env->me_reclaimed_pglist = NULL; mdbx_midl_shrink(&txn->mt_free_pages); if (mdbx_audit_enabled()) @@ -3373,13 +3437,15 @@ int mdbx_txn_commit(MDBX_txn *txn) { rc = mdbx_page_flush(txn, 0); if (likely(rc == MDBX_SUCCESS)) { - MDBX_meta meta; + MDBX_meta meta, *head = mdbx_meta_head(env); + meta.mm_geo = head->mm_geo; + meta.mm_geo.next = txn->mt_next_pgno; + meta.mm_geo.now = txn->mt_end_pgno; meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - meta.mm_last_pg = txn->mt_next_pgno - 1; - mdbx_meta_set_txnid(env, &meta, txn->mt_txnid); meta.mm_canary = txn->mt_canary; + mdbx_meta_set_txnid(env, &meta, txn->mt_txnid); rc = mdbx_sync_locked(env, env->me_flags | txn->mt_flags, &meta); } @@ -3480,51 +3546,81 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { } /* LY: check pagesize */ - if (!is_power2(page.mp_meta.mm_psize) || + if (!mdbx_is_power2(page.mp_meta.mm_psize) || page.mp_meta.mm_psize < MIN_PAGESIZE || page.mp_meta.mm_psize > MAX_PAGESIZE) { - mdbx_notice("meta[%u] has invalid pagesize %u, skip it", meta_number, + mdbx_notice("meta[%u] has invalid pagesize (%u), skip it", meta_number, page.mp_meta.mm_psize); rc = MDBX_VERSION_MISMATCH; continue; } + /* LY: check min-pages value */ + if (page.mp_meta.mm_geo.lower < MIN_PAGENO || + page.mp_meta.mm_geo.lower > MAX_PAGENO) { + mdbx_notice("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", + meta_number, page.mp_meta.mm_geo.lower); + rc = MDBX_INVALID; + continue; + } + + /* LY: check max-pages value */ + if (page.mp_meta.mm_geo.upper < MIN_PAGENO || + page.mp_meta.mm_geo.upper > MAX_PAGENO || + page.mp_meta.mm_geo.upper < page.mp_meta.mm_geo.lower) { + mdbx_notice("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", + meta_number, page.mp_meta.mm_geo.upper); + rc = MDBX_INVALID; + continue; + } + /* LY: check mapsize limits */ + const uint64_t mapsize_min = + page.mp_meta.mm_geo.lower * (uint64_t)page.mp_meta.mm_psize; + const uint64_t mapsize_max = + page.mp_meta.mm_geo.upper * (uint64_t)page.mp_meta.mm_psize; STATIC_ASSERT(MAX_MAPSIZE < SSIZE_MAX - MAX_PAGESIZE); - if (page.mp_meta.mm_mapsize < MIN_MAPSIZE) { - mdbx_notice("meta[%u] has invalid mapsize %" PRIu64 ", skip it", - meta_number, page.mp_meta.mm_mapsize); + STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); + if (mapsize_min < MIN_MAPSIZE || mapsize_max > MAX_MAPSIZE) { + mdbx_notice("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_min); rc = MDBX_VERSION_MISMATCH; continue; } STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); - if (page.mp_meta.mm_mapsize > MAX_MAPSIZE) { - mdbx_notice("meta[%u] has too large mapsize %" PRIu64 ", skip it", - meta_number, page.mp_meta.mm_mapsize); + if (mapsize_max > MAX_MAPSIZE || + MAX_PAGENO < mdbx_roundup2((size_t)mapsize_max, env->me_os_psize) / + (uint64_t)page.mp_meta.mm_psize) { + mdbx_notice("meta[%u] has too large max-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_max); rc = MDBX_TOO_LARGE; continue; } - /* LY: check mapsize with given given pagesize */ - if (page.mp_meta.mm_mapsize < - MIN_PAGENO * (uint64_t)page.mp_meta.mm_psize || - page.mp_meta.mm_mapsize > - MAX_PAGENO * (uint64_t)page.mp_meta.mm_psize) { - mdbx_notice("meta[%u] has invalid mapsize %" PRIu64 - ", with given pagesize %u, skip it", - meta_number, page.mp_meta.mm_mapsize, page.mp_meta.mm_psize); + /* LY: check end_pgno */ + if (page.mp_meta.mm_geo.now < page.mp_meta.mm_geo.lower || + page.mp_meta.mm_geo.now > page.mp_meta.mm_geo.upper) { + mdbx_notice("meta[%u] has invalid end-pageno (%" PRIaPGNO "), skip it", + meta_number, page.mp_meta.mm_geo.now); rc = MDBX_CORRUPTED; continue; } /* LY: check last_pgno */ - if (page.mp_meta.mm_last_pg < MIN_PAGENO || - page.mp_meta.mm_last_pg > MAX_PAGENO || - page.mp_meta.mm_last_pg > - page.mp_meta.mm_mapsize / page.mp_meta.mm_psize) { - mdbx_notice("meta[%u] has invalid last-pageno %" PRIaPGNO ", skip it", - meta_number, page.mp_meta.mm_last_pg); + if (page.mp_meta.mm_geo.next < MIN_PAGENO || + page.mp_meta.mm_geo.next - 1 > MAX_PAGENO) { + mdbx_notice("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", + meta_number, page.mp_meta.mm_geo.next); + rc = MDBX_CORRUPTED; + continue; + } + + if (page.mp_meta.mm_geo.next > page.mp_meta.mm_geo.now) { + mdbx_notice("meta[%u] next-pageno (%" PRIaPGNO + ") is beyond end-pgno (%" PRIaPGNO "), skip it", + meta_number, page.mp_meta.mm_geo.next, + page.mp_meta.mm_geo.now); rc = MDBX_CORRUPTED; continue; } @@ -3540,8 +3636,8 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { rc = MDBX_CORRUPTED; continue; } - } else if (page.mp_meta.mm_dbs[FREE_DBI].md_root > - page.mp_meta.mm_last_pg) { + } else if (page.mp_meta.mm_dbs[FREE_DBI].md_root >= + page.mp_meta.mm_geo.next) { mdbx_notice("meta[%u] has invalid freedb-root %" PRIaPGNO ", skip it", meta_number, page.mp_meta.mm_dbs[FREE_DBI].md_root); rc = MDBX_CORRUPTED; @@ -3559,14 +3655,19 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { rc = MDBX_CORRUPTED; continue; } - } else if (page.mp_meta.mm_dbs[MAIN_DBI].md_root > - page.mp_meta.mm_last_pg) { + } else if (page.mp_meta.mm_dbs[MAIN_DBI].md_root >= + page.mp_meta.mm_geo.next) { mdbx_notice("meta[%u] has invalid maindb-root %" PRIaPGNO ", skip it", meta_number, page.mp_meta.mm_dbs[MAIN_DBI].md_root); rc = MDBX_CORRUPTED; continue; } + if (page.mp_meta.mm_txnid_a == 0) { + mdbx_warning("meta[%u] has zero txnid, skip it", meta_number); + continue; + } + if (mdbx_meta_ot(env, meta, &page.mp_meta, true)) { *meta = page.mp_meta; if (META_IS_WEAK(meta)) @@ -3584,13 +3685,40 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta) { static MDBX_page *__cold mdbx_meta_model(const MDBX_env *env, MDBX_page *model, unsigned num) { + + mdbx_ensure(env, mdbx_is_power2(env->me_psize)); + mdbx_ensure(env, env->me_psize >= MIN_PAGESIZE); + mdbx_ensure(env, env->me_psize <= MAX_PAGESIZE); + mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower); + mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper); + memset(model, 0, sizeof(*model)); model->mp_pgno = num; model->mp_flags = P_META; model->mp_meta.mm_magic_and_version = MDBX_DATA_MAGIC; - model->mp_meta.mm_mapsize = env->me_mapsize; + + model->mp_meta.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); + model->mp_meta.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); + model->mp_meta.mm_geo.grow = (uint16_t)bytes2pgno(env, env->me_dbgeo.grow); + model->mp_meta.mm_geo.shrink = + (uint16_t)bytes2pgno(env, env->me_dbgeo.shrink); + model->mp_meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); + model->mp_meta.mm_geo.next = NUM_METAS; + + mdbx_ensure(env, model->mp_meta.mm_geo.lower >= MIN_PAGENO); + mdbx_ensure(env, model->mp_meta.mm_geo.upper <= MAX_PAGENO); + mdbx_ensure(env, model->mp_meta.mm_geo.now >= model->mp_meta.mm_geo.lower); + mdbx_ensure(env, model->mp_meta.mm_geo.now <= model->mp_meta.mm_geo.upper); + mdbx_ensure(env, model->mp_meta.mm_geo.next >= MIN_PAGENO); + mdbx_ensure(env, model->mp_meta.mm_geo.next <= model->mp_meta.mm_geo.now); + mdbx_ensure(env, model->mp_meta.mm_geo.grow == + bytes2pgno(env, env->me_dbgeo.grow)); + mdbx_ensure(env, model->mp_meta.mm_geo.shrink == + bytes2pgno(env, env->me_dbgeo.shrink)); + model->mp_meta.mm_psize = env->me_psize; - model->mp_meta.mm_last_pg = NUM_METAS - 1; model->mp_meta.mm_flags = (uint16_t)env->me_flags; model->mp_meta.mm_flags |= MDBX_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */ @@ -3623,22 +3751,14 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const meta2 = METAPAGE(env, 2); MDBX_meta *const head = mdbx_meta_head(env); - mdbx_assert(env, head->mm_mapsize < MAX_MAPSIZE); - STATIC_ASSERT(SSIZE_MAX > MAX_MAPSIZE); - const size_t prev_mapsize = (size_t)head->mm_mapsize; - - const size_t used_size = pgno2bytes(env, pending->mm_last_pg + 1); - mdbx_assert(env, mdbx_meta_eq_mask(env) == 0); mdbx_assert(env, pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); - mdbx_assert(env, !META_IS_STEADY(head) || env->me_sync_pending != 0 || - env->me_mapsize != prev_mapsize); - - pending->mm_mapsize = env->me_mapsize; - mdbx_assert(env, pending->mm_mapsize >= used_size); + mdbx_assert(env, !META_IS_STEADY(head) || env->me_sync_pending != 0); + const size_t usedbytes = + mdbx_roundup2(pgno2bytes(env, pending->mm_geo.next), env->me_os_psize); if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) flags &= MDBX_WRITEMAP; @@ -3646,34 +3766,45 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, int rc = MDBX_RESULT_TRUE; if (env->me_sync_pending && (flags & MDBX_NOSYNC) == 0) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + MDBX_meta *const steady = mdbx_meta_steady(env); if (flags & MDBX_WRITEMAP) { - rc = mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC); + rc = mdbx_msync(env->me_map, usedbytes, flags & MDBX_MAPASYNC); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - if ((flags & MDBX_MAPASYNC) == 0) + if ((flags & MDBX_MAPASYNC) == 0) { + if (unlikely(pending->mm_geo.next > steady->mm_geo.now)) { + rc = mdbx_filesize_sync(env->me_fd); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } env->me_sync_pending = 0; - } else { - bool fullsync = false; - if (unlikely(prev_mapsize != pending->mm_mapsize)) { - /* LY: It is no reason to use fdatasync() here, even in case - * no such bug in a kernel. Because "no-bug" mean that a kernel - * internally do nearly the same, e.g. fdatasync() == fsync() - * when no-kernel-bug and file size was changed. - * - * So, this code is always safe and without appreciable - * performance degradation. - * - * For more info about of a corresponding fdatasync() bug - * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ - fullsync = true; } - rc = mdbx_filesync(env->me_fd, fullsync); + } else { + rc = mdbx_filesync(env->me_fd, pending->mm_geo.next > steady->mm_geo.now); if (unlikely(rc != MDBX_SUCCESS)) goto fail; env->me_sync_pending = 0; } } +#if defined(_WIN32) || defined(_WIN64) +/* Windows is unable shrinking a mapped file */ +#else + /* LY: check conditions to shrink datafile */ + const pgno_t shrink_pgno = pending->mm_geo.next /* + pending->mm_geo.grow */; + const size_t shrink_bytes = + mdbx_roundup2(pgno2bytes(env, shrink_pgno), env->me_os_psize); + size_t shrink_pgno_delta = 0; + if (pending->mm_geo.now > shrink_pgno && pending->mm_geo.shrink && + unlikely(pending->mm_geo.now - pending->mm_geo.shrink >= shrink_pgno)) { + if (pending->mm_geo.now > shrink_pgno && + pending->mm_geo.now - pending->mm_geo.shrink >= shrink_pgno) { + shrink_pgno_delta = pending->mm_geo.now - shrink_pgno; + pending->mm_geo.now = shrink_pgno; + } + } +#endif /* not a Windows */ + /* Steady or Weak */ if (env->me_sync_pending == 0) { pending->mm_datasync_sign = mdbx_meta_sign(pending); @@ -3690,8 +3821,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, sizeof(head->mm_dbs)) == 0); mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, sizeof(head->mm_canary)) == 0); - mdbx_assert(env, head->mm_last_pg == pending->mm_last_pg); - mdbx_assert(env, head->mm_mapsize == pending->mm_mapsize); + mdbx_assert(env, memcmp(&head->mm_geo, &pending->mm_geo, + sizeof(pending->mm_geo)) == 0); if (!META_IS_STEADY(head) && META_IS_STEADY(pending)) target = head; else { @@ -3756,11 +3887,10 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, #endif /* LY: update info */ - target->mm_mapsize = pending->mm_mapsize; + target->mm_geo = pending->mm_geo; target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; target->mm_canary = pending->mm_canary; - target->mm_last_pg = pending->mm_last_pg; mdbx_jitter4testing(true); mdbx_coherent_barrier(); @@ -3773,8 +3903,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_ensure(env, mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a && !META_IS_STEADY(head) && META_IS_STEADY(pending)); - mdbx_ensure(env, head->mm_last_pg == pending->mm_last_pg); - mdbx_ensure(env, head->mm_mapsize == pending->mm_mapsize); + mdbx_ensure(env, memcmp(&head->mm_geo, &pending->mm_geo, + sizeof(head->mm_geo)) == 0); mdbx_ensure(env, memcmp(&head->mm_dbs, &pending->mm_dbs, sizeof(head->mm_dbs)) == 0); mdbx_ensure(env, memcmp(&head->mm_canary, &pending->mm_canary, @@ -3797,12 +3927,6 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_invalidate_cache(env->me_map + offset, sizeof(MDBX_meta)); } - /* Memory ordering issues are irrelevant; since the entire writer - * is wrapped by wmutex, all of these changes will become visible - * after the wmutex is unlocked. Since the DB is multi-version, - * readers will get consistent data regardless of how fresh or - * how stale their view of these values is. */ - /* LY: step#3 - sync meta-pages. */ mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) { @@ -3819,24 +3943,24 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, } } - /* LY: currently this can't happen, but... */ - mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); - if (unlikely(pending->mm_mapsize < prev_mapsize)) { - mdbx_assert(env, pending->mm_mapsize == env->me_mapsize); - if (pending->mm_mapsize > MAX_MAPSIZE) { - rc = MDBX_PROBLEM; - goto fail; - } - const size_t mapsize = (size_t)pending->mm_mapsize; - mdbx_assert(env, pending->mm_mapsize == mapsize); - - rc = mdbx_ftruncate(env->me_fd, mapsize); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - rc = mdbx_mremap_size((void **)&env->me_map, prev_mapsize, mapsize); - if (unlikely(rc != MDBX_SUCCESS)) +#if defined(_WIN32) || defined(_WIN64) +/* Windows is unable shrinking a mapped file */ +#else + /* LY: shrink datafile if needed */ + if (shrink_pgno_delta) { + mdbx_mmap_param_t mmap; + mmap.address = env->me_map; +#ifdef MDBX_OSAL_SECTION + mmap.section = env->me_dxb_section; +#endif + mmap.fd = env->me_fd; + rc = mdbx_mresize(env->me_flags, &mmap, env->me_dbgeo.now, shrink_bytes); + if (rc == MDBX_SUCCESS) + env->me_dbgeo.now = shrink_bytes; + else if (rc != MDBX_RESULT_TRUE) goto fail; } +#endif /* not a Windows */ return MDBX_SUCCESS; @@ -3870,8 +3994,10 @@ int mdbx_get_maxkeysize(size_t pagesize) { return (maxkey > 0 && maxkey < INT_MAX) ? (int)maxkey : -MDBX_EINVAL; } -static void __cold mdbx_setup_pagesize(MDBX_env *env, size_t pagesize) { - mdbx_ensure(env, is_power2(pagesize)); +static void __cold mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { + STATIC_ASSERT(SSIZE_MAX > MAX_MAPSIZE); + STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page)); + mdbx_ensure(env, mdbx_is_power2(pagesize)); mdbx_ensure(env, pagesize >= MIN_PAGESIZE); mdbx_ensure(env, pagesize <= MAX_PAGESIZE); @@ -3897,13 +4023,9 @@ static void __cold mdbx_setup_pagesize(MDBX_env *env, size_t pagesize) { mdbx_ensure(env, maxkey_limit > 42 && (size_t)maxkey_limit < pagesize); env->me_maxkey_limit = (unsigned)maxkey_limit; - env->me_psize2log = 0; - while (pagesize > 1) { - env->me_psize2log += 1; - pagesize >>= 1; - } - - env->me_maxpg = bytes2pgno(env, env->me_mapsize); + env->me_psize2log = mdbx_log2(pagesize); + mdbx_assert(env, pgno2bytes(env, 1) == pagesize); + mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2); } int __cold mdbx_env_create(MDBX_env **penv) { @@ -3919,8 +4041,8 @@ int __cold mdbx_env_create(MDBX_env **penv) { int rc; const size_t os_psize = mdbx_syspagesize(); - if (!is_power2(os_psize) || os_psize < MIN_PAGESIZE) { - mdbx_error("unsuitable system pageize %" PRIuPTR, os_psize); + if (!mdbx_is_power2(os_psize) || os_psize < MIN_PAGESIZE) { + mdbx_error("unsuitable system pagesize %" PRIuPTR, os_psize); rc = MDBX_INCOMPATIBLE; goto bailout; } @@ -3942,24 +4064,20 @@ bailout: return rc; } -static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { - unsigned flags = env->me_flags; - int rc; - - if (flags & MDBX_WRITEMAP) { - rc = mdbx_ftruncate(env->me_fd, env->me_mapsize); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - - env->me_map = addr; - rc = mdbx_mmap((void **)&env->me_map, env->me_mapsize, flags & MDBX_WRITEMAP, - env->me_fd); +static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) { + mdbx_mmap_param_t mmap; + mmap.fd = env->me_fd; + int rc = mdbx_mmap(env->me_flags, &mmap, env->me_dbgeo.now, env->me_mapsize); if (unlikely(rc != MDBX_SUCCESS)) { env->me_map = NULL; return rc; } + env->me_map = mmap.address; +#ifdef MDBX_OSAL_SECTION + env->me_dxb_section = mmap.section; +#endif + #ifdef MADV_DONTFORK if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK)) return errno; @@ -3972,13 +4090,13 @@ static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { #if defined(MADV_DODUMP) && defined(MADV_DONTDUMP) const size_t meta_length = pgno2bytes(env, NUM_METAS); (void)madvise(env->me_map, meta_length, MADV_DODUMP); - if (!(flags & MDBX_PAGEPERTURB)) + if (!(env->me_flags & MDBX_PAGEPERTURB)) (void)madvise(env->me_map + meta_length, env->me_mapsize - meta_length, MADV_DONTDUMP); #endif #ifdef MADV_REMOVE - if (flags & MDBX_WRITEMAP) + if (env->me_flags & MDBX_WRITEMAP) (void)madvise(env->me_map + usedsize, env->me_mapsize - usedsize, MADV_REMOVE); #else @@ -3988,14 +4106,14 @@ static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { #if defined(MADV_RANDOM) && defined(MADV_WILLNEED) /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ if (madvise(env->me_map, env->me_mapsize, - (flags & MDBX_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED)) + (env->me_flags & MDBX_NORDAHEAD) ? MADV_RANDOM : MADV_WILLNEED)) return errno; #endif /* Lock meta pages to avoid unexpected write, * before the data pages would be synchronized. */ - if (flags & MDBX_WRITEMAP) { - rc = mdbx_mlock(env->me_map, pgno2bytes(env, NUM_METAS)); + if (env->me_flags & MDBX_WRITEMAP) { + rc = mdbx_mlock(&mmap, pgno2bytes(env, NUM_METAS)); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -4008,53 +4126,261 @@ static int __cold mdbx_env_map(MDBX_env *env, void *addr, size_t usedsize) { return MDBX_SUCCESS; } -int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { +LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower, + ssize_t size_now, ssize_t size_upper, + ssize_t growth_step, + ssize_t shrink_threshold, + ssize_t pagesize) { if (unlikely(!env)) return MDBX_EINVAL; if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(size < MIN_MAPSIZE || size > MAX_MAPSIZE)) - return MDBX_EINVAL; + const bool outside_txn = + (!env->me_txn0 || env->me_txn0->mt_owner != mdbx_thread_self()); - if (unlikely(size < pgno2bytes(env, MIN_PAGENO))) - return MDBX_EINVAL; - - /* If env is already open, caller is responsible for making - * sure there are no active txns. */ + int rc = MDBX_PROBLEM; if (env->me_map) { - int rc; - MDBX_meta *meta; - if (env->me_txn) - return MDBX_EINVAL; + /* env already mapped */ + if (!env->me_lck || (env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; - /* FIXME: lock/unlock */ - meta = mdbx_meta_head(env); - /* Silently round up to minimum if the size is too small */ - const size_t usedsize = pgno2bytes(env, meta->mm_last_pg + 1); - if (size < usedsize) - size = usedsize; + if (outside_txn) { + int err = mdbx_txn_lock(env); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + MDBX_meta *head = mdbx_meta_head(env); - mdbx_munmap(env->me_map, env->me_mapsize); -#ifdef USE_VALGRIND - VALGRIND_DISCARD(env->me_valgrind_handle); - env->me_valgrind_handle = -1; -#endif + if (pagesize < 0) + pagesize = env->me_psize; + if (pagesize != env->me_psize) { + rc = MDBX_EINVAL; + goto bailout; + } - rc = mdbx_ftruncate(env->me_fd, size); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - env->me_mapsize = size; - /* FIXME: update meta */ - rc = mdbx_env_map(env, NULL, usedsize); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (size_lower < 0) + size_lower = pgno2bytes(env, head->mm_geo.lower); + if (size_now < 0) + size_now = pgno2bytes(env, head->mm_geo.now); + if (size_upper < 0) + size_upper = pgno2bytes(env, head->mm_geo.upper); + if (growth_step < 0) + growth_step = pgno2bytes(env, head->mm_geo.grow); + if (shrink_threshold < 0) + shrink_threshold = pgno2bytes(env, head->mm_geo.shrink); + + const size_t used_size = pgno2bytes(env, head->mm_geo.next); + if ((size_t)size_upper < used_size) { + rc = MDBX_MAP_FULL; + goto bailout; + } + if ((size_t)size_now < used_size) + size_now = used_size; +#if defined(_WIN32) || defined(_WIN64) + if ((size_t)size_now < env->me_dbgeo.now || + (size_t)size_upper < env->me_dbgeo.upper) { + /* Windows is unable shrinking a mapped file */ + return ERROR_USER_MAPPED_FILE; + } +#endif /* Windows */ + } else { + /* env NOT yet mapped */ + if (!outside_txn) { + rc = MDBX_PANIC; + goto bailout; + } + + if (pagesize < 0) { + pagesize = env->me_os_psize; + if (pagesize > MAX_PAGESIZE) + pagesize = MAX_PAGESIZE; + mdbx_assert(env, pagesize >= MIN_PAGESIZE); + } } - env->me_mapsize = size; - env->me_maxpg = bytes2pgno(env, env->me_mapsize); - return MDBX_SUCCESS; + if (pagesize < MIN_PAGESIZE || pagesize > MAX_PAGESIZE || + !mdbx_is_power2(pagesize)) { + rc = MDBX_EINVAL; + goto bailout; + } + + if (size_lower < 0) { + size_lower = MIN_MAPSIZE; + if (MIN_MAPSIZE / pagesize < MIN_PAGENO) + size_lower = MIN_PAGENO * pagesize; + } + + if (size_now < 0) { + size_now = DEFAULT_MAPSIZE; + if (size_now < size_lower) + size_now = size_lower; + } + + if (size_upper < 0) { + if ((size_t)size_now >= MAX_MAPSIZE / 2) + size_upper = MAX_MAPSIZE; + else if (MAX_MAPSIZE != MAX_MAPSIZE32 && + (size_t)size_now >= MAX_MAPSIZE32 / 2) + size_upper = MAX_MAPSIZE32; + else { + size_upper = size_now + size_now; + if ((size_t)size_upper < DEFAULT_MAPSIZE * 2) + size_upper = DEFAULT_MAPSIZE * 2; + } + if ((size_t)size_upper / pagesize > MAX_PAGENO) + size_upper = pagesize * MAX_PAGENO; + } + + if (unlikely(size_lower < MIN_MAPSIZE || size_lower > size_upper)) { + rc = MDBX_EINVAL; + goto bailout; + } + + if ((uint64_t)size_lower / pagesize < MIN_PAGENO) { + rc = MDBX_EINVAL; + goto bailout; + } + + if (unlikely((size_t)size_upper > MAX_MAPSIZE || + (uint64_t)size_upper / pagesize > MAX_PAGENO)) { + rc = MDBX_TOO_LARGE; + goto bailout; + } + + size_lower = mdbx_roundup2(size_lower, env->me_os_psize); + size_upper = mdbx_roundup2(size_upper, env->me_os_psize); + size_now = mdbx_roundup2(size_now, env->me_os_psize); + + /* LY: подбираем значение size_upper: + * - кратное размеру системной страницы + * - без нарушения MAX_MAPSIZE или MAX_PAGENO */ + while (unlikely((size_t)size_upper > MAX_MAPSIZE || + (uint64_t)size_upper / pagesize > MAX_PAGENO)) { + if ((size_t)size_upper < env->me_os_psize + MIN_MAPSIZE || + (size_t)size_upper < env->me_os_psize * (MIN_PAGENO + 1)) { + /* паранойа на случай переполнения при невероятных значениях */ + rc = MDBX_EINVAL; + goto bailout; + } + size_upper -= env->me_os_psize; + if ((size_t)size_upper > (size_t)size_lower) + size_lower = size_upper; + } + mdbx_assert(env, (size_upper - size_lower) % env->me_os_psize == 0); + + if (size_now < size_lower) + size_now = size_lower; + if (size_now > size_upper) + size_now = size_upper; + + if (growth_step < 0) { + growth_step = ((size_t)(size_upper - size_lower)) / 42; + if (growth_step > size_lower) + growth_step = size_lower; + if (growth_step < 65536) + growth_step = 65536; + if ((size_t)growth_step > MEGABYTE * 16) + growth_step = MEGABYTE * 16; + } + growth_step = mdbx_roundup2(growth_step, env->me_os_psize); + if (bytes2pgno(env, growth_step) > UINT16_MAX) + growth_step = pgno2bytes(env, UINT16_MAX); + + if (shrink_threshold < 0) { + shrink_threshold = growth_step + growth_step; + if (shrink_threshold < growth_step) + shrink_threshold = growth_step; + } + shrink_threshold = mdbx_roundup2(shrink_threshold, env->me_os_psize); + if (bytes2pgno(env, shrink_threshold) > UINT16_MAX) + shrink_threshold = pgno2bytes(env, UINT16_MAX); + + /* save params for future open/create */ + env->me_dbgeo.lower = size_lower; + env->me_dbgeo.now = size_now; + env->me_dbgeo.upper = size_upper; + env->me_dbgeo.grow = growth_step; + env->me_dbgeo.shrink = shrink_threshold; + rc = MDBX_SUCCESS; + + if (env->me_map) { + /* apply new params */ + mdbx_assert(env, pagesize == env->me_psize); + + MDBX_meta *head = mdbx_meta_head(env); + MDBX_meta meta = *head; + meta.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); + meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); + meta.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); + meta.mm_geo.grow = (uint16_t)bytes2pgno(env, env->me_dbgeo.grow); + meta.mm_geo.shrink = (uint16_t)bytes2pgno(env, env->me_dbgeo.shrink); + + mdbx_assert(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + mdbx_assert(env, meta.mm_geo.lower >= MIN_PAGENO); + mdbx_assert(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + mdbx_assert(env, meta.mm_geo.upper <= MAX_PAGENO); + mdbx_assert(env, meta.mm_geo.now >= meta.mm_geo.next); + mdbx_assert(env, env->me_dbgeo.upper >= env->me_dbgeo.lower); + mdbx_assert(env, meta.mm_geo.upper >= meta.mm_geo.now); + mdbx_assert(env, meta.mm_geo.now >= meta.mm_geo.lower); + mdbx_assert(env, meta.mm_geo.grow == bytes2pgno(env, env->me_dbgeo.grow)); + mdbx_assert(env, + meta.mm_geo.shrink == bytes2pgno(env, env->me_dbgeo.shrink)); + + if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { + if (meta.mm_geo.upper != head->mm_geo.upper) { + const size_t size = + mdbx_roundup2(pgno2bytes(env, meta.mm_geo.upper), env->me_os_psize); + + mdbx_mmap_param_t mmap; + mmap.address = env->me_map; +#ifdef MDBX_OSAL_SECTION + mmap.section = env->me_dxb_section; +#endif + mmap.fd = env->me_fd; + rc = mdbx_mremap(env->me_flags, &mmap, env->me_mapsize, size); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + env->me_mapsize = size; + env->me_map = mmap.address; +#ifdef USE_VALGRIND + VALGRIND_DISCARD(env->me_valgrind_handle); + env->me_valgrind_handle = + VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "mdbx"); +#endif + } + if (meta.mm_geo.now != head->mm_geo.now) { + const size_t size = + mdbx_roundup2(pgno2bytes(env, meta.mm_geo.now), env->me_os_psize); + + mdbx_mmap_param_t mmap; + mmap.address = env->me_map; +#ifdef MDBX_OSAL_SECTION + mmap.section = env->me_dxb_section; +#endif + mmap.fd = env->me_fd; + rc = mdbx_mresize(env->me_flags, &mmap, + pgno2bytes(env, head->mm_geo.now), size); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + mdbx_meta_set_txnid(env, &meta, mdbx_meta_txnid_stable(env, head) + 1); + rc = mdbx_sync_locked(env, env->me_flags, &meta); + } + } else { + mdbx_setup_pagesize(env, pagesize); + } + +bailout: + if (env->me_map && outside_txn) + mdbx_txn_unlock(env); + return rc; +} + +int __cold mdbx_env_set_mapsize(MDBX_env *env, size_t size) { + return mdbx_env_set_geometry(env, -1, size, -1, -1, -1, -1); } int __cold mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { @@ -4115,15 +4441,12 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { mdbx_debug("create new database"); rc = /* new database */ MDBX_RESULT_TRUE; - if (!env->me_psize) - env->me_psize = env->me_os_psize; - if (env->me_psize > MAX_PAGESIZE) - env->me_psize = MAX_PAGESIZE; - mdbx_ensure(env, is_power2(env->me_psize)); - mdbx_ensure(env, env->me_psize >= MIN_PAGESIZE); - - env->me_mapsize = roundup2( - env->me_mapsize ? env->me_mapsize : DEFAULT_MAPSIZE, env->me_os_psize); + if (!env->me_dbgeo.now) { + /* set defaults if not configured */ + err = mdbx_env_set_mapsize(env, DEFAULT_MAPSIZE); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } void *buffer = calloc(NUM_METAS, env->me_psize); if (!buffer) @@ -4135,60 +4458,95 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { if (unlikely(err != MDBX_SUCCESS)) return err; + err = mdbx_ftruncate(env->me_fd, env->me_dbgeo.now); + if (unlikely(err != MDBX_SUCCESS)) + return err; + #ifndef NDEBUG /* just for checking */ err = mdbx_read_header(env, &meta); if (unlikely(err != MDBX_SUCCESS)) return err; #endif - - err = mdbx_ftruncate(env->me_fd, env->me_mapsize); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } else { - env->me_psize = meta.mm_psize; - if (!is_power2(env->me_psize) || env->me_psize < MIN_PAGESIZE || - env->me_psize > MAX_PAGESIZE) { - mdbx_error("wrong pagesize %u (system %u)", env->me_psize, - env->me_os_psize); - return MDBX_WANNA_RECOVERY; - } - - /* Make sure mapsize >= committed data size. Even when using - * mm_mapsize, which could be broken in old files (ITS#7789). */ - const size_t usedsize = roundup2( - (meta.mm_last_pg + 1) * (size_t)env->me_psize, env->me_os_psize); - if (meta.mm_mapsize < usedsize) - meta.mm_mapsize = usedsize; - - /* Was a mapsize configured? */ - if (!env->me_mapsize || (env->me_flags & MDBX_RDONLY) || - lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) - env->me_mapsize = (size_t)meta.mm_mapsize; - else if (env->me_mapsize < usedsize) - env->me_mapsize = usedsize; } - mdbx_setup_pagesize(env, env->me_psize); + mdbx_setup_pagesize(env, meta.mm_psize); + if ((env->me_flags & MDBX_RDONLY) /* readonly */ + || lck_rc != MDBX_RESULT_TRUE /* not exclusive */) { + /* use present params from db */ + err = mdbx_env_set_geometry( + env, meta.mm_geo.lower * meta.mm_psize, meta.mm_geo.now * meta.mm_psize, + meta.mm_geo.upper * meta.mm_psize, meta.mm_geo.grow * meta.mm_psize, + meta.mm_geo.shrink * meta.mm_psize, meta.mm_psize); + if (unlikely(err != MDBX_SUCCESS)) { + mdbx_error("could not use present dbsize-params from db"); + return MDBX_INCOMPATIBLE; + } + } else if (env->me_dbgeo.now) { + /* silently growth to last used page */ + /* TODO: compactification */ + size_t used_bytes = meta.mm_psize * (size_t)meta.mm_geo.next; + if (env->me_dbgeo.lower < used_bytes) + env->me_dbgeo.lower = used_bytes; + if (env->me_dbgeo.now < used_bytes) + env->me_dbgeo.now = used_bytes; + if (env->me_dbgeo.upper < used_bytes) + env->me_dbgeo.upper = used_bytes; - uint64_t size; - err = mdbx_filesize(env->me_fd, &size); + /* apply preconfigured params */ + err = mdbx_env_set_geometry(env, env->me_dbgeo.lower, env->me_dbgeo.now, + env->me_dbgeo.upper, env->me_dbgeo.grow, + env->me_dbgeo.shrink, meta.mm_psize); + if (unlikely(err != MDBX_SUCCESS)) { + mdbx_error("could not apply preconfigured dbsize-params to db"); + return MDBX_INCOMPATIBLE; + } + /* update meta fields */ + meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); + meta.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); + meta.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); + meta.mm_geo.grow = (uint16_t)bytes2pgno(env, env->me_dbgeo.grow); + meta.mm_geo.shrink = (uint16_t)bytes2pgno(env, env->me_dbgeo.shrink); + mdbx_ensure(env, meta.mm_geo.now >= meta.mm_geo.next); + } + env->me_mapsize = env->me_dbgeo.upper; + + uint64_t filesize; + err = mdbx_filesize(env->me_fd, &filesize); if (unlikely(err != MDBX_SUCCESS)) return err; - if (size != env->me_mapsize) { - mdbx_notice("filesize mismatch (wanna %" PRIuPTR ", have %" PRIu64 ")", - env->me_mapsize, size); - if ((env->me_flags & MDBX_RDONLY) || - lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { - mdbx_error("exclusive, but read-only, unable ftruncate/set-size"); - return MDBX_WANNA_RECOVERY /* LY: could not mdbx_ftruncate */; - } - err = mdbx_ftruncate(env->me_fd, env->me_mapsize); - if (unlikely(err != MDBX_SUCCESS)) - return err; + const size_t expected_bytes = pgno2bytes(env, meta.mm_geo.now); + const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next); + if (filesize != expected_bytes) { + if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { + mdbx_info("filesize mismatch (expect %" PRIuPTR ", have %" PRIu64 "), " + "assume collision in non-exclusive mode", + expected_bytes, filesize); + } else { + mdbx_notice("filesize mismatch (expect %" PRIuPTR ", have %" PRIu64 ")", + expected_bytes, filesize); + if (filesize < used_bytes) { + mdbx_error("last-page beyond end-of-file (last %" PRIaPGNO + ", have %" PRIaPGNO ")", + meta.mm_geo.next, bytes2pgno(env, filesize)); + return MDBX_CORRUPTED; + } + + if (env->me_flags & MDBX_RDONLY) { + mdbx_notice("ignore filesize mismatch in readonly-mode"); + } else { + mdbx_info("resize datafile to %" PRIu64 " bytes", expected_bytes); + err = mdbx_ftruncate(env->me_fd, expected_bytes); + if (unlikely(err != MDBX_SUCCESS)) { + mdbx_error("error %d, while resize datafile to %" PRIu64 " bytes", rc, + expected_bytes); + return err; + } + } + } } - err = mdbx_env_map(env, NULL, env->me_mapsize); + err = mdbx_env_map(env, env->me_mapsize); if (err) return err; @@ -4198,9 +4556,12 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { return MDBX_WANNA_RECOVERY; } - const MDBX_meta *head = mdbx_meta_head(env); - const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); - if (head_txnid != meta.mm_txnid_a) { + while (1) { + const MDBX_meta *head = mdbx_meta_head(env); + const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); + if (head_txnid == meta.mm_txnid_a) + break; + if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { assert(META_IS_STEADY(&meta) && !META_IS_STEADY(head)); if (env->me_flags & MDBX_RDONLY) { @@ -4213,6 +4574,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { /* LY: rollback weak checkpoint */ MDBX_meta rollback = *head; mdbx_meta_set_txnid(env, &rollback, 0); + rollback.mm_datasync_sign = MDBX_DATASIGN_WEAK; mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN, head_txnid, meta.mm_txnid_a); mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head)); @@ -4220,37 +4582,74 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { (uint8_t *)head - (uint8_t *)env->me_map); if (err) return err; + + mdbx_invalidate_cache(env->me_map, pgno2bytes(env, NUM_METAS)); mdbx_ensure(env, 0 == mdbx_meta_txnid_fluid(env, head)); - } else if (!env->me_lck) { + mdbx_ensure(env, 0 == mdbx_meta_eq_mask(env)); + continue; + } + + if (!env->me_lck) { /* LY: without-lck (read-only) mode, so it is imposible that other * process made weak checkpoint. */ mdbx_error("without-lck, unable recovery/rollback"); return MDBX_WANNA_RECOVERY; - } else { - /* LY: assume just have a collision with other running process, - * or someone make a weak checkpoint */ - mdbx_info("assume collision or online weak checkpoint"); } + + /* LY: assume just have a collision with other running process, + * or someone make a weak checkpoint */ + mdbx_info("assume collision or online weak checkpoint"); + break; } - head = mdbx_meta_head(env); - if (head->mm_mapsize != env->me_mapsize) { - mdbx_info("mismatch meta.mapsize: present %" PRIu64 ", should %" PRIuPTR, - head->mm_mapsize, env->me_mapsize); - if ((env->me_flags & MDBX_RDONLY) || - lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) - return MDBX_MAP_RESIZED; - - mdbx_trace("updating meta.mapsize: from %" PRIu64 " to %" PRIuPTR, - head->mm_mapsize, env->me_mapsize); - meta = *head; - meta.mm_mapsize = env->me_mapsize; - mdbx_meta_set_txnid(env, &meta, meta.mm_txnid_a + 1); - if (META_IS_STEADY(head)) - meta.mm_datasync_sign = mdbx_meta_sign(&meta); - err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); - if (err) + const MDBX_meta *head = mdbx_meta_head(env); + if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { + /* re-check file size after mmap */ + err = mdbx_filesize(env->me_fd, &filesize); + if (unlikely(err != MDBX_SUCCESS)) return err; + if (filesize != expected_bytes) { + mdbx_info("datafile resized by system to %" PRIu64 " bytes", filesize); + if (filesize % env->me_os_psize || filesize > env->me_dbgeo.upper || + filesize < used_bytes) { + mdbx_info("unacceptable/unexpected datafile size %" PRIu64, filesize); + return MDBX_PROBLEM; + } + meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now = filesize); + mdbx_info("update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO + " pages", + env->me_dbgeo.now, meta.mm_geo.now); + } + + if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { + const txnid_t txnid = mdbx_meta_txnid_stable(env, head); + mdbx_info("updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, + head->mm_geo.shrink, head->mm_geo.grow, txnid, + meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, + meta.mm_geo.shrink, meta.mm_geo.grow, txnid + 1); + + mdbx_ensure(env, mdbx_meta_eq(env, &meta, head)); + mdbx_meta_set_txnid(env, &meta, txnid + 1); + env->me_sync_pending += env->me_psize; + err = mdbx_sync_locked(env, env->me_flags, &meta); + if (err) { + mdbx_info("error %d, while updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + err, head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, + head->mm_geo.shrink, head->mm_geo.grow, txnid, + meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, + meta.mm_geo.shrink, meta.mm_geo.grow, txnid + 1); + return err; + } + } } return rc; @@ -4286,9 +4685,9 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { return err; if (rc == MDBX_RESULT_TRUE) { - uint64_t wanna = roundup2((env->me_maxreaders - 1) * sizeof(MDBX_reader) + - sizeof(MDBX_lockinfo), - env->me_os_psize); + uint64_t wanna = mdbx_roundup2( + (env->me_maxreaders - 1) * sizeof(MDBX_reader) + sizeof(MDBX_lockinfo), + env->me_os_psize); #ifndef NDEBUG err = mdbx_ftruncate(env->me_lfd, size = 0); if (unlikely(err != MDBX_SUCCESS)) @@ -4316,12 +4715,15 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { } env->me_maxreaders = (unsigned)maxreaders; - void *addr = NULL; - err = mdbx_mmap(&addr, (size_t)size, true, env->me_lfd); + mdbx_mmap_param_t mmap; + mmap.fd = env->me_lfd; + err = mdbx_mmap(MDBX_WRITEMAP, &mmap, (size_t)size, (size_t)size); if (unlikely(err != MDBX_SUCCESS)) return err; - mdbx_assert(env, addr != nullptr); - env->me_lck = addr; + env->me_lck = mmap.address; +#ifdef MDBX_OSAL_SECTION + env->me_lck_section = mmap.section; +#endif #ifdef MADV_DODUMP (void)madvise(env->me_lck, size, MADV_DODUMP); @@ -4589,7 +4991,13 @@ static void __cold mdbx_env_close0(MDBX_env *env) { } if (env->me_map) { - mdbx_munmap(env->me_map, env->me_mapsize); + mdbx_mmap_param_t mmap; + mmap.address = env->me_map; +#ifdef MDBX_OSAL_SECTION + mmap.section = env->me_dxb_section; +#endif + mmap.fd = env->me_fd; + mdbx_munmap(&mmap, env->me_mapsize); #ifdef USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; @@ -4600,10 +5008,17 @@ static void __cold mdbx_env_close0(MDBX_env *env) { env->me_fd = INVALID_HANDLE_VALUE; } - mdbx_munmap((void *)env->me_lck, - (env->me_maxreaders - 1) * sizeof(MDBX_reader) + - sizeof(MDBX_lockinfo)); - env->me_lck = nullptr; + if (env->me_lck) { + mdbx_mmap_param_t mmap; + mmap.address = env->me_lck; +#ifdef MDBX_OSAL_SECTION + mmap.section = env->me_lck_section; +#endif + mmap.fd = env->me_lfd; + mdbx_munmap(&mmap, (env->me_maxreaders - 1) * sizeof(MDBX_reader) + + sizeof(MDBX_lockinfo)); + env->me_lck = nullptr; + } env->me_pid = 0; env->me_oldest = nullptr; @@ -5175,16 +5590,17 @@ static int mdbx_ovpage_free(MDBX_cursor *mc, MDBX_page *mp) { * so we should give it back to our current free list, if any. * Otherwise put it onto the list of pages we freed in this txn. * - * Won't create me_pghead: me_pglast must be inited along with it. + * Won't create me_reclaimed_pglist: me_last_reclaimed must be inited along + * with it. * Unsupported in nested txns: They would need to hide the page * range in ancestor txns' dirty and spilled lists. */ - if (env->me_pghead && !txn->mt_parent && + if (env->me_reclaimed_pglist && !txn->mt_parent && ((mp->mp_flags & P_DIRTY) || (sl && (x = mdbx_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) { unsigned i, j; pgno_t *mop; MDBX_ID2 *dl, ix, iy; - rc = mdbx_midl_need(&env->me_pghead, ovpages); + rc = mdbx_midl_need(&env->me_reclaimed_pglist, ovpages); if (unlikely(rc)) return rc; if (!(mp->mp_flags & P_DIRTY)) { @@ -5217,8 +5633,8 @@ static int mdbx_ovpage_free(MDBX_cursor *mc, MDBX_page *mp) { if (!(env->me_flags & MDBX_WRITEMAP)) mdbx_dpage_free(env, mp); release: - /* Insert in me_pghead */ - mop = env->me_pghead; + /* Insert in me_reclaimed_pglist */ + mop = env->me_reclaimed_pglist; j = mop[0] + ovpages; for (i = mop[0]; i && mop[i] < pg; i--) mop[j--] = mop[i]; @@ -6712,7 +7128,7 @@ fail: * [out] mp Address of a page, or NULL on failure. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, unsigned num, +static int mdbx_page_new(MDBX_cursor *mc, unsigned flags, unsigned num, MDBX_page **mp) { MDBX_page *np; int rc; @@ -8930,7 +9346,7 @@ static int __cold mdbx_env_compact(MDBX_env *env, mdbx_filehandle_t fd) { txn->mt_dbs[FREE_DBI].md_overflow_pages; new_root = txn->mt_next_pgno - 1 - freecount; - meta->mp_meta.mm_last_pg = new_root; + meta->mp_meta.mm_geo.next = meta->mp_meta.mm_geo.now = new_root + 1; meta->mp_meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; meta->mp_meta.mm_dbs[MAIN_DBI].md_root = new_root; } else { @@ -8996,6 +9412,9 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, mdbx_filehandle_t fd) { } rc = mdbx_write(fd, env->me_map, pgno2bytes(env, NUM_METAS)); + MDBX_meta *const head = mdbx_meta_head(env); + const uint64_t size = + mdbx_roundup2(pgno2bytes(env, head->mm_geo.now), env->me_os_psize); mdbx_txn_unlock(env); if (likely(rc == MDBX_SUCCESS)) @@ -9003,7 +9422,7 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, mdbx_filehandle_t fd) { pgno2bytes(env, txn->mt_next_pgno - NUM_METAS)); if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_ftruncate(fd, env->me_mapsize); + rc = mdbx_ftruncate(fd, size); bailout: mdbx_txn_abort(txn); @@ -9175,7 +9594,14 @@ int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { arg->me_meta1_sign = meta1->mm_datasync_sign; arg->me_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2); arg->me_meta2_sign = meta2->mm_datasync_sign; - arg->me_recent_pgno = meta->mm_last_pg; + arg->me_last_pgno = meta->mm_geo.next - 1; + arg->me_geo.lower = pgno2bytes(env, meta->mm_geo.lower); + arg->me_geo.upper = pgno2bytes(env, meta->mm_geo.upper); + arg->me_geo.current = pgno2bytes(env, meta->mm_geo.now); + arg->me_geo.shrink = pgno2bytes(env, meta->mm_geo.shrink); + arg->me_geo.grow = pgno2bytes(env, meta->mm_geo.grow); + arg->me_mapsize = env->me_mapsize; + mdbx_compiler_barrier(); } while (unlikely(arg->me_meta0_txnid != mdbx_meta_txnid_fluid(env, meta0) || arg->me_meta0_sign != meta0->mm_datasync_sign || arg->me_meta1_txnid != mdbx_meta_txnid_fluid(env, meta1) || @@ -9185,18 +9611,21 @@ int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { meta != mdbx_meta_head(env) || arg->me_recent_txnid != mdbx_meta_txnid_fluid(env, meta))); - arg->me_mapsize = env->me_mapsize; arg->me_maxreaders = env->me_maxreaders; arg->me_numreaders = env->me_lck->mti_numreaders; - arg->me_latter_reader_txnid = 0; + arg->me_dxb_pagesize = env->me_psize; + arg->me_sys_pagesize = env->me_os_psize; - MDBX_reader *r = env->me_lck->mti_readers; - arg->me_latter_reader_txnid = arg->me_recent_txnid; - for (unsigned i = 0; i < arg->me_numreaders; ++i) { - if (r[i].mr_pid) { - txnid_t mr = r[i].mr_txnid; - if (arg->me_latter_reader_txnid > mr) - arg->me_latter_reader_txnid = mr; + arg->me_latter_reader_txnid = 0; + if (env->me_lck) { + MDBX_reader *r = env->me_lck->mti_readers; + arg->me_latter_reader_txnid = arg->me_recent_txnid; + for (unsigned i = 0; i < arg->me_numreaders; ++i) { + if (r[i].mr_pid) { + txnid_t mr = r[i].mr_txnid; + if (arg->me_latter_reader_txnid > mr) + arg->me_latter_reader_txnid = mr; + } } } @@ -9998,7 +10427,7 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) return MDBX_THREAD_MISMATCH; MDBX_env *env = txn->mt_env; - pgno_t maxpg = env->me_maxpg; + pgno_t maxpg = bytes2pgno(env, env->me_mapsize); if (unlikely((txn->mt_flags & MDBX_RDONLY) == 0)) { *percent = (int)((txn->mt_next_pgno * UINT64_C(100) + maxpg / 2) / maxpg); return -1; @@ -10009,10 +10438,8 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) do { meta = mdbx_meta_head(env); recent = mdbx_meta_txnid_fluid(env, meta); - if (percent) { - pgno_t last = meta->mm_last_pg + 1; - *percent = (int)((last * UINT64_C(100) + maxpg / 2) / maxpg); - } + if (percent) + *percent = (int)((meta->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); } while (unlikely(recent != mdbx_meta_txnid_fluid(env, meta))); txnid_t lag = recent - txn->mt_ro_reader->mr_txnid; diff --git a/src/osal.c b/src/osal.c index 02b6ce5c..71d17b7d 100644 --- a/src/osal.c +++ b/src/osal.c @@ -49,21 +49,57 @@ static int ntstatus2errcode(NTSTATUS status) { * conflict with the regular user-level headers, so we explicitly * declare them here. Using these APIs also means we must link to * ntdll.dll, which is not linked by default in user code. */ -NTSTATUS WINAPI NtCreateSection(OUT PHANDLE sh, IN ACCESS_MASK acc, - IN void *oa OPTIONAL, - IN PLARGE_INTEGER ms OPTIONAL, IN ULONG pp, - IN ULONG aa, IN HANDLE fh OPTIONAL); + +#ifndef NT_SUCCESS +#define NT_SUCCESS(x) ((x) >= 0) +#define STATUS_SUCCESS ((NTSTATUS)0) +#endif +typedef struct _UNICODE_STRING { + USHORT Length; + USHORT MaximumLength; + PWSTR Buffer; +} UNICODE_STRING, *PUNICODE_STRING; + +typedef struct _OBJECT_ATTRIBUTES { + ULONG Length; + HANDLE RootDirectory; + PUNICODE_STRING ObjectName; + ULONG Attributes; + PVOID SecurityDescriptor; + PVOID SecurityQualityOfService; +} OBJECT_ATTRIBUTES, *POBJECT_ATTRIBUTES; + +extern NTSTATUS NTAPI NtCreateSection( + OUT PHANDLE SectionHandle, IN ACCESS_MASK DesiredAccess, + IN OPTIONAL POBJECT_ATTRIBUTES ObjectAttributes, + IN OPTIONAL PLARGE_INTEGER MaximumSize, IN ULONG SectionPageProtection, + IN ULONG AllocationAttributes, IN OPTIONAL HANDLE FileHandle); + +extern NTSTATUS NTAPI NtExtendSection(IN HANDLE SectionHandle, + IN PLARGE_INTEGER NewSectionSize); typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; -NTSTATUS WINAPI NtMapViewOfSection(IN PHANDLE sh, IN HANDLE ph, - IN OUT PVOID *addr, IN ULONG_PTR zbits, - IN SIZE_T cs, - IN OUT PLARGE_INTEGER off OPTIONAL, - IN OUT PSIZE_T vs, IN SECTION_INHERIT ih, - IN ULONG at, IN ULONG pp); +extern NTSTATUS NTAPI NtMapViewOfSection( + IN HANDLE SectionHandle, IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, + IN ULONG_PTR ZeroBits, IN SIZE_T CommitSize, + IN OUT OPTIONAL PLARGE_INTEGER SectionOffset, IN OUT PSIZE_T ViewSize, + IN SECTION_INHERIT InheritDisposition, IN ULONG AllocationType, + IN ULONG Win32Protect); -NTSTATUS WINAPI NtClose(HANDLE h); +extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle, + IN OPTIONAL PVOID BaseAddress); + +extern NTSTATUS NTAPI NtClose(HANDLE Handle); + +extern NTSTATUS NTAPI NtAllocateVirtualMemory( + IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, IN ULONG ZeroBits, + IN OUT PULONG RegionSize, IN ULONG AllocationType, IN ULONG Protect); + +extern NTSTATUS NTAPI NtFreeVirtualMemory(IN HANDLE ProcessHandle, + IN PVOID *BaseAddress, + IN OUT PULONG RegionSize, + IN ULONG FreeType); #endif /* _WIN32 || _WIN64 */ @@ -558,6 +594,16 @@ int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync) { #elif __GLIBC_PREREQ(2, 16) || _BSD_SOURCE || _XOPEN_SOURCE || \ (__GLIBC_PREREQ(2, 8) && _POSIX_C_SOURCE >= 200112L) for (;;) { +/* LY: It is no reason to use fdatasync() here, even in case + * no such bug in a kernel. Because "no-bug" mean that a kernel + * internally do nearly the same, e.g. fdatasync() == fsync() + * when no-kernel-bug and file size was changed. + * + * So, this code is always safe and without appreciable + * performance degradation. + * + * For more info about of a corresponding fdatasync() bug + * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ #if _POSIX_C_SOURCE >= 199309L || _XOPEN_SOURCE >= 500 || \ defined(_POSIX_SYNCHRONIZED_IO) if (!fullsync && fdatasync(fd) == 0) @@ -576,6 +622,22 @@ int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync) { #endif } +int mdbx_filesize_sync(mdbx_filehandle_t fd) { +#if defined(_WIN32) || defined(_WIN64) + (void)fd; + /* Nothing on Windows (i.e. newer 100% steady) */ + return MDBX_SUCCESS; +#else + for (;;) { + if (fsync(fd) == 0) + return MDBX_SUCCESS; + int rc = errno; + if (rc != EINTR) + return rc; + } +#endif +} + int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { #if defined(_WIN32) || defined(_WIN64) BY_HANDLE_FILE_INFORMATION info; @@ -678,51 +740,113 @@ int mdbx_msync(void *addr, size_t length, int async) { #endif } -int mdbx_mremap_size(void **address, size_t old_size, size_t new_size) { +int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit) { #if defined(_WIN32) || defined(_WIN64) - *address = MAP_FAILED; - (void)old_size; - (void)new_size; - return ERROR_CALL_NOT_IMPLEMENTED; + NTSTATUS rc = NtCreateSection( + &map->section, + /* DesiredAccess */ SECTION_MAP_READ | SECTION_EXTEND_SIZE | + ((flags & MDBX_WRITEMAP) ? SECTION_MAP_WRITE : 0), + /* ObjectAttributes */ NULL, /* MaximumSize */ NULL, + /* SectionPageProtection */ (flags & MDBX_RDONLY) ? PAGE_READONLY + : PAGE_READWRITE, + /* AllocationAttributes */ SEC_RESERVE, map->fd); + + if (!NT_SUCCESS(rc)) { + map->section = 0; + map->address = MAP_FAILED; + return ntstatus2errcode(rc); + } + + map->address = NULL; + size_t ViewSize = limit; + rc = NtMapViewOfSection( + map->section, GetCurrentProcess(), &map->address, + /* ZeroBits */ 0, + /* CommitSize */ length, + /* SectionOffset */ NULL, &ViewSize, + /* InheritDisposition */ ViewUnmap, + /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, + /* Win32Protect */ (flags & MDBX_WRITEMAP) ? PAGE_READWRITE + : PAGE_READONLY); + + if (!NT_SUCCESS(rc)) { + NtClose(map->section); + map->section = 0; + map->address = MAP_FAILED; + return ntstatus2errcode(rc); + } + + assert(map->address != MAP_FAILED); + return MDBX_SUCCESS; #else - *address = mremap(*address, old_size, new_size, 0, address); - return (*address != MAP_FAILED) ? MDBX_SUCCESS : errno; + (void)length; + map->address = mmap( + NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, + MAP_SHARED, map->fd, 0); + return (map->address != MAP_FAILED) ? MDBX_SUCCESS : errno; #endif } -int mdbx_mmap(void **address, size_t length, int rw, mdbx_filehandle_t fd) { -#if defined(_WIN32) || defined(_WIN64) - HANDLE h = CreateFileMapping(fd, NULL, rw ? PAGE_READWRITE : PAGE_READONLY, - HIGH_DWORD(length), (DWORD)length, NULL); - if (!h) - return mdbx_get_errno_checked(); - *address = MapViewOfFileEx(h, rw ? FILE_MAP_WRITE : FILE_MAP_READ, 0, 0, - length, *address); - int rc = (*address != MAP_FAILED) ? MDBX_SUCCESS : mdbx_get_errno_checked(); - CloseHandle(h); - return rc; -#else - *address = mmap(NULL, length, rw ? PROT_READ | PROT_WRITE : PROT_READ, - MAP_SHARED, fd, 0); - return (*address != MAP_FAILED) ? MDBX_SUCCESS : errno; -#endif -} - -int mdbx_munmap(void *address, size_t length) { +int mdbx_munmap(mdbx_mmap_param_t *map, size_t length) { #if defined(_WIN32) || defined(_WIN64) (void)length; - return UnmapViewOfFile(address) ? MDBX_SUCCESS : mdbx_get_errno_checked(); + if (map->section) + NtClose(map->section); + NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->address); + return NT_SUCCESS(rc) ? MDBX_SUCCESS : ntstatus2errcode(rc); #else - return (munmap(address, length) == 0) ? MDBX_SUCCESS : errno; + return (munmap(map->address, length) == 0) ? MDBX_SUCCESS : errno; #endif } -int mdbx_mlock(const void *address, size_t length) { +int mdbx_mlock(mdbx_mmap_param_t *map, size_t length) { #if defined(_WIN32) || defined(_WIN64) - return VirtualLock((void *)address, length) ? MDBX_SUCCESS - : mdbx_get_errno_checked(); + return VirtualLock(map->address, length) ? MDBX_SUCCESS : GetLastError(); #else - return (mlock(address, length) == 0) ? MDBX_SUCCESS : errno; + return (mlock(map->address, length) == 0) ? MDBX_SUCCESS : errno; +#endif +} + +int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current, + size_t wanna) { +#if defined(_WIN32) || defined(_WIN64) + if (wanna > current) { + /* growth */ + uint8_t *ptr = (uint8_t *)map->address + current; + return (ptr == VirtualAlloc(ptr, wanna - current, MEM_COMMIT, + (flags & MDBX_WRITEMAP) ? PAGE_READWRITE + : PAGE_READONLY)) + ? MDBX_SUCCESS + : GetLastError(); + } + /* Windows is unable shrinking a mapped file */ + return MDBX_RESULT_TRUE; +#else + (void)flags; + (void)current; + return mdbx_ftruncate(map->fd, wanna); +#endif +} + +int mdbx_mremap(int flags, mdbx_mmap_param_t *map, size_t old_limit, + size_t new_limit) { +#if defined(_WIN32) || defined(_WIN64) + (void)flags; + if (old_limit > new_limit) { + /* Windows is unable shrinking a mapped section */ + return ERROR_USER_MAPPED_FILE; + } + LARGE_INTEGER new_size; + new_size.QuadPart = new_limit; + NTSTATUS rc = NtExtendSection(map->section, &new_size); + return NT_SUCCESS(rc) ? MDBX_SUCCESS : ntstatus2errcode(rc); +#else + (void)flags; + void *ptr = mremap(map->address, old_limit, new_limit, 0); + if (ptr == MAP_FAILED) + return errno; + map->address = ptr; + return MDBX_SUCCESS; #endif } diff --git a/src/osal.h b/src/osal.h index 0c62da1d..e4f13713 100644 --- a/src/osal.h +++ b/src/osal.h @@ -69,7 +69,7 @@ #define HAVE_SYS_TYPES_H typedef HANDLE mdbx_thread_t; typedef unsigned mdbx_thread_key_t; -typedef SSIZE_T ssize_t; +#define MDBX_OSAL_SECTION HANDLE #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -430,16 +430,27 @@ void *mdbx_thread_rthc_get(mdbx_thread_key_t key); void mdbx_thread_rthc_set(mdbx_thread_key_t key, const void *value); int mdbx_filesync(mdbx_filehandle_t fd, bool fullsync); +int mdbx_filesize_sync(mdbx_filehandle_t fd); int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); int mdbx_openfile(const char *pathname, int flags, mode_t mode, mdbx_filehandle_t *fd); int mdbx_closefile(mdbx_filehandle_t fd); -int mdbx_mremap_size(void **address, size_t old_size, size_t new_size); -int mdbx_mmap(void **address, size_t length, int rw, mdbx_filehandle_t fd); -int mdbx_munmap(void *address, size_t length); -int mdbx_mlock(const void *address, size_t length); +typedef struct mdbx_mmap_param { + void *address; +#ifdef MDBX_OSAL_SECTION + MDBX_OSAL_SECTION section; +#endif + mdbx_filehandle_t fd; +} mdbx_mmap_param_t; +int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit); +int mdbx_munmap(mdbx_mmap_param_t *map, size_t length); +int mdbx_mlock(mdbx_mmap_param_t *map, size_t length); +int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current, + size_t wanna); +int mdbx_mremap(int flags, mdbx_mmap_param_t *map, size_t old_limit, + size_t new_limit); static __inline mdbx_pid_t mdbx_getpid(void) { #if defined(_WIN32) || defined(_WIN64) diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index dabe2f82..ad8f85c8 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -345,10 +345,10 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, reclaimable_pages += number; for (i = number, prev = 1; --i >= 0;) { pg = iptr[i]; - if (pg < NUM_METAS || pg > envinfo.me_recent_pgno) + if (pg < NUM_METAS || pg > envinfo.me_last_pgno) problem_add("entry", record_number, "wrong idl entry", "%u < %" PRIiPTR " < %" PRIiPTR "", NUM_METAS, pg, - envinfo.me_recent_pgno); + envinfo.me_last_pgno); else if (pg <= prev) { bad = " [bad sequence]"; problem_add("entry", record_number, "bad sequence", @@ -727,6 +727,17 @@ static int check_meta_head(bool steady) { return 0; } +static void print_size(const char *prefix, const uint64_t value, + const char *suffix) { + const char sf[] = + "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ + double k = 1024.0; + size_t i; + for (i = 0; sf[i + 1] && value / k > 1000.0; ++i) + k *= 1024; + print("%s%" PRIu64 " (%.2f %cb)%s", prefix, value, value / k, sf[i], suffix); +} + int main(int argc, char *argv[]) { int i, rc; char *prog = argv[0]; @@ -858,21 +869,26 @@ int main(int argc, char *argv[]) { goto bailout; } - lastpgno = envinfo.me_recent_pgno + 1; + lastpgno = envinfo.me_last_pgno + 1; errno = 0; if (verbose) { - double k = 1024.0; - const char sf[] = - "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ - for (i = 0; sf[i + 1] && envinfo.me_mapsize / k > 1000.0; ++i) - k *= 1024; - print(" - map size %" PRIu64 " (%.2f %cb)\n", envinfo.me_mapsize, - envinfo.me_mapsize / k, sf[i]); - if (envinfo.me_mapaddr) - print(" - mapaddr %p\n", envinfo.me_mapaddr); - print(" - pagesize %u, max keysize %" PRIuPTR ", max readers %u\n", - envstat.ms_psize, maxkeysize, envinfo.me_maxreaders); + print(" - pagesize %u (%u system), max keysize %" PRIuPTR + ", max readers %u\n", + envinfo.me_dxb_pagesize, envinfo.me_sys_pagesize, maxkeysize, + envinfo.me_maxreaders); + print_size(" - mapsize ", envinfo.me_mapsize, "\n"); + if (envinfo.me_geo.lower == envinfo.me_geo.upper) + print_size(" - fixed datafile: ", envinfo.me_geo.current, ""); + else { + print_size(" - dynamic datafile: ", envinfo.me_geo.lower, ""); + print_size(" .. ", envinfo.me_geo.upper, ", "); + print_size("+", envinfo.me_geo.grow, ", "); + print_size("-", envinfo.me_geo.shrink, "\n"); + print_size(" - current datafile: ", envinfo.me_geo.current, ""); + } + printf(", %" PRIu64 " pages\n", + envinfo.me_geo.current / envinfo.me_dxb_pagesize); print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64 ", lag %" PRIi64 "\n", envinfo.me_recent_txnid, envinfo.me_latter_reader_txnid, @@ -884,7 +900,7 @@ int main(int argc, char *argv[]) { } if (verbose) - print(" - performs check for meta-pages overlap\n"); + print(" - performs check for meta-pages clashes\n"); if (meta_eq(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, envinfo.me_meta1_txnid, envinfo.me_meta1_sign)) { print(" - meta-%d and meta-%d are clashed\n", 0, 1); @@ -1008,6 +1024,8 @@ int main(int argc, char *argv[]) { uint64_t value = envinfo.me_mapsize / envstat.ms_psize; double percent = value / 100.0; print(" - pages info: %" PRIu64 " total", value); + value = envinfo.me_geo.current / envinfo.me_dxb_pagesize; + print(", backed %" PRIu64 " (%.1f%%)", value, value / percent); print(", allocated %" PRIu64 " (%.1f%%)", lastpgno, lastpgno / percent); if (verbose > 1) { diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index 6cff2560..726cc5d0 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -110,8 +110,6 @@ static int dumpit(MDBX_txn *txn, MDBX_dbi dbi, char *name) { printf("database=%s\n", name); printf("type=btree\n"); printf("mapsize=%" PRIu64 "\n", info.me_mapsize); - if (info.me_mapaddr) - printf("mapaddr=%p\n", info.me_mapaddr); printf("maxreaders=%u\n", info.me_maxreaders); for (i = 0; dbflags[i].bit; i++) diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index 924cdaeb..5089efb0 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -107,8 +107,8 @@ static void readhdr(void) { ptr = memchr(dbuf.iov_base, '\n', dbuf.iov_len); if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.iov_base + STRLENOF("mapaddr="), "%p", - &envinfo.me_mapaddr); + void *unused; + i = sscanf((char *)dbuf.iov_base + STRLENOF("mapaddr="), "%p", &unused); if (i != 1) { fprintf(stderr, "%s: line %" PRIiPTR ": invalid mapaddr %s\n", prog, lineno, (char *)dbuf.iov_base + STRLENOF("mapaddr=")); diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index efa69b42..b1fa22bb 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -13,15 +13,17 @@ * top-level directory of the distribution or, alternatively, at * . */ -#include "../../mdbx.h" #include #include #include #include #include +#include "../../mdbx.h" +#include "../bits.h" + static void prstat(MDBX_stat *ms) { - printf(" Page size: %u\n", ms->ms_psize); + printf(" Pagesize: %u\n", ms->ms_psize); printf(" Tree depth: %u\n", ms->ms_depth); printf(" Branch pages: %" PRIu64 "\n", ms->ms_branch_pages); printf(" Leaf pages: %" PRIu64 "\n", ms->ms_leaf_pages); @@ -121,11 +123,24 @@ int main(int argc, char *argv[]) { (void)mdbx_env_stat(env, &mst, sizeof(mst)); (void)mdbx_env_info(env, &mei, sizeof(mei)); printf("Environment Info\n"); - printf(" Map address: %p\n", mei.me_mapaddr); - printf(" Map size: %" PRIu64 "\n", mei.me_mapsize); - printf(" Page size: %u\n", mst.ms_psize); - printf(" Max pages: %" PRIu64 "\n", mei.me_mapsize / mst.ms_psize); - printf(" Number of pages used: %" PRIu64 "\n", mei.me_recent_pgno + 1); + printf(" Pagesize: %u\n", mst.ms_psize); + if (mei.me_geo.lower != mei.me_geo.upper) { + printf(" Dynamic datafile: %" PRIu64 "..%" PRIu64 " bytes (+%" PRIu64 + "/-%" PRIu64 "), %" PRIu64 "..%" PRIu64 " pages (+%" PRIu64 + "/-%" PRIu64 ")\n", + mei.me_geo.lower, mei.me_geo.upper, mei.me_geo.grow, + mei.me_geo.shrink, mei.me_geo.lower / mst.ms_psize, + mei.me_geo.upper / mst.ms_psize, mei.me_geo.grow / mst.ms_psize, + mei.me_geo.shrink / mst.ms_psize); + printf(" Current datafile: %" PRIu64 " bytes, %" PRIu64 " pages\n", + mei.me_geo.current, mei.me_geo.current / mst.ms_psize); + } else { + printf(" Fixed datafile: %" PRIu64 " bytes, %" PRIu64 " pages\n", + mei.me_geo.current, mei.me_geo.current / mst.ms_psize); + } + printf(" Current mapsize: %" PRIu64 " bytes, %" PRIu64 " pages \n", + mei.me_mapsize, mei.me_mapsize / mst.ms_psize); + printf(" Number of pages used: %" PRIu64 "\n", mei.me_last_pgno + 1); printf(" Last transaction ID: %" PRIu64 "\n", mei.me_recent_txnid); printf(" Tail transaction ID: %" PRIu64 " (%" PRIi64 ")\n", mei.me_latter_reader_txnid, @@ -161,8 +176,8 @@ int main(int argc, char *argv[]) { if (freinfo) { MDBX_cursor *cursor; MDBX_val key, data; - size_t pages = 0, *iptr; - size_t reclaimable = 0; + pgno_t pages = 0, *iptr; + pgno_t reclaimable = 0; printf("Freelist Status\n"); dbi = 0; @@ -186,7 +201,7 @@ int main(int argc, char *argv[]) { reclaimable += *iptr; if (freinfo > 1) { char *bad = ""; - size_t pg, prev; + pgno_t pg, prev; ssize_t i, j, span = 0; j = *iptr++; for (i = j, prev = 1; --i >= 0;) { @@ -198,53 +213,52 @@ int main(int argc, char *argv[]) { for (; i >= span && iptr[i - span] == pg; span++, pg++) ; } - printf(" Transaction %" PRIuPTR ", %" PRIiPTR + printf(" Transaction %" PRIaTXN ", %" PRIiPTR " pages, maxspan %" PRIiPTR "%s\n", - *(size_t *)key.iov_base, j, span, bad); + *(txnid_t *)key.iov_base, j, span, bad); if (freinfo > 2) { for (--j; j >= 0;) { pg = iptr[j]; for (span = 1; --j >= 0 && iptr[j] == pg + span; span++) ; if (span > 1) - printf(" %9zu[%" PRIiPTR "]\n", pg, span); + printf(" %9" PRIaPGNO "[%" PRIiPTR "]\n", pg, span); else - printf(" %9zu\n", pg); + printf(" %9" PRIaPGNO "\n", pg); } } } } mdbx_cursor_close(cursor); if (envinfo) { - size_t value = mei.me_mapsize / mst.ms_psize; + uint64_t value = mei.me_mapsize / mst.ms_psize; double percent = value / 100.0; printf("Page Allocation Info\n"); - printf(" Max pages: %9zu 100%%\n", value); + printf(" Max pages: %" PRIu64 " 100%%\n", value); - value = mei.me_recent_pgno + 1; - printf(" Number of pages used: %" PRIuPTR " %.1f%%\n", value, - value / percent); + value = mei.me_last_pgno + 1; + printf(" Pages used: %" PRIu64 " %.1f%%\n", value, value / percent); - value = mei.me_mapsize / mst.ms_psize - (mei.me_recent_pgno + 1); - printf(" Remained: %" PRIuPTR " %.1f%%\n", value, value / percent); + value = mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1); + printf(" Remained: %" PRIu64 " %.1f%%\n", value, value / percent); - value = mei.me_recent_pgno + 1 - pages; - printf(" Used now: %" PRIuPTR " %.1f%%\n", value, value / percent); + value = mei.me_last_pgno + 1 - pages; + printf(" Used now: %" PRIu64 " %.1f%%\n", value, value / percent); value = pages; - printf(" Unallocated: %" PRIuPTR " %.1f%%\n", value, value / percent); + printf(" Unallocated: %" PRIu64 " %.1f%%\n", value, value / percent); value = pages - reclaimable; - printf(" Detained: %" PRIuPTR " %.1f%%\n", value, value / percent); + printf(" Detained: %" PRIu64 " %.1f%%\n", value, value / percent); value = reclaimable; - printf(" Reclaimable: %" PRIuPTR " %.1f%%\n", value, value / percent); + printf(" Reclaimable: %" PRIu64 " %.1f%%\n", value, value / percent); - value = mei.me_mapsize / mst.ms_psize - (mei.me_recent_pgno + 1) + - reclaimable; - printf(" Available: %" PRIuPTR " %.1f%%\n", value, value / percent); + value = + mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1) + reclaimable; + printf(" Available: %" PRIu64 " %.1f%%\n", value, value / percent); } else - printf(" Free pages: %" PRIuPTR "\n", pages); + printf(" Free pages: %" PRIaPGNO "\n", pages); } rc = mdbx_dbi_open(txn, subname, 0, &dbi); From 28ba127db238b224919eb92da378a98d8fe9a162 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 20 Jun 2017 07:18:09 +0300 Subject: [PATCH 255/303] test: typo fix. --- test/config.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/config.cc b/test/config.cc index 3c6ac224..81b26b83 100644 --- a/test/config.cc +++ b/test/config.cc @@ -231,8 +231,8 @@ const struct option_verb mode_bits[] = { {"nosync", MDBX_NOSYNC}, {"nometasync", MDBX_NOMETASYNC}, {"writemap", MDBX_WRITEMAP}, {"notls", MDBX_NOTLS}, {"nordahead", MDBX_NORDAHEAD}, {"nomeminit", MDBX_NOMEMINIT}, - {"coasesce", MDBX_COALESCE}, {"lifo", MDBX_LIFORECLAIM}, - {"parturb", MDBX_PAGEPERTURB}, {nullptr, 0}}; + {"coalesce", MDBX_COALESCE}, {"lifo", MDBX_LIFORECLAIM}, + {"perturb", MDBX_PAGEPERTURB}, {nullptr, 0}}; const struct option_verb table_bits[] = { {"key.reverse", MDBX_REVERSEKEY}, From ca83480c344f51c17819768c98e84001daba5547 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 20 Jun 2017 07:25:31 +0300 Subject: [PATCH 256/303] mdbx: pragma lib ntdll.lib --- src/osal.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/osal.c b/src/osal.c index 71d17b7d..9dcba59b 100644 --- a/src/osal.c +++ b/src/osal.c @@ -49,6 +49,7 @@ static int ntstatus2errcode(NTSTATUS status) { * conflict with the regular user-level headers, so we explicitly * declare them here. Using these APIs also means we must link to * ntdll.dll, which is not linked by default in user code. */ +#pragma comment(lib, "ntdll.lib") #ifndef NT_SUCCESS #define NT_SUCCESS(x) ((x) >= 0) From 5ab319bbb11795e5fc3619bf13f50a9d14bbe958 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 16 Jun 2017 14:29:37 +0300 Subject: [PATCH 257/303] mdbx: update TODO. --- TODO.md | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/TODO.md b/TODO.md index c0170135..1e8ffdd7 100644 --- a/TODO.md +++ b/TODO.md @@ -28,6 +28,8 @@ - [x] Сделать список для txnid_t, кода sizeof(txnid_t) > sizeof(pgno_t) и вернуть размер pgno_t - [x] Избавиться от умножения на размер страницы (заменить на сдвиг). - [x] Устранение всех предупреждений (в том числе под Windows). +- [ ] добавить 'mti_reader_finished_flag' +- [ ] отрефакторить mdbx_freelist_save() - [ ] Перевод mdbx-tools на С++ и сборка для Windows - [ ] Заменить заглушки mdbx_version и mdbx_build - [ ] Актуализация README.md @@ -37,21 +39,22 @@ CI - [ ] Добавить в CI linux сборки для 32-битных таргетов Доработки API -- [ ] Добавить возможность "подбора" режима для mdbx_env_open() -- [ ] Дать возможность задавать размер страницы при создании БД. -- [ ] Изменение mapsize через API с блокировкой и увеличением txn, плюс поправить доку. -- [ ] Контроль размера страницы полного размера и кол-ва страниц при создании и обновлении. -- [ ] Инкрементальный mmap. +- [x] Дать возможность задавать размер страницы при создании БД. +- [x] Изменение mapsize через API с блокировкой и увеличением txn +- [x] Контроль размера страницы полного размера и кол-ва страниц при создании и обновлении. +- [x] Инкрементальный mmap. +- [x] Инкрементальное приращение размера (колбэк стратегии?). +- [ ] Поправить/Добавить описание нового API. - [ ] Возврат выделенных страниц в unallocated tail-pool. -- [ ] Инкрементальное приращение размера (колбэк стратегии?). +- [ ] Добавить возможность "подбора" режима для mdbx_env_open() - [ ] Переименовать в API: env->db, db->tbl Тест +- [ ] Додумать имя и размещение тестовой БД по-умолчанию. - [ ] Реализовать cleanup в тесте - [ ] usage для теста - [ ] Логирование в файл, плюс более полный progress bar - [ ] Опция игнорирования (пропуска части теста) при переполнении БД -- [ ] Додумать имя и размещение тестовой БД по-умолчанию. - [ ] Базовый бенчмарк Отладка @@ -63,7 +66,8 @@ CI - [ ] Валидатор страниц БД по номеру транзакции: ~0 при переработке и номер транзакции при выделении, проверять что этот номер больше головы реклайминга и не-больше текущей транзакции. -- [ ] Добавить free_backlog в meta +- [ ] Размещение overflow-pages в отдельном mmap/файле с собственной геометрией. +- [ ] Разместить free_backlog в конце meta - [ ] Валидатор страниц по CRC32, плюс контроль номер транзакии под модулю 2^32. - [ ] Валидатор страниц по t1ha c контролем снимков/версий БД на основе Merkle Tree. - [ ] Возможность хранения ключей внутри data (libfptu) From 5fbb8d1018c87a0b94329e28b64bd125c28fa59b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sat, 24 Jun 2017 19:33:34 +0300 Subject: [PATCH 258/303] mdbx: fix C11 warnings abount atomic. --- src/osal.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/osal.h b/src/osal.h index e4f13713..3fd869f0 100644 --- a/src/osal.h +++ b/src/osal.h @@ -531,8 +531,9 @@ int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid); #endif static __inline uint32_t mdbx_atomic_add32(volatile uint32_t *p, uint32_t v) { -#ifdef ATOMIC_VAR_INIT - return atomic_fetch_add(p, v); +#if defined(ATOMIC_VAR_INIT) + assert(atomic_is_lock_free(p)); + return atomic_fetch_add((_Atomic uint32_t *)p, v); #elif defined(__GNUC__) || defined(__clang__) return __sync_fetch_and_add(p, v); #else @@ -547,7 +548,8 @@ static __inline uint32_t mdbx_atomic_add32(volatile uint32_t *p, uint32_t v) { static __inline uint64_t mdbx_atomic_add64(volatile uint64_t *p, uint64_t v) { #ifdef ATOMIC_VAR_INIT - return atomic_fetch_add(p, v); + assert(atomic_is_lock_free(p)); + return atomic_fetch_add((_Atomic uint64_t *)p, v); #elif defined(__GNUC__) || defined(__clang__) return __sync_fetch_and_add(p, v); #else @@ -566,7 +568,8 @@ static __inline uint64_t mdbx_atomic_add64(volatile uint64_t *p, uint64_t v) { static __inline bool mdbx_atomic_compare_and_swap32(volatile uint32_t *p, uint32_t c, uint32_t v) { #ifdef ATOMIC_VAR_INIT - return atomic_compare_exchange_strong(p, &c, v); + assert(atomic_is_lock_free(p)); + return atomic_compare_exchange_strong((_Atomic uint32_t *)p, &c, v); #elif defined(__GNUC__) || defined(__clang__) return __sync_bool_compare_and_swap(p, c, v); #else @@ -582,7 +585,8 @@ static __inline bool mdbx_atomic_compare_and_swap32(volatile uint32_t *p, static __inline bool mdbx_atomic_compare_and_swap64(volatile uint64_t *p, uint64_t c, uint64_t v) { #ifdef ATOMIC_VAR_INIT - return atomic_compare_exchange_strong(p, &c, v); + assert(atomic_is_lock_free(p)); + return atomic_compare_exchange_strong((_Atomic uint64_t *)p, &c, v); #elif defined(__GNUC__) || defined(__clang__) return __sync_bool_compare_and_swap(p, c, v); #else From bfa8e439ba70558e65c7a03663fac8b0c822bc25 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sat, 24 Jun 2017 19:34:29 +0300 Subject: [PATCH 259/303] mdbx: use GNU C11 by default. --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index bdcaf3e3..fcf8bdc3 100644 --- a/Makefile +++ b/Makefile @@ -26,8 +26,8 @@ CC ?= gcc CXX ?= g++ XCFLAGS ?= -DNDEBUG=1 -DMDBX_DEBUG=0 -DLIBMDBX_EXPORTS=1 CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden -CFLAGS += -D_GNU_SOURCE=1 -std=gnu99 -pthread $(XCFLAGS) -CXXFLAGS = -std=c++11 $(filter-out -std=gnu99,$(CFLAGS)) +CFLAGS += -D_GNU_SOURCE=1 -std=gnu11 -pthread $(XCFLAGS) +CXXFLAGS = -std=c++11 $(filter-out -std=gnu11,$(CFLAGS)) TESTDB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-check.db # LY: '--no-as-needed,-lrt' for ability to built with modern glibc, but then run with the old From acfa096abadfc98013cdaab55daa821326876491 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 30 Jun 2017 00:16:16 +0300 Subject: [PATCH 260/303] mdbx: more for lockless/readonly mode (testing needed). Change-Id: I28f46f5a373212ebb5df780f4d2b5e216d8cdae2 --- src/mdbx.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 269769d0..0568663c 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -2097,13 +2097,13 @@ int mdbx_env_sync(MDBX_env *env, int force) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - if (unlikely(!env->me_lck)) - return MDBX_PANIC; - unsigned flags = env->me_flags & ~MDBX_NOMETASYNC; if (unlikely(flags & (MDBX_RDONLY | MDBX_FATAL_ERROR))) return MDBX_EACCESS; + if (unlikely(!env->me_lck)) + return MDBX_PANIC; + const bool outside_txn = (!env->me_txn0 || env->me_txn0->mt_owner != mdbx_thread_self()); @@ -4668,6 +4668,10 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { return err; /* LY: without-lck mode (e.g. on read-only filesystem) */ env->me_lfd = INVALID_HANDLE_VALUE; + env->me_oldest = &env->me_oldest_stub; + env->me_maxreaders = UINT_MAX; + mdbx_debug("lck-setup: %s ", "lockless mode (readonly)"); + return MDBX_SUCCESS; } /* Try to get exclusive lock. If we succeed, then @@ -4861,7 +4865,6 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, goto bailout; } - env->me_oldest = &env->me_oldest_stub; const int dxb_rc = mdbx_setup_dxb(env, lck_rc); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; @@ -9622,9 +9625,9 @@ int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { arg->me_latter_reader_txnid = arg->me_recent_txnid; for (unsigned i = 0; i < arg->me_numreaders; ++i) { if (r[i].mr_pid) { - txnid_t mr = r[i].mr_txnid; - if (arg->me_latter_reader_txnid > mr) - arg->me_latter_reader_txnid = mr; + const txnid_t txnid = r[i].mr_txnid; + if (arg->me_latter_reader_txnid > txnid) + arg->me_latter_reader_txnid = txnid; } } } From 186d2ee06596263067f076bb954e6d6179592748 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 30 Jun 2017 00:20:33 +0300 Subject: [PATCH 261/303] mdbx: add 'mti_reader_finished_flag' for speedup find_oldesd(). Change-Id: I4a2c8b80efad0cfc12918969125d258043cbffba --- src/bits.h | 8 +++++- src/defs.h | 2 ++ src/mdbx.c | 84 ++++++++++++++++++++++++++++++------------------------ 3 files changed, 56 insertions(+), 38 deletions(-) diff --git a/src/bits.h b/src/bits.h index 69689bb3..4b8e1485 100644 --- a/src/bits.h +++ b/src/bits.h @@ -402,7 +402,13 @@ typedef struct MDBX_lockinfo { volatile txnid_t mti_oldest; uint64_t align_oldest; }; - uint8_t pad_align[MDBX_CACHELINE_SIZE - sizeof(uint64_t) * 6]; + + union { + volatile uint32_t mti_reader_finished_flag; + uint64_t align_reader_finished_flag; + }; + + uint8_t pad_align[MDBX_CACHELINE_SIZE - sizeof(uint64_t) * 7]; MDBX_reader __cache_aligned mti_readers[1]; } MDBX_lockinfo; diff --git a/src/defs.h b/src/defs.h index 6ef6e35a..ed1a87e8 100644 --- a/src/defs.h +++ b/src/defs.h @@ -367,6 +367,8 @@ #define MDBX_TETRAD(a, b, c, d) \ ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d)) +#define MDBX_STRING_TETRAD(str) MDBX_TETRAD(str[0], str[1], str[2], str[3]) + #define FIXME "FIXME: " __FILE__ ", " STRINGIFY(__LINE__) #ifndef STATIC_ASSERT_MSG diff --git a/src/mdbx.c b/src/mdbx.c index 0568663c..0756b3e4 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1481,27 +1481,33 @@ static const char *mdbx_durable_str(const MDBX_meta *const meta) { /* Find oldest txnid still referenced. */ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { - MDBX_env *env = txn->mt_env; - mdbx_assert(env, (txn->mt_flags & MDBX_RDONLY) == 0); + mdbx_tassert(txn, (txn->mt_flags & MDBX_RDONLY) == 0); + MDBX_lockinfo *const lck = txn->mt_env->me_lck; - const txnid_t last_oldest = env->me_oldest[0]; + const txnid_t last_oldest = lck->mti_oldest; txnid_t oldest = txn->mt_txnid - 1; - mdbx_assert(env, oldest >= last_oldest); + mdbx_tassert(txn, oldest >= last_oldest); + if (last_oldest == oldest || + lck->mti_reader_finished_flag == MDBX_STRING_TETRAD("None")) + return last_oldest; - const MDBX_reader *const rtbl = env->me_lck->mti_readers; - for (int i = env->me_lck->mti_numreaders; - oldest != last_oldest && --i >= 0;) { - if (rtbl[i].mr_pid) { + const unsigned snap_nreaders = lck->mti_numreaders; + lck->mti_reader_finished_flag = MDBX_STRING_TETRAD("None"); + for (unsigned i = 0; i < snap_nreaders; ++i) { + if (lck->mti_readers[i].mr_pid) { mdbx_jitter4testing(true); - const txnid_t snap = rtbl[i].mr_txnid; - if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) + const txnid_t snap = lck->mti_readers[i].mr_txnid; + if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) { oldest = snap; + if (oldest == last_oldest) + break; + } } } if (oldest != last_oldest) { - mdbx_assert(env, oldest >= env->me_oldest[0]); - env->me_oldest[0] = oldest; + mdbx_tassert(txn, oldest >= lck->mti_oldest); + lck->mti_oldest = oldest; } return oldest; } @@ -2251,7 +2257,6 @@ static void mdbx_cursors_eot(MDBX_txn *txn, unsigned merge) { /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { MDBX_env *env = txn->mt_env; - unsigned i, nr; int rc; if (unlikely(env->me_pid != mdbx_getpid())) { @@ -2277,6 +2282,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != ~(txnid_t)0)) return MDBX_BAD_RSLOT; } else if (env->me_lck) { + unsigned slot, nreaders; const mdbx_pid_t pid = env->me_pid; const mdbx_tid_t tid = mdbx_thread_self(); mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); @@ -2297,12 +2303,12 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { } while (1) { - nr = env->me_lck->mti_numreaders; - for (i = 0; i < nr; i++) - if (env->me_lck->mti_readers[i].mr_pid == 0) + nreaders = env->me_lck->mti_numreaders; + for (slot = 0; slot < nreaders; slot++) + if (env->me_lck->mti_readers[slot].mr_pid == 0) break; - if (likely(i < env->me_maxreaders)) + if (likely(slot < env->me_maxreaders)) break; rc = mdbx_reader_check0(env, true, NULL); @@ -2315,7 +2321,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { STATIC_ASSERT(sizeof(MDBX_reader) == MDBX_CACHELINE_SIZE); STATIC_ASSERT( offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE == 0); - r = &env->me_lck->mti_readers[i]; + r = &env->me_lck->mti_readers[slot]; /* Claim the reader slot, carefully since other code * uses the reader table un-mutexed: First reset the * slot, next publish it in mtb.mti_numreaders. After @@ -2325,10 +2331,10 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { r->mr_txnid = ~(txnid_t)0; r->mr_tid = tid; mdbx_coherent_barrier(); - if (i == nr) - env->me_lck->mti_numreaders = ++nr; - if (env->me_close_readers < nr) - env->me_close_readers = nr; + if (slot == nreaders) + env->me_lck->mti_numreaders = ++nreaders; + if (env->me_close_readers < nreaders) + env->me_close_readers = nreaders; r->mr_pid = pid; mdbx_rdt_unlock(env); @@ -2412,7 +2418,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { /* Setup db info */ txn->mt_numdbs = env->me_numdbs; - for (i = CORE_DBS; i < txn->mt_numdbs; i++) { + for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { unsigned x = env->me_dbflags[i]; txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS; txn->mt_dbflags[i] = @@ -2647,6 +2653,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { if (txn->mt_ro_reader) { txn->mt_ro_reader->mr_txnid = ~(txnid_t)0; + env->me_lck->mti_reader_finished_flag = true; if (mode & MDBX_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) txn->mt_ro_reader->mr_pid = 0; @@ -10127,17 +10134,19 @@ int __cold mdbx_reader_list(MDBX_env *env, MDBX_msg_func *func, void *ctx) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - unsigned snap_nreaders = env->me_lck->mti_numreaders; - MDBX_reader *mr = env->me_lck->mti_readers; + const MDBX_lockinfo *const lck = env->me_lck; + const unsigned snap_nreaders = lck->mti_numreaders; for (unsigned i = 0; i < snap_nreaders; i++) { - if (mr[i].mr_pid) { - txnid_t txnid = mr[i].mr_txnid; + if (lck->mti_readers[i].mr_pid) { + const txnid_t txnid = lck->mti_readers[i].mr_txnid; if (txnid == ~(txnid_t)0) snprintf(buf, sizeof(buf), "%10" PRIuPTR " %" PRIxPTR " -\n", - (size_t)mr[i].mr_pid, (size_t)mr[i].mr_tid); + (size_t)lck->mti_readers[i].mr_pid, + (size_t)lck->mti_readers[i].mr_tid); else snprintf(buf, sizeof(buf), "%10" PRIuPTR " %" PRIxPTR " %" PRIaTXN "\n", - (size_t)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); + (size_t)lck->mti_readers[i].mr_pid, + (size_t)lck->mti_readers[i].mr_tid, txnid); if (first) { first = 0; @@ -10211,15 +10220,14 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { return MDBX_PANIC; } - unsigned snap_nreaders = env->me_lck->mti_numreaders; + MDBX_lockinfo *const lck = env->me_lck; + const unsigned snap_nreaders = lck->mti_numreaders; mdbx_pid_t *pids = alloca((snap_nreaders + 1) * sizeof(mdbx_pid_t)); pids[0] = 0; int rc = MDBX_SUCCESS, count = 0; - MDBX_reader *mr = env->me_lck->mti_readers; - for (unsigned i = 0; i < snap_nreaders; i++) { - const mdbx_pid_t pid = mr[i].mr_pid; + const mdbx_pid_t pid = lck->mti_readers[i].mr_pid; if (pid == 0) continue /* skip empty */; if (pid == env->me_pid) @@ -10252,7 +10260,7 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { } /* a other process may have clean and reused slot, recheck */ - if (mr[i].mr_pid != pid) + if (lck->mti_readers[i].mr_pid != pid) continue; err = mdbx_rpid_check(env, pid); @@ -10267,10 +10275,11 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { /* clean it */ for (unsigned j = i; j < snap_nreaders; j++) { - if (mr[j].mr_pid == pid) { + if (lck->mti_readers[j].mr_pid == pid) { mdbx_debug("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN "", - (size_t)pid, mr[j].mr_txnid); - mr[j].mr_pid = 0; + (size_t)pid, lck->mti_readers[j].mr_txnid); + lck->mti_readers[j].mr_pid = 0; + lck->mti_reader_finished_flag = true; count++; } } @@ -10371,6 +10380,7 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) { if (rc) { asleep->mr_txnid = ~(txnid_t)0; + env->me_lck->mti_reader_finished_flag = true; if (rc > 1) { asleep->mr_tid = 0; asleep->mr_pid = 0; From 9cb3abf37994d76be5de3295f01aa49f5724ba48 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 30 Jun 2017 00:21:28 +0300 Subject: [PATCH 262/303] mdbx: update TODO. Change-Id: Ia5d9e39276845361b8f3fe0c01ce0521bebf5c0d --- TODO.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/TODO.md b/TODO.md index 1e8ffdd7..aaeec666 100644 --- a/TODO.md +++ b/TODO.md @@ -28,10 +28,11 @@ - [x] Сделать список для txnid_t, кода sizeof(txnid_t) > sizeof(pgno_t) и вернуть размер pgno_t - [x] Избавиться от умножения на размер страницы (заменить на сдвиг). - [x] Устранение всех предупреждений (в том числе под Windows). -- [ ] добавить 'mti_reader_finished_flag' -- [ ] отрефакторить mdbx_freelist_save() -- [ ] Перевод mdbx-tools на С++ и сборка для Windows -- [ ] Заменить заглушки mdbx_version и mdbx_build +- [x] Добавить 'mti_reader_finished_flag'. +- [ ] Отрефакторить mdbx_freelist_save(). +- [ ] Хранить "свободный хвост" не связанный с freeDB в META. +- [ ] Перевод mdbx-tools на С++ и сборка для Windows. +- [ ] Заменить заглушки mdbx_version и mdbx_build. - [ ] Актуализация README.md CI @@ -50,6 +51,7 @@ CI - [ ] Переименовать в API: env->db, db->tbl Тест +- [ ] Тестирование поддержки lockless-режима. - [ ] Додумать имя и размещение тестовой БД по-умолчанию. - [ ] Реализовать cleanup в тесте - [ ] usage для теста From 2ed74ee78f17c141ff70147150aa8a02b84270d3 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 30 Jun 2017 07:25:56 +0300 Subject: [PATCH 263/303] mdbx: fix MSVC size_t/uint64_t warnings (minor). Change-Id: I580eea87e9c557c61b1ff0b66feaafce139d2c45 --- src/mdbx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 0756b3e4..229ae86f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4535,7 +4535,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { if (filesize < used_bytes) { mdbx_error("last-page beyond end-of-file (last %" PRIaPGNO ", have %" PRIaPGNO ")", - meta.mm_geo.next, bytes2pgno(env, filesize)); + meta.mm_geo.next, bytes2pgno(env, (size_t)filesize)); return MDBX_CORRUPTED; } @@ -4622,7 +4622,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { mdbx_info("unacceptable/unexpected datafile size %" PRIu64, filesize); return MDBX_PROBLEM; } - meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now = filesize); + meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now = (size_t)filesize); mdbx_info("update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO " pages", env->me_dbgeo.now, meta.mm_geo.now); From 3e6a6722860241c26c62c5c9dd7a8f5dc129e586 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 2 Jul 2017 09:07:57 +0300 Subject: [PATCH 264/303] mdbx: fix MSVC warnings, add uint16-range-asserts. Change-Id: Ie67a728035eeae250efbf962270b5c17c974db23 --- src/bits.h | 40 +++++++++-------- src/mdbx.c | 116 ++++++++++++++++++++++++++++++++----------------- src/osal.c | 6 ++- test/base.h | 26 +++++++++-- test/keygen.cc | 12 +++-- test/log.cc | 4 +- test/test.cc | 8 ++-- test/test.h | 7 +-- test/utils.cc | 2 +- 9 files changed, 142 insertions(+), 79 deletions(-) diff --git a/src/bits.h b/src/bits.h index 4b8e1485..f5fc18a3 100644 --- a/src/bits.h +++ b/src/bits.h @@ -32,15 +32,25 @@ /* Should be defined before any includes */ #ifndef _GNU_SOURCE -#define _GNU_SOURCE 1 +# define _GNU_SOURCE 1 #endif #ifndef _FILE_OFFSET_BITS -#define _FILE_OFFSET_BITS 64 +# define _FILE_OFFSET_BITS 64 #endif -#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS) -#define _CRT_SECURE_NO_WARNINGS -#endif +#ifdef _MSC_VER +# ifndef _CRT_SECURE_NO_WARNINGS +# define _CRT_SECURE_NO_WARNINGS +# endif +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#pragma warning(disable : 4710) /* 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* function 'xyz' selected for automatic inline expansion */ +#pragma warning(disable : 4201) /* nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4706) /* assignment within conditional expression */ +#pragma warning(disable : 4127) /* conditional expression is constant */ +#pragma warning(disable : 4324) /* 'xyz': structure was padded due to alignment specifier */ +#pragma warning(disable : 4310) /* cast truncates constant value */ +#endif /* _MSC_VER (warnings) */ #include "../mdbx.h" #include "./defs.h" @@ -65,17 +75,6 @@ # warning "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues." #endif /* __SANITIZE_THREAD__ */ -#ifdef _MSC_VER -#pragma warning(disable : 4464) /* relative include path contains '..' */ -#pragma warning(disable : 4710) /* 'xyz': function not inlined */ -#pragma warning(disable : 4711) /* function 'xyz' selected for automatic inline expansion */ -#pragma warning(disable : 4201) /* nonstandard extension used : nameless struct / union */ -#pragma warning(disable : 4706) /* assignment within conditional expression */ -#pragma warning(disable : 4127) /* conditional expression is constant */ -#pragma warning(disable : 4324) /* 'xyz': structure was padded due to alignment specifier */ -#pragma warning(disable : 4310) /* cast truncates constant value */ -#endif /* _MSC_VER (warnings) */ - #include "./osal.h" /* *INDENT-ON* */ @@ -361,7 +360,10 @@ typedef struct MDBX_page { #define MAX_MAPSIZE ((sizeof(size_t) < 8) ? MAX_MAPSIZE32 : MAX_MAPSIZE64) -#pragma pack(pop) +#ifdef _MSC_VER +#pragma warning(disable : 4820) /* bytes padding added after data member \ + for aligment */ +#endif /* The header for the reader table (a memory-mapped lock file). */ typedef struct MDBX_lockinfo { @@ -413,6 +415,10 @@ typedef struct MDBX_lockinfo { MDBX_reader __cache_aligned mti_readers[1]; } MDBX_lockinfo; +#ifdef _MSC_VER +#pragma pack(pop) +#endif + #define MDBX_LOCKINFO_WHOLE_SIZE \ ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ ~((size_t)MDBX_CACHELINE_SIZE - 1)) diff --git a/src/mdbx.c b/src/mdbx.c index 229ae86f..d7d76607 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -37,6 +37,16 @@ #include "./bits.h" +#ifdef _MSC_VER +#if _MSC_VER < 1910 +/* LY: MSVC has buggy/inconsistent PRIuPTR/PRIxPTR macros and format-arg + checker for size_t typedef. */ +#pragma warning(disable : 4777) /* format string '%10u' requires an argument \ + of type 'unsigned int', but variadic \ + argument 1 has type 'std::size_t' */ +#endif +#endif /* _MSC_VER (warnings) */ + /*----------------------------------------------------------------------------*/ /* rthc (tls keys and destructors) */ @@ -603,10 +613,10 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, static void mdbx_env_close0(MDBX_env *env); static MDBX_node *mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, int *exactp); -static int mdbx_node_add(MDBX_cursor *mc, indx_t indx, MDBX_val *key, +static int mdbx_node_add(MDBX_cursor *mc, unsigned indx, MDBX_val *key, MDBX_val *data, pgno_t pgno, unsigned flags); static void mdbx_node_del(MDBX_cursor *mc, size_t ksize); -static void mdbx_node_shrink(MDBX_page *mp, indx_t indx); +static void mdbx_node_shrink(MDBX_page *mp, unsigned indx); static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft); static int mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf, MDBX_val *data); static size_t mdbx_leaf_size(MDBX_env *env, MDBX_val *key, MDBX_val *data); @@ -1809,11 +1819,11 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, mdbx_meta_txnid_stable(env, steady), mdbx_durable_str(steady), oldest); - const unsigned flags = F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) - ? env->me_flags - : env->me_flags & MDBX_WRITEMAP; + const unsigned syncflags = F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) + ? env->me_flags + : env->me_flags & MDBX_WRITEMAP; MDBX_meta meta = *head; - if (mdbx_sync_locked(env, flags, &meta) == MDBX_SUCCESS) { + if (mdbx_sync_locked(env, syncflags, &meta) == MDBX_SUCCESS) { txnid_t snap = mdbx_find_oldest(txn); if (snap > oldest) continue; @@ -4162,7 +4172,7 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower, if (pagesize < 0) pagesize = env->me_psize; - if (pagesize != env->me_psize) { + if (pagesize != (ssize_t)env->me_psize) { rc = MDBX_EINVAL; goto bailout; } @@ -4314,7 +4324,7 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower, if (env->me_map) { /* apply new params */ - mdbx_assert(env, pagesize == env->me_psize); + mdbx_assert(env, pagesize == (ssize_t)env->me_psize); MDBX_meta *head = mdbx_meta_head(env); MDBX_meta meta = *head; @@ -5282,7 +5292,8 @@ static MDBX_node *__hot mdbx_node_search(MDBX_cursor *mc, MDBX_val *key, if (exactp) *exactp = (rc == 0 && nkeys > 0); /* store the key index */ - mc->mc_ki[mc->mc_top] = i; + mdbx_cassert(mc, i <= UINT16_MAX); + mc->mc_ki[mc->mc_top] = (indx_t)i; if (i >= nkeys) /* There is no entry larger or equal to the key. */ return NULL; @@ -5329,6 +5340,7 @@ static int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { return MDBX_CURSOR_FULL; } + mdbx_cassert(mc, mc->mc_snum < UINT16_MAX); mc->mc_top = mc->mc_snum++; mc->mc_pg[mc->mc_top] = mp; mc->mc_ki[mc->mc_top] = 0; @@ -6013,7 +6025,8 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = mc->mc_dbx->md_cmp(key, &nodekey); if (rc == 0) { /* last node was the one we wanted */ - mc->mc_ki[mc->mc_top] = nkeys - 1; + mdbx_cassert(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); + mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1); if (exactp) *exactp = 1; goto set1; @@ -6048,7 +6061,8 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, break; if (i == mc->mc_top) { /* There are no other pages */ - mc->mc_ki[mc->mc_top] = nkeys; + mdbx_cassert(mc, nkeys <= UINT16_MAX); + mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; return MDBX_NOTFOUND; } } @@ -6258,7 +6272,8 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_page *mp = mc->mc_pg[mc->mc_top]; unsigned nkeys = NUMKEYS(mp); if (mc->mc_ki[mc->mc_top] >= nkeys) { - mc->mc_ki[mc->mc_top] = nkeys; + mdbx_cassert(mc, nkeys <= UINT16_MAX); + mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; return MDBX_NOTFOUND; } assert(nkeys > 0); @@ -6645,7 +6660,8 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc2 = mdbx_update_key(mc, key); else rc2 = MDBX_SUCCESS; - mc->mc_top += dtop; + mdbx_cassert(mc, mc->mc_top + dtop < UINT16_MAX); + mc->mc_top += (uint16_t)dtop; if (rc2) return rc2; } @@ -6764,15 +6780,18 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mp->mp_flags = fp_flags | P_DIRTY; mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; mp->mp_lower = fp->mp_lower; - mp->mp_upper = fp->mp_upper + offset; + mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX); + mp->mp_upper = (indx_t)(fp->mp_upper + offset); if (fp_flags & P_LEAF2) { memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize); } else { memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ, (char *)fp + fp->mp_upper + PAGEHDRSZ, olddata.iov_len - fp->mp_upper - PAGEHDRSZ); - for (i = 0; i < NUMKEYS(fp); i++) - mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; + for (i = 0; i < NUMKEYS(fp); i++) { + mdbx_cassert(mc, fp->mp_ptrs[i] + offset <= UINT16_MAX); + mp->mp_ptrs[i] = (indx_t)(fp->mp_ptrs[i] + offset); + } } } @@ -7147,9 +7166,9 @@ static int mdbx_page_new(MDBX_cursor *mc, unsigned flags, unsigned num, return rc; mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize); - np->mp_flags = flags | P_DIRTY; + np->mp_flags = (uint16_t)(flags | P_DIRTY); np->mp_lower = 0; - np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ; + np->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); if (IS_BRANCH(np)) mc->mc_db->md_branch_pages++; @@ -7233,7 +7252,7 @@ static __inline size_t mdbx_branch_size(MDBX_env *env, MDBX_val *key) { * MDBX_PAGE_FULL - there is insufficient room in the page. This error * should never happen since all callers already calculate * the page's free space before calling this function. */ -static int mdbx_node_add(MDBX_cursor *mc, indx_t indx, MDBX_val *key, +static int mdbx_node_add(MDBX_cursor *mc, unsigned indx, MDBX_val *key, MDBX_val *data, pgno_t pgno, unsigned flags) { unsigned i; size_t node_size = NODESIZE; @@ -7319,7 +7338,7 @@ update: /* Write the node data. */ node = NODEPTR(mp, indx); node->mn_ksize = (key == NULL) ? 0 : (uint16_t)key->iov_len; - node->mn_flags = flags; + node->mn_flags = (uint16_t)flags; if (IS_LEAF(mp)) SETDSZ(node, data->iov_len); else @@ -7423,7 +7442,7 @@ static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { /* Compact the main page after deleting a node on a subpage. * [in] mp The main page to operate on. * [in] indx The index of the subpage on the main page. */ -static void mdbx_node_shrink(MDBX_page *mp, indx_t indx) { +static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { MDBX_node *node; MDBX_page *sp, *xp; char *base; @@ -7801,14 +7820,17 @@ static int mdbx_update_key(MDBX_cursor *mc, MDBX_val *key) { numkeys = NUMKEYS(mp); for (i = 0; i < numkeys; i++) { - if (mp->mp_ptrs[i] <= ptr) - mp->mp_ptrs[i] -= delta; + if (mp->mp_ptrs[i] <= ptr) { + mdbx_cassert(mc, mp->mp_ptrs[i] >= delta); + mp->mp_ptrs[i] -= (indx_t)delta; + } } base = (char *)mp + mp->mp_upper + PAGEHDRSZ; len = ptr - mp->mp_upper + NODESIZE; memmove(base - delta, base, len); - mp->mp_upper -= delta; + mdbx_cassert(mc, mp->mp_upper >= delta); + mp->mp_upper -= (indx_t)delta; node = NODEPTR(mp, indx); } @@ -7887,8 +7909,9 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { key.iov_len = NODEKSZ(s2); key.iov_base = NODEKEY(s2); } - csrc->mc_snum = snum--; - csrc->mc_top = snum; + mdbx_cassert(csrc, snum >= 1 && snum <= UINT16_MAX); + csrc->mc_snum = (uint16_t)snum--; + csrc->mc_top = (uint16_t)snum; } else { key.iov_len = NODEKSZ(srcnode); key.iov_base = NODEKEY(srcnode); @@ -7914,8 +7937,9 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, int fromleft) { bkey.iov_len = NODEKSZ(s2); bkey.iov_base = NODEKEY(s2); } - mn.mc_snum = snum--; - mn.mc_top = snum; + mdbx_cassert(csrc, snum >= 1 && snum <= UINT16_MAX); + mn.mc_snum = (uint16_t)snum--; + mn.mc_top = (uint16_t)snum; mn.mc_ki[snum] = 0; rc = mdbx_update_key(&mn, &bkey); if (unlikely(rc)) @@ -8075,7 +8099,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_val key, data; unsigned nkeys; int rc; - indx_t i, j; + unsigned i, j; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; @@ -8182,7 +8206,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { continue; if (m3->mc_pg[top] == psrc) { m3->mc_pg[top] = pdst; - m3->mc_ki[top] += nkeys; + mdbx_cassert(m3, nkeys + m3->mc_ki[top] <= UINT16_MAX); + m3->mc_ki[top] += (indx_t)nkeys; m3->mc_ki[top - 1] = cdst->mc_ki[top - 1]; } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] && m3->mc_ki[top - 1] > csrc->mc_ki[top - 1]) { @@ -8200,8 +8225,9 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { /* Did the tree height change? */ if (depth != cdst->mc_db->md_depth) snum += cdst->mc_db->md_depth - depth; - cdst->mc_snum = snum; - cdst->mc_top = snum - 1; + mdbx_cassert(cdst, snum >= 1 && snum <= UINT16_MAX); + cdst->mc_snum = (uint16_t)snum; + cdst->mc_top = (uint16_t)(snum - 1); } return rc; } @@ -8660,10 +8686,14 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, split = LEAF2KEY(mp, split_indx, ksize); rsize = (nkeys - split_indx) * ksize; lsize = (nkeys - split_indx) * sizeof(indx_t); - mp->mp_lower -= lsize; - rp->mp_lower += lsize; - mp->mp_upper += rsize - lsize; - rp->mp_upper -= rsize - lsize; + mdbx_cassert(mc, mp->mp_lower >= lsize); + mp->mp_lower -= (indx_t)lsize; + mdbx_cassert(mc, rp->mp_lower + lsize <= UINT16_MAX); + rp->mp_lower += (indx_t)lsize; + mdbx_cassert(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); + mp->mp_upper += (indx_t)(rsize - lsize); + mdbx_cassert(mc, rp->mp_upper >= rsize - lsize); + rp->mp_upper -= (indx_t)(rsize - lsize); sepkey.iov_len = ksize; if (newindx == split_indx) { sepkey.iov_base = newkey->iov_base; @@ -8691,7 +8721,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, rp->mp_lower += sizeof(indx_t); mdbx_cassert(mc, rp->mp_upper >= ksize - sizeof(indx_t)); rp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); - mc->mc_ki[mc->mc_top] = x; + mdbx_cassert(mc, x <= UINT16_MAX); + mc->mc_ki[mc->mc_top] = (indx_t)x; } } else { size_t psize, nsize, k; @@ -8712,7 +8743,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, copy->mp_pgno = mp->mp_pgno; copy->mp_flags = mp->mp_flags; copy->mp_lower = 0; - copy->mp_upper = env->me_psize - PAGEHDRSZ; + mdbx_cassert(mc, env->me_psize - PAGEHDRSZ <= UINT16_MAX); + copy->mp_upper = (indx_t)(env->me_psize - PAGEHDRSZ); /* prepare to insert */ for (unsigned j = i = 0; i < nkeys; i++) { @@ -8966,7 +8998,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, m3->mc_ki[mc->mc_top]++; if (m3->mc_ki[mc->mc_top] >= nkeys) { m3->mc_pg[mc->mc_top] = rp; - m3->mc_ki[mc->mc_top] -= nkeys; + mdbx_cassert(mc, m3->mc_ki[mc->mc_top] >= nkeys); + m3->mc_ki[mc->mc_top] -= (indx_t)nkeys; for (i = 0; i < mc->mc_top; i++) { m3->mc_ki[i] = mn.mc_ki[i]; m3->mc_pg[i] = mn.mc_pg[i]; @@ -9810,7 +9843,7 @@ int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, unsigned user_flags, txn->mt_dbxs[slot].md_name.iov_len = len; txn->mt_dbxs[slot].md_cmp = nullptr; txn->mt_dbxs[slot].md_dcmp = nullptr; - txn->mt_dbflags[slot] = dbflag; + txn->mt_dbflags[slot] = (uint8_t)dbflag; txn->mt_dbiseqs[slot] = (env->me_dbiseqs[slot] += 1); txn->mt_dbs[slot] = *(MDBX_db *)data.iov_base; @@ -9985,7 +10018,8 @@ static int mdbx_drop0(MDBX_cursor *mc, int subs) { } if (!mc->mc_top) break; - mc->mc_ki[mc->mc_top] = i; + mdbx_cassert(mc, i <= UINT16_MAX); + mc->mc_ki[mc->mc_top] = (indx_t)i; rc = mdbx_cursor_sibling(mc, 1); if (rc) { if (unlikely(rc != MDBX_NOTFOUND)) diff --git a/src/osal.c b/src/osal.c index 9dcba59b..08ae9fef 100644 --- a/src/osal.c +++ b/src/osal.c @@ -37,7 +37,9 @@ static int waitstatus2errcode(DWORD result) { /* Map a result from an NTAPI call to WIN32 error code. */ static int ntstatus2errcode(NTSTATUS status) { DWORD dummy; - OVERLAPPED ov = {status}; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + ov.Internal = status; return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS : GetLastError(); } @@ -759,7 +761,7 @@ int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit) { } map->address = NULL; - size_t ViewSize = limit; + SIZE_T ViewSize = limit; rc = NtMapViewOfSection( map->section, GetCurrentProcess(), &map->address, /* ZeroBits */ 0, diff --git a/test/base.h b/test/base.h index fe09aa89..e87f0240 100644 --- a/test/base.h +++ b/test/base.h @@ -19,11 +19,22 @@ #endif #if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) +#ifdef _MSC_VER +#pragma warning(push, 1) +#pragma warning(disable : 4548) /* expression before comma has no effect; \ + expected expression with side - effect */ +#pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ + semantics are not enabled. Specify /EHsc */ +#pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ + mode specified; termination on exception \ + is not guaranteed. Specify /EHsc */ +#endif /* _MSC_VER (warnings) */ + /* If you wish to build your application for a previous Windows platform, - * include WinSDKVer.h and set the _WIN32_WINNT macro to the platform you - * wish to support before including SDKDDKVer.h. - * - * TODO: #define _WIN32_WINNT WIN32_MUSTDIE */ +* include WinSDKVer.h and set the _WIN32_WINNT macro to the platform you +* wish to support before including SDKDDKVer.h. +* +* TODO: #define _WIN32_WINNT WIN32_MUSTDIE */ #include #endif /* WINDOWS */ @@ -74,3 +85,10 @@ #include "../mdbx.h" #include "../src/defs.h" + +#ifdef _MSC_VER +#pragma warning(pop) +#pragma warning(disable : 4201) /* nonstandard extension used : \ + nameless struct / union */ +#pragma warning(disable : 4127) /* conditional expression is constant */ +#endif diff --git a/test/keygen.cc b/test/keygen.cc index 6cfaed8c..806d4ba8 100644 --- a/test/keygen.cc +++ b/test/keygen.cc @@ -122,13 +122,17 @@ void maker::setup(const config::actor_params_pod &actor, unsigned thread_number) { key_essentials.flags = actor.table_flags & (MDBX_INTEGERKEY | MDBX_REVERSEKEY); - key_essentials.minlen = actor.keylen_min; - key_essentials.maxlen = actor.keylen_max; + assert(actor.keylen_min < UINT8_MAX); + key_essentials.minlen = (uint8_t)actor.keylen_min; + assert(actor.keylen_max < UINT16_MAX); + key_essentials.maxlen = (uint16_t)actor.keylen_max; value_essentials.flags = actor.table_flags & (MDBX_INTEGERDUP | MDBX_REVERSEDUP); - value_essentials.minlen = actor.datalen_min; - value_essentials.maxlen = actor.datalen_max; + assert(actor.datalen_min < UINT8_MAX); + value_essentials.minlen = (uint8_t)actor.datalen_min; + assert(actor.datalen_max < UINT16_MAX); + value_essentials.maxlen = (uint16_t)actor.datalen_max; assert(thread_number < 2); (void)thread_number; diff --git a/test/log.cc b/test/log.cc index ebb859b6..eee0fffe 100644 --- a/test/log.cc +++ b/test/log.cc @@ -51,8 +51,8 @@ void setup(loglevel _level, const std::string &_prefix) { void setup(const std::string &_prefix) { prefix = _prefix; } -const char *level2str(const loglevel level) { - switch (level) { +const char *level2str(const loglevel alevel) { + switch (alevel) { default: return "invalid/unknown"; case extra: diff --git a/test/test.cc b/test/test.cc index 0874c4bb..50426014 100644 --- a/test/test.cc +++ b/test/test.cc @@ -241,20 +241,18 @@ bool testcase::wait4start() { } void testcase::kick_progress(bool active) const { - static chrono::time last; chrono::time now = chrono::now_motonic(); - if (active) { static int last_point = -1; int point = (now.fixedpoint >> 29) & 3; if (point != last_point) { - last = now; + last.progress_timestamp = now; fprintf(stderr, "%c\b", "-\\|/"[last_point = point]); fflush(stderr); } - } else if (now.fixedpoint - last.fixedpoint > + } else if (now.fixedpoint - last.progress_timestamp.fixedpoint > chrono::from_seconds(2).fixedpoint) { - last = now; + last.progress_timestamp = now; fprintf(stderr, "%c\b", "@*"[now.utc & 1]); fflush(stderr); } diff --git a/test/test.h b/test/test.h index 939fc8b3..98b65801 100644 --- a/test/test.h +++ b/test/test.h @@ -95,6 +95,7 @@ protected: struct { mdbx_canary canary; + mutable chrono::time progress_timestamp; } last; static int oom_callback(MDBX_env *env, int pid, mdbx_tid_t tid, uint64_t txn, @@ -119,9 +120,9 @@ protected: void signal(); bool should_continue(bool check_timeout_only = false) const; - void generate_pair(const keygen::serial_t serial, keygen::buffer &key, - keygen::buffer &value, keygen::serial_t data_age = 0) { - keyvalue_maker.pair(serial, key, value, data_age); + void generate_pair(const keygen::serial_t serial, keygen::buffer &out_key, + keygen::buffer &out_value, keygen::serial_t data_age = 0) { + keyvalue_maker.pair(serial, out_key, out_value, data_age); } void generate_pair(const keygen::serial_t serial, diff --git a/test/utils.cc b/test/utils.cc index fd2162f1..56085178 100644 --- a/test/utils.cc +++ b/test/utils.cc @@ -86,7 +86,7 @@ bool hex2data(const char *hex_begin, const char *hex_end, void *ptr, uint32_t c = l + (h << 4); checksum.push(c); - *data = c; + *data = (uint8_t)c; } return true; } From 5dd68f70fa1e7bf8b9dac11d29ad57c0fd541552 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 2 Jul 2017 16:48:09 +0300 Subject: [PATCH 265/303] mdbx: use /W4 (level4 for warnings) and /WX (treat warnings as errors). Change-Id: I04872c15c295c67eccb0d6379447aab7ac7ca163 --- dll.vcxproj | 7 ++++++- test/test.vcxproj | 8 ++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/dll.vcxproj b/dll.vcxproj index 2866a857..4f9b4f50 100644 --- a/dll.vcxproj +++ b/dll.vcxproj @@ -33,6 +33,7 @@ DynamicLibrary false v140 + true DynamicLibrary @@ -43,6 +44,7 @@ DynamicLibrary false v140 + true @@ -83,6 +85,7 @@ ProgramDatabase Disabled true + true MachineX86 @@ -94,7 +97,7 @@ WIN32;NDEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions) MultiThreadedDLL - Level3 + Level4 ProgramDatabase true Full @@ -124,6 +127,7 @@ WIN64;_DEBUG;_WINDOWS;_USRDLL;LIBMDBX_EXPORTS;%(PreprocessorDefinitions);MDBX_DEBUG=1 MultiThreadedDebugDLL true + true @@ -137,6 +141,7 @@ Size true true + Level4 UseLinkTimeCodeGeneration diff --git a/test/test.vcxproj b/test/test.vcxproj index 400090b5..20535ff4 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -98,6 +98,7 @@ WIN32;_DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions) true test.h + true Console @@ -113,6 +114,7 @@ _DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions) true test.h + true Console @@ -122,7 +124,7 @@ - Level3 + Level4 Use MaxSpeed true @@ -130,6 +132,7 @@ WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true test.h + true Console @@ -141,7 +144,7 @@ - Level3 + Level4 Use MaxSpeed true @@ -149,6 +152,7 @@ NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true test.h + true Console From 229514c93ea22e82db1d158633466ff18b89ea25 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sun, 2 Jul 2017 16:54:18 +0300 Subject: [PATCH 266/303] mdbx: update TODO. Change-Id: Ib812146ccb78496dc28610cbd2a6597b09ee238f --- TODO.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TODO.md b/TODO.md index aaeec666..1508c944 100644 --- a/TODO.md +++ b/TODO.md @@ -29,11 +29,13 @@ - [x] Избавиться от умножения на размер страницы (заменить на сдвиг). - [x] Устранение всех предупреждений (в том числе под Windows). - [x] Добавить 'mti_reader_finished_flag'. +- [x] Погасить все level4-warnings от MSVC, включить /WX. - [ ] Отрефакторить mdbx_freelist_save(). - [ ] Хранить "свободный хвост" не связанный с freeDB в META. - [ ] Перевод mdbx-tools на С++ и сборка для Windows. - [ ] Заменить заглушки mdbx_version и mdbx_build. - [ ] Актуализация README.md +- [ ] Переход на C++11, добавление #pramga detect_mismatch(). CI - [ ] Прикрутить проверку coverity From 283eb0aff055887eda218f3c4c45ef6b2ad68b36 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 3 Jul 2017 05:11:52 +0300 Subject: [PATCH 267/303] test: add 'volatile' to rdtsc() and friends (fix for modern clang). Change-Id: I82d531f66b1aa44c173485d1955611ed46767998 --- test/utils.cc | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/test/utils.cc b/test/utils.cc index 56085178..9a6338cc 100644 --- a/test/utils.cc +++ b/test/utils.cc @@ -110,35 +110,35 @@ uint64_t entropy_ticks(void) { #if defined(__GNUC__) || defined(__clang__) #if defined(__ia64__) uint64_t ticks; - __asm("mov %0=ar.itc" : "=r"(ticks)); + __asm __volatile("mov %0=ar.itc" : "=r"(ticks)); return ticks; #elif defined(__hppa__) uint64_t ticks; - __asm("mfctl 16, %0" : "=r"(ticks)); + __asm __volatile("mfctl 16, %0" : "=r"(ticks)); return ticks; #elif defined(__s390__) uint64_t ticks; - __asm("stck 0(%0)" : : "a"(&(ticks)) : "memory", "cc"); + __asm __volatile("stck 0(%0)" : : "a"(&(ticks)) : "memory", "cc"); return ticks; #elif defined(__alpha__) uint64_t ticks; - __asm("rpcc %0" : "=r"(ticks)); + __asm __volatile("rpcc %0" : "=r"(ticks)); return ticks; #elif defined(__sparc_v9__) uint64_t ticks; - __asm("rd %%tick, %0" : "=r"(ticks)); + __asm __volatile("rd %%tick, %0" : "=r"(ticks)); return ticks; #elif defined(__powerpc64__) || defined(__ppc64__) uint64_t ticks; - __asm("mfspr %0, 268" : "=r"(ticks)); + __asm __volatile("mfspr %0, 268" : "=r"(ticks)); return ticks; #elif defined(__ppc__) || defined(__powerpc__) unsigned tbl, tbu; /* LY: Here not a problem if a high-part (tbu) * would been updated during reading. */ - __asm("mftb %0" : "=r"(tbl)); - __asm("mftbu %0" : "=r"(tbu)); + __asm __volatile("mftb %0" : "=r"(tbl)); + __asm __volatile("mftbu %0" : "=r"(tbu)); return (((uin64_t)tbu0) << 32) | tbl; #elif defined(__mips__) @@ -153,12 +153,16 @@ uint64_t entropy_ticks(void) { return *mips_tsc_addr; } #elif defined(__x86_64__) || defined(__i386__) +#if __GNUC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_rdtsc) + return __builtin_ia32_rdtsc(); +#else unsigned lo, hi; /* LY: Using the "a" and "d" constraints is important for correct code. */ - __asm("rdtsc" : "=a"(lo), "=d"(hi)); + __asm __volatile("rdtsc" : "=a"(lo), "=d"(hi)); return (((uint64_t)hi) << 32) + lo; +#endif #endif /* arch selector */ #elif defined(_M_IX86) || defined(_M_X64) From 1d702aa94f89531702e9d07b165715fc0e0d50af Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 3 Jul 2017 06:30:43 +0300 Subject: [PATCH 268/303] mdbx: more warning control for MSVC. --- src/bits.h | 4 +++- src/mdbx.c | 6 ++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/bits.h b/src/bits.h index f5fc18a3..446b1e19 100644 --- a/src/bits.h +++ b/src/bits.h @@ -42,7 +42,9 @@ # ifndef _CRT_SECURE_NO_WARNINGS # define _CRT_SECURE_NO_WARNINGS # endif -#pragma warning(disable : 4464) /* relative include path contains '..' */ +#if _MSC_VER > 1800 +# pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic inline expansion */ #pragma warning(disable : 4201) /* nonstandard extension used : nameless struct / union */ diff --git a/src/mdbx.c b/src/mdbx.c index d7d76607..1ee9790c 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -37,14 +37,12 @@ #include "./bits.h" -#ifdef _MSC_VER -#if _MSC_VER < 1910 -/* LY: MSVC has buggy/inconsistent PRIuPTR/PRIxPTR macros and format-arg +#if defined(_MSC_VER) && _MSC_VER == 1900 +/* LY: MSVC 2015 has buggy/inconsistent PRIuPTR/PRIxPTR macros and format-arg checker for size_t typedef. */ #pragma warning(disable : 4777) /* format string '%10u' requires an argument \ of type 'unsigned int', but variadic \ argument 1 has type 'std::size_t' */ -#endif #endif /* _MSC_VER (warnings) */ /*----------------------------------------------------------------------------*/ From 78ae12aa101bb2abe9e41a39dbe00c894cfc9feb Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 3 Jul 2017 09:56:46 +0300 Subject: [PATCH 269/303] mdbx: fix MSVC 'padding' warnings, minor refine fields ordering. --- src/bits.h | 20 ++++++++------------ src/mdbx.c | 14 +++++++------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/bits.h b/src/bits.h index 446b1e19..e72bdfa2 100644 --- a/src/bits.h +++ b/src/bits.h @@ -52,6 +52,7 @@ #pragma warning(disable : 4127) /* conditional expression is constant */ #pragma warning(disable : 4324) /* 'xyz': structure was padded due to alignment specifier */ #pragma warning(disable : 4310) /* cast truncates constant value */ +#pragma warning(disable : 4820) /* bytes padding added after data member for aligment */ #endif /* _MSC_VER (warnings) */ #include "../mdbx.h" @@ -362,11 +363,6 @@ typedef struct MDBX_page { #define MAX_MAPSIZE ((sizeof(size_t) < 8) ? MAX_MAPSIZE32 : MAX_MAPSIZE64) -#ifdef _MSC_VER -#pragma warning(disable : 4820) /* bytes padding added after data member \ - for aligment */ -#endif - /* The header for the reader table (a memory-mapped lock file). */ typedef struct MDBX_lockinfo { /* Stamp identifying this as an MDBX file. @@ -419,7 +415,7 @@ typedef struct MDBX_lockinfo { #ifdef _MSC_VER #pragma pack(pop) -#endif +#endif /* MSVC: Enable aligment */ #define MDBX_LOCKINFO_WHOLE_SIZE \ ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ @@ -492,7 +488,7 @@ typedef struct MDBX_dbx { * Every operation requires a transaction handle. */ struct MDBX_txn { #define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) - uint32_t mt_signature; + size_t mt_signature; MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ MDBX_txn *mt_child; @@ -568,8 +564,8 @@ struct MDBX_txn { * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirtylist into mt_parent after freeing hidden mt_parent pages. */ unsigned mt_dirtyroom; - mdbx_canary mt_canary; mdbx_tid_t mt_owner; /* thread ID that owns this transaction */ + mdbx_canary mt_canary; }; /* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. @@ -591,6 +587,8 @@ struct MDBX_cursor { #define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047) #define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7) uint32_t mc_signature; + /* The database handle this cursor operates on */ + MDBX_dbi mc_dbi; /* Next cursor on this DB in this txn */ MDBX_cursor *mc_next; /* Backup of the original cursor if this cursor is a shadow */ @@ -599,8 +597,6 @@ struct MDBX_cursor { struct MDBX_xcursor *mc_xcursor; /* The transaction that owns this cursor */ MDBX_txn *mc_txn; - /* The database handle this cursor operates on */ - MDBX_dbi mc_dbi; /* The database record for this cursor */ MDBX_db *mc_db; /* The database auxiliary record for this cursor */ @@ -661,7 +657,7 @@ typedef struct MDBX_pgstate { /* The database environment. */ struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) - uint32_t me_signature; + size_t me_signature; mdbx_filehandle_t me_fd; /* The main data file */ mdbx_filehandle_t me_lfd; /* The lock file */ #ifdef MDBX_OSAL_SECTION @@ -685,6 +681,7 @@ struct MDBX_env { MDBX_dbi me_numdbs; /* number of DBs opened */ MDBX_dbi me_maxdbs; /* size of the DB table */ mdbx_pid_t me_pid; /* process ID of this env */ + mdbx_thread_key_t me_txkey; /* thread-key for readers */ char *me_path; /* path to the DB files */ char *me_map; /* the memory map of the data file */ MDBX_lockinfo *me_lck; /* the memory map of the lock file, never NULL */ @@ -695,7 +692,6 @@ struct MDBX_env { MDBX_dbx *me_dbxs; /* array of static DB info */ uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */ MDBX_pgstate me_pgstate; /* state of old pages from freeDB */ #define me_last_reclaimed me_pgstate.mf_last_reclaimed diff --git a/src/mdbx.c b/src/mdbx.c index 1ee9790c..ae1a71ff 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -43,7 +43,7 @@ #pragma warning(disable : 4777) /* format string '%10u' requires an argument \ of type 'unsigned int', but variadic \ argument 1 has type 'std::size_t' */ -#endif /* _MSC_VER (warnings) */ +#endif /* _MSC_VER (warnings) */ /*----------------------------------------------------------------------------*/ /* rthc (tls keys and destructors) */ @@ -9088,13 +9088,13 @@ typedef struct mdbx_copy { char *mc_over[2]; size_t mc_wlen[2]; size_t mc_olen[2]; - pgno_t mc_next_pgno; mdbx_filehandle_t mc_fd; - int mc_toggle; /* Buffer number in provider */ - int mc_new; /* (0-2 buffers to write) | (MDBX_EOF at end) */ + volatile int mc_error; + pgno_t mc_next_pgno; + short mc_toggle; /* Buffer number in provider */ + short mc_new; /* (0-2 buffers to write) | (MDBX_EOF at end) */ /* Error code. Never cleared if set. Both threads can set nonzero * to fail the copy. Not mutex-protected, MDBX expects atomic int. */ - volatile int mc_error; } mdbx_copy; /* Dedicated writer thread for compacting copy. */ @@ -9168,7 +9168,7 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { * [in] adjust (1 to hand off 1 buffer) | (MDBX_EOF when ending). */ static int __cold mdbx_env_cthr_toggle(mdbx_copy *my, int adjust) { mdbx_condmutex_lock(&my->mc_condmutex); - my->mc_new += adjust; + my->mc_new += (short)adjust; mdbx_condmutex_signal(&my->mc_condmutex); while (my->mc_new & 2) /* both buffers in use */ mdbx_condmutex_wait(&my->mc_condmutex); @@ -9279,7 +9279,7 @@ static int __cold mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { } memcpy(&db, NODEDATA(ni), sizeof(db)); - my->mc_toggle = toggle; + my->mc_toggle = (short)toggle; rc = mdbx_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA); if (rc) goto done; From 3c5ae4cc9cedc84b8f2d88ae49909ba5e7d97181 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 3 Jul 2017 12:50:48 +0300 Subject: [PATCH 270/303] mdbx: disable C4548 as workaround for buggy WinSDK 10.0.10240.0 --- mdbx.h | 2 ++ src/bits.h | 1 + src/osal.h | 2 ++ 3 files changed, 5 insertions(+) diff --git a/mdbx.h b/mdbx.h index 65ff15e4..d01bd4f3 100644 --- a/mdbx.h +++ b/mdbx.h @@ -53,6 +53,8 @@ #ifdef _MSC_VER #pragma warning(push, 1) +#pragma warning(disable : 4548) /* expression before comma has no effect; \ + expected expression with side - effect */ #pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ * semantics are not enabled. Specify /EHsc */ #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ diff --git a/src/bits.h b/src/bits.h index e72bdfa2..dcdafb0a 100644 --- a/src/bits.h +++ b/src/bits.h @@ -53,6 +53,7 @@ #pragma warning(disable : 4324) /* 'xyz': structure was padded due to alignment specifier */ #pragma warning(disable : 4310) /* cast truncates constant value */ #pragma warning(disable : 4820) /* bytes padding added after data member for aligment */ +#pragma warning(disable : 4548) /* expression before comma has no effect; expected expression with side - effect */ #endif /* _MSC_VER (warnings) */ #include "../mdbx.h" diff --git a/src/osal.h b/src/osal.h index 3fd869f0..731c86b5 100644 --- a/src/osal.h +++ b/src/osal.h @@ -21,6 +21,8 @@ #ifdef _MSC_VER #pragma warning(push, 1) +#pragma warning(disable : 4548) /* expression before comma has no effect; \ + expected expression with side - effect */ #pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ * semantics are not enabled. Specify /EHsc */ #pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ From b34e92d308215945d195f3d77423fdd7aae4c06d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 09:24:10 +0300 Subject: [PATCH 271/303] mdbx: cleanup/reformat after the merge. Change-Id: I3e0fe8f1292a6387e8d3ff8b904170f05d8e1770 --- mdbx.h | 463 ++++++++++++++++++++++++++--------------------------- src/mdbx.c | 194 +++++++++++----------- 2 files changed, 319 insertions(+), 338 deletions(-) diff --git a/mdbx.h b/mdbx.h index 867ec5e5..96e6e63e 100644 --- a/mdbx.h +++ b/mdbx.h @@ -493,12 +493,12 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); * If this function fails, mdbx_env_close() must be called to discard * the MDBX_env handle. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] path The directory in which the database files reside. - * This directory must already exist and be writable. - * [in] flags Special options for this environment. This parameter - * must be set to 0 or by bitwise OR'ing together one - * or more of the values described here. + * [in] env An environment handle returned by mdbx_env_create() + * [in] path The directory in which the database files reside. + * This directory must already exist and be writable. + * [in] flags Special options for this environment. This parameter + * must be set to 0 or by bitwise OR'ing together one + * or more of the values described here. * * Flags set by mdbx_env_set_flags() are also used: * - MDBX_NOSUBDIR @@ -617,13 +617,13 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_VERSION_MISMATCH - the version of the MDBX library doesn't match the - * version that created the database environment. + * version that created the database environment. * - MDBX_INVALID - the environment file headers are corrupted. - * - MDBX_ENOENT - the directory specified by the path parameter - * doesn't exist. - * - MDBX_EACCES - the user didn't have permission to access - * the environment files. - * - MDBX_EAGAIN - the environment was locked by another process. */ + * - MDBX_ENOENT - the directory specified by the path parameter + * doesn't exist. + * - MDBX_EACCES - the user didn't have permission to access + * the environment files. + * - MDBX_EAGAIN - the environment was locked by another process. */ LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, mode_t mode); LIBMDBX_API int mdbx_env_open_ex(MDBX_env *env, const char *path, @@ -637,13 +637,13 @@ LIBMDBX_API int mdbx_env_open_ex(MDBX_env *env, const char *path, * parallel with write transactions, because it employs a read-only * transaction. See long-lived transactions under "Caveats" section. * - * [in] env An environment handle returned by mdbx_env_create(). It must - * have already been opened successfully. - * [in] path The directory in which the copy will reside. This directory must - * already exist and be writable but must otherwise be empty. - * [in] flags Special options for this operation. This parameter must be set - * to 0 or by bitwise OR'ing together one or more of the values - * described here: + * [in] env An environment handle returned by mdbx_env_create(). It must + * have already been opened successfully. + * [in] path The directory in which the copy will reside. This directory + * must already exist and be writable but must otherwise be empty. + * [in] flags Special options for this operation. This parameter must be set + * to 0 or by bitwise OR'ing together one or more of the values + * described here: * * - MDBX_CP_COMPACT * Perform compaction while copying: omit free pages and sequentially @@ -720,13 +720,13 @@ LIBMDBX_API int mdbx_env_sync(MDBX_env *env, int force); * The environment handle will be freed and must not be used again after this * call. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] dont_sync A dont'sync flag, if non-zero the last checkpoint (meta-page - * update) will be kept "as is" and may be still "weak" in the - * NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be ignored - * on opening next time, and transactions since the last non-weak - * checkpoint (meta-page update) will rolledback for consistency - * guarantee. */ + * [in] env An environment handle returned by mdbx_env_create() + * [in] dont_sync A dont'sync flag, if non-zero the last checkpoint (meta-page + * update) will be kept "as is" and may be still "weak" in the + * NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be + * ignored on opening next time, and transactions since the + * last non-weak checkpoint (meta-page update) will rolledback + * for consistency guarantee. */ LIBMDBX_API void mdbx_env_close(MDBX_env *env); /* Set environment flags. @@ -870,7 +870,7 @@ LIBMDBX_API int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs); /* Get the maximum size of keys and MDBX_DUPSORT data we can write. * - * [in] env An environment handle returned by mdbx_env_create() + * [in] env An environment handle returned by mdbx_env_create() * * Returns The maximum size of a key we can write. */ LIBMDBX_API int mdbx_env_get_maxkeysize(MDBX_env *env); @@ -878,8 +878,8 @@ LIBMDBX_API int mdbx_get_maxkeysize(size_t pagesize); /* Set application information associated with the MDBX_env. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] ctx An arbitrary pointer for whatever the application needs. + * [in] env An environment handle returned by mdbx_env_create() + * [in] ctx An arbitrary pointer for whatever the application needs. * * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_set_userctx(MDBX_env *env, void *ctx); @@ -893,8 +893,8 @@ LIBMDBX_API void *mdbx_env_get_userctx(MDBX_env *env); /* A callback function for most MDBX assert() failures, * called before printing the message and aborting. * - * [in] env An environment handle returned by mdbx_env_create(). - * [in] msg The assertion message, not including newline. */ + * [in] env An environment handle returned by mdbx_env_create(). + * [in] msg The assertion message, not including newline. */ typedef void MDBX_assert_func(const MDBX_env *env, const char *msg, const char *function, unsigned line); @@ -903,8 +903,8 @@ typedef void MDBX_assert_func(const MDBX_env *env, const char *msg, * Disabled if libmdbx is buillt with MDBX_DEBUG=0. * NOTE: This hack should become obsolete as mdbx's error handling matures. * - * [in] env An environment handle returned by mdbx_env_create(). - * [in] func An MDBX_assert_func function, or 0. + * [in] env An environment handle returned by mdbx_env_create(). + * [in] func An MDBX_assert_func function, or 0. * * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func); @@ -937,20 +937,19 @@ LIBMDBX_API int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_PANIC - a fatal error occurred earlier and the environment - * must be shut down. + * must be shut down. * - MDBX_MAP_RESIZED - another process wrote data beyond this MDBX_env's - * mapsize and this environment's map must be resized - * as well. See mdbx_env_set_mapsize(). + * mapsize and this environment's map must be resized + * as well. See mdbx_env_set_mapsize(). * - MDBX_READERS_FULL - a read-only transaction was requested and the reader - * lock table is full. See mdbx_env_set_maxreaders(). - * - MDBX_ENOMEM - out of memory. */ + * lock table is full. See mdbx_env_set_maxreaders(). + * - MDBX_ENOMEM - out of memory. */ LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, MDBX_txn **txn); /* Returns the transaction's MDBX_env * - * [in] txn A transaction handle returned by mdbx_txn_begin() - */ + * [in] txn A transaction handle returned by mdbx_txn_begin() */ LIBMDBX_API MDBX_env *mdbx_txn_env(MDBX_txn *txn); /* Return the transaction's ID. @@ -973,7 +972,7 @@ LIBMDBX_API uint64_t mdbx_txn_id(MDBX_txn *txn); * or after its transaction ends. It can be reused with * mdbx_cursor_renew() before finally closing it. * - * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] txn A transaction handle returned by mdbx_txn_begin() * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: @@ -991,7 +990,7 @@ LIBMDBX_API int mdbx_txn_commit(MDBX_txn *txn); * A cursor must be closed explicitly always, before or after its transaction * ends. It can be reused with mdbx_cursor_renew() before finally closing it. * - * [in] txn A transaction handle returned by mdbx_txn_begin(). */ + * [in] txn A transaction handle returned by mdbx_txn_begin(). */ LIBMDBX_API int mdbx_txn_abort(MDBX_txn *txn); /* Reset a read-only transaction. @@ -1012,7 +1011,7 @@ LIBMDBX_API int mdbx_txn_abort(MDBX_txn *txn); * from being reused when writers commit new data, and so under heavy load * the database size may grow much more rapidly than otherwise. * - * [in] txn A transaction handle returned by mdbx_txn_begin() */ + * [in] txn A transaction handle returned by mdbx_txn_begin() */ LIBMDBX_API int mdbx_txn_reset(MDBX_txn *txn); /* Renew a read-only transaction. @@ -1021,13 +1020,13 @@ LIBMDBX_API int mdbx_txn_reset(MDBX_txn *txn); * released by mdbx_txn_reset(). It must be called before a reset transaction * may be used again. * - * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] txn A transaction handle returned by mdbx_txn_begin() * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_PANIC - a fatal error occurred earlier and the environment - * must be shut down. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * must be shut down. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_txn_renew(MDBX_txn *txn); /* Open a table in the environment. @@ -1052,12 +1051,12 @@ LIBMDBX_API int mdbx_txn_renew(MDBX_txn *txn); * must be called before opening the environment. Table names are * keys in the internal unnamed table, and may be read but not written. * - * [in] txn transaction handle returned by mdbx_txn_begin() - * [in] name The name of the table to open. If only a single - * table is needed in the environment, this value may be NULL. - * [in] flags Special options for this table. This parameter must be set - * to 0 or by bitwise OR'ing together one or more of the values - * described here: + * [in] txn transaction handle returned by mdbx_txn_begin() + * [in] name The name of the table to open. If only a single + * table is needed in the environment, this value may be NULL. + * [in] flags Special options for this table. This parameter must be set + * to 0 or by bitwise OR'ing together one or more of the values + * described here: * - MDBX_REVERSEKEY * Keys are strings to be compared in reverse order, from the end * of the strings to the beginning. By default, Keys are treated as @@ -1093,9 +1092,9 @@ LIBMDBX_API int mdbx_txn_renew(MDBX_txn *txn); * Returns A non-zero error value on failure and 0 on success, some * possible errors are: * - MDBX_NOTFOUND - the specified database doesn't exist in the - * environment and MDBX_CREATE was not specified. - * - MDBX_DBS_FULL - too many databases have been opened. - * See mdbx_env_set_maxdbs(). */ + * environment and MDBX_CREATE was not specified. + * - MDBX_DBS_FULL - too many databases have been opened. + * See mdbx_env_set_maxdbs(). */ LIBMDBX_API int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, unsigned flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); @@ -1144,8 +1143,8 @@ LIBMDBX_API int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags); * reuse the handle value. Usually it's better to set a bigger * mdbx_env_set_maxdbs(), unless that value would be large. * - * [in] env An environment handle returned by mdbx_env_create() - * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] env An environment handle returned by mdbx_env_create() + * [in] dbi A database handle returned by mdbx_dbi_open() */ LIBMDBX_API int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi); @@ -1153,10 +1152,10 @@ LIBMDBX_API int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi); * * See mdbx_dbi_close() for restrictions about closing the DB handle. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] del 0 to empty the DB, 1 to delete it from the environment - * and close the DB handle. + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] del 0 to empty the DB, 1 to delete it from the environment + * and close the DB handle. * * Returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del); @@ -1178,10 +1177,10 @@ LIBMDBX_API int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, int del); * NOTE: Values returned from the database are valid only until a * subsequent update operation, or the end of the transaction. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] key The key to search for in the database - * [out] data The data corresponding to the key + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] key The key to search for in the database + * [in,out] data The data corresponding to the key * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: @@ -1197,13 +1196,13 @@ LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, * if duplicates are disallowed, or adding a duplicate data item if * duplicates are allowed (MDBX_DUPSORT). * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [in] key The key to store in the database - * [in,out] data The data to store - * [in] flags Special options for this operation. This parameter must be - * set to 0 or by bitwise OR'ing together one or more of the - * values described here. + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] key The key to store in the database + * [in,out] data The data to store + * [in] flags Special options for this operation. This parameter must be + * set to 0 or by bitwise OR'ing together one or more of the + * values described here. * * - MDBX_NODUPDATA * Enter the new key/data pair only if it does not already appear @@ -1243,10 +1242,11 @@ LIBMDBX_API int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: + * - MDBX_KEYEXIST * - MDBX_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). * - MDBX_TXN_FULL - the transaction has too many dirty pages. - * - MDBX_EACCES - an attempt was made to write in a read-only transaction. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EACCES - an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, unsigned flags); @@ -1284,9 +1284,9 @@ LIBMDBX_API int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, * or after its transaction ends. It can be reused with * mdbx_cursor_renew() before finally closing it. * - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] dbi A database handle returned by mdbx_dbi_open() - * [out] cursor Address where the new MDBX_cursor handle will be stored + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [out] cursor Address where the new MDBX_cursor handle will be stored * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: @@ -1299,7 +1299,7 @@ LIBMDBX_API int mdbx_cursor_open(MDBX_txn *txn, MDBX_dbi dbi, * The cursor handle will be freed and must not be used again after this call. * Its transaction must still be live if it is a write-transaction. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() */ + * [in] cursor A cursor handle returned by mdbx_cursor_open() */ LIBMDBX_API void mdbx_cursor_close(MDBX_cursor *cursor); /* Renew a cursor handle. @@ -1311,8 +1311,8 @@ LIBMDBX_API void mdbx_cursor_close(MDBX_cursor *cursor); * as it was created with. * * This may be done whether the previous transaction is live or dead. - * [in] txn A transaction handle returned by mdbx_txn_begin() - * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] cursor A cursor handle returned by mdbx_cursor_open() * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: @@ -1326,7 +1326,7 @@ LIBMDBX_API MDBX_txn *mdbx_cursor_txn(MDBX_cursor *cursor); /* Return the cursor's database handle. * - * [in] cursor A cursor handle returned by mdbx_cursor_open() */ + * [in] cursor A cursor handle returned by mdbx_cursor_open() */ LIBMDBX_API MDBX_dbi mdbx_cursor_dbi(MDBX_cursor *cursor); /* Retrieve by cursor. @@ -1344,8 +1344,8 @@ LIBMDBX_API MDBX_dbi mdbx_cursor_dbi(MDBX_cursor *cursor); * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_NOTFOUND - no matching key found. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_NOTFOUND - no matching key found. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op); @@ -1358,7 +1358,7 @@ LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, * [in] key The key operated on. * [in] data The data operated on. * [in] flags Options for this operation. This parameter - * must be set to 0 or one of the values described here. + * must be set to 0 or one of the values described here: * * - MDBX_CURRENT * Replace the item at the current cursor position. The key parameter @@ -1413,8 +1413,8 @@ LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, * - MDBX_EKEYMISMATCH * - MDBX_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). * - MDBX_TXN_FULL - the transaction has too many dirty pages. - * - MDBX_EACCES - an attempt was made to write in a read-only transaction. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EACCES - an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_put(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, unsigned flags); @@ -1432,8 +1432,8 @@ LIBMDBX_API int mdbx_cursor_put(MDBX_cursor *cursor, MDBX_val *key, * * Returns A non-zero error value on failure and 0 on success, some * possible errors are: - * - MDBX_EACCES - an attempt was made to write in a read-only transaction. - * - MDBX_EINVAL - an invalid parameter was specified. */ + * - MDBX_EACCES - an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL - an invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_del(MDBX_cursor *cursor, unsigned flags); /* Return count of duplicates for current key. @@ -1635,162 +1635,155 @@ LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, /* attribute support functions for Nexenta */ typedef uint64_t mdbx_attr_t; - /** @brief Store by cursor with attribute. - * - * This function stores key/data pairs into the database. - * The cursor is positioned at the new item, or on failure usually near it. - * @note Internally based on #MDBX_RESERVE feature, therefore doesn't support #MDBX_DUPSORT. - * @note Earlier documentation incorrectly said errors would leave the - * state of the cursor unchanged. - * @param[in] cursor A cursor handle returned by #mdb_cursor_open() - * @param[in] key The key operated on. - * @param[in] data The data operated on. - * @param[in] attr The attribute. - * @param[in] flags Options for this operation. This parameter - * must be set to 0 or one of the values described here. - *
      - *
    • #MDBX_CURRENT - replace the item at the current cursor position. - * The \b key parameter must still be provided, and must match it. - * This is intended to be used when the - * new data is the same size as the old. Otherwise it will simply - * perform a delete of the old record followed by an insert. - *
    • #MDBX_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will return - * #MDBX_KEYEXIST if the key already appears in the database. - *
    • #MDBX_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later. This saves - * an extra memcpy if the data is being generated later. - *
    • #MDBX_APPEND - append the given key/data pair to the end of the - * database. No key comparisons are performed. This option allows - * fast bulk loading when keys are already known to be in the - * correct order. Loading unsorted keys with this flag will cause - * data corruption. - *
    - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
      - *
    • #MDBX_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). - *
    • #MDBX_TXN_FULL - the transaction has too many dirty pages. - *
    • EACCES - an attempt was made to write in a read-only transaction. - *
    • EINVAL - an invalid parameter was specified. - *
    - */ +/* Store by cursor with attribute. + * + * This function stores key/data pairs into the database. The cursor is + * positioned at the new item, or on failure usually near it. + * + * NOTE: Internally based on MDBX_RESERVE feature, + * therefore doesn't support MDBX_DUPSORT. + * + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in] key The key operated on. + * [in] data The data operated on. + * [in] attr The attribute. + * [in] flags Options for this operation. This parameter must be set to 0 + * or one of the values described here: + * + * - MDBX_CURRENT + * Replace the item at the current cursor position. The key parameter + * must still be provided, and must match it, otherwise the function + * return MDBX_EKEYMISMATCH. + * + * - MDBX_APPEND + * Append the given key/data pair to the end of the database. No key + * comparisons are performed. This option allows fast bulk loading when + * keys are already known to be in the correct order. Loading unsorted + * keys with this flag will cause a MDBX_KEYEXIST error. + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDBX_EKEYMISMATCH + * - MDBX_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). + * - MDBX_TXN_FULL - the transaction has too many dirty pages. + * - MDBX_EACCES - an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL - an invalid parameter was specified. */ int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, unsigned flags); + mdbx_attr_t attr, unsigned flags); - /** @brief Store items and attributes into a database. - * - * This function stores key/data pairs in the database. The default behavior - * is to enter the new key/data pair, replacing any previously existing key - * if duplicates are disallowed. - * @note Internally based on #MDBX_RESERVE feature, therefore doesn't support #MDBX_DUPSORT. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to store in the database - * @param[in] attr The attribute to store in the database - * @param[in,out] data The data to store - * @param[in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or more of the - * values described here. - *
      - *
    • #MDBX_NOOVERWRITE - enter the new key/data pair only if the key - * does not already appear in the database. The function will return - * #MDBX_KEYEXIST if the key already appears in the database. The \b data - * parameter will be set to point to the existing item. - *
    • #MDBX_RESERVE - reserve space for data of the given size, but - * don't copy the given data. Instead, return a pointer to the - * reserved space, which the caller can fill in later - before - * the next update operation or the transaction ends. This saves - * an extra memcpy if the data is being generated later. - * LMDB does nothing else with this memory, the caller is expected - * to modify all of the space requested. - *
    • #MDBX_APPEND - append the given key/data pair to the end of the - * database. This option allows fast bulk loading when keys are - * already known to be in the correct order. Loading unsorted keys - * with this flag will cause a #MDBX_KEYEXIST error. - *
    - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
      - *
    • #MDBX_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). - *
    • #MDBX_TXN_FULL - the transaction has too many dirty pages. - *
    • EACCES - an attempt was made to write in a read-only transaction. - *
    • EINVAL - an invalid parameter was specified. - *
    - */ +/* Store items and attributes into a database. + * + * This function stores key/data pairs in the database. The default behavior + * is to enter the new key/data pair, replacing any previously existing key + * if duplicates are disallowed. + * + * NOTE: Internally based on MDBX_RESERVE feature, + * therefore doesn't support MDBX_DUPSORT. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in] key The key to store in the database. + * [in] attr The attribute to store in the database. + * [in,out] data The data to store. + * [in] flags Special options for this operation. This parameter must be + * set to 0 or by bitwise OR'ing together one or more of the + * values described here: + * + * - MDBX_NOOVERWRITE + * Enter the new key/data pair only if the key does not already appear + * in the database. The function will return MDBX_KEYEXIST if the key + * already appears in the database. The data parameter will be set to + * point to the existing item. + * + * - MDBX_CURRENT + * Update an single existing entry, but not add new ones. The function + * will return MDBX_NOTFOUND if the given key not exist in the database. + * Or the MDBX_EMULTIVAL in case duplicates for the given key. + * + * - MDBX_APPEND + * Append the given key/data pair to the end of the database. This option + * allows fast bulk loading when keys are already known to be in the + * correct order. Loading unsorted keys with this flag will cause + * a MDBX_EKEYMISMATCH error. + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDBX_KEYEXIST + * - MDBX_MAP_FULL - the database is full, see mdbx_env_set_mapsize(). + * - MDBX_TXN_FULL - the transaction has too many dirty pages. + * - MDBX_EACCES - an attempt was made to write in a read-only transaction. + * - MDBX_EINVAL - an invalid parameter was specified. */ int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, unsigned flags); + mdbx_attr_t attr, unsigned flags); - /** @brief Set items attribute from a database. - * - * This function stores key/data pairs attribute to the database. - * @note Internally based on #MDBX_RESERVE feature, therefore doesn't support #MDBX_DUPSORT. - * - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to search for in the database - * @param[in] data The data to be stored or NULL to save previous value. - * @param[in] attr The attribute to be stored - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
      - *
    • #MDBX_NOTFOUND - the key-value pair was not in the database. - *
    • EINVAL - an invalid parameter was specified. - *
    - */ +/* Set items attribute from a database. + * + * This function stores key/data pairs attribute to the database. + * + * NOTE: Internally based on MDBX_RESERVE feature, + * therefore doesn't support MDBX_DUPSORT. + * + * [in] txn A transaction handle returned by mdbx_txn_begin(). + * [in] dbi A database handle returned by mdbx_dbi_open(). + * [in] key The key to search for in the database. + * [in] data The data to be stored or NULL to save previous value. + * [in] attr The attribute to be stored. + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDBX_NOTFOUND - the key-value pair was not in the database. + * - MDBX_EINVAL - an invalid parameter was specified. */ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr); + mdbx_attr_t attr); - /** @brief Get items attribute from a database cursor. - * - * This function retrieves key/data pairs attribute from the database. - * The attribute of the specified key-value pair is returned in - * uint64_t to which \b attrptr refers. - * If the database supports duplicate keys (#MDBX_DUPSORT) then both - * key and data parameters are required, otherwise data could be NULL. - * - * @note Values returned from the database are valid only until a - * subsequent update operation, or the end of the transaction. - * @param[in] mc A database cursor pointing at the node - * @param[in] key The key to search for in the database - * @param[in,out] data The data for #MDBX_DUPSORT databases - * @param[out] attrptr The pointer to the result - * @param[in] op A cursor operation #MDBX_cursor_op - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
      - *
    • #MDBX_NOTFOUND - the key-value pair was not in the database. - *
    • EINVAL - an invalid parameter was specified. - *
    - */ +/* Get items attribute from a database cursor. + * + * This function retrieves key/data pairs from the database. The address and + * length of the key are returned in the object to which key refers (except + * for the case of the MDBX_SET option, in which the key object is unchanged), + * and the address and length of the data are returned in the object to which + * data refers. See mdbx_get() for restrictions on using the output values. + * + * [in] cursor A cursor handle returned by mdbx_cursor_open() + * [in,out] key The key for a retrieved item + * [in,out] data The data of a retrieved item + * [in] op A cursor operation MDBX_cursor_op + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDBX_NOTFOUND - no matching key found. + * - MDBX_EINVAL - an invalid parameter was specified. */ int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - mdbx_attr_t *attrptr, MDBX_cursor_op op); + mdbx_attr_t *attrptr, MDBX_cursor_op op); - /** @brief Get items attribute from a database. - * - * This function retrieves key/data pairs attribute from the database. - * The attribute of the specified key-value pair is returned in - * uint64_t to which \b attrptr refers. - * If the database supports duplicate keys (#MDBX_DUPSORT) then both - * key and data parameters are required, otherwise data is ignored. - * - * @note Values returned from the database are valid only until a - * subsequent update operation, or the end of the transaction. - * @param[in] txn A transaction handle returned by #mdb_txn_begin() - * @param[in] dbi A database handle returned by #mdb_dbi_open() - * @param[in] key The key to search for in the database - * @param[in] data The data for #MDBX_DUPSORT databases - * @param[out] attrptr The pointer to the result - * @return A non-zero error value on failure and 0 on success. Some possible - * errors are: - *
      - *
    • #MDBX_NOTFOUND - the key-value pair was not in the database. - *
    • EINVAL - an invalid parameter was specified. - *
    - */ +/* Get items attribute from a database. + * + * This function retrieves key/data pairs from the database. The address + * and length of the data associated with the specified key are returned + * in the structure to which data refers. + * If the database supports duplicate keys (MDBX_DUPSORT) then the + * first data item for the key will be returned. Retrieval of other + * items requires the use of mdbx_cursor_get(). + * + * NOTE: The memory pointed to by the returned values is owned by the + * database. The caller need not dispose of the memory, and may not + * modify it in any way. For values returned in a read-only transaction + * any modification attempts will cause a SIGSEGV. + * + * NOTE: Values returned from the database are valid only until a + * subsequent update operation, or the end of the transaction. + * + * [in] txn A transaction handle returned by mdbx_txn_begin() + * [in] dbi A database handle returned by mdbx_dbi_open() + * [in] key The key to search for in the database + * [in,out] data The data corresponding to the key + * + * Returns A non-zero error value on failure and 0 on success, some + * possible errors are: + * - MDBX_NOTFOUND - the key was not in the database. + * - MDBX_EINVAL - an invalid parameter was specified. */ int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t *attrptr); + mdbx_attr_t *attrptr); #ifdef __cplusplus } diff --git a/src/mdbx.c b/src/mdbx.c index 57ebe42d..df7bec11 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -11101,137 +11101,125 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, /*----------------------------------------------------------------------------*/ /* attribute support functions for Nexenta */ -static __inline int -mdbx_attr_peek(MDBX_val *data, mdbx_attr_t *attrptr) -{ - if (unlikely(data->iov_len < sizeof(mdbx_attr_t))) - return MDBX_INCOMPATIBLE; +static __inline int mdbx_attr_peek(MDBX_val *data, mdbx_attr_t *attrptr) { + if (unlikely(data->iov_len < sizeof(mdbx_attr_t))) + return MDBX_INCOMPATIBLE; - if (likely(attrptr != NULL)) - *attrptr = *(mdbx_attr_t*) data->iov_base; - data->iov_len -= sizeof(mdbx_attr_t); - data->iov_base = likely(data->iov_len > 0) - ? ((mdbx_attr_t*) data->iov_base) + 1 : NULL; + if (likely(attrptr != NULL)) + *attrptr = *(mdbx_attr_t *)data->iov_base; + data->iov_len -= sizeof(mdbx_attr_t); + data->iov_base = + likely(data->iov_len > 0) ? ((mdbx_attr_t *)data->iov_base) + 1 : NULL; - return MDBX_SUCCESS; + return MDBX_SUCCESS; } -static __inline int -mdbx_attr_poke(MDBX_val *reserved, MDBX_val *data, mdbx_attr_t attr, unsigned flags) -{ - mdbx_attr_t *space = reserved->iov_base; - if (flags & MDBX_RESERVE) { - if (likely(data != NULL)) { - data->iov_base = data->iov_len ? space + 1 : NULL; - } - } else { - *space = attr; - if (likely(data != NULL)) { - memcpy(space + 1, data->iov_base, data->iov_len ); - } - } +static __inline int mdbx_attr_poke(MDBX_val *reserved, MDBX_val *data, + mdbx_attr_t attr, unsigned flags) { + mdbx_attr_t *space = reserved->iov_base; + if (flags & MDBX_RESERVE) { + if (likely(data != NULL)) { + data->iov_base = data->iov_len ? space + 1 : NULL; + } + } else { + *space = attr; + if (likely(data != NULL)) { + memcpy(space + 1, data->iov_base, data->iov_len); + } + } - return MDBX_SUCCESS; + return MDBX_SUCCESS; } -int -mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - mdbx_attr_t *attrptr, MDBX_cursor_op op) -{ - int rc = mdbx_cursor_get(mc, key, data, op); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + mdbx_attr_t *attrptr, MDBX_cursor_op op) { + int rc = mdbx_cursor_get(mc, key, data, op); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - return mdbx_attr_peek(data, attrptr); + return mdbx_attr_peek(data, attrptr); } -int -mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, - MDBX_val *key, MDBX_val *data, uint64_t *attrptr) -{ - int rc = mdbx_get(txn, dbi, key, data); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, + uint64_t *attrptr) { + int rc = mdbx_get(txn, dbi, key, data); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - return mdbx_attr_peek(data, attrptr); + return mdbx_attr_peek(data, attrptr); } -int -mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, - MDBX_val *key, MDBX_val *data, mdbx_attr_t attr, unsigned flags) -{ - MDBX_val reserve = { - .iov_base = NULL, - .iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t) - }; +int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, + mdbx_attr_t attr, unsigned flags) { + MDBX_val reserve = {.iov_base = NULL, + .iov_len = + (data ? data->iov_len : 0) + sizeof(mdbx_attr_t)}; - int rc = mdbx_put(txn, dbi, key, &reserve, flags | MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + int rc = mdbx_put(txn, dbi, key, &reserve, flags | MDBX_RESERVE); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - return mdbx_attr_poke(&reserve, data, attr, flags); + return mdbx_attr_poke(&reserve, data, attr, flags); } int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, unsigned flags) -{ - MDBX_val reserve = { - .iov_base = NULL, - .iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t) - }; + mdbx_attr_t attr, unsigned flags) { + MDBX_val reserve = {.iov_base = NULL, + .iov_len = + (data ? data->iov_len : 0) + sizeof(mdbx_attr_t)}; - int rc = mdbx_cursor_put(cursor, key, &reserve, flags | MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + int rc = mdbx_cursor_put(cursor, key, &reserve, flags | MDBX_RESERVE); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - return mdbx_attr_poke(&reserve, data, attr, flags); + return mdbx_attr_poke(&reserve, data, attr, flags); } -int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, - MDBX_val *key, MDBX_val *data, mdbx_attr_t attr) -{ - MDBX_cursor mc; - MDBX_xcursor mx; - MDBX_val old_data; - mdbx_attr_t old_attr; - int rc; +int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, + mdbx_attr_t attr) { + MDBX_cursor mc; + MDBX_xcursor mx; + MDBX_val old_data; + mdbx_attr_t old_attr; + int rc; - if (unlikely(!key || !txn)) - return EINVAL; + if (unlikely(!key || !txn)) + return EINVAL; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_VERSION_MISMATCH; + if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_VERSION_MISMATCH; - if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) + return EINVAL; - if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY|MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? EACCES : MDBX_BAD_TXN; + if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->mt_flags & MDBX_TXN_RDONLY) ? EACCES : MDBX_BAD_TXN; - mdbx_cursor_init(&mc, txn, dbi, &mx); - rc = mdbx_cursor_set(&mc, key, &old_data, MDBX_SET, NULL); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND && data) { - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; - rc = mdbx_cursor_put_attr(&mc, key, data, attr, 0); - txn->mt_cursors[dbi] = mc.mc_next; - } - return rc; - } + mdbx_cursor_init(&mc, txn, dbi, &mx); + rc = mdbx_cursor_set(&mc, key, &old_data, MDBX_SET, NULL); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND && data) { + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdbx_cursor_put_attr(&mc, key, data, attr, 0); + txn->mt_cursors[dbi] = mc.mc_next; + } + return rc; + } - rc = mdbx_attr_peek(&old_data, &old_attr); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + rc = mdbx_attr_peek(&old_data, &old_attr); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (old_attr == attr && (!data || - (data->iov_len == old_data.iov_len - && memcpy(data->iov_base, old_data.iov_base, old_data.iov_len) == 0))) - return MDBX_SUCCESS; + if (old_attr == attr && (!data || (data->iov_len == old_data.iov_len && + memcpy(data->iov_base, old_data.iov_base, + old_data.iov_len) == 0))) + return MDBX_SUCCESS; - mc.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &mc; - rc = mdbx_cursor_put_attr(&mc, key, data ? data : &old_data, attr, MDBX_CURRENT); - txn->mt_cursors[dbi] = mc.mc_next; - return rc; + mc.mc_next = txn->mt_cursors[dbi]; + txn->mt_cursors[dbi] = &mc; + rc = mdbx_cursor_put_attr(&mc, key, data ? data : &old_data, attr, + MDBX_CURRENT); + txn->mt_cursors[dbi] = mc.mc_next; + return rc; } From aec35300c442bbc1e6ffc3e6a07a56797470d09f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 10:46:46 +0300 Subject: [PATCH 272/303] mdbx: fix new API function after the merge. Change-Id: Ic929444ceb137ccaa2ab4fe82b4f93a3cb5a92aa --- mdbx.h | 24 +++++++++++++----------- src/mdbx.c | 24 +++++++++++------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/mdbx.h b/mdbx.h index 96e6e63e..160bef64 100644 --- a/mdbx.h +++ b/mdbx.h @@ -1633,7 +1633,7 @@ LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, /*----------------------------------------------------------------------------*/ /* attribute support functions for Nexenta */ -typedef uint64_t mdbx_attr_t; +typedef uint_fast64_t mdbx_attr_t; /* Store by cursor with attribute. * @@ -1668,8 +1668,9 @@ typedef uint64_t mdbx_attr_t; * - MDBX_TXN_FULL - the transaction has too many dirty pages. * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, unsigned flags); +LIBMDBX_API int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, + MDBX_val *data, mdbx_attr_t attr, + unsigned flags); /* Store items and attributes into a database. * @@ -1713,8 +1714,8 @@ int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, * - MDBX_TXN_FULL - the transaction has too many dirty pages. * - MDBX_EACCES - an attempt was made to write in a read-only transaction. * - MDBX_EINVAL - an invalid parameter was specified. */ -int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, unsigned flags); +LIBMDBX_API int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data, mdbx_attr_t attr, unsigned flags); /* Set items attribute from a database. * @@ -1733,8 +1734,8 @@ int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, * possible errors are: * - MDBX_NOTFOUND - the key-value pair was not in the database. * - MDBX_EINVAL - an invalid parameter was specified. */ -int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr); +LIBMDBX_API int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data, mdbx_attr_t attr); /* Get items attribute from a database cursor. * @@ -1753,8 +1754,9 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, * possible errors are: * - MDBX_NOTFOUND - no matching key found. * - MDBX_EINVAL - an invalid parameter was specified. */ -int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - mdbx_attr_t *attrptr, MDBX_cursor_op op); +LIBMDBX_API int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, mdbx_attr_t *attrptr, + MDBX_cursor_op op); /* Get items attribute from a database. * @@ -1782,8 +1784,8 @@ int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, * possible errors are: * - MDBX_NOTFOUND - the key was not in the database. * - MDBX_EINVAL - an invalid parameter was specified. */ -int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t *attrptr); +LIBMDBX_API int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data, mdbx_attr_t *attrptr); #ifdef __cplusplus } diff --git a/src/mdbx.c b/src/mdbx.c index df7bec11..63c4b958 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -11151,9 +11151,9 @@ int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, mdbx_attr_t attr, unsigned flags) { - MDBX_val reserve = {.iov_base = NULL, - .iov_len = - (data ? data->iov_len : 0) + sizeof(mdbx_attr_t)}; + MDBX_val reserve; + reserve.iov_base = NULL; + reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t); int rc = mdbx_put(txn, dbi, key, &reserve, flags | MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) @@ -11164,9 +11164,9 @@ int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, mdbx_attr_t attr, unsigned flags) { - MDBX_val reserve = {.iov_base = NULL, - .iov_len = - (data ? data->iov_len : 0) + sizeof(mdbx_attr_t)}; + MDBX_val reserve; + reserve.iov_base = NULL; + reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t); int rc = mdbx_cursor_put(cursor, key, &reserve, flags | MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) @@ -11177,12 +11177,6 @@ int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, mdbx_attr_t attr) { - MDBX_cursor mc; - MDBX_xcursor mx; - MDBX_val old_data; - mdbx_attr_t old_attr; - int rc; - if (unlikely(!key || !txn)) return EINVAL; @@ -11195,8 +11189,11 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) return (txn->mt_flags & MDBX_TXN_RDONLY) ? EACCES : MDBX_BAD_TXN; + MDBX_cursor mc; + MDBX_xcursor mx; + MDBX_val old_data; mdbx_cursor_init(&mc, txn, dbi, &mx); - rc = mdbx_cursor_set(&mc, key, &old_data, MDBX_SET, NULL); + int rc = mdbx_cursor_set(&mc, key, &old_data, MDBX_SET, NULL); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND && data) { mc.mc_next = txn->mt_cursors[dbi]; @@ -11207,6 +11204,7 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, return rc; } + mdbx_attr_t old_attr; rc = mdbx_attr_peek(&old_data, &old_attr); if (unlikely(rc != MDBX_SUCCESS)) return rc; From bf8507f3ecab436e8f60c7d76a30441fde0f8145 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 11:41:50 +0300 Subject: [PATCH 273/303] mdbx: fix mdbx_env_info() to avoid null-deref in lck-free mode (coverity). Change-Id: Ica8fe6c7f5a18af3a4d7d38ce8a1a092d5f1b2f7 --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 63c4b958..531fb3a8 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -9653,7 +9653,7 @@ int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { arg->me_recent_txnid != mdbx_meta_txnid_fluid(env, meta))); arg->me_maxreaders = env->me_maxreaders; - arg->me_numreaders = env->me_lck->mti_numreaders; + arg->me_numreaders = env->me_lck ? env->me_lck->mti_numreaders : INT32_MAX; arg->me_dxb_pagesize = env->me_psize; arg->me_sys_pagesize = env->me_os_psize; From 93d92db43dad6bf2d03a47cac19c68e50f604e97 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 11:51:40 +0300 Subject: [PATCH 274/303] mdbx: fix mischecking in mdbx_reader_check0() (coverity). Change-Id: I2e3aaba1426c3b152a39b90f6d171380948da0a7 --- src/mdbx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 531fb3a8..0adaa626 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -10279,7 +10279,7 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { /* stale reader found */ if (!rdt_locked) { err = mdbx_rdt_lock(env); - if (MDBX_IS_ERROR(rc)) { + if (MDBX_IS_ERROR(err)) { rc = err; break; } @@ -10296,7 +10296,7 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { continue; err = mdbx_rpid_check(env, pid); - if (MDBX_IS_ERROR(rc)) { + if (MDBX_IS_ERROR(err)) { rc = err; break; } From beda6902271549e91304fdc2b1d6bf59e381e371 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 12:17:09 +0300 Subject: [PATCH 275/303] mdbx: fix resource-leak in mdbx_env_open_ex() in case of error (coverity). Change-Id: I9e808e2de4a64d05205a79c9d90feacc87ec930d --- src/mdbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdbx.c b/src/mdbx.c index 0adaa626..4382be23 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4924,7 +4924,7 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_lck->mti_readers[0], &env->me_lck->mti_readers[env->me_maxreaders]); if (unlikely(rc != MDBX_SUCCESS)) - return rc; + goto bailout; env->me_flags |= MDBX_ENV_TXKEY; } From 136e98fb934ba048ab031763c94eac895ca84aa8 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 12:24:14 +0300 Subject: [PATCH 276/303] test: fix initialization 'signalled' field (coverity). Change-Id: Ie6dae6c419bd81f203968bf07f2cf472ea7344a5 --- test/test.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test.h b/test/test.h index 98b65801..0a9b8a38 100644 --- a/test/test.h +++ b/test/test.h @@ -136,7 +136,7 @@ protected: public: testcase(const actor_config &config, const mdbx_pid_t pid) - : config(config), pid(pid), nops_completed(0) { + : config(config), pid(pid), signalled(false), nops_completed(0) { start_timestamp.reset(); memset(&last, 0, sizeof(last)); } From 66f8327642386291103d78985aca68e33a6a564f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 12:36:00 +0300 Subject: [PATCH 277/303] test: allow null as 'function name' for logging (coverity). Change-Id: I047a4c372514e85d19dd3d3719f8ad3be046171e --- test/test.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test.cc b/test/test.cc index 50426014..9a265e33 100644 --- a/test/test.cc +++ b/test/test.cc @@ -76,9 +76,10 @@ static void mdbx_debug_logger(int type, const char *function, int line, if (type & MDBX_DBG_PRINT) level = logging::verbose; + if (!function) + function = "unknown"; if (type & MDBX_DBG_ASSERT) { - log_error("mdbx: assertion failure: %s, %d", - function ? function : "unknown", line); + log_error("mdbx: assertion failure: %s, %d", function, line); level = logging::failure; } From 14484a6f32722e7c20ccb34bad5fb85e3adc585f Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 13:06:32 +0300 Subject: [PATCH 278/303] mdbx: fix minor defects (coverity). Change-Id: I1a5b0788a87ab2a138b342140648642fd5855ae3 --- src/mdbx.c | 6 +++++- src/osal.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 4382be23..5182c2bf 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -3849,8 +3849,10 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target = mdbx_meta_ancient(env, meta1, meta2, true); else if (head == meta1) target = mdbx_meta_ancient(env, meta0, meta2, true); - else if (head == meta2) + else { + mdbx_assert(env, head == meta2); target = mdbx_meta_ancient(env, meta0, meta1, true); + } /* LY: step#2 - update meta-page. */ mdbx_debug( @@ -6190,6 +6192,7 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { if (likely(data)) { if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_cassert(mc, mc->mc_xcursor != nullptr); mdbx_xcursor_init1(mc, leaf); rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) @@ -6233,6 +6236,7 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { if (likely(data)) { if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + mdbx_cassert(mc, mc->mc_xcursor != nullptr); mdbx_xcursor_init1(mc, leaf); rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) diff --git a/src/osal.c b/src/osal.c index 08ae9fef..ae28d191 100644 --- a/src/osal.c +++ b/src/osal.c @@ -549,7 +549,7 @@ int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { sigset_t set, old; sigemptyset(&set); sigaddset(&set, SIGPIPE); - int rc = rc = pthread_sigmask(SIG_BLOCK, &set, &old); + int rc = pthread_sigmask(SIG_BLOCK, &set, &old); if (rc != 0) return rc; #endif From d01a97f729e53931fae62954904be4abc8bf2ea0 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 13:06:54 +0300 Subject: [PATCH 279/303] tools: fix minor defects (coverity). Change-Id: I3a80dcb31c54718d22cdca6272aa028685956243 --- src/tools/mdbx_chk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index ad8f85c8..ccc0592b 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -229,7 +229,7 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, if (type) { uint64_t page_bytes = payload_bytes + header_bytes + unused_bytes; - uint64_t page_size = pgnumber * envstat.ms_psize; + size_t page_size = (size_t)pgnumber * envstat.ms_psize; int index = pagemap_lookup_dbi(dbi); if (index < 0) return ENOMEM; From e4a8a144b64de12b1aea7dbe9fc700c740228d03 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 13:07:08 +0300 Subject: [PATCH 280/303] test: fix minor defects (coverity). Change-Id: I87165ca771a717815a2c81c36fcf1e9add2536bb --- test/log.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/log.cc b/test/log.cc index eee0fffe..5ab99513 100644 --- a/test/log.cc +++ b/test/log.cc @@ -125,9 +125,11 @@ bool output(const logging::loglevel priority, const char *format, va_list ap) { switch (end) { default: putc('\n', last); + // fall through case '\n': fflush(last); last = nullptr; + // fall through case ' ': case '_': case ':': From 36ef355332e27f82dc44c4bdaa432e9a7af9d7cc Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 4 Jul 2017 12:07:05 +0300 Subject: [PATCH 281/303] mdbx: update README (add perfomance comparison). Change-Id: I0cea926d37b83dbe787b72031ecae28095d98160 --- README.md | 223 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 195 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index a9ef1ba6..c7cf6431 100644 --- a/README.md +++ b/README.md @@ -16,14 +16,15 @@ and [by Yandex](https://translate.yandex.ru/translate?url=https%3A%2F%2Fgithub.c _libmdbx_ - это встраиваемый key-value движок хранения со специфическим набором возможностей, которые при правильном применении позволяют создавать уникальные решения с чемпионской производительностью, идеально -сочетаясь с технологией [MRAM](https://en.wikipedia.org/wiki/Magnetoresistive_random-access_memory). +сочетаясь с технологией +[MRAM](https://en.wikipedia.org/wiki/Magnetoresistive_random-access_memory). -_libmdbx_ обновляет совместно используемый набор данных, никак не мешая -при этом параллельным операциям чтения, не применяя атомарных операций к -самим данным, и обеспечивая согласованность при аварийной остановке в -любой момент. Поэтому _libmdbx_ позволяя строить системы с линейным -масштабированием производительности чтения/поиска по ядрам CPU и -амортизационной стоимостью любых операций Olog(N). +_libmdbx_ умеет обновлять совместно используемый набор данных, никак не +мешая при этом параллельным операциям чтения, не применяя атомарных +операций к самим данным, и обеспечивая согласованность при аварийной +остановке в любой момент. Поэтому _libmdbx_ позволяя строить системы с +линейным масштабированием производительности чтения/поиска по ядрам CPU +и амортизационной стоимостью любых операций Olog(N). ### История @@ -37,6 +38,12 @@ _libmdbx_ является потомком "Lightning Memory-Mapped Database", [представлен на конференции Highload++ 2015](http://www.highload.ru/2015/abstracts/1831.html). +В начале 2017 года движок _libmdbx_ получил новый импульс развития, +благодаря использованию в [Fast Positive +Tables](https://github.com/leo-yuriev/libfpta), aka ["Позитивные +Таблицы"](https://github.com/leo-yuriev/libfpta) by [Positive +Technologies](https://www.ptsecurity.ru). + Характеристики и ключевые особенности ===================================== @@ -45,14 +52,12 @@ _libmdbx_ наследует все ключевые возможности и своего прародителя [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database), с устранением описанных далее проблем и архитектурных недочетов. -### Общее для оригинальной _LMDB_ и _libmdbx_ - 1. Данные хранятся в упорядоченном отображении (ordered map), ключи всегда отсортированы, поддерживается выборка диапазонов (range lookups). 2. Данные отображается в память каждого работающего с БД процесса. - Ключам и данным обеспечивается прямой доступ без необходимости их - копирования, так как они защищены транзакцией чтения и не изменяются. + К данным и ключам обеспечивается прямой доступ без необходимости их + копирования. 3. Транзакции согласно [ACID](https://ru.wikipedia.org/wiki/ACID), посредством @@ -71,10 +76,10 @@ _libmdbx_ наследует все ключевые возможности и значениями), без дублирования ключей, с сортировкой значений, в том числе целочисленных (для вторичных индексов). -6. Эффективная поддержка ключей фиксированной длины, в том числе целочисленных. +6. Эффективная поддержка коротких ключей фиксированной длины, в том числе целочисленных. 7. Амортизационная стоимость любой операции Olog(N), - [WAF](https://en.wikipedia.org/wiki/Write_amplification) и RAF также Olog(N). + [WAF](https://en.wikipedia.org/wiki/Write_amplification) (Write Amplification Factor) и RAF (Read Amplification Factor) также Olog(N). 8. Нет [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) и журнала транзакций, после сбоев не требуется восстановление. Не требуется компактификация @@ -82,10 +87,151 @@ _libmdbx_ наследует все ключевые возможности и "по горячему", на работающей БД без приостановки изменения данных. 9. Отсутствует какое-либо внутреннее управление памятью или кэшированием. Всё - необходимое штатно выполняет ядро ОС. + необходимое штатно выполняет ядро ОС! -### Недостатки и Компромиссы +Сравнение производительности +============================ +Все данные получены многократным прогоном тестов на ноутбуке Lenovo +Carbon-2, i7-4600U 2.1 ГГц, 8 Гб ОЗУ, с SSD-диском SAMSUNG +MZNTD512HAGL-000L1 (DXT23L0Q) 512 Гб. + +Исходный код бенчмарка [_IOArena_](https://github.com/pmwkaa/ioarena) и +сценарии тестирования [доступны на +github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). + +-------------------------------------------------------------------------------- + +### Интегральная производительность +![Comparison #1: Integral Performance](https://raw.githubusercontent.com/wiki/ReOpen/libmdbx/img/perf-slide-1.png) + +Показана соотнесенная сумма показателей производительности в трёх +бенчмарках: + + - Чтение/Поиск на машине с 4-мя процессорами; + + - Транзакции с [CRUD](https://ru.wikipedia.org/wiki/CRUD)-операциями + (вставка, чтение, обновление, удаление) в режиме **синхронной фиксации** + данных (fdatasync при завершении каждой транзакции или аналог); + + - Транзакции с [CRUD](https://ru.wikipedia.org/wiki/CRUD)-операциями + (вставка, чтение, обновление, удаление) в режиме **отложенной фиксации** + данных (отложенная запись посредством файловой систем или аналог); + +*Бенчмарк в режиме асинхронной записи не включен по двум причинам:* + + 1. Такое сравнение не совсем правомочно, его следует делать с движками + ориентированными на хранение данных в памяти (Tarantool, Redis). + + 2. Превосходство libmdbx становится еще более подавляющем, что мешает + восприятию информации. + +-------------------------------------------------------------------------------- + +### Масштабируемость чтения +![Comparison #2: Read Scalability](https://raw.githubusercontent.com/wiki/ReOpen/libmdbx/img/perf-slide-2.png) + +Для каждого движка показана суммарная производительность при +одновременном выполнении запросов чтения/поиска в 1-2-4-8 потоков на +машине с 4-мя процессорами. + +-------------------------------------------------------------------------------- + +### Синхронная фиксация +![Comparison #3: Sync-write mode](https://raw.githubusercontent.com/wiki/ReOpen/libmdbx/img/perf-slide-3.png) + + - Линейная шкала слева и темные прямоугольники соответствуют количеству + транзакций в секунду, усредненному за все время теста. + + - Логарифмическая шкала справа и желтые интервальные отрезки + соответствуют времени выполнения транзакций. При этом каждый отрезок + показывает минимальное и максимальное время затраченной на выполнения + транзакций, а крестик показывает среднеквадратичное значение. + +Выполняется **10.000 транзакций в режиме синхронной фиксации данных** на +диске. При этом требуется гарантия, что при аварийном выключении питания +(или другом подобном сбое) все данные будут консистентны и полностью +соответствовать последней завершенной транзакции. В _libmdbx_ в этом +режиме при фиксации каждой транзакции выполняется системный вызов +[fdatasync](https://linux.die.net/man/2/fdatasync). + +В каждой транзакции выполняется CRUD-операция (две вставки, одной +чтение, одно обновление, одно удаление). Бенчмарк стартует на пустой +базе и в результате выполняемых действий при завершении в базе +насчитывается 10.000 небольших key-value записей. + +-------------------------------------------------------------------------------- + +### Отложенная фиксация +![Comparison #4: Lazy-write mode](https://raw.githubusercontent.com/wiki/ReOpen/libmdbx/img/perf-slide-4.png) + + - Линейная шкала слева и темные прямоугольники соответствуют количеству транзакций в секунду, усредненному за все время теста. + - Логарифмическая шкала справа и желтые интервальные отрезки соответствуют времени выполнения транзакций. При этом каждый отрезок показывает минимальное и максимальное время затраченной на выполнения транзакций, а крестик показывает среднеквадратичное значение. + +Выполняется **100.000 транзакций в режиме отложенной фиксации данных** на диске. При этом требуется гарантия, что при аварийном выключении питания (или другом подобном сбое) все данные будут консистентны на момент завершения одной из транзакций, но допускается потеря изменений из некоторого количества последних транзакций, что для многих движков предполагает включение [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) (write-ahead logging) либо журнала транзакций, который в свою очередь опирается на гарантию упорядоченности данных в журналируемой файловой системе. _libmdbx_ при этом не ведет WAL, а передает весь контроль файловой системе и ядру ОС. + +В каждой транзакции выполняется CRUD-операция (две вставки, одной чтение, одно обновление, одно удаление). +Бенчмарк стартует на пустой базе и в результате выполняемых действий при завершении в базе насчитывается 100.000 небольших key-value записей. + +-------------------------------------------------------------------------------- + +### Асинхронная фиксация +![Comparison #5: Async-write mode](https://raw.githubusercontent.com/wiki/ReOpen/libmdbx/img/perf-slide-5.png) + + - Линейная шкала слева и темные прямоугольники соответствуют количеству + транзакций в секунду, усредненному за все время теста. + + - Логарифмическая шкала справа и желтые интервальные отрезки + соответствуют времени выполнения транзакций. При этом каждый отрезок + показывает минимальное и максимальное время затраченной на выполнения + транзакций, а крестик показывает среднеквадратичное значение. + +Выполняется **1.000.000 транзакций в режиме асинхронной фиксации +данных** на диске. При этом требуется гарантия, что при аварийном +выключении питания (или другом подобном сбое) все данные будут +консистентны на момент завершения одной из транзакций, но допускается +потеря изменений из любого количества последних транзакций. Во всех +движках при этом включался режим предполагающий минимальную нагрузку +записи на диск, и соответственно минимальную гарантию сохранности +данных. В _libmdbx_ при этом используется режим асинхронной записи +измененных страниц на диск силами ядра ОС посредством системрго вызова +[msync(MS_ASYNC)](https://linux.die.net/man/2/msync). + +В каждой транзакции выполняется CRUD-операция (две вставки, одной +чтение, одно обновление, одно удаление). Бенчмарк стартует на пустой +базе и в результате выполняемых действий при завершении в базе +насчитывается 1.000.000 небольших key-value записей. + +-------------------------------------------------------------------------------- + +### Стоимость как потребление ресурсов +![Comparison #6: Cost comparison](https://raw.githubusercontent.com/wiki/ReOpen/libmdbx/img/perf-slide-6.png) + +Показана соотнесенная сумма использованных ресурсов в ходе бенчмарка в +режиме отложенной фиксации: + + - суммарное количество операций ввода-вывода (IOPS), как записи, так и + чтения. + + - суммарное затраченное время процессора, как в режиме пользователя, + так и в режиме ядра ОС. + + - максимальный объем места на диске. который требовался во время работы + теста. + +Движок _ForestDB_ был исключен при окончательном формировании диаграммы, +так как многократно превысил потребление каждого из ресурсов (потратил +процессорное время на генерацию IOPS для заполнения диска). Что не +позволяло наглядно сравнить показатели остальных движков на одной +диаграмме. + +Все данные собирались посредством системного вывова +[getrusage()](http://man7.org/linux/man-pages/man2/getrusage.2.html) и +сканированием директорий с данными. + +-------------------------------------------------------------------------------- + +## Недостатки и Компромиссы 1. Единовременно может выполняться не более одной транзакция изменения данных (один писатель). Зато все изменения всегда последовательны, не может быть @@ -93,20 +239,22 @@ _libmdbx_ наследует все ключевые возможности и 2. Отсутствие [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) обуславливает относительно большой - [WAF](https://en.wikipedia.org/wiki/Write_amplification). Поэтому фиксация - изменений на диске может быть дорогой и является главным ограничителем для - производительности по записи. В качестве компромисса предлагается несколько - режимов ленивой и/или периодической фиксации. В том числе режим `MAPASYNC`, - при котором изменения происходят только в памяти и асинхронно фиксируются на - диске ядром ОС. + [WAF](https://en.wikipedia.org/wiki/Write_amplification) (Write + Amplification Factor). Поэтому фиксация изменений на диске может быть + дорогой и является главным ограничителем для производительности по + записи. В качестве компромисса предлагается несколько режимов ленивой + и/или периодической фиксации. В том числе режим `MAPASYNC`, при котором + изменения происходят только в памяти и асинхронно фиксируются на диске + ядром ОС. 3. [COW](https://ru.wikipedia.org/wiki/%D0%9A%D0%BE%D0%BF%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%BF%D1%80%D0%B8_%D0%B7%D0%B0%D0%BF%D0%B8%D1%81%D0%B8) для реализации [MVCC](https://ru.wikipedia.org/wiki/MVCC) выполняется на - уровне страниц в [B+ дереве](https://ru.wikipedia.org/wiki/B-%D0%B4%D0%B5%D1%80%D0%B5%D0%B2%D0%BE). - Поэтому изменение данных амортизационно требует копирования Olog(N) страниц, - что расходует [пропускную способность оперативной - памяти](https://en.wikipedia.org/wiki/Memory_bandwidth) и является основным - ограничителем производительности в режиме `MAPASYNC`. + уровне страниц в [B+ + дереве](https://ru.wikipedia.org/wiki/B-%D0%B4%D0%B5%D1%80%D0%B5%D0%B2%D0%BE). + Поэтому изменение данных амортизационно требует копирования Olog(N) + страниц, что расходует [пропускную способность оперативной + памяти](https://en.wikipedia.org/wiki/Memory_bandwidth) и является + основным ограничителем производительности в режиме `MAPASYNC`. 4. В _LMDB_ существует проблема долгих чтений (приостановленных читателей), которая приводит к деградации производительности и переполнению БД. @@ -121,6 +269,12 @@ _libmdbx_ наследует все ключевые возможности и #### Проблема долгих чтений +*Следует отметить*, что проблема "сборки мусора" так или иначе +существует во всех СУБД (Vacuum в PostgreSQL). Однако в случае _libmdbx_ +и LMDB она проявляется более остро, прежде всего из-за высокой +производительности, а также из-за намеренного прощения внутренних +механизмов ради производительности. + Понимание проблемы требует некоторых пояснений, которые изложены ниже, но могут быть сложны для быстрого восприятия. Поэтому, тезисно: @@ -202,9 +356,10 @@ RECLAIM` в _libmdbx_. В _libmdbx_ эта проблема устранена, подробности ниже. +-------------------------------------------------------------------------------- -Доработки _libmdbx_ -=================== +Дополнительные "фичи" _libmdbx_ относительно LMDB +================================================= 1. Режим `LIFO RECLAIM`. @@ -377,3 +532,15 @@ RECLAIM` в _libmdbx_. закрываются и не теряются при завершении таких транзакций посредством mdbx_txn_abort() или mdbx_txn_reset(). Что позволяет избавится от ряда сложно обнаруживаемых ошибок. + +26. Генерация последовательностей посредством `mdbx_dbi_sequence()`. + +27. Обновление данных с одновременным получением старых значений, +а также адресное изменение конкретного multi-значения посредством `mdbx_replace()`. + +28. Расширенное динамическое управление размером БД, включая выбор размера страницы +посредством `mdbx_env_set_geometry()`. + +29. Три мета-страницы вместо двух, что позволяет гарантированно консистентно +обновлять слабые контрольные точки фиксации без риска повредить крайнюю сильную +точку фиксации. From 84b4883f00a8e6f857b88f6ddb3c7fd3831232e6 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 5 Jul 2017 19:38:22 +0300 Subject: [PATCH 282/303] mdbx: fix returning MDBX-errors after the merge. Change-Id: Id49b7f341749709cdee476c88c1066e76b6937c1 --- src/mdbx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 5182c2bf..3a168702 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -11182,16 +11182,16 @@ int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, mdbx_attr_t attr) { if (unlikely(!key || !txn)) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDBX_VERSION_MISMATCH; if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))) - return EINVAL; + return MDBX_EINVAL; if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? EACCES : MDBX_BAD_TXN; + return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; MDBX_cursor mc; MDBX_xcursor mx; From a9faaaaf214a10492771be9a5f4031387bacf4d0 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 5 Jul 2017 21:40:18 +0300 Subject: [PATCH 283/303] mdbx: ci-appveyor - complete matrix MSVC 2013/2015/2017. Change-Id: Ie7984960e83bb8e4366531665c2f5195eca6cc41 --- appveyor.yml | 47 +++++++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index f8b9393c..bfaca865 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,33 +1,44 @@ -image: Visual Studio 2015 +version: 0.1.2.{build} environment: matrix: -# - Toolset: v141 - - Toolset: v140 - - Toolset: v120 - - Toolset: v110 - - Toolset: v100 - -platform: - - x86 - - x64 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + TOOLSET: v141 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + TOOLSET: v140 + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2013 + TOOLSET: v120 configuration: - - Release - - Debug +- Debug +- Release -build: - verbosity: minimal - project: mdbx.sln +platform: +- x86 +- x64 +#- ARM + +build_script: +- ps: > + msbuild "C:\projects\libmdbx\mdbx.sln" /verbosity:minimal + /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" + /property:PlatformToolset=$env:TOOLSET + /property:Configuration=$env:CONFIGURATION + /property:Platform=$env:PLATFORM test_script: - ps: | if (($env:PLATFORM -eq "x86") -and (Test-Path "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" -PathType Leaf)) { $test = "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" - } else { + } elseif (($env:PLATFORM -ne "ARM") -and ($env:PLATFORM -ne "ARM64")) { $test = "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\test.exe" + } else { + $test = "" + } + + if ($test -ne "") { + & "$test" --pathname=tmp.db --dont-cleanup-after basic | Tee-Object -file test.log | Select-Object -last 42 } - & "$test" --pathname=tmp.db --dont-cleanup-after basic | Tee-Object -file test.log | Select-Object -last 42 on_failure: - - ps: Push-AppveyorArtifact test.log +- ps: Push-AppveyorArtifact test.log From e3fcc4754e27bb8323f8487c58598e5edf1b7c76 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Wed, 5 Jul 2017 23:08:45 +0300 Subject: [PATCH 284/303] mdbx-test: fix warnings from MSVC 2013. --- test/base.h | 9 +++++++++ test/keygen.h | 2 +- test/test.h | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/test/base.h b/test/base.h index e87f0240..0ed927da 100644 --- a/test/base.h +++ b/test/base.h @@ -20,6 +20,7 @@ #if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) #ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS #pragma warning(push, 1) #pragma warning(disable : 4548) /* expression before comma has no effect; \ expected expression with side - effect */ @@ -91,4 +92,12 @@ #pragma warning(disable : 4201) /* nonstandard extension used : \ nameless struct / union */ #pragma warning(disable : 4127) /* conditional expression is constant */ +#if _MSC_VER < 1900 +#pragma warning(disable : 4510) /* default constructor could not be generated */ +#pragma warning(disable : 4512) /* assignment operator could not be generated */ +#pragma warning(disable : 4610) /* user-defined constructor required */ +#define snprintf _snprintf +#pragma warning(disable : 4996) /* 'vsnprintf': This function or variable \ + may be unsafe */ #endif +#endif /* _MSC_VER */ diff --git a/test/keygen.h b/test/keygen.h index f109bf40..911ea6d8 100644 --- a/test/keygen.h +++ b/test/keygen.h @@ -71,7 +71,7 @@ namespace keygen { typedef uint64_t serial_t; -enum { +enum : serial_t { serial_minwith = 8, serial_maxwith = sizeof(serial_t) * 8, serial_allones = ~(serial_t)0 diff --git a/test/test.h b/test/test.h index 0a9b8a38..bae5eb0d 100644 --- a/test/test.h +++ b/test/test.h @@ -63,7 +63,7 @@ struct txn_deleter : public std::unary_function { void operator()(MDBX_txn *txn) const { int rc = mdbx_txn_abort(txn); if (rc) - log_trouble(__func__, "mdbx_txn_abort()", rc); + log_trouble(mdbx_func_, "mdbx_txn_abort()", rc); } }; From 0047ce4bd6f7b13eddbe46e64852451ec04ae599 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 7 Jul 2017 00:02:56 +0300 Subject: [PATCH 285/303] mdbx: add default AddressSanitizer options. Change-Id: I99b7aee5010f1d43008b2a61efe3fb9bb023ab31 --- src/mdbx.c | 22 ++++++++++++++++++++++ test/base.h | 6 ++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 3a168702..00ba1e0d 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -11225,3 +11225,25 @@ int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, txn->mt_cursors[dbi] = mc.mc_next; return rc; } + +//---------------------------------------------------------------------------- + +#ifdef __SANITIZE_ADDRESS__ +LIBMDBX_API __attribute__((weak)) const char *__asan_default_options() { + return "symbolize=1:allow_addr2line=1:" +#ifdef _DEBUG + "debug=1:" +#endif /* _DEBUG */ + "report_globals=1:" + "replace_str=1:replace_intrin=1:" + "malloc_context_size=9:" + "detect_leaks=1:" + "check_printf=1:" + "detect_deadlocks=1:" + "check_initialization_order=1:" + "detect_stack_use_after_return=1:" + "intercept_tls_get_addr=1:" + "decorate_proc_maps=1:" + "abort_on_error=1"; +} +#endif /* __SANITIZE_ADDRESS__ */ diff --git a/test/base.h b/test/base.h index 0ed927da..8557787d 100644 --- a/test/base.h +++ b/test/base.h @@ -93,8 +93,10 @@ nameless struct / union */ #pragma warning(disable : 4127) /* conditional expression is constant */ #if _MSC_VER < 1900 -#pragma warning(disable : 4510) /* default constructor could not be generated */ -#pragma warning(disable : 4512) /* assignment operator could not be generated */ +#pragma warning(disable : 4510) /* default constructor could \ + not be generated */ +#pragma warning(disable : 4512) /* assignment operator could \ + not be generated */ #pragma warning(disable : 4610) /* user-defined constructor required */ #define snprintf _snprintf #pragma warning(disable : 4996) /* 'vsnprintf': This function or variable \ From 6f00854034f9161e6acdfe13fa264224735c0e64 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 7 Jul 2017 00:45:52 +0300 Subject: [PATCH 286/303] mdbx: take LTO_ENABLED in account for AddressSanitizer default options. Change-Id: I691eae23ca2cb60c6e2a9260ab41e3c80c8aaeb8 --- src/mdbx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mdbx.c b/src/mdbx.c index 00ba1e0d..e07655a3 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -11240,7 +11240,9 @@ LIBMDBX_API __attribute__((weak)) const char *__asan_default_options() { "detect_leaks=1:" "check_printf=1:" "detect_deadlocks=1:" +#ifndef LTO_ENABLED "check_initialization_order=1:" +#endif "detect_stack_use_after_return=1:" "intercept_tls_get_addr=1:" "decorate_proc_maps=1:" From 9121aaf7119b9b8a03ba8e24eb3acae3bdca2e34 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 10 Jul 2017 17:45:50 +0300 Subject: [PATCH 287/303] mdbx: update TODO. --- TODO.md | 132 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 71 insertions(+), 61 deletions(-) diff --git a/TODO.md b/TODO.md index 1508c944..b713830b 100644 --- a/TODO.md +++ b/TODO.md @@ -1,79 +1,89 @@ Допеределки -- [x] разделение errno и GetLastError() -- [x] CI посредством AppVeyor +=========== +- [ ] Перевод mdbx-tools на С++ и сборка для Windows. +- [ ] Переход на CMake, замена заглушек mdbx_version и mdbx_build. +- [ ] Актуализация README.md +- [ ] Переход на C++11, добавление #pramga detect_mismatch(). +- [ ] Убрать MDB_DEBUG (всегда: логирование важный ситуаций и ошибок, опционально: включение ассертов и трассировка). +- [ ] Заменить mdbx_debug на mdbx_trace, и почистить... +- [ ] Заметить максимум assert() на mdbx_assert(env, ...). + +Качество и CI +============= +- [ ] Добавить в CI linux сборки для 32-битных таргетов. + +Доработки API +============= +- [ ] Поправить/Добавить описание нового API. +- [ ] Добавить возможность "подбора" режима для mdbx_env_open(). +- [ ] Переименовать в API: env->db, db->tbl. + +Тесты +===== +- [ ] Тестирование поддержки lockless-режима. +- [ ] Додумать имя и размещение тестовой БД по-умолчанию. +- [ ] Реализовать cleanup в тесте. +- [ ] usage для теста. +- [ ] Логирование в файл, плюс более полный progress bar. +- [ ] Опция игнорирования (пропуска части теста) при переполнении БД. +- [ ] Базовый бенчмарк. + +Развитие +======== +- [ ] Отслеживание времени жизни DBI-хендлов. +- [ ] Отрефакторить mdbx_freelist_save(). +- [ ] Хранить "свободный хвост" не связанный с freeDB в META. +- [ ] Возврат выделенных страниц в unallocated tail-pool. +- [ ] Валидатор страниц БД по номеру транзакции: + ~0 при переработке и номер транзакции при выделении, + проверять что этот номер больше головы реклайминга и не-больше текущей транзакции. +- [ ] Размещение overflow-pages в отдельном mmap/файле с собственной геометрией. +- [ ] Зафиксировать формат БД. +- [ ] Валидатор страниц по CRC32, плюс контроль номер транзакии под модулю 2^32. +- [ ] Валидатор страниц по t1ha c контролем снимков/версий БД на основе Merkle Tree. +- [ ] Возможность хранения ключей внутри data (libfptu). +- [ ] Асинхронная фиксация (https://github.com/ReOpen/libmdbx/issues/5). +- [ ] (Пере)Выделять память под IDL-списки с учетом реального кол-ва страниц, т.е. max(MDB_IDL_UM_MAX/MDB_IDL_UM_MAX, npages). + +----------------------------------------------------------------------- + +Сделано +======= +- [x] разделение errno и GetLastError(). +- [x] CI посредством AppVeyor. - [x] тест конкурентного доступа. - [x] тест основного функционала (заменить текущий треш). -- [x] uint32/uint64 в структурах -- [x] Завершить переименование -- [x] Макросы версионности, сделать как в fpta (cmake?) -- [x] Попробовать убрать yield (или что там с местом?) -- [x] trinity для copy/compaction -- [x] trinity для mdbx_chk и mdbx_stat -- [x] проверки с mdbx_meta_eq -- [x] Не проверять режим при открытии в readonly -- [x] Поправить выбор tail в mdbx_chk -- [x] Там-же проверять позицию реклайминга +- [x] uint32/uint64 в структурах. +- [x] Завершить переименование. +- [x] Макросы версионности, сделать как в fpta (cmake?). +- [x] Попробовать убрать yield (или что там с местом?). +- [x] trinity для copy/compaction. +- [x] trinity для mdbx_chk и mdbx_stat. +- [x] проверки с mdbx_meta_eq. +- [x] Не проверять режим при открытии в readonly. +- [x] Поправить выбор tail в mdbx_chk. +- [x] Там-же проверять позицию реклайминга. - [x] поправить проблему открытия после READ-ONLY. - [x] static-assertы на размер/выравнивание lck, meta и т.п. -- [x] Зачистить size_t -- [x] Добавить локи вокруг dbi -- [x] Привести в порядок volatile -- [x] контроль meta.mapsize +- [x] Зачистить size_t. +- [x] Добавить локи вокруг dbi. +- [x] Привести в порядок volatile. +- [x] контроль meta.mapsize. - [x] переработка формата: заголовки страниц, meta, clk... - [x] зачистка Doxygen и бесполезных коментариев. - [x] Добавить поле типа контрольной суммы. - [x] Добавить поле/флаг размера pgno_t. - [x] Поменять сигнатуры. -- [x] Добавить мета-страницы в coredump, проверить lck -- [x] Сделать список для txnid_t, кода sizeof(txnid_t) > sizeof(pgno_t) и вернуть размер pgno_t +- [x] Добавить мета-страницы в coredump, проверить lck. +- [x] Сделать список для txnid_t, кода sizeof(txnid_t) > sizeof(pgno_t) и вернуть размер pgno_t. - [x] Избавиться от умножения на размер страницы (заменить на сдвиг). - [x] Устранение всех предупреждений (в том числе под Windows). - [x] Добавить 'mti_reader_finished_flag'. - [x] Погасить все level4-warnings от MSVC, включить /WX. -- [ ] Отрефакторить mdbx_freelist_save(). -- [ ] Хранить "свободный хвост" не связанный с freeDB в META. -- [ ] Перевод mdbx-tools на С++ и сборка для Windows. -- [ ] Заменить заглушки mdbx_version и mdbx_build. -- [ ] Актуализация README.md -- [ ] Переход на C++11, добавление #pramga detect_mismatch(). - -CI -- [ ] Прикрутить проверку coverity -- [ ] Добавить в CI linux сборки для 32-битных таргетов - -Доработки API +- [x] Проверка посредством Coverity с гашением всех дефектов. +- [x] Полная матрица Windows-сборок (2013/2015/2017). - [x] Дать возможность задавать размер страницы при создании БД. -- [x] Изменение mapsize через API с блокировкой и увеличением txn +- [x] Изменение mapsize через API с блокировкой и увеличением txn. - [x] Контроль размера страницы полного размера и кол-ва страниц при создании и обновлении. - [x] Инкрементальный mmap. - [x] Инкрементальное приращение размера (колбэк стратегии?). -- [ ] Поправить/Добавить описание нового API. -- [ ] Возврат выделенных страниц в unallocated tail-pool. -- [ ] Добавить возможность "подбора" режима для mdbx_env_open() -- [ ] Переименовать в API: env->db, db->tbl - -Тест -- [ ] Тестирование поддержки lockless-режима. -- [ ] Додумать имя и размещение тестовой БД по-умолчанию. -- [ ] Реализовать cleanup в тесте -- [ ] usage для теста -- [ ] Логирование в файл, плюс более полный progress bar -- [ ] Опция игнорирования (пропуска части теста) при переполнении БД -- [ ] Базовый бенчмарк - -Отладка -- [ ] Убрать MDB_DEBUG (всегда: логирование важный ситуаций и ошибок, опционально: включение ассертов и трассировка) -- [ ] Заменить mdbx_debug на mdbx_trace, и почистить... -- [ ] Заметить максимум assert() на mdbx_assert(env, ...) - -Развитие -- [ ] Валидатор страниц БД по номеру транзакции: - ~0 при переработке и номер транзакции при выделении, - проверять что этот номер больше головы реклайминга и не-больше текущей транзакции. -- [ ] Размещение overflow-pages в отдельном mmap/файле с собственной геометрией. -- [ ] Разместить free_backlog в конце meta -- [ ] Валидатор страниц по CRC32, плюс контроль номер транзакии под модулю 2^32. -- [ ] Валидатор страниц по t1ha c контролем снимков/версий БД на основе Merkle Tree. -- [ ] Возможность хранения ключей внутри data (libfptu) -- [ ] Асинхронная фиксация (https://github.com/ReOpen/libmdbx/issues/5) -- [ ] (Пере)Выделять память под IDL-списки с учетом реального кол-ва страниц, т.е. max(MDB_IDL_UM_MAX/MDB_IDL_UM_MAX, npages) From 01676944571953088e3fe233c6aef38c18957485 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 10 Jul 2017 20:45:24 +0300 Subject: [PATCH 288/303] mdbx: building mdbx-tools for Windows. Change-Id: I9019c4382b7108ec7c442d2b0d4be044a3cb136a --- dll.vcxproj | 10 +++ mdbx.h | 2 + mdbx.sln | 75 +++++++++++++++-- src/tools/mdbx_chk.c | 111 ++++++++++++++---------- src/tools/mdbx_chk.vcxproj | 162 ++++++++++++++++++++++++++++++++++++ src/tools/mdbx_copy.c | 57 ++++++++++--- src/tools/mdbx_copy.vcxproj | 162 ++++++++++++++++++++++++++++++++++++ src/tools/mdbx_dump.c | 71 ++++++++++------ src/tools/mdbx_dump.vcxproj | 162 ++++++++++++++++++++++++++++++++++++ src/tools/mdbx_load.c | 87 ++++++++++++++----- src/tools/mdbx_load.vcxproj | 162 ++++++++++++++++++++++++++++++++++++ src/tools/mdbx_stat.c | 57 ++++++++++--- src/tools/mdbx_stat.vcxproj | 162 ++++++++++++++++++++++++++++++++++++ src/tools/wingetopt.c | 75 +++++++++++++++++ src/tools/wingetopt.h | 26 ++++++ test/test.vcxproj | 16 ++-- 16 files changed, 1267 insertions(+), 130 deletions(-) create mode 100644 src/tools/mdbx_chk.vcxproj create mode 100644 src/tools/mdbx_copy.vcxproj create mode 100644 src/tools/mdbx_dump.vcxproj create mode 100644 src/tools/mdbx_load.vcxproj create mode 100644 src/tools/mdbx_stat.vcxproj create mode 100644 src/tools/wingetopt.c create mode 100644 src/tools/wingetopt.h diff --git a/dll.vcxproj b/dll.vcxproj index 4f9b4f50..4443f553 100644 --- a/dll.vcxproj +++ b/dll.vcxproj @@ -28,23 +28,27 @@ DynamicLibrary true v140 + MultiByte DynamicLibrary false v140 true + MultiByte DynamicLibrary true v140 + MultiByte DynamicLibrary false v140 true + MultiByte @@ -68,14 +72,20 @@ true $(Platform)\$(Configuration)\ $(SolutionDir)$(Platform)\$(Configuration)\ + mdbx false $(Platform)\$(Configuration)\ $(SolutionDir)$(Platform)\$(Configuration)\ + mdbx false + mdbx + + + mdbx diff --git a/mdbx.h b/mdbx.h index 160bef64..3f010c4a 100644 --- a/mdbx.h +++ b/mdbx.h @@ -83,6 +83,7 @@ typedef SSIZE_T ssize_t; #define MDBX_ENOSYS ERROR_NOT_SUPPORTED #define MDBX_EIO ERROR_WRITE_FAULT #define MDBX_EPERM ERROR_INVALID_FUNCTION +#define MDBX_EINTR ERROR_CANCELLED #else @@ -102,6 +103,7 @@ typedef pthread_t mdbx_tid_t; #define MDBX_ENOSYS ENOSYS #define MDBX_EIO EIO #define MDBX_EPERM EPERM +#define MDBX_EINTR EINTR #endif #ifdef _MSC_VER diff --git a/mdbx.sln b/mdbx.sln index aa2025d8..a32249d7 100644 --- a/mdbx.sln +++ b/mdbx.sln @@ -7,6 +7,18 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "test\test.vcxproj", EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dll", "dll.vcxproj", "{6D19209B-ECE7-4B9C-941C-0AA2B484F199}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tools", "tools", "{0A147F9F-22D5-44E6-B389-218CFFB0C524}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mdbx_load", "src\tools\mdbx_load.vcxproj", "{15030120-5F7F-48F9-ABE5-DFC814F2A4BB}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mdbx_dump", "src\tools\mdbx_dump.vcxproj", "{15030120-5F7F-48F9-ABE5-DFC814F2A4BC}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mdbx_copy", "src\tools\mdbx_copy.vcxproj", "{15030120-5F7F-48F9-ABE5-DFC814F2A4BD}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mdbx_chk", "src\tools\mdbx_chk.vcxproj", "{15030120-5F7F-48F9-ABE5-DFC814F2A4BE}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mdbx_stat", "src\tools\mdbx_stat.vcxproj", "{15030120-5F7F-48F9-ABE5-DFC814F2A4BF}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -15,14 +27,6 @@ Global Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x64.ActiveCfg = Debug|x64 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x64.Build.0 = Debug|x64 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x86.ActiveCfg = Debug|Win32 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x86.Build.0 = Debug|Win32 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.ActiveCfg = Release|x64 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.Build.0 = Release|x64 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.ActiveCfg = Release|Win32 - {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.Build.0 = Release|Win32 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x64.ActiveCfg = Debug|x64 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x64.Build.0 = Debug|x64 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Debug|x86.ActiveCfg = Debug|Win32 @@ -31,8 +35,63 @@ Global {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x64.Build.0 = Release|x64 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x86.ActiveCfg = Release|Win32 {30E29CE6-E6FC-4D32-AA07-46A55FAF3A31}.Release|x86.Build.0 = Release|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x64.ActiveCfg = Debug|x64 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x64.Build.0 = Debug|x64 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x86.ActiveCfg = Debug|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Debug|x86.Build.0 = Debug|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.ActiveCfg = Release|x64 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x64.Build.0 = Release|x64 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.ActiveCfg = Release|Win32 + {6D19209B-ECE7-4B9C-941C-0AA2B484F199}.Release|x86.Build.0 = Release|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BB}.Debug|x64.ActiveCfg = Debug|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BB}.Debug|x64.Build.0 = Debug|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BB}.Debug|x86.ActiveCfg = Debug|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BB}.Debug|x86.Build.0 = Debug|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BB}.Release|x64.ActiveCfg = Release|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BB}.Release|x64.Build.0 = Release|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BB}.Release|x86.ActiveCfg = Release|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BB}.Release|x86.Build.0 = Release|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BC}.Debug|x64.ActiveCfg = Debug|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BC}.Debug|x64.Build.0 = Debug|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BC}.Debug|x86.ActiveCfg = Debug|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BC}.Debug|x86.Build.0 = Debug|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BC}.Release|x64.ActiveCfg = Release|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BC}.Release|x64.Build.0 = Release|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BC}.Release|x86.ActiveCfg = Release|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BC}.Release|x86.Build.0 = Release|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BD}.Debug|x64.ActiveCfg = Debug|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BD}.Debug|x64.Build.0 = Debug|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BD}.Debug|x86.ActiveCfg = Debug|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BD}.Debug|x86.Build.0 = Debug|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BD}.Release|x64.ActiveCfg = Release|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BD}.Release|x64.Build.0 = Release|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BD}.Release|x86.ActiveCfg = Release|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BD}.Release|x86.Build.0 = Release|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BE}.Debug|x64.ActiveCfg = Debug|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BE}.Debug|x64.Build.0 = Debug|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BE}.Debug|x86.ActiveCfg = Debug|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BE}.Debug|x86.Build.0 = Debug|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BE}.Release|x64.ActiveCfg = Release|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BE}.Release|x64.Build.0 = Release|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BE}.Release|x86.ActiveCfg = Release|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BE}.Release|x86.Build.0 = Release|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BF}.Debug|x64.ActiveCfg = Debug|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BF}.Debug|x64.Build.0 = Debug|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BF}.Debug|x86.ActiveCfg = Debug|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BF}.Debug|x86.Build.0 = Debug|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BF}.Release|x64.ActiveCfg = Release|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BF}.Release|x64.Build.0 = Release|x64 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BF}.Release|x86.ActiveCfg = Release|Win32 + {15030120-5F7F-48F9-ABE5-DFC814F2A4BF}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {15030120-5F7F-48F9-ABE5-DFC814F2A4BB} = {0A147F9F-22D5-44E6-B389-218CFFB0C524} + {15030120-5F7F-48F9-ABE5-DFC814F2A4BC} = {0A147F9F-22D5-44E6-B389-218CFFB0C524} + {15030120-5F7F-48F9-ABE5-DFC814F2A4BD} = {0A147F9F-22D5-44E6-B389-218CFFB0C524} + {15030120-5F7F-48F9-ABE5-DFC814F2A4BE} = {0A147F9F-22D5-44E6-B389-218CFFB0C524} + {15030120-5F7F-48F9-ABE5-DFC814F2A4BF} = {0A147F9F-22D5-44E6-B389-218CFFB0C524} + EndGlobalSection EndGlobal diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index ccc0592b..92eff1aa 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -13,19 +13,13 @@ * top-level directory of the distribution or, alternatively, at * . */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#ifdef _MSC_VER +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif +#pragma warning(disable : 4996) /* The POSIX name is deprecated... */ +#endif /* _MSC_VER */ -#include "../../mdbx.h" #include "../bits.h" typedef struct flagbit { @@ -41,13 +35,26 @@ flagbit dbflags[] = {{MDBX_DUPSORT, "dupsort"}, {MDBX_INTEGERDUP, "integerdup"}, {0, NULL}}; -static volatile sig_atomic_t gotsignal; +#if defined(_WIN32) || defined(_WIN64) +#include "wingetopt.h" +static volatile BOOL user_break; +static BOOL WINAPI ConsoleBreakHandlerRoutine(DWORD dwCtrlType) { + (void)dwCtrlType; + user_break = true; + return true; +} + +#else /* WINDOWS */ + +static volatile sig_atomic_t user_break; static void signal_handler(int sig) { (void)sig; - gotsignal = 1; + user_break = 1; } +#endif /* !WINDOWS */ + #define EXIT_INTERRUPTED (EXIT_FAILURE + 4) #define EXIT_FAILURE_SYS (EXIT_FAILURE + 3) #define EXIT_FAILURE_MDB (EXIT_FAILURE + 2) @@ -65,10 +72,6 @@ struct { uint64_t pgcount; } walk; -static __attribute__((constructor)) void init_walk(void) { - walk.dbi_names[0] = "@gc"; -} - uint64_t total_unused_bytes; int exclusive = 2; int envflags = MDBX_RDONLY; @@ -91,7 +94,11 @@ struct problem { struct problem *problems_list; uint64_t total_problems; -static void __attribute__((format(printf, 1, 2))) print(const char *msg, ...) { +static void +#ifdef __GNU__ + __attribute__((format(printf, 1, 2))) +#endif + print(const char *msg, ...) { if (!quiet) { va_list args; @@ -102,7 +109,11 @@ static void __attribute__((format(printf, 1, 2))) print(const char *msg, ...) { } } -static void __attribute__((format(printf, 1, 2))) error(const char *msg, ...) { +static void +#ifdef __GNU__ + __attribute__((format(printf, 1, 2))) +#endif + error(const char *msg, ...) { total_problems++; if (!quiet) { @@ -232,7 +243,7 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, size_t page_size = (size_t)pgnumber * envstat.ms_psize; int index = pagemap_lookup_dbi(dbi); if (index < 0) - return ENOMEM; + return MDBX_ENOMEM; if (verbose > 2 && (!only_subdb || strcmp(only_subdb, dbi) == 0)) { if (pgnumber == 1) @@ -290,7 +301,7 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, problem_add("page", pgno, "already used", "in %s", walk.dbi_names[walk.pagemap[pgno]]); else { - walk.pagemap[pgno] = index; + walk.pagemap[pgno] = (short)index; walk.dbi_pages[index] += 1; } ++pgno; @@ -298,12 +309,12 @@ static int pgvisitor(uint64_t pgno, unsigned pgnumber, void *ctx, } } - return gotsignal ? EINTR : MDBX_SUCCESS; + return user_break ? MDBX_EINTR : MDBX_SUCCESS; } typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data); -static int process_db(MDBX_dbi dbi, char *name, visitor *handler, int silent); +static int process_db(MDBX_dbi dbi, char *name, visitor *handler, bool silent); static int handle_userdb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { @@ -398,7 +409,7 @@ static int handle_maindb(const uint64_t record_number, const MDBX_val *key, name[key->iov_len] = '\0'; userdb_count++; - rc = process_db(-1, name, handle_userdb, 0); + rc = process_db(~0u, name, handle_userdb, false); free(name); if (rc != MDBX_INCOMPATIBLE) return rc; @@ -406,7 +417,7 @@ static int handle_maindb(const uint64_t record_number, const MDBX_val *key, return handle_userdb(record_number, key, data); } -static int process_db(MDBX_dbi dbi, char *name, visitor *handler, int silent) { +static int process_db(MDBX_dbi dbi, char *name, visitor *handler, bool silent) { MDBX_cursor *mc; MDBX_stat ms; MDBX_val key, data; @@ -419,7 +430,7 @@ static int process_db(MDBX_dbi dbi, char *name, visitor *handler, int silent) { uint64_t record_count = 0, dups = 0; uint64_t key_bytes = 0, data_bytes = 0; - if (0 > (int)dbi) { + if (dbi == ~0u) { rc = mdbx_dbi_open(txn, name, 0, &dbi); if (rc) { if (!name || @@ -489,10 +500,10 @@ static int process_db(MDBX_dbi dbi, char *name, visitor *handler, int silent) { prev_data.iov_len = 0; rc = mdbx_cursor_get(mc, &key, &data, MDBX_FIRST); while (rc == MDBX_SUCCESS) { - if (gotsignal) { + if (user_break) { print(" - interrupted by signal\n"); fflush(NULL); - rc = EINTR; + rc = MDBX_EINTR; goto bailout; } @@ -744,16 +755,22 @@ int main(int argc, char *argv[]) { char *envname; int problems_maindb = 0, problems_freedb = 0, problems_meta = 0; int dont_traversal = 0; - struct timespec timestamp_start, timestamp_finish; + double elapsed; - - atexit(pagemap_cleanup); - +#if defined(_WIN32) || defined(_WIN64) + uint64_t timestamp_start, timestamp_finish; + timestamp_start = GetTickCount64(); +#else + struct timespec timestamp_start, timestamp_finish; if (clock_gettime(CLOCK_MONOTONIC, ×tamp_start)) { rc = errno; error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); return EXIT_FAILURE_SYS; } +#endif + + walk.dbi_names[0] = "@gc"; + atexit(pagemap_cleanup); if (argc < 2) { usage(prog); @@ -797,6 +814,9 @@ int main(int argc, char *argv[]) { if (optind != argc - 1) usage(prog); +#if defined(_WIN32) || defined(_WIN64) + SetConsoleCtrlHandler(ConsoleBreakHandlerRoutine, true); +#else #ifdef SIGPIPE signal(SIGPIPE, signal_handler); #endif @@ -805,6 +825,7 @@ int main(int argc, char *argv[]) { #endif signal(SIGINT, signal_handler); signal(SIGTERM, signal_handler); +#endif /* !WINDOWS */ envname = argv[optind]; print("Running mdbx_chk for '%s' in %s mode...\n", envname, @@ -933,14 +954,14 @@ int main(int argc, char *argv[]) { if (!dont_traversal) { struct problem *saved_list; - size_t traversal_problems; - size_t empty_pages, lost_bytes; + uint64_t traversal_problems; + uint64_t empty_pages, lost_bytes; print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid); fflush(NULL); - walk.pagemap = calloc(lastpgno, sizeof(*walk.pagemap)); + walk.pagemap = calloc((size_t)lastpgno, sizeof(*walk.pagemap)); if (!walk.pagemap) { - rc = errno ? errno : ENOMEM; + rc = errno ? errno : MDBX_ENOMEM; error("calloc failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } @@ -950,7 +971,7 @@ int main(int argc, char *argv[]) { traversal_problems = problems_pop(saved_list); if (rc) { - if (rc == EINTR && gotsignal) { + if (rc == MDBX_EINTR && user_break) { print(" - interrupted by signal\n"); fflush(NULL); } else { @@ -1017,8 +1038,8 @@ int main(int argc, char *argv[]) { if (!verbose) print("Iterating DBIs...\n"); - problems_maindb = process_db(-1, /* MAIN_DBI */ NULL, NULL, 0); - problems_freedb = process_db(0 /* FREE_DBI */, "free", handle_freedb, 0); + problems_maindb = process_db(~0u, /* MAIN_DBI */ NULL, NULL, false); + problems_freedb = process_db(FREE_DBI, "free", handle_freedb, false); if (verbose) { uint64_t value = envinfo.me_mapsize / envstat.ms_psize; @@ -1064,7 +1085,7 @@ int main(int argc, char *argv[]) { "monopolistic or write-lock mode only)\n"); } - if (!process_db(-1, NULL, handle_maindb, 1)) { + if (!process_db(MAIN_DBI, NULL, handle_maindb, true)) { if (!userdb_count && verbose) print(" - does not contain multiple databases\n"); } @@ -1080,18 +1101,22 @@ bailout: fflush(NULL); if (rc) { if (rc < 0) - return gotsignal ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; + return (user_break) ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; return EXIT_FAILURE_MDB; } +#if defined(_WIN32) || defined(_WIN64) + timestamp_finish = GetTickCount64(); + elapsed = (timestamp_finish - timestamp_start) * 1e-3; +#else if (clock_gettime(CLOCK_MONOTONIC, ×tamp_finish)) { rc = errno; error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); return EXIT_FAILURE_SYS; } - elapsed = timestamp_finish.tv_sec - timestamp_start.tv_sec + (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9; +#endif /* !WINDOWS */ total_problems += problems_meta; if (total_problems || problems_maindb || problems_freedb) { diff --git a/src/tools/mdbx_chk.vcxproj b/src/tools/mdbx_chk.vcxproj new file mode 100644 index 00000000..bece9d2b --- /dev/null +++ b/src/tools/mdbx_chk.vcxproj @@ -0,0 +1,162 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + {15030120-5F7F-48F9-ABE5-DFC814F2A4BE} + Win32Proj + mdbx_chk + 8.1 + + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + true + $(Platform)\$(Configuration)\$(ProjectName)\ + + + true + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + + + + Level4 + Disabled + _DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + true + + + Console + + + + + + + Level4 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + true + + + Console + + + + + Level4 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + + + Console + true + true + + + + + Level4 + + + MaxSpeed + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + + + Console + true + true + + + + + {6d19209b-ece7-4b9c-941c-0aa2b484f199} + + + + + + + + + + + + + + diff --git a/src/tools/mdbx_copy.c b/src/tools/mdbx_copy.c index 9295c7ff..2e384cac 100644 --- a/src/tools/mdbx_copy.c +++ b/src/tools/mdbx_copy.c @@ -13,13 +13,34 @@ * top-level directory of the distribution or, alternatively, at * . */ -#include "../../mdbx.h" -#include -#include -#include -#include +#ifdef _MSC_VER +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif +#pragma warning(disable : 4996) /* The POSIX name is deprecated... */ +#endif /* _MSC_VER */ -static void sighandle(int sig) { (void)sig; } +#include "../bits.h" + +#if defined(_WIN32) || defined(_WIN64) +#include "wingetopt.h" + +static volatile BOOL user_break; +static BOOL WINAPI ConsoleBreakHandlerRoutine(DWORD dwCtrlType) { + (void)dwCtrlType; + user_break = true; + return true; +} + +#else /* WINDOWS */ + +static volatile sig_atomic_t user_break; +static void signal_handler(int sig) { + (void)sig; + user_break = 1; +} + +#endif /* !WINDOWS */ int main(int argc, char *argv[]) { int rc; @@ -46,14 +67,18 @@ int main(int argc, char *argv[]) { exit(EXIT_FAILURE); } +#if defined(_WIN32) || defined(_WIN64) + SetConsoleCtrlHandler(ConsoleBreakHandlerRoutine, true); +#else #ifdef SIGPIPE - signal(SIGPIPE, sighandle); + signal(SIGPIPE, signal_handler); #endif #ifdef SIGHUP - signal(SIGHUP, sighandle); + signal(SIGHUP, signal_handler); #endif - signal(SIGINT, sighandle); - signal(SIGTERM, sighandle); + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); +#endif /* !WINDOWS */ act = "opening environment"; rc = mdbx_env_create(&env); @@ -62,9 +87,15 @@ int main(int argc, char *argv[]) { } if (rc == MDBX_SUCCESS) { act = "copying"; - if (argc == 2) - rc = mdbx_env_copy2fd(env, STDOUT_FILENO, cpflags); - else + if (argc == 2) { + mdbx_filehandle_t fd; +#if defined(_WIN32) || defined(_WIN64) + fd = GetStdHandle(STD_OUTPUT_HANDLE); +#else + fd = fileno(stdout); +#endif + rc = mdbx_env_copy2fd(env, fd, cpflags); + } else rc = mdbx_env_copy(env, argv[2], cpflags); } if (rc) diff --git a/src/tools/mdbx_copy.vcxproj b/src/tools/mdbx_copy.vcxproj new file mode 100644 index 00000000..b3b52dc2 --- /dev/null +++ b/src/tools/mdbx_copy.vcxproj @@ -0,0 +1,162 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + {15030120-5F7F-48F9-ABE5-DFC814F2A4BD} + Win32Proj + mdbx_copy + 8.1 + + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + true + $(Platform)\$(Configuration)\$(ProjectName)\ + + + true + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + + + + Level4 + Disabled + _DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + true + + + Console + + + + + + + Level4 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + true + + + Console + + + + + Level4 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + + + Console + true + true + + + + + Level4 + + + MaxSpeed + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + + + Console + true + true + + + + + {6d19209b-ece7-4b9c-941c-0aa2b484f199} + + + + + + + + + + + + + + diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index 726cc5d0..c9fb6f20 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -13,15 +13,15 @@ * top-level directory of the distribution or, alternatively, at * . */ -#include "../../mdbx.h" +#ifdef _MSC_VER +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif +#pragma warning(disable : 4996) /* The POSIX name is deprecated... */ +#endif /* _MSC_VER */ + +#include "../bits.h" #include -#include -#include -#include -#include -#include -#include -#include #define PRINT 1 static int mode; @@ -39,16 +39,29 @@ flagbit dbflags[] = {{MDBX_REVERSEKEY, "reversekey"}, {MDBX_REVERSEDUP, "reversedup"}, {0, NULL}}; -static volatile sig_atomic_t gotsig; +#if defined(_WIN32) || defined(_WIN64) +#include "wingetopt.h" -static void dumpsig(int sig) { - (void)sig; - gotsig = 1; +static volatile BOOL user_break; +static BOOL WINAPI ConsoleBreakHandlerRoutine(DWORD dwCtrlType) { + (void)dwCtrlType; + user_break = true; + return true; } +#else /* WINDOWS */ + +static volatile sig_atomic_t user_break; +static void signal_handler(int sig) { + (void)sig; + user_break = 1; +} + +#endif /* !WINDOWS */ + static const char hexc[] = "0123456789abcdef"; -static void hex(unsigned char c) { +static void dumpbyte(unsigned char c) { putchar(hexc[c >> 4]); putchar(hexc[c & 0xf]); } @@ -60,25 +73,25 @@ static void text(MDBX_val *v) { c = v->iov_base; end = c + v->iov_len; while (c < end) { - if (isprint(*c)) { + if (isprint(*c) && *c != '\\') { putchar(*c); } else { putchar('\\'); - hex(*c); + dumpbyte(*c); } c++; } putchar('\n'); } -static void byte(MDBX_val *v) { +static void dumpval(MDBX_val *v) { unsigned char *c, *end; putchar(' '); c = v->iov_base; end = c + v->iov_len; while (c < end) { - hex(*c++); + dumpbyte(*c++); } putchar('\n'); } @@ -124,16 +137,16 @@ static int dumpit(MDBX_txn *txn, MDBX_dbi dbi, char *name) { return rc; while ((rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT)) == MDBX_SUCCESS) { - if (gotsig) { - rc = EINTR; + if (user_break) { + rc = MDBX_EINTR; break; } if (mode & PRINT) { text(&key); text(&data); } else { - byte(&key); - byte(&data); + dumpval(&key); + dumpval(&data); } } printf("DATA=END\n"); @@ -212,14 +225,18 @@ int main(int argc, char *argv[]) { if (optind != argc - 1) usage(prog); +#if defined(_WIN32) || defined(_WIN64) + SetConsoleCtrlHandler(ConsoleBreakHandlerRoutine, true); +#else #ifdef SIGPIPE - signal(SIGPIPE, dumpsig); + signal(SIGPIPE, signal_handler); #endif #ifdef SIGHUP - signal(SIGHUP, dumpsig); + signal(SIGHUP, signal_handler); #endif - signal(SIGINT, dumpsig); - signal(SIGTERM, dumpsig); + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); +#endif /* !WINDOWS */ envname = argv[optind]; rc = mdbx_env_create(&env); @@ -265,6 +282,10 @@ int main(int argc, char *argv[]) { goto txn_abort; } while ((rc = mdbx_cursor_get(cursor, &key, NULL, MDBX_NEXT_NODUP)) == 0) { + if (user_break) { + rc = MDBX_EINTR; + break; + } char *str; MDBX_dbi db2; if (memchr(key.iov_base, '\0', key.iov_len)) diff --git a/src/tools/mdbx_dump.vcxproj b/src/tools/mdbx_dump.vcxproj new file mode 100644 index 00000000..8d4280c7 --- /dev/null +++ b/src/tools/mdbx_dump.vcxproj @@ -0,0 +1,162 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + {15030120-5F7F-48F9-ABE5-DFC814F2A4BC} + Win32Proj + mdbx_dump + 8.1 + + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + true + $(Platform)\$(Configuration)\$(ProjectName)\ + + + true + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + + + + Level4 + Disabled + _DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + true + + + Console + + + + + + + Level4 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + true + + + Console + + + + + Level4 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + + + Console + true + true + + + + + Level4 + + + MaxSpeed + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + + + Console + true + true + + + + + {6d19209b-ece7-4b9c-941c-0aa2b484f199} + + + + + + + + + + + + + + diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index 5089efb0..91c1bfae 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -13,32 +13,49 @@ * top-level directory of the distribution or, alternatively, at * . */ -#include "../../mdbx.h" +#ifdef _MSC_VER +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif +#pragma warning(disable : 4996) /* The POSIX name is deprecated... */ +#endif /* _MSC_VER */ + +#include "../bits.h" #include -#include -#include -#include -#include -#include -#include + +#if defined(_WIN32) || defined(_WIN64) +#include "wingetopt.h" + +static volatile BOOL user_break; +static BOOL WINAPI ConsoleBreakHandlerRoutine(DWORD dwCtrlType) { + (void)dwCtrlType; + user_break = true; + return true; +} + +#else /* WINDOWS */ + +static volatile sig_atomic_t user_break; +static void signal_handler(int sig) { + (void)sig; + user_break = 1; +} + +#endif /* !WINDOWS */ #define PRINT 1 #define NOHDR 2 static int mode; static char *subname = NULL; - static size_t lineno; static int version; static int dbi_flags; - static char *prog; - static int Eof; static MDBX_envinfo envinfo; - static MDBX_val kbuf, dbuf; #define STRLENOF(s) (sizeof(s) - 1) @@ -63,7 +80,7 @@ static void readhdr(void) { char *ptr; dbi_flags = 0; - while (fgets(dbuf.iov_base, dbuf.iov_len, stdin) != NULL) { + while (fgets(dbuf.iov_base, (int)dbuf.iov_len, stdin) != NULL) { lineno++; if (!strncmp(dbuf.iov_base, "db_pagesize=", STRLENOF("db_pagesize=")) || !strncmp(dbuf.iov_base, "duplicates=", STRLENOF("duplicates="))) { @@ -197,7 +214,7 @@ static int readline(MDBX_val *out, MDBX_val *buf) { } if (c != ' ') { lineno++; - if (fgets(buf->iov_base, buf->iov_len, stdin) == NULL) { + if (fgets(buf->iov_base, (int)buf->iov_len, stdin) == NULL) { badend: Eof = 1; badend(); @@ -208,7 +225,7 @@ static int readline(MDBX_val *out, MDBX_val *buf) { goto badend; } } - if (fgets(buf->iov_base, buf->iov_len, stdin) == NULL) { + if (fgets(buf->iov_base, (int)buf->iov_len, stdin) == NULL) { Eof = 1; return EOF; } @@ -229,7 +246,7 @@ static int readline(MDBX_val *out, MDBX_val *buf) { } c1 = buf->iov_base; c1 += l2; - if (fgets((char *)c1, buf->iov_len + 1, stdin) == NULL) { + if (fgets((char *)c1, (int)buf->iov_len + 1, stdin) == NULL) { Eof = 1; badend(); return EOF; @@ -255,7 +272,7 @@ static int readline(MDBX_val *out, MDBX_val *buf) { badend(); return EOF; } - *c1++ = unhex(++c2); + *c1++ = (char)unhex(++c2); c2 += 2; } } else { @@ -276,7 +293,7 @@ static int readline(MDBX_val *out, MDBX_val *buf) { badend(); return EOF; } - *c1++ = unhex(c2); + *c1++ = (char)unhex(c2); c2 += 2; } } @@ -294,11 +311,11 @@ static void usage(void) { int main(int argc, char *argv[]) { int i, rc; - MDBX_env *env; - MDBX_txn *txn; - MDBX_cursor *mc; + MDBX_env *env = NULL; + MDBX_txn *txn = NULL; + MDBX_cursor *mc = NULL; MDBX_dbi dbi; - char *envname; + char *envname = NULL; int envflags = 0, putflags = 0; prog = argv[0]; @@ -347,6 +364,19 @@ int main(int argc, char *argv[]) { if (optind != argc - 1) usage(); +#if defined(_WIN32) || defined(_WIN64) + SetConsoleCtrlHandler(ConsoleBreakHandlerRoutine, true); +#else +#ifdef SIGPIPE + signal(SIGPIPE, signal_handler); +#endif +#ifdef SIGHUP + signal(SIGHUP, signal_handler); +#endif + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); +#endif /* !WINDOWS */ + dbuf.iov_len = 4096; dbuf.iov_base = malloc(dbuf.iov_len); @@ -366,8 +396,14 @@ int main(int argc, char *argv[]) { if (envinfo.me_maxreaders) mdbx_env_set_maxreaders(env, envinfo.me_maxreaders); - if (envinfo.me_mapsize) - mdbx_env_set_mapsize(env, envinfo.me_mapsize); + if (envinfo.me_mapsize) { + if (envinfo.me_mapsize > SIZE_MAX) { + fprintf(stderr, "mdbx_env_set_mapsize failed, error %d %s\n", rc, + mdbx_strerror(MDBX_TOO_LARGE)); + return EXIT_FAILURE; + } + mdbx_env_set_mapsize(env, (size_t)envinfo.me_mapsize); + } #ifdef MDBX_FIXEDMAP if (info.me_mapaddr) @@ -385,6 +421,11 @@ int main(int argc, char *argv[]) { kbuf.iov_base = malloc(kbuf.iov_len); while (!Eof) { + if (user_break) { + rc = MDBX_EINTR; + break; + } + MDBX_val key, data; int batch = 0; diff --git a/src/tools/mdbx_load.vcxproj b/src/tools/mdbx_load.vcxproj new file mode 100644 index 00000000..b0043ef3 --- /dev/null +++ b/src/tools/mdbx_load.vcxproj @@ -0,0 +1,162 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + {15030120-5F7F-48F9-ABE5-DFC814F2A4BB} + Win32Proj + mdbx_load + 8.1 + + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + true + $(Platform)\$(Configuration)\$(ProjectName)\ + + + true + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + + + + Level4 + Disabled + _DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + true + + + Console + + + + + + + Level4 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + true + + + Console + + + + + Level4 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + + + Console + true + true + + + + + Level4 + + + MaxSpeed + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + + + Console + true + true + + + + + {6d19209b-ece7-4b9c-941c-0aa2b484f199} + + + + + + + + + + + + + + diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index b1fa22bb..eaf0371a 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -13,15 +13,35 @@ * top-level directory of the distribution or, alternatively, at * . */ -#include -#include -#include -#include -#include +#ifdef _MSC_VER +#if _MSC_VER > 1800 +#pragma warning(disable : 4464) /* relative include path contains '..' */ +#endif +#pragma warning(disable : 4996) /* The POSIX name is deprecated... */ +#endif /* _MSC_VER */ -#include "../../mdbx.h" #include "../bits.h" +#if defined(_WIN32) || defined(_WIN64) +#include "wingetopt.h" + +static volatile BOOL user_break; +static BOOL WINAPI ConsoleBreakHandlerRoutine(DWORD dwCtrlType) { + (void)dwCtrlType; + user_break = true; + return true; +} + +#else /* WINDOWS */ + +static volatile sig_atomic_t user_break; +static void signal_handler(int sig) { + (void)sig; + user_break = 1; +} + +#endif /* !WINDOWS */ + static void prstat(MDBX_stat *ms) { printf(" Pagesize: %u\n", ms->ms_psize); printf(" Tree depth: %u\n", ms->ms_depth); @@ -39,7 +59,7 @@ static void usage(char *prog) { } int main(int argc, char *argv[]) { - int i, rc; + int o, rc; MDBX_env *env; MDBX_txn *txn; MDBX_dbi dbi; @@ -63,8 +83,8 @@ int main(int argc, char *argv[]) { * -V: print version and exit * (default) print stat of only the main DB */ - while ((i = getopt(argc, argv, "Vaefnrs:")) != EOF) { - switch (i) { + while ((o = getopt(argc, argv, "Vaefnrs:")) != EOF) { + switch (o) { case 'V': printf("%s (%s, build %s)\n", mdbx_version.git.describe, mdbx_version.git.datetime, mdbx_build.datetime); @@ -100,6 +120,19 @@ int main(int argc, char *argv[]) { if (optind != argc - 1) usage(prog); +#if defined(_WIN32) || defined(_WIN64) + SetConsoleCtrlHandler(ConsoleBreakHandlerRoutine, true); +#else +#ifdef SIGPIPE + signal(SIGPIPE, signal_handler); +#endif +#ifdef SIGHUP + signal(SIGHUP, signal_handler); +#endif + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); +#endif /* !WINDOWS */ + envname = argv[optind]; rc = mdbx_env_create(&env); if (rc) { @@ -195,6 +228,10 @@ int main(int argc, char *argv[]) { } prstat(&mst); while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == 0) { + if (user_break) { + rc = MDBX_EINTR; + break; + } iptr = data.iov_base; pages += *iptr; if (envinfo && mei.me_latter_reader_txnid > *(size_t *)key.iov_base) @@ -209,7 +246,7 @@ int main(int argc, char *argv[]) { if (pg <= prev) bad = " [bad sequence]"; prev = pg; - pg += span; + pg += (unsigned)span; for (; i >= span && iptr[i - span] == pg; span++, pg++) ; } diff --git a/src/tools/mdbx_stat.vcxproj b/src/tools/mdbx_stat.vcxproj new file mode 100644 index 00000000..caf425ed --- /dev/null +++ b/src/tools/mdbx_stat.vcxproj @@ -0,0 +1,162 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + {15030120-5F7F-48F9-ABE5-DFC814F2A4BF} + Win32Proj + mdbx_stat + 8.1 + + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + Application + true + v140 + MultiByte + + + Application + false + v140 + true + MultiByte + + + + + + + + + + + + + + + + + + + + + true + $(Platform)\$(Configuration)\$(ProjectName)\ + + + true + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + false + $(Platform)\$(Configuration)\$(ProjectName)\ + + + + + + Level4 + Disabled + _DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + true + + + Console + + + + + + + Level4 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + true + + + Console + + + + + Level4 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + + + Console + true + true + + + + + Level4 + + + MaxSpeed + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 + + + Console + true + true + + + + + {6d19209b-ece7-4b9c-941c-0aa2b484f199} + + + + + + + + + + + + + + diff --git a/src/tools/wingetopt.c b/src/tools/wingetopt.c new file mode 100644 index 00000000..3762ecae --- /dev/null +++ b/src/tools/wingetopt.c @@ -0,0 +1,75 @@ +/* + * POSIX getopt for Windows + * + * AT&T Public License + * + * Code given out at the 1985 UNIFORUM conference in Dallas. + */ + +#include "wingetopt.h" +#include +#include + +#ifndef NULL +#define NULL 0 +#endif + +#ifndef EOF +#define EOF (-1) +#endif + +#define ERR(s, c) \ + if (opterr) { \ + char errbuf[2]; \ + errbuf[0] = (char)c; \ + errbuf[1] = '\n'; \ + fputs(argv[0], stderr); \ + fputs(s, stderr); \ + fputc(c, stderr); \ + } + +int opterr = 1; +int optind = 1; +int optopt; +char *optarg; + +int getopt(int argc, char *const argv[], const char *opts) { + static int sp = 1; + int c; + char *cp; + + if (sp == 1) + if (optind >= argc || argv[optind][0] != '-' || argv[optind][1] == '\0') + return EOF; + else if (strcmp(argv[optind], "--") == 0) { + optind++; + return EOF; + } + optopt = c = argv[optind][sp]; + if (c == ':' || (cp = strchr(opts, c)) == NULL) { + ERR(": illegal option -- ", c); + if (argv[optind][++sp] == '\0') { + optind++; + sp = 1; + } + return '?'; + } + if (*++cp == ':') { + if (argv[optind][sp + 1] != '\0') + optarg = &argv[optind++][sp + 1]; + else if (++optind >= argc) { + ERR(": option requires an argument -- ", c); + sp = 1; + return '?'; + } else + optarg = argv[optind++]; + sp = 1; + } else { + if (argv[optind][++sp] == '\0') { + sp = 1; + optind++; + } + optarg = NULL; + } + return c; +} diff --git a/src/tools/wingetopt.h b/src/tools/wingetopt.h new file mode 100644 index 00000000..fdff3683 --- /dev/null +++ b/src/tools/wingetopt.h @@ -0,0 +1,26 @@ +/* + * POSIX getopt for Windows + * + * AT&T Public License + * + * Code given out at the 1985 UNIFORUM conference in Dallas. + */ + +#ifndef _WINGETOPT_H_ +#define _WINGETOPT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern int opterr; +extern int optind; +extern int optopt; +extern char *optarg; +int getopt(int argc, char *const argv[], const char *optstring); + +#ifdef __cplusplus +} +#endif + +#endif /* _GETOPT_H_ */ diff --git a/test/test.vcxproj b/test/test.vcxproj index 20535ff4..8df56351 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -34,27 +34,27 @@ Application true v140 - Unicode + MultiByte Application false v140 true - Unicode + MultiByte Application true v140 - Unicode + MultiByte Application false v140 true - Unicode + MultiByte @@ -95,7 +95,7 @@ Use Level4 Disabled - WIN32;_DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions) + WIN32;_DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 true test.h true @@ -111,7 +111,7 @@ Use Level4 Disabled - _DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions) + _DEBUG;_CONSOLE;MDBX_DEBUG=1;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 true test.h true @@ -129,7 +129,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 true test.h true @@ -149,7 +149,7 @@ MaxSpeed true true - NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + NDEBUG;_CONSOLE;%(PreprocessorDefinitions);LIBMDBX_IMPORTS=1 true test.h true From 90d33a12e89051395e27b7ee1161f5bc8633734b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Jul 2017 12:37:39 +0300 Subject: [PATCH 289/303] mdbx-make: add '-Wno-constant-logical-operand' for CLANG. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fcf8bdc3..3a9e5df4 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ suffix ?= CC ?= gcc CXX ?= g++ XCFLAGS ?= -DNDEBUG=1 -DMDBX_DEBUG=0 -DLIBMDBX_EXPORTS=1 -CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden +CFLAGS ?= -O2 -g3 -Wall -Wno-constant-logical-operand -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden CFLAGS += -D_GNU_SOURCE=1 -std=gnu11 -pthread $(XCFLAGS) CXXFLAGS = -std=c++11 $(filter-out -std=gnu11,$(CFLAGS)) TESTDB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-check.db From 294a53663c809da4b7befca02cfb1170f06ba6d7 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Jul 2017 12:40:16 +0300 Subject: [PATCH 290/303] mdbx-make: add 'all' as a prereq of the 'check' target. --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 3a9e5df4..253cc479 100644 --- a/Makefile +++ b/Makefile @@ -72,7 +72,7 @@ install: $(LIBRARIES) $(TOOLS) $(HEADERS) clean: rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err src/*.o test/*.o -check: test/test mdbx_chk +check: all rm -f $(TESTDB) test.log && (set -o pipefail; test/test --pathname=$(TESTDB) --dont-cleanup-after basic | tee -a test.log | tail -n 42) && ./mdbx_chk -vn $(TESTDB) define core-rule @@ -132,7 +132,7 @@ endif ci-rule = ( CC=$$(which $1); if [ -n "$$CC" ]; then \ echo -n "probe by $2 ($$(readlink -f $$(which $$CC))): " && \ $(MAKE) clean >$1.log 2>$1.err && \ - $(MAKE) CC=$$(readlink -f $$CC) XCFLAGS="-UNDEBUG -DMDBX_DEBUG=2" all check 1>$1.log 2>$1.err && echo "OK" \ + $(MAKE) CC=$$(readlink -f $$CC) XCFLAGS="-UNDEBUG -DMDBX_DEBUG=2" check 1>$1.log 2>$1.err && echo "OK" \ || ( echo "FAILED"; cat $1.err >&2; exit 1 ); \ else echo "no $2 ($1) for probe"; fi; ) ci: From 70d54f6f2cb69532918ed2e9ca5be16ac9eb8b0e Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Jul 2017 13:22:48 +0300 Subject: [PATCH 291/303] mdbx-msvc: refine IntDir/OutDir for msbuild projects. --- dll.vcxproj | 18 ++++++++++++------ src/tools/mdbx_chk.vcxproj | 12 ++++++++---- src/tools/mdbx_copy.vcxproj | 12 ++++++++---- src/tools/mdbx_dump.vcxproj | 12 ++++++++---- src/tools/mdbx_load.vcxproj | 12 ++++++++---- src/tools/mdbx_stat.vcxproj | 12 ++++++++---- test/test.vcxproj | 14 +++++++++----- 7 files changed, 61 insertions(+), 31 deletions(-) diff --git a/dll.vcxproj b/dll.vcxproj index 4443f553..82f89ca5 100644 --- a/dll.vcxproj +++ b/dll.vcxproj @@ -70,22 +70,28 @@ true - $(Platform)\$(Configuration)\ - $(SolutionDir)$(Platform)\$(Configuration)\ mdbx + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\ $(SolutionDir)$(Platform)\$(Configuration)\ mdbx + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + + + true + mdbx + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false mdbx - - - mdbx + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ diff --git a/src/tools/mdbx_chk.vcxproj b/src/tools/mdbx_chk.vcxproj index bece9d2b..b680471d 100644 --- a/src/tools/mdbx_chk.vcxproj +++ b/src/tools/mdbx_chk.vcxproj @@ -71,19 +71,23 @@ true - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ true - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ diff --git a/src/tools/mdbx_copy.vcxproj b/src/tools/mdbx_copy.vcxproj index b3b52dc2..d2ff8254 100644 --- a/src/tools/mdbx_copy.vcxproj +++ b/src/tools/mdbx_copy.vcxproj @@ -71,19 +71,23 @@ true - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ true - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ diff --git a/src/tools/mdbx_dump.vcxproj b/src/tools/mdbx_dump.vcxproj index 8d4280c7..5f8b0b24 100644 --- a/src/tools/mdbx_dump.vcxproj +++ b/src/tools/mdbx_dump.vcxproj @@ -71,19 +71,23 @@ true - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ true - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ diff --git a/src/tools/mdbx_load.vcxproj b/src/tools/mdbx_load.vcxproj index b0043ef3..93a88f93 100644 --- a/src/tools/mdbx_load.vcxproj +++ b/src/tools/mdbx_load.vcxproj @@ -71,19 +71,23 @@ true - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ true - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ diff --git a/src/tools/mdbx_stat.vcxproj b/src/tools/mdbx_stat.vcxproj index caf425ed..3475165b 100644 --- a/src/tools/mdbx_stat.vcxproj +++ b/src/tools/mdbx_stat.vcxproj @@ -71,19 +71,23 @@ true - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ true - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false - $(Platform)\$(Configuration)\$(ProjectName)\ + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ diff --git a/test/test.vcxproj b/test/test.vcxproj index 8df56351..19f272b8 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -77,18 +77,22 @@ true $(SolutionDir)$(Platform)\$(Configuration)\ - $(Platform)\$(Configuration)\ - - - true + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false $(SolutionDir)$(Platform)\$(Configuration)\ - $(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ + + + true + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ false + $(SolutionDir)$(Platform)\$(Configuration)\ + $(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\ From 455de97d362984fc9101fcc03d0929e81abc3911 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Jul 2017 14:10:24 +0300 Subject: [PATCH 292/303] mdbx: rework using of mdbx_mmap_t and mdbx_msync(). --- src/bits.h | 15 ++++---- src/mdbx.c | 105 +++++++++++++++++------------------------------------ src/osal.c | 20 +++++----- src/osal.h | 25 +++++++------ 4 files changed, 64 insertions(+), 101 deletions(-) diff --git a/src/bits.h b/src/bits.h index dcdafb0a..3c35c8e3 100644 --- a/src/bits.h +++ b/src/bits.h @@ -659,12 +659,13 @@ typedef struct MDBX_pgstate { struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) size_t me_signature; - mdbx_filehandle_t me_fd; /* The main data file */ - mdbx_filehandle_t me_lfd; /* The lock file */ -#ifdef MDBX_OSAL_SECTION - MDBX_OSAL_SECTION me_dxb_section; - MDBX_OSAL_SECTION me_lck_section; -#endif + mdbx_mmap_t me_dxb_mmap; /* The main data file */ + mdbx_mmap_t me_lck_mmap; /* The lock file */ +#define me_map me_dxb_mmap.dxb +#define me_lck me_lck_mmap.lck +#define me_fd me_dxb_mmap.fd +#define me_lfd me_lck_mmap.fd + /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ @@ -684,8 +685,6 @@ struct MDBX_env { mdbx_pid_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ char *me_path; /* path to the DB files */ - char *me_map; /* the memory map of the data file */ - MDBX_lockinfo *me_lck; /* the memory map of the lock file, never NULL */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ diff --git a/src/mdbx.c b/src/mdbx.c index e07655a3..2705602e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1852,14 +1852,8 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, "), %" PRIuPTR " bytes", growth_pgno, growth_pgno - txn->mt_end_pgno, growth_bytes); - mdbx_mmap_param_t mmap; - mmap.address = env->me_map; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_dxb_section; -#endif - mmap.fd = env->me_fd; - rc = - mdbx_mresize(env->me_flags, &mmap, env->me_dbgeo.now, growth_bytes); + rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + growth_bytes); if (rc == MDBX_SUCCESS) { txn->mt_end_pgno = growth_pgno; env->me_dbgeo.now = growth_bytes; @@ -2146,7 +2140,8 @@ int mdbx_env_sync(MDBX_env *env, int force) { /* LY: pre-sync without holding lock to reduce latency for writer(s) */ int rc = (flags & MDBX_WRITEMAP) - ? mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC) + ? mdbx_msync(&env->me_dxb_mmap, 0, used_size, + flags & MDBX_MAPASYNC) : mdbx_filesync(env->me_fd, false); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -3783,7 +3778,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); MDBX_meta *const steady = mdbx_meta_steady(env); if (flags & MDBX_WRITEMAP) { - rc = mdbx_msync(env->me_map, usedbytes, flags & MDBX_MAPASYNC); + rc = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, flags & MDBX_MAPASYNC); if (unlikely(rc != MDBX_SUCCESS)) goto fail; if ((flags & MDBX_MAPASYNC) == 0) { @@ -3885,7 +3880,6 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, !mdbx_meta_eq(env, pending, meta1)); mdbx_assert(env, !mdbx_meta_eq(env, pending, meta2)); - const size_t offset = (char *)target - env->me_map; mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); mdbx_ensure(env, target == head || @@ -3932,16 +3926,18 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_jitter4testing(true); } else { pending->mm_magic_and_version = MDBX_DATA_MAGIC; - rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), offset); + rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), + (uint8_t *)target - env->me_map); if (unlikely(rc != MDBX_SUCCESS)) { undo: mdbx_debug("write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ - mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta), offset); + mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta), + (uint8_t *)target - env->me_map); goto fail; } - mdbx_invalidate_cache(env->me_map + offset, sizeof(MDBX_meta)); + mdbx_invalidate_cache(target, sizeof(MDBX_meta)); } /* LY: step#3 - sync meta-pages. */ @@ -3949,8 +3945,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { - char *ptr = env->me_map + (offset & ~(env->me_os_psize - 1)); - rc = mdbx_msync(ptr, env->me_os_psize, flags & MDBX_MAPASYNC); + const size_t offset = (uint8_t *)container_of(head, MDBX_page, mp_meta) - + env->me_dxb_mmap.dxb; + const size_t paged_offset = offset & ~(env->me_os_psize - 1); + const size_t paged_length = mdbx_roundup2( + env->me_psize + offset - paged_offset, env->me_os_psize); + rc = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length, + flags & MDBX_MAPASYNC); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } else { @@ -3965,13 +3966,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, #else /* LY: shrink datafile if needed */ if (shrink_pgno_delta) { - mdbx_mmap_param_t mmap; - mmap.address = env->me_map; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_dxb_section; -#endif - mmap.fd = env->me_fd; - rc = mdbx_mresize(env->me_flags, &mmap, env->me_dbgeo.now, shrink_bytes); + rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + shrink_bytes); if (rc == MDBX_SUCCESS) env->me_dbgeo.now = shrink_bytes; else if (rc != MDBX_RESULT_TRUE) @@ -4082,19 +4078,13 @@ bailout: } static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) { - mdbx_mmap_param_t mmap; - mmap.fd = env->me_fd; - int rc = mdbx_mmap(env->me_flags, &mmap, env->me_dbgeo.now, env->me_mapsize); + int rc = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + env->me_mapsize); if (unlikely(rc != MDBX_SUCCESS)) { env->me_map = NULL; return rc; } - env->me_map = mmap.address; -#ifdef MDBX_OSAL_SECTION - env->me_dxb_section = mmap.section; -#endif - #ifdef MADV_DONTFORK if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK)) return errno; @@ -4130,7 +4120,7 @@ static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) { /* Lock meta pages to avoid unexpected write, * before the data pages would be synchronized. */ if (env->me_flags & MDBX_WRITEMAP) { - rc = mdbx_mlock(&mmap, pgno2bytes(env, NUM_METAS)); + rc = mdbx_mlock(&env->me_dxb_mmap, pgno2bytes(env, NUM_METAS)); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -4351,17 +4341,11 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower, const size_t size = mdbx_roundup2(pgno2bytes(env, meta.mm_geo.upper), env->me_os_psize); - mdbx_mmap_param_t mmap; - mmap.address = env->me_map; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_dxb_section; -#endif - mmap.fd = env->me_fd; - rc = mdbx_mremap(env->me_flags, &mmap, env->me_mapsize, size); + rc = mdbx_mremap(env->me_flags, &env->me_dxb_mmap, env->me_mapsize, + size); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; env->me_mapsize = size; - env->me_map = mmap.address; #ifdef USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = @@ -4372,13 +4356,7 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower, const size_t size = mdbx_roundup2(pgno2bytes(env, meta.mm_geo.now), env->me_os_psize); - mdbx_mmap_param_t mmap; - mmap.address = env->me_map; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_dxb_section; -#endif - mmap.fd = env->me_fd; - rc = mdbx_mresize(env->me_flags, &mmap, + rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, pgno2bytes(env, head->mm_geo.now), size); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -4736,15 +4714,9 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { } env->me_maxreaders = (unsigned)maxreaders; - mdbx_mmap_param_t mmap; - mmap.fd = env->me_lfd; - err = mdbx_mmap(MDBX_WRITEMAP, &mmap, (size_t)size, (size_t)size); + err = mdbx_mmap(MDBX_WRITEMAP, &env->me_lck_mmap, (size_t)size, (size_t)size); if (unlikely(err != MDBX_SUCCESS)) return err; - env->me_lck = mmap.address; -#ifdef MDBX_OSAL_SECTION - env->me_lck_section = mmap.section; -#endif #ifdef MADV_DODUMP (void)madvise(env->me_lck, size, MADV_DODUMP); @@ -5011,13 +4983,7 @@ static void __cold mdbx_env_close0(MDBX_env *env) { } if (env->me_map) { - mdbx_mmap_param_t mmap; - mmap.address = env->me_map; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_dxb_section; -#endif - mmap.fd = env->me_fd; - mdbx_munmap(&mmap, env->me_mapsize); + mdbx_munmap(&env->me_dxb_mmap, env->me_mapsize); #ifdef USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; @@ -5029,14 +4995,9 @@ static void __cold mdbx_env_close0(MDBX_env *env) { } if (env->me_lck) { - mdbx_mmap_param_t mmap; - mmap.address = env->me_lck; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_lck_section; -#endif - mmap.fd = env->me_lfd; - mdbx_munmap(&mmap, (env->me_maxreaders - 1) * sizeof(MDBX_reader) + - sizeof(MDBX_lockinfo)); + mdbx_munmap(&env->me_lck_mmap, + (env->me_maxreaders - 1) * sizeof(MDBX_reader) + + sizeof(MDBX_lockinfo)); env->me_lck = nullptr; } env->me_pid = 0; @@ -11027,9 +10988,9 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { * Тем не менее, однозначно страница "не грязная" (не будет переписана * во время транзакции) если адрес находится внутри mmap-диапазона * и в заголовке страницы нет флажка P_DIRTY. */ - if (env->me_map < (char *)page) { + if (env->me_map < (uint8_t *)page) { const size_t used_size = pgno2bytes(env, txn->mt_next_pgno); - if ((char *)page < env->me_map + used_size) { + if ((uint8_t *)page < env->me_map + used_size) { /* страница внутри диапазона, смотрим на флажки */ return (page->mp_flags & (P_DIRTY | P_LOOSE | P_KEEP)) ? MDBX_RESULT_TRUE @@ -11040,7 +11001,7 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { * ошибка, к которой не возможно прийти без каких-то больших нарушений. * Поэтому не проверяем этот случай кроме как assert-ом, на то что * страница вне mmap-диаппазона. */ - mdbx_tassert(txn, (char *)page >= env->me_map + env->me_mapsize); + mdbx_tassert(txn, (uint8_t *)page >= env->me_map + env->me_mapsize); } /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был diff --git a/src/osal.c b/src/osal.c index ae28d191..80728283 100644 --- a/src/osal.c +++ b/src/osal.c @@ -732,18 +732,19 @@ int mdbx_thread_join(mdbx_thread_t thread) { /*----------------------------------------------------------------------------*/ -int mdbx_msync(void *addr, size_t length, int async) { +int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async) { + uint8_t *ptr = (uint8_t *)map->address + offset; #if defined(_WIN32) || defined(_WIN64) - if (async) + if (FlushViewOfFile(ptr, length) && (async || FlushFileBuffers(map->fd))) return MDBX_SUCCESS; - return FlushViewOfFile(addr, length) ? MDBX_SUCCESS : GetLastError(); + return GetLastError(); #else const int mode = async ? MS_ASYNC : MS_SYNC; - return (msync(addr, length, mode) == 0) ? MDBX_SUCCESS : errno; + return (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno; #endif } -int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit) { +int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit) { #if defined(_WIN32) || defined(_WIN64) NTSTATUS rc = NtCreateSection( &map->section, @@ -790,7 +791,7 @@ int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit) { #endif } -int mdbx_munmap(mdbx_mmap_param_t *map, size_t length) { +int mdbx_munmap(mdbx_mmap_t *map, size_t length) { #if defined(_WIN32) || defined(_WIN64) (void)length; if (map->section) @@ -802,7 +803,7 @@ int mdbx_munmap(mdbx_mmap_param_t *map, size_t length) { #endif } -int mdbx_mlock(mdbx_mmap_param_t *map, size_t length) { +int mdbx_mlock(mdbx_mmap_t *map, size_t length) { #if defined(_WIN32) || defined(_WIN64) return VirtualLock(map->address, length) ? MDBX_SUCCESS : GetLastError(); #else @@ -810,8 +811,7 @@ int mdbx_mlock(mdbx_mmap_param_t *map, size_t length) { #endif } -int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current, - size_t wanna) { +int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna) { #if defined(_WIN32) || defined(_WIN64) if (wanna > current) { /* growth */ @@ -831,7 +831,7 @@ int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current, #endif } -int mdbx_mremap(int flags, mdbx_mmap_param_t *map, size_t old_limit, +int mdbx_mremap(int flags, mdbx_mmap_t *map, size_t old_limit, size_t new_limit) { #if defined(_WIN32) || defined(_WIN64) (void)flags; diff --git a/src/osal.h b/src/osal.h index 731c86b5..413e7d9b 100644 --- a/src/osal.h +++ b/src/osal.h @@ -420,8 +420,6 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t count); -int mdbx_msync(void *addr, size_t length, int async); - int mdbx_thread_create(mdbx_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); @@ -440,19 +438,24 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, int mdbx_closefile(mdbx_filehandle_t fd); typedef struct mdbx_mmap_param { - void *address; + union { + void *address; + uint8_t *dxb; + struct MDBX_lockinfo *lck; + }; + mdbx_filehandle_t fd; #ifdef MDBX_OSAL_SECTION MDBX_OSAL_SECTION section; #endif - mdbx_filehandle_t fd; -} mdbx_mmap_param_t; -int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit); -int mdbx_munmap(mdbx_mmap_param_t *map, size_t length); -int mdbx_mlock(mdbx_mmap_param_t *map, size_t length); -int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current, - size_t wanna); -int mdbx_mremap(int flags, mdbx_mmap_param_t *map, size_t old_limit, +} mdbx_mmap_t; + +int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit); +int mdbx_munmap(mdbx_mmap_t *map, size_t length); +int mdbx_mlock(mdbx_mmap_t *map, size_t length); +int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna); +int mdbx_mremap(int flags, mdbx_mmap_t *map, size_t old_limit, size_t new_limit); +int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async); static __inline mdbx_pid_t mdbx_getpid(void) { #if defined(_WIN32) || defined(_WIN64) From 4d1df6ea11ab4b42eed1a70311f9b2b2438403c8 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Jul 2017 15:04:38 +0300 Subject: [PATCH 293/303] mdbx: rework rollback on dxb-setup to avoid troubles under Windows. --- src/mdbx.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 2705602e..68702420 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4552,7 +4552,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { } while (1) { - const MDBX_meta *head = mdbx_meta_head(env); + MDBX_meta *head = mdbx_meta_head(env); const txnid_t head_txnid = mdbx_meta_txnid_fluid(env, head); if (head_txnid == meta.mm_txnid_a) break; @@ -4567,14 +4567,28 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { } /* LY: rollback weak checkpoint */ - MDBX_meta rollback = *head; - mdbx_meta_set_txnid(env, &rollback, 0); - rollback.mm_datasync_sign = MDBX_DATASIGN_WEAK; mdbx_trace("rollback: from %" PRIaTXN ", to %" PRIaTXN, head_txnid, meta.mm_txnid_a); mdbx_ensure(env, head_txnid == mdbx_meta_txnid_stable(env, head)); - err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), - (uint8_t *)head - (uint8_t *)env->me_map); + + if (env->me_flags & MDBX_WRITEMAP) { + head->mm_txnid_a = 0; + head->mm_datasync_sign = MDBX_DATASIGN_WEAK; + head->mm_txnid_b = 0; + const size_t offset = + (uint8_t *)container_of(head, MDBX_page, mp_meta) - + env->me_dxb_mmap.dxb; + const size_t paged_offset = offset & ~(env->me_os_psize - 1); + const size_t paged_length = mdbx_roundup2( + env->me_psize + offset - paged_offset, env->me_os_psize); + err = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length, false); + } else { + MDBX_meta rollback = *head; + mdbx_meta_set_txnid(env, &rollback, 0); + rollback.mm_datasync_sign = MDBX_DATASIGN_WEAK; + err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta), + (uint8_t *)head - (uint8_t *)env->me_map); + } if (err) return err; From 53c2b0abe4c892dce80096c9ef6f88819e154528 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 11 Jul 2017 19:05:40 +0300 Subject: [PATCH 294/303] mdbx: checking and reject network/remote files. Change-Id: I77e8b8bc94785d705461d162cbc40ad58ead67ca --- src/osal.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 113 insertions(+), 5 deletions(-) diff --git a/src/osal.c b/src/osal.c index 80728283..655f104f 100644 --- a/src/osal.c +++ b/src/osal.c @@ -104,6 +104,35 @@ extern NTSTATUS NTAPI NtFreeVirtualMemory(IN HANDLE ProcessHandle, IN OUT PULONG RegionSize, IN ULONG FreeType); +typedef struct _IO_STATUS_BLOCK { + union { + NTSTATUS Status; + PVOID Pointer; + }; + ULONG_PTR Information; +} IO_STATUS_BLOCK, *PIO_STATUS_BLOCK; + +typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 { + ULONG Version; + ULONG Algorithm; + ULONG Flags; +} FILE_PROVIDER_EXTERNAL_INFO_V1, *PFILE_PROVIDER_EXTERNAL_INFO_V1; + +#ifndef STATUS_OBJECT_NOT_EXTERNALLY_BACKED +#define STATUS_OBJECT_NOT_EXTERNALLY_BACKED ((NTSTATUS)0xC000046DL) +#endif +#ifndef STATUS_INVALID_DEVICE_REQUEST +#define STATUS_INVALID_DEVICE_REQUEST ((NTSTATUS)0xC0000010L) +#endif + +extern NTSTATUS +NtFsControlFile(IN HANDLE FileHandle, IN OUT HANDLE Event, + IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, + IN OUT PVOID ApcContext, OUT PIO_STATUS_BLOCK IoStatusBlock, + IN ULONG FsControlCode, IN OUT PVOID InputBuffer, + IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, + IN ULONG OutputBufferLength); + #endif /* _WIN32 || _WIN64 */ /*----------------------------------------------------------------------------*/ @@ -746,7 +775,89 @@ int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async) { int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit) { #if defined(_WIN32) || defined(_WIN64) - NTSTATUS rc = NtCreateSection( + map->section = 0; + map->address = MAP_FAILED; + + if (GetFileType(map->fd) != FILE_TYPE_DISK) + return ERROR_FILE_OFFLINE; + + FILE_REMOTE_PROTOCOL_INFO RemoteProtocolInfo; + if (GetFileInformationByHandleEx(map->fd, FileRemoteProtocolInfo, + &RemoteProtocolInfo, + sizeof(RemoteProtocolInfo))) { + if ((RemoteProtocolInfo.Flags & (REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK | + REMOTE_PROTOCOL_INFO_FLAG_OFFLINE)) != + REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK) + return ERROR_FILE_OFFLINE; + } + + NTSTATUS rc; +#ifdef _WIN64 + struct { + WOF_EXTERNAL_INFO wof_info; + union { + WIM_PROVIDER_EXTERNAL_INFO wim_info; + FILE_PROVIDER_EXTERNAL_INFO_V1 file_info; + }; + size_t reserved_for_microsoft_madness[42]; + } GetExternalBacking_OutputBuffer; + IO_STATUS_BLOCK StatusBlock; + rc = NtFsControlFile(map->fd, NULL, NULL, NULL, &StatusBlock, + FSCTL_GET_EXTERNAL_BACKING, NULL, 0, + &GetExternalBacking_OutputBuffer, + sizeof(GetExternalBacking_OutputBuffer)); + if (rc != STATUS_OBJECT_NOT_EXTERNALLY_BACKED && + rc != STATUS_INVALID_DEVICE_REQUEST) + return NT_SUCCESS(rc) ? ERROR_FILE_OFFLINE : ntstatus2errcode(rc); +#endif + + WCHAR PathBuffer[INT16_MAX]; + DWORD VolumeSerialNumber, FileSystemFlags; + if (!GetVolumeInformationByHandleW(map->fd, PathBuffer, INT16_MAX, + &VolumeSerialNumber, NULL, + &FileSystemFlags, NULL, 0)) + return GetLastError(); + + if ((flags & MDBX_RDONLY) == 0) { + if (FileSystemFlags & (FILE_SEQUENTIAL_WRITE_ONCE | FILE_READ_ONLY_VOLUME | + FILE_VOLUME_IS_COMPRESSED)) + return ERROR_FILE_OFFLINE; + } + + if (!GetFinalPathNameByHandleW(map->fd, PathBuffer, INT16_MAX, + FILE_NAME_NORMALIZED | VOLUME_NAME_NT)) + return GetLastError(); + + if (_wcsnicmp(PathBuffer, L"\\Device\\Mup\\", 12) == 0) + return ERROR_FILE_OFFLINE; + + if (GetFinalPathNameByHandleW(map->fd, PathBuffer, INT16_MAX, + FILE_NAME_NORMALIZED | VOLUME_NAME_DOS)) { + UINT DriveType = GetDriveTypeW(PathBuffer); + if (DriveType == DRIVE_NO_ROOT_DIR && + wcsncmp(PathBuffer, L"\\\\?\\", 4) == 0 && + wcsncmp(PathBuffer + 5, L":\\", 2) == 0) { + PathBuffer[7] = 0; + DriveType = GetDriveTypeW(PathBuffer + 4); + } + switch (DriveType) { + case DRIVE_CDROM: + if (flags & MDBX_RDONLY) + break; + // fall through + case DRIVE_UNKNOWN: + case DRIVE_NO_ROOT_DIR: + case DRIVE_REMOTE: + default: + return ERROR_FILE_OFFLINE; + case DRIVE_REMOVABLE: + case DRIVE_FIXED: + case DRIVE_RAMDISK: + break; + } + } + + rc = NtCreateSection( &map->section, /* DesiredAccess */ SECTION_MAP_READ | SECTION_EXTEND_SIZE | ((flags & MDBX_WRITEMAP) ? SECTION_MAP_WRITE : 0), @@ -755,11 +866,8 @@ int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit) { : PAGE_READWRITE, /* AllocationAttributes */ SEC_RESERVE, map->fd); - if (!NT_SUCCESS(rc)) { - map->section = 0; - map->address = MAP_FAILED; + if (!NT_SUCCESS(rc)) return ntstatus2errcode(rc); - } map->address = NULL; SIZE_T ViewSize = limit; From 700ec68d063795a756911adb172694ab7ad66986 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 19 Jul 2017 14:13:18 +0300 Subject: [PATCH 295/303] mdbx-ci: add mdbx_chk info appveyor. Change-Id: Ic620a2dddfa6ea973c9f7b37bdc801282283db8a --- appveyor.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index bfaca865..2b1da729 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -30,15 +30,19 @@ test_script: - ps: | if (($env:PLATFORM -eq "x86") -and (Test-Path "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" -PathType Leaf)) { $test = "C:\projects\libmdbx\Win32\$env:CONFIGURATION\test.exe" + $mdbx_chk = "C:\projects\libmdbx\Win32\$env:CONFIGURATION\mdbx_chk.exe" } elseif (($env:PLATFORM -ne "ARM") -and ($env:PLATFORM -ne "ARM64")) { $test = "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\test.exe" + $mdbx_chk = "C:\projects\libmdbx\$env:PLATFORM\$env:CONFIGURATION\mdbx_chk.exe" } else { $test = "" + $mdbx_chk = "" } if ($test -ne "") { & "$test" --pathname=tmp.db --dont-cleanup-after basic | Tee-Object -file test.log | Select-Object -last 42 + & "$mdbx_chk" -nvv tmp.db | Tee-Object -file chk.log | Select-Object -last 42 } on_failure: -- ps: Push-AppveyorArtifact test.log +- ps: Push-AppveyorArtifact test.log chk.log From 17e8429a294c19b0363da4867ec5ceb991505f04 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 12 Jul 2017 21:13:17 +0300 Subject: [PATCH 296/303] mdbx: rework mmap-functions for osal. - add 'length' and 'current' fields to mmap-object; - drop mdbx_mremap(); - do remap on-demand inside mdbx_mresize(); - add mdbx_mapresize() which re-creates Valgrind's region. - call resize on txn-begin. Change-Id: I82780f92c4947804e3f14fb7cb71ee655382f9bb --- mdbx.h | 24 ++--- src/bits.h | 7 +- src/mdbx.c | 227 +++++++++++++++++++++++------------------- src/osal.c | 104 +++++++++++-------- src/osal.h | 10 +- src/tools/mdbx_chk.c | 102 +++++++++---------- src/tools/mdbx_dump.c | 4 +- src/tools/mdbx_load.c | 16 +-- src/tools/mdbx_stat.c | 40 ++++---- 9 files changed, 290 insertions(+), 244 deletions(-) diff --git a/mdbx.h b/mdbx.h index 3f010c4a..a7bba16a 100644 --- a/mdbx.h +++ b/mdbx.h @@ -448,18 +448,18 @@ typedef struct MDBX_envinfo { uint64_t current; /* current datafile size */ uint64_t shrink; /* shrink theshold for datafile */ uint64_t grow; /* growth step for datafile */ - } me_geo; - uint64_t me_mapsize; /* Size of the data memory map */ - uint64_t me_last_pgno; /* ID of the last used page */ - uint64_t me_recent_txnid; /* ID of the last committed transaction */ - uint64_t me_latter_reader_txnid; /* ID of the last reader transaction */ - uint64_t me_meta0_txnid, me_meta0_sign; - uint64_t me_meta1_txnid, me_meta1_sign; - uint64_t me_meta2_txnid, me_meta2_sign; - uint32_t me_maxreaders; /* max reader slots in the environment */ - uint32_t me_numreaders; /* max reader slots used in the environment */ - uint32_t me_dxb_pagesize; /* database pagesize */ - uint32_t me_sys_pagesize; /* system pagesize */ + } mi_geo; + uint64_t mi_mapsize; /* Size of the data memory map */ + uint64_t mi_last_pgno; /* ID of the last used page */ + uint64_t mi_recent_txnid; /* ID of the last committed transaction */ + uint64_t mi_latter_reader_txnid; /* ID of the last reader transaction */ + uint64_t mi_meta0_txnid, mi_meta0_sign; + uint64_t mi_meta1_txnid, mi_meta1_sign; + uint64_t mi_meta2_txnid, mi_meta2_sign; + uint32_t mi_maxreaders; /* max reader slots in the environment */ + uint32_t mi_numreaders; /* max reader slots used in the environment */ + uint32_t mi_dxb_pagesize; /* database pagesize */ + uint32_t mi_sys_pagesize; /* system pagesize */ } MDBX_envinfo; /* Return a string describing a given error code. diff --git a/src/bits.h b/src/bits.h index 3c35c8e3..d5cb9f73 100644 --- a/src/bits.h +++ b/src/bits.h @@ -660,11 +660,12 @@ struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) size_t me_signature; mdbx_mmap_t me_dxb_mmap; /* The main data file */ - mdbx_mmap_t me_lck_mmap; /* The lock file */ #define me_map me_dxb_mmap.dxb -#define me_lck me_lck_mmap.lck #define me_fd me_dxb_mmap.fd +#define me_mapsize me_dxb_mmap.length + mdbx_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd +#define me_lck me_lck_mmap.lck /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) @@ -688,7 +689,6 @@ struct MDBX_env { void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ - size_t me_mapsize; /* size of the data memory map */ MDBX_dbx *me_dbxs; /* array of static DB info */ uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ @@ -719,6 +719,7 @@ struct MDBX_env { #ifdef USE_VALGRIND int me_valgrind_handle; #endif + struct { size_t lower; /* minimal size of datafile */ size_t upper; /* maximal size of datafile */ diff --git a/src/mdbx.c b/src/mdbx.c index 68702420..445d948e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1537,6 +1537,54 @@ static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { txn->mt_dirtyroom--; } +static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, + const pgno_t limit_pgno) { +#ifdef USE_VALGRIND + const size_t prev_mapsize = env->me_mapsize; + void *const prev_mapaddr = env->me_map; +#endif + + const size_t limit_bytes = + mdbx_roundup2(pgno2bytes(env, limit_pgno), env->me_os_psize); + const size_t size_bytes = + mdbx_roundup2(pgno2bytes(env, size_pgno), env->me_os_psize); + + mdbx_info("resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR, + env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, limit_bytes); + + mdbx_assert(env, limit_bytes >= size_bytes); + mdbx_assert(env, bytes2pgno(env, size_bytes) == size_pgno); + mdbx_assert(env, bytes2pgno(env, limit_bytes) == limit_pgno); + const int rc = + mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); + + if (rc == MDBX_SUCCESS) { + if (env->me_txn0) + env->me_txn0->mt_end_pgno = size_pgno; + env->me_dbgeo.now = size_bytes; + env->me_dbgeo.upper = limit_bytes; + } else { + mdbx_error("failed resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + env->me_dbgeo.now, size_bytes, env->me_dbgeo.upper, limit_bytes, + rc); + } + +#ifdef USE_VALGRIND + if (prev_mapsize != env->me_mapsize || prev_mapaddr != env->me_map) { + VALGRIND_DISCARD(env->me_valgrind_handle); + env->me_valgrind_handle = 0; + if (env->me_mapsize) + env->me_valgrind_handle = + VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "mdbx"); + } +#endif + return rc; +} + /* Allocate page numbers and memory for writing. Maintain me_last_reclaimed, * me_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. * @@ -1835,36 +1883,24 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, } } - if (rc == MDBX_MAP_FULL) { + if (rc == MDBX_MAP_FULL && next < head->mm_geo.upper) { mdbx_assert(env, next > txn->mt_end_pgno); - if (unlikely(pgno2bytes(env, next) <= env->me_mapsize)) { - pgno_t growth_pgno = txn->mt_next_pgno + head->mm_geo.grow; - if (growth_pgno > MAX_PAGENO) - growth_pgno = MAX_PAGENO; - size_t growth_bytes = - mdbx_roundup2(pgno2bytes(env, growth_pgno), env->me_os_psize); - if (growth_bytes > env->me_mapsize) - growth_bytes = env->me_mapsize; - growth_pgno = bytes2pgno(env, growth_bytes); - mdbx_assert(env, growth_pgno <= head->mm_geo.upper); - mdbx_assert(env, growth_pgno > txn->mt_end_pgno); - mdbx_info("growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO - "), %" PRIuPTR " bytes", - growth_pgno, growth_pgno - txn->mt_end_pgno, growth_bytes); + pgno_t growth_pgno = bytes2pgno( + env, + mdbx_roundup2(pgno2bytes(env, txn->mt_next_pgno + head->mm_geo.grow), + env->me_os_psize)); + if (growth_pgno > head->mm_geo.upper) + growth_pgno = head->mm_geo.upper; - rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, - growth_bytes); - if (rc == MDBX_SUCCESS) { - txn->mt_end_pgno = growth_pgno; - env->me_dbgeo.now = growth_bytes; - continue; - } - mdbx_error("error while growth datafile to %" PRIaPGNO - "pages (+%" PRIaPGNO "), %" PRIuPTR " bytes, errcode %d", - growth_pgno, growth_pgno - txn->mt_end_pgno, growth_bytes, - rc); - } else if (next < head->mm_geo.upper) - rc = MDBX_MAP_RESIZED; + mdbx_info("try growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO ")", + growth_pgno, growth_pgno - txn->mt_end_pgno); + rc = mdbx_mapresize(env, growth_pgno, head->mm_geo.upper); + if (rc == MDBX_SUCCESS) + continue; + + mdbx_warning("unable growth datafile to %" PRIaPGNO "pages (+%" PRIaPGNO + "), errcode %d", + growth_pgno, growth_pgno - txn->mt_end_pgno, rc); } fail: @@ -2267,6 +2303,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { return MDBX_PANIC; } + pgno_t upper_pgno = 0; if (flags & MDBX_TXN_RDONLY) { txn->mt_flags = MDBX_TXN_RDONLY; MDBX_reader *r = txn->mt_ro_reader; @@ -2364,6 +2401,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { txn->mt_txnid = snap; txn->mt_next_pgno = meta->mm_geo.next; txn->mt_end_pgno = meta->mm_geo.now; + upper_pgno = meta->mm_geo.upper; memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); txn->mt_canary = meta->mm_canary; @@ -2417,6 +2455,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { /* Moved to here to avoid a data race in read TXNs */ txn->mt_next_pgno = meta->mm_geo.next; txn->mt_end_pgno = meta->mm_geo.now; + upper_pgno = meta->mm_geo.upper; } /* Setup db info */ @@ -2433,9 +2472,18 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { mdbx_debug("environment had fatal error, must shutdown!"); rc = MDBX_PANIC; - } else if (unlikely(env->me_mapsize < pgno2bytes(env, txn->mt_next_pgno))) { - rc = MDBX_MAP_RESIZED; } else { + const size_t size = pgno2bytes(env, txn->mt_end_pgno); + if (unlikely(size > env->me_mapsize)) { + if (upper_pgno > MAX_PAGENO || + bytes2pgno(env, pgno2bytes(env, upper_pgno)) != upper_pgno) { + rc = MDBX_MAP_RESIZED; + goto bailout; + } + rc = mdbx_mapresize(env, txn->mt_end_pgno, upper_pgno); + if (rc != MDBX_SUCCESS) + goto bailout; + } return MDBX_SUCCESS; } bailout: @@ -3801,10 +3849,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, /* Windows is unable shrinking a mapped file */ #else /* LY: check conditions to shrink datafile */ + pgno_t shrink_pgno_delta = 0; const pgno_t shrink_pgno = pending->mm_geo.next /* + pending->mm_geo.grow */; - const size_t shrink_bytes = - mdbx_roundup2(pgno2bytes(env, shrink_pgno), env->me_os_psize); - size_t shrink_pgno_delta = 0; if (pending->mm_geo.now > shrink_pgno && pending->mm_geo.shrink && unlikely(pending->mm_geo.now - pending->mm_geo.shrink >= shrink_pgno)) { if (pending->mm_geo.now > shrink_pgno && @@ -3965,12 +4011,9 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, /* Windows is unable shrinking a mapped file */ #else /* LY: shrink datafile if needed */ - if (shrink_pgno_delta) { - rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, - shrink_bytes); - if (rc == MDBX_SUCCESS) - env->me_dbgeo.now = shrink_bytes; - else if (rc != MDBX_RESULT_TRUE) + if (unlikely(shrink_pgno_delta)) { + rc = mdbx_mapresize(env, pending->mm_geo.now, pending->mm_geo.upper); + if (MDBX_IS_ERROR(rc)) goto fail; } #endif /* not a Windows */ @@ -4079,7 +4122,7 @@ bailout: static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) { int rc = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, - env->me_mapsize); + env->me_dbgeo.upper); if (unlikely(rc != MDBX_SUCCESS)) { env->me_map = NULL; return rc; @@ -4304,7 +4347,7 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower, if (bytes2pgno(env, shrink_threshold) > UINT16_MAX) shrink_threshold = pgno2bytes(env, UINT16_MAX); - /* save params for future open/create */ + /* save user's geo-params for future open/create */ env->me_dbgeo.lower = size_lower; env->me_dbgeo.now = size_now; env->me_dbgeo.upper = size_upper; @@ -4337,27 +4380,9 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower, meta.mm_geo.shrink == bytes2pgno(env, env->me_dbgeo.shrink)); if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { - if (meta.mm_geo.upper != head->mm_geo.upper) { - const size_t size = - mdbx_roundup2(pgno2bytes(env, meta.mm_geo.upper), env->me_os_psize); - - rc = mdbx_mremap(env->me_flags, &env->me_dxb_mmap, env->me_mapsize, - size); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - env->me_mapsize = size; -#ifdef USE_VALGRIND - VALGRIND_DISCARD(env->me_valgrind_handle); - env->me_valgrind_handle = - VALGRIND_CREATE_BLOCK(env->me_map, env->me_mapsize, "mdbx"); -#endif - } - if (meta.mm_geo.now != head->mm_geo.now) { - const size_t size = - mdbx_roundup2(pgno2bytes(env, meta.mm_geo.now), env->me_os_psize); - - rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, - pgno2bytes(env, head->mm_geo.now), size); + if (meta.mm_geo.now != head->mm_geo.now || + meta.mm_geo.upper != head->mm_geo.upper) { + rc = mdbx_mapresize(env, meta.mm_geo.now, meta.mm_geo.upper); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -4503,7 +4528,6 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { meta.mm_geo.shrink = (uint16_t)bytes2pgno(env, env->me_dbgeo.shrink); mdbx_ensure(env, meta.mm_geo.now >= meta.mm_geo.next); } - env->me_mapsize = env->me_dbgeo.upper; uint64_t filesize; err = mdbx_filesize(env->me_fd, &filesize); @@ -4541,7 +4565,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { } } - err = mdbx_env_map(env, env->me_mapsize); + err = mdbx_env_map(env, expected_bytes); if (err) return err; @@ -4997,7 +5021,7 @@ static void __cold mdbx_env_close0(MDBX_env *env) { } if (env->me_map) { - mdbx_munmap(&env->me_dxb_mmap, env->me_mapsize); + mdbx_munmap(&env->me_dxb_mmap); #ifdef USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; @@ -5009,9 +5033,7 @@ static void __cold mdbx_env_close0(MDBX_env *env) { } if (env->me_lck) { - mdbx_munmap(&env->me_lck_mmap, - (env->me_maxreaders - 1) * sizeof(MDBX_reader) + - sizeof(MDBX_lockinfo)); + mdbx_munmap(&env->me_lck_mmap); env->me_lck = nullptr; } env->me_pid = 0; @@ -9607,44 +9629,44 @@ int __cold mdbx_env_info(MDBX_env *env, MDBX_envinfo *arg, size_t bytes) { const MDBX_meta *meta; do { meta = mdbx_meta_head(env); - arg->me_recent_txnid = mdbx_meta_txnid_fluid(env, meta); - arg->me_meta0_txnid = mdbx_meta_txnid_fluid(env, meta0); - arg->me_meta0_sign = meta0->mm_datasync_sign; - arg->me_meta1_txnid = mdbx_meta_txnid_fluid(env, meta1); - arg->me_meta1_sign = meta1->mm_datasync_sign; - arg->me_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2); - arg->me_meta2_sign = meta2->mm_datasync_sign; - arg->me_last_pgno = meta->mm_geo.next - 1; - arg->me_geo.lower = pgno2bytes(env, meta->mm_geo.lower); - arg->me_geo.upper = pgno2bytes(env, meta->mm_geo.upper); - arg->me_geo.current = pgno2bytes(env, meta->mm_geo.now); - arg->me_geo.shrink = pgno2bytes(env, meta->mm_geo.shrink); - arg->me_geo.grow = pgno2bytes(env, meta->mm_geo.grow); - arg->me_mapsize = env->me_mapsize; + arg->mi_recent_txnid = mdbx_meta_txnid_fluid(env, meta); + arg->mi_meta0_txnid = mdbx_meta_txnid_fluid(env, meta0); + arg->mi_meta0_sign = meta0->mm_datasync_sign; + arg->mi_meta1_txnid = mdbx_meta_txnid_fluid(env, meta1); + arg->mi_meta1_sign = meta1->mm_datasync_sign; + arg->mi_meta2_txnid = mdbx_meta_txnid_fluid(env, meta2); + arg->mi_meta2_sign = meta2->mm_datasync_sign; + arg->mi_last_pgno = meta->mm_geo.next - 1; + arg->mi_geo.lower = pgno2bytes(env, meta->mm_geo.lower); + arg->mi_geo.upper = pgno2bytes(env, meta->mm_geo.upper); + arg->mi_geo.current = pgno2bytes(env, meta->mm_geo.now); + arg->mi_geo.shrink = pgno2bytes(env, meta->mm_geo.shrink); + arg->mi_geo.grow = pgno2bytes(env, meta->mm_geo.grow); + arg->mi_mapsize = env->me_mapsize; mdbx_compiler_barrier(); - } while (unlikely(arg->me_meta0_txnid != mdbx_meta_txnid_fluid(env, meta0) || - arg->me_meta0_sign != meta0->mm_datasync_sign || - arg->me_meta1_txnid != mdbx_meta_txnid_fluid(env, meta1) || - arg->me_meta1_sign != meta1->mm_datasync_sign || - arg->me_meta2_txnid != mdbx_meta_txnid_fluid(env, meta2) || - arg->me_meta2_sign != meta2->mm_datasync_sign || + } while (unlikely(arg->mi_meta0_txnid != mdbx_meta_txnid_fluid(env, meta0) || + arg->mi_meta0_sign != meta0->mm_datasync_sign || + arg->mi_meta1_txnid != mdbx_meta_txnid_fluid(env, meta1) || + arg->mi_meta1_sign != meta1->mm_datasync_sign || + arg->mi_meta2_txnid != mdbx_meta_txnid_fluid(env, meta2) || + arg->mi_meta2_sign != meta2->mm_datasync_sign || meta != mdbx_meta_head(env) || - arg->me_recent_txnid != mdbx_meta_txnid_fluid(env, meta))); + arg->mi_recent_txnid != mdbx_meta_txnid_fluid(env, meta))); - arg->me_maxreaders = env->me_maxreaders; - arg->me_numreaders = env->me_lck ? env->me_lck->mti_numreaders : INT32_MAX; - arg->me_dxb_pagesize = env->me_psize; - arg->me_sys_pagesize = env->me_os_psize; + arg->mi_maxreaders = env->me_maxreaders; + arg->mi_numreaders = env->me_lck ? env->me_lck->mti_numreaders : INT32_MAX; + arg->mi_dxb_pagesize = env->me_psize; + arg->mi_sys_pagesize = env->me_os_psize; - arg->me_latter_reader_txnid = 0; + arg->mi_latter_reader_txnid = 0; if (env->me_lck) { MDBX_reader *r = env->me_lck->mti_readers; - arg->me_latter_reader_txnid = arg->me_recent_txnid; - for (unsigned i = 0; i < arg->me_numreaders; ++i) { + arg->mi_latter_reader_txnid = arg->mi_recent_txnid; + for (unsigned i = 0; i < arg->mi_numreaders; ++i) { if (r[i].mr_pid) { const txnid_t txnid = r[i].mr_txnid; - if (arg->me_latter_reader_txnid > txnid) - arg->me_latter_reader_txnid = txnid; + if (arg->mi_latter_reader_txnid > txnid) + arg->mi_latter_reader_txnid = txnid; } } } @@ -10451,9 +10473,10 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) return MDBX_THREAD_MISMATCH; MDBX_env *env = txn->mt_env; - pgno_t maxpg = bytes2pgno(env, env->me_mapsize); if (unlikely((txn->mt_flags & MDBX_RDONLY) == 0)) { - *percent = (int)((txn->mt_next_pgno * UINT64_C(100) + maxpg / 2) / maxpg); + *percent = + (int)((txn->mt_next_pgno * UINT64_C(100) + txn->mt_end_pgno / 2) / + txn->mt_end_pgno); return -1; } @@ -10462,8 +10485,10 @@ int mdbx_txn_straggler(MDBX_txn *txn, int *percent) do { meta = mdbx_meta_head(env); recent = mdbx_meta_txnid_fluid(env, meta); - if (percent) + if (percent) { + const pgno_t maxpg = meta->mm_geo.now; *percent = (int)((meta->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); + } } while (unlikely(recent != mdbx_meta_txnid_fluid(env, meta))); txnid_t lag = recent - txn->mt_ro_reader->mr_txnid; diff --git a/src/osal.c b/src/osal.c index 655f104f..d7547370 100644 --- a/src/osal.c +++ b/src/osal.c @@ -773,9 +773,12 @@ int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async) { #endif } -int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit) { +int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t must, size_t limit) { + assert(must <= limit); #if defined(_WIN32) || defined(_WIN64) - map->section = 0; + map->length = 0; + map->current = 0; + map->section = NULL; map->address = MAP_FAILED; if (GetFileType(map->fd) != FILE_TYPE_DISK) @@ -870,11 +873,11 @@ int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit) { return ntstatus2errcode(rc); map->address = NULL; - SIZE_T ViewSize = limit; + SIZE_T ViewSize = (flags & MDBX_RDONLY) ? must : limit; rc = NtMapViewOfSection( map->section, GetCurrentProcess(), &map->address, /* ZeroBits */ 0, - /* CommitSize */ length, + /* CommitSize */ must, /* SectionOffset */ NULL, &ViewSize, /* InheritDisposition */ ViewUnmap, /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, @@ -889,26 +892,40 @@ int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit) { } assert(map->address != MAP_FAILED); + map->current = must; + map->length = ViewSize; return MDBX_SUCCESS; #else - (void)length; + (void)must; map->address = mmap( NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, MAP_SHARED, map->fd, 0); - return (map->address != MAP_FAILED) ? MDBX_SUCCESS : errno; + if (likely(map->address != MAP_FAILED)) { + map->length = limit; + return MDBX_SUCCESS; + } + map->length = 0; + return errno; #endif } -int mdbx_munmap(mdbx_mmap_t *map, size_t length) { +int mdbx_munmap(mdbx_mmap_t *map) { #if defined(_WIN32) || defined(_WIN64) - (void)length; if (map->section) NtClose(map->section); NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->address); - return NT_SUCCESS(rc) ? MDBX_SUCCESS : ntstatus2errcode(rc); + if (!NT_SUCCESS(rc)) + ntstatus2errcode(rc); + map->length = 0; + map->current = 0; + map->address = nullptr; #else - return (munmap(map->address, length) == 0) ? MDBX_SUCCESS : errno; + if (unlikely(munmap(map->address, map->length))) + return errno; + map->length = 0; + map->address = nullptr; #endif + return MDBX_SUCCESS; } int mdbx_mlock(mdbx_mmap_t *map, size_t length) { @@ -919,45 +936,46 @@ int mdbx_mlock(mdbx_mmap_t *map, size_t length) { #endif } -int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna) { +int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t must, size_t limit) { + assert(must <= limit); #if defined(_WIN32) || defined(_WIN64) - if (wanna > current) { - /* growth */ - uint8_t *ptr = (uint8_t *)map->address + current; - return (ptr == VirtualAlloc(ptr, wanna - current, MEM_COMMIT, - (flags & MDBX_WRITEMAP) ? PAGE_READWRITE - : PAGE_READONLY)) - ? MDBX_SUCCESS - : GetLastError(); - } - /* Windows is unable shrinking a mapped file */ - return MDBX_RESULT_TRUE; -#else - (void)flags; - (void)current; - return mdbx_ftruncate(map->fd, wanna); -#endif -} - -int mdbx_mremap(int flags, mdbx_mmap_t *map, size_t old_limit, - size_t new_limit) { -#if defined(_WIN32) || defined(_WIN64) - (void)flags; - if (old_limit > new_limit) { + if (limit < map->length) { /* Windows is unable shrinking a mapped section */ return ERROR_USER_MAPPED_FILE; } - LARGE_INTEGER new_size; - new_size.QuadPart = new_limit; - NTSTATUS rc = NtExtendSection(map->section, &new_size); - return NT_SUCCESS(rc) ? MDBX_SUCCESS : ntstatus2errcode(rc); + if (limit > map->length) { + /* extend */ + LARGE_INTEGER new_size; + new_size.QuadPart = limit; + NTSTATUS rc = NtExtendSection(map->section, &new_size); + if (!NT_SUCCESS(rc)) + return ntstatus2errcode(rc); + map->length = limit; + } + if (must < map->current) { + /* Windows is unable shrinking a mapped file */ + return MDBX_RESULT_TRUE; + } + if (must > map->current) { + /* growth */ + uint8_t *ptr = (uint8_t *)map->address + map->current; + if (ptr != + VirtualAlloc(ptr, must - map->current, MEM_COMMIT, + (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY)) + return GetLastError(); + map->current = must; + } + return MDBX_SUCCESS; #else (void)flags; - void *ptr = mremap(map->address, old_limit, new_limit, 0); - if (ptr == MAP_FAILED) - return errno; - map->address = ptr; - return MDBX_SUCCESS; + if (limit != map->length) { + void *ptr = mremap(map->address, map->length, limit, MREMAP_MAYMOVE); + if (ptr == MAP_FAILED) + return errno; + map->address = ptr; + map->length = limit; + } + return mdbx_ftruncate(map->fd, must); #endif } diff --git a/src/osal.h b/src/osal.h index 413e7d9b..c35ffa3e 100644 --- a/src/osal.h +++ b/src/osal.h @@ -444,17 +444,19 @@ typedef struct mdbx_mmap_param { struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; + size_t length; /* mapping length, but NOT a size of file or DB */ +#if defined(_WIN32) || defined(_WIN64) + size_t current; /* mapped region size, e.g. file and DB */ +#endif #ifdef MDBX_OSAL_SECTION MDBX_OSAL_SECTION section; #endif } mdbx_mmap_t; -int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit); -int mdbx_munmap(mdbx_mmap_t *map, size_t length); +int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t must, size_t limit); +int mdbx_munmap(mdbx_mmap_t *map); int mdbx_mlock(mdbx_mmap_t *map, size_t length); int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna); -int mdbx_mremap(int flags, mdbx_mmap_t *map, size_t old_limit, - size_t new_limit); int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async); static __inline mdbx_pid_t mdbx_getpid(void) { diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 92eff1aa..7a93d70d 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -335,7 +335,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, if (key->iov_len != sizeof(txnid_t)) problem_add("entry", record_number, "wrong txn-id size", "key-size %" PRIiPTR "", key->iov_len); - else if (txnid < 1 || txnid > envinfo.me_recent_txnid) + else if (txnid < 1 || txnid > envinfo.mi_recent_txnid) problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN "", txnid); if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) @@ -352,14 +352,14 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, data->iov_len); else { freedb_pages += number; - if (envinfo.me_latter_reader_txnid > txnid) + if (envinfo.mi_latter_reader_txnid > txnid) reclaimable_pages += number; for (i = number, prev = 1; --i >= 0;) { pg = iptr[i]; - if (pg < NUM_METAS || pg > envinfo.me_last_pgno) + if (pg < NUM_METAS || pg > envinfo.mi_last_pgno) problem_add("entry", record_number, "wrong idl entry", "%u < %" PRIiPTR " < %" PRIiPTR "", NUM_METAS, pg, - envinfo.me_last_pgno); + envinfo.mi_last_pgno); else if (pg <= prev) { bad = " [bad sequence]"; problem_add("entry", record_number, "bad sequence", @@ -636,16 +636,16 @@ static __inline bool meta_eq(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, static __inline int meta_recent(const bool roolback2steady) { - if (meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, - envinfo.me_meta1_txnid, envinfo.me_meta1_sign, roolback2steady)) - return meta_ot(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, - envinfo.me_meta1_txnid, envinfo.me_meta1_sign, + if (meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, + envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, roolback2steady)) + return meta_ot(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, + envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, roolback2steady) ? 1 : 2; - return meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, - envinfo.me_meta2_txnid, envinfo.me_meta2_sign, roolback2steady) + return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, + envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, roolback2steady) ? 2 : 0; } @@ -653,18 +653,18 @@ static __inline int meta_recent(const bool roolback2steady) { static __inline int meta_tail(int head) { if (head == 0) - return meta_ot(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, - envinfo.me_meta2_txnid, envinfo.me_meta2_sign, true) + return meta_ot(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, + envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true) ? 1 : 2; if (head == 1) - return meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, - envinfo.me_meta2_txnid, envinfo.me_meta2_sign, true) + return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, + envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true) ? 0 : 2; if (head == 2) - return meta_ot(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, - envinfo.me_meta1_txnid, envinfo.me_meta1_sign, true) + return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, + envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, true) ? 0 : 1; assert(false); @@ -698,10 +698,10 @@ void verbose_meta(int num, txnid_t txnid, uint64_t sign) { if (stay) print(", stay"); - if (txnid > envinfo.me_recent_txnid && + if (txnid > envinfo.mi_recent_txnid && (exclusive || (envflags & MDBX_RDONLY) == 0)) print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", - txnid - envinfo.me_recent_txnid, txnid, envinfo.me_recent_txnid); + txnid - envinfo.mi_recent_txnid, txnid, envinfo.mi_recent_txnid); print("\n"); } @@ -712,26 +712,26 @@ static int check_meta_head(bool steady) { error(" - unexpected internal error (%s)\n", steady ? "meta_steady_head" : "meta_weak_head"); case 0: - if (envinfo.me_meta0_txnid != envinfo.me_recent_txnid) { + if (envinfo.mi_meta0_txnid != envinfo.mi_recent_txnid) { print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 ")\n", - 0, envinfo.me_meta0_txnid, envinfo.me_recent_txnid); + 0, envinfo.mi_meta0_txnid, envinfo.mi_recent_txnid); return 1; } break; case 1: - if (envinfo.me_meta1_txnid != envinfo.me_recent_txnid) { + if (envinfo.mi_meta1_txnid != envinfo.mi_recent_txnid) { print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 ")\n", - 1, envinfo.me_meta1_txnid, envinfo.me_recent_txnid); + 1, envinfo.mi_meta1_txnid, envinfo.mi_recent_txnid); return 1; } break; case 2: - if (envinfo.me_meta2_txnid != envinfo.me_recent_txnid) { + if (envinfo.mi_meta2_txnid != envinfo.mi_recent_txnid) { print(" - meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 ")\n", - 2, envinfo.me_meta2_txnid, envinfo.me_recent_txnid); + 2, envinfo.mi_meta2_txnid, envinfo.mi_recent_txnid); return 1; } } @@ -890,50 +890,50 @@ int main(int argc, char *argv[]) { goto bailout; } - lastpgno = envinfo.me_last_pgno + 1; + lastpgno = envinfo.mi_last_pgno + 1; errno = 0; if (verbose) { print(" - pagesize %u (%u system), max keysize %" PRIuPTR ", max readers %u\n", - envinfo.me_dxb_pagesize, envinfo.me_sys_pagesize, maxkeysize, - envinfo.me_maxreaders); - print_size(" - mapsize ", envinfo.me_mapsize, "\n"); - if (envinfo.me_geo.lower == envinfo.me_geo.upper) - print_size(" - fixed datafile: ", envinfo.me_geo.current, ""); + envinfo.mi_dxb_pagesize, envinfo.mi_sys_pagesize, maxkeysize, + envinfo.mi_maxreaders); + print_size(" - mapsize ", envinfo.mi_mapsize, "\n"); + if (envinfo.mi_geo.lower == envinfo.mi_geo.upper) + print_size(" - fixed datafile: ", envinfo.mi_geo.current, ""); else { - print_size(" - dynamic datafile: ", envinfo.me_geo.lower, ""); - print_size(" .. ", envinfo.me_geo.upper, ", "); - print_size("+", envinfo.me_geo.grow, ", "); - print_size("-", envinfo.me_geo.shrink, "\n"); - print_size(" - current datafile: ", envinfo.me_geo.current, ""); + print_size(" - dynamic datafile: ", envinfo.mi_geo.lower, ""); + print_size(" .. ", envinfo.mi_geo.upper, ", "); + print_size("+", envinfo.mi_geo.grow, ", "); + print_size("-", envinfo.mi_geo.shrink, "\n"); + print_size(" - current datafile: ", envinfo.mi_geo.current, ""); } printf(", %" PRIu64 " pages\n", - envinfo.me_geo.current / envinfo.me_dxb_pagesize); + envinfo.mi_geo.current / envinfo.mi_dxb_pagesize); print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64 ", lag %" PRIi64 "\n", - envinfo.me_recent_txnid, envinfo.me_latter_reader_txnid, - envinfo.me_recent_txnid - envinfo.me_latter_reader_txnid); + envinfo.mi_recent_txnid, envinfo.mi_latter_reader_txnid, + envinfo.mi_recent_txnid - envinfo.mi_latter_reader_txnid); - verbose_meta(0, envinfo.me_meta0_txnid, envinfo.me_meta0_sign); - verbose_meta(1, envinfo.me_meta1_txnid, envinfo.me_meta1_sign); - verbose_meta(2, envinfo.me_meta2_txnid, envinfo.me_meta2_sign); + verbose_meta(0, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign); + verbose_meta(1, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign); + verbose_meta(2, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign); } if (verbose) print(" - performs check for meta-pages clashes\n"); - if (meta_eq(envinfo.me_meta0_txnid, envinfo.me_meta0_sign, - envinfo.me_meta1_txnid, envinfo.me_meta1_sign)) { + if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, + envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) { print(" - meta-%d and meta-%d are clashed\n", 0, 1); ++problems_meta; } - if (meta_eq(envinfo.me_meta1_txnid, envinfo.me_meta1_sign, - envinfo.me_meta2_txnid, envinfo.me_meta2_sign)) { + if (meta_eq(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, + envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign)) { print(" - meta-%d and meta-%d are clashed\n", 1, 2); ++problems_meta; } - if (meta_eq(envinfo.me_meta2_txnid, envinfo.me_meta2_sign, - envinfo.me_meta0_txnid, envinfo.me_meta0_sign)) { + if (meta_eq(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, + envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign)) { print(" - meta-%d and meta-%d are clashed\n", 2, 0); ++problems_meta; } @@ -1042,15 +1042,15 @@ int main(int argc, char *argv[]) { problems_freedb = process_db(FREE_DBI, "free", handle_freedb, false); if (verbose) { - uint64_t value = envinfo.me_mapsize / envstat.ms_psize; + uint64_t value = envinfo.mi_mapsize / envstat.ms_psize; double percent = value / 100.0; print(" - pages info: %" PRIu64 " total", value); - value = envinfo.me_geo.current / envinfo.me_dxb_pagesize; + value = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize; print(", backed %" PRIu64 " (%.1f%%)", value, value / percent); print(", allocated %" PRIu64 " (%.1f%%)", lastpgno, lastpgno / percent); if (verbose > 1) { - value = envinfo.me_mapsize / envstat.ms_psize - lastpgno; + value = envinfo.mi_mapsize / envstat.ms_psize - lastpgno; print(", remained %" PRIu64 " (%.1f%%)", value, value / percent); value = lastpgno - freedb_pages; @@ -1066,7 +1066,7 @@ int main(int argc, char *argv[]) { } value = - envinfo.me_mapsize / envstat.ms_psize - lastpgno + reclaimable_pages; + envinfo.mi_mapsize / envstat.ms_psize - lastpgno + reclaimable_pages; print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent); } diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index c9fb6f20..71c300dd 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -122,8 +122,8 @@ static int dumpit(MDBX_txn *txn, MDBX_dbi dbi, char *name) { if (name) printf("database=%s\n", name); printf("type=btree\n"); - printf("mapsize=%" PRIu64 "\n", info.me_mapsize); - printf("maxreaders=%u\n", info.me_maxreaders); + printf("mapsize=%" PRIu64 "\n", info.mi_mapsize); + printf("maxreaders=%u\n", info.mi_maxreaders); for (i = 0; dbflags[i].bit; i++) if (flags & dbflags[i].bit) diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index 91c1bfae..16180e86 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -137,7 +137,7 @@ static void readhdr(void) { if (ptr) *ptr = '\0'; i = sscanf((char *)dbuf.iov_base + STRLENOF("mapsize="), "%" PRIu64 "", - &envinfo.me_mapsize); + &envinfo.mi_mapsize); if (i != 1) { fprintf(stderr, "%s: line %" PRIiPTR ": invalid mapsize %s\n", prog, lineno, (char *)dbuf.iov_base + STRLENOF("mapsize=")); @@ -150,7 +150,7 @@ static void readhdr(void) { if (ptr) *ptr = '\0'; i = sscanf((char *)dbuf.iov_base + STRLENOF("maxreaders="), "%u", - &envinfo.me_maxreaders); + &envinfo.mi_maxreaders); if (i != 1) { fprintf(stderr, "%s: line %" PRIiPTR ": invalid maxreaders %s\n", prog, lineno, (char *)dbuf.iov_base + STRLENOF("maxreaders=")); @@ -393,20 +393,20 @@ int main(int argc, char *argv[]) { mdbx_env_set_maxdbs(env, 2); - if (envinfo.me_maxreaders) - mdbx_env_set_maxreaders(env, envinfo.me_maxreaders); + if (envinfo.mi_maxreaders) + mdbx_env_set_maxreaders(env, envinfo.mi_maxreaders); - if (envinfo.me_mapsize) { - if (envinfo.me_mapsize > SIZE_MAX) { + if (envinfo.mi_mapsize) { + if (envinfo.mi_mapsize > SIZE_MAX) { fprintf(stderr, "mdbx_env_set_mapsize failed, error %d %s\n", rc, mdbx_strerror(MDBX_TOO_LARGE)); return EXIT_FAILURE; } - mdbx_env_set_mapsize(env, (size_t)envinfo.me_mapsize); + mdbx_env_set_mapsize(env, (size_t)envinfo.mi_mapsize); } #ifdef MDBX_FIXEDMAP - if (info.me_mapaddr) + if (info.mi_mapaddr) envflags |= MDBX_FIXEDMAP; #endif diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index eaf0371a..7fbe924b 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -157,29 +157,29 @@ int main(int argc, char *argv[]) { (void)mdbx_env_info(env, &mei, sizeof(mei)); printf("Environment Info\n"); printf(" Pagesize: %u\n", mst.ms_psize); - if (mei.me_geo.lower != mei.me_geo.upper) { + if (mei.mi_geo.lower != mei.mi_geo.upper) { printf(" Dynamic datafile: %" PRIu64 "..%" PRIu64 " bytes (+%" PRIu64 "/-%" PRIu64 "), %" PRIu64 "..%" PRIu64 " pages (+%" PRIu64 "/-%" PRIu64 ")\n", - mei.me_geo.lower, mei.me_geo.upper, mei.me_geo.grow, - mei.me_geo.shrink, mei.me_geo.lower / mst.ms_psize, - mei.me_geo.upper / mst.ms_psize, mei.me_geo.grow / mst.ms_psize, - mei.me_geo.shrink / mst.ms_psize); + mei.mi_geo.lower, mei.mi_geo.upper, mei.mi_geo.grow, + mei.mi_geo.shrink, mei.mi_geo.lower / mst.ms_psize, + mei.mi_geo.upper / mst.ms_psize, mei.mi_geo.grow / mst.ms_psize, + mei.mi_geo.shrink / mst.ms_psize); printf(" Current datafile: %" PRIu64 " bytes, %" PRIu64 " pages\n", - mei.me_geo.current, mei.me_geo.current / mst.ms_psize); + mei.mi_geo.current, mei.mi_geo.current / mst.ms_psize); } else { printf(" Fixed datafile: %" PRIu64 " bytes, %" PRIu64 " pages\n", - mei.me_geo.current, mei.me_geo.current / mst.ms_psize); + mei.mi_geo.current, mei.mi_geo.current / mst.ms_psize); } printf(" Current mapsize: %" PRIu64 " bytes, %" PRIu64 " pages \n", - mei.me_mapsize, mei.me_mapsize / mst.ms_psize); - printf(" Number of pages used: %" PRIu64 "\n", mei.me_last_pgno + 1); - printf(" Last transaction ID: %" PRIu64 "\n", mei.me_recent_txnid); + mei.mi_mapsize, mei.mi_mapsize / mst.ms_psize); + printf(" Number of pages used: %" PRIu64 "\n", mei.mi_last_pgno + 1); + printf(" Last transaction ID: %" PRIu64 "\n", mei.mi_recent_txnid); printf(" Tail transaction ID: %" PRIu64 " (%" PRIi64 ")\n", - mei.me_latter_reader_txnid, - mei.me_latter_reader_txnid - mei.me_recent_txnid); - printf(" Max readers: %u\n", mei.me_maxreaders); - printf(" Number of readers used: %u\n", mei.me_numreaders); + mei.mi_latter_reader_txnid, + mei.mi_latter_reader_txnid - mei.mi_recent_txnid); + printf(" Max readers: %u\n", mei.mi_maxreaders); + printf(" Number of readers used: %u\n", mei.mi_numreaders); } else { /* LY: zap warnings from gcc */ memset(&mst, 0, sizeof(mst)); @@ -234,7 +234,7 @@ int main(int argc, char *argv[]) { } iptr = data.iov_base; pages += *iptr; - if (envinfo && mei.me_latter_reader_txnid > *(size_t *)key.iov_base) + if (envinfo && mei.mi_latter_reader_txnid > *(size_t *)key.iov_base) reclaimable += *iptr; if (freinfo > 1) { char *bad = ""; @@ -268,18 +268,18 @@ int main(int argc, char *argv[]) { } mdbx_cursor_close(cursor); if (envinfo) { - uint64_t value = mei.me_mapsize / mst.ms_psize; + uint64_t value = mei.mi_mapsize / mst.ms_psize; double percent = value / 100.0; printf("Page Allocation Info\n"); printf(" Max pages: %" PRIu64 " 100%%\n", value); - value = mei.me_last_pgno + 1; + value = mei.mi_last_pgno + 1; printf(" Pages used: %" PRIu64 " %.1f%%\n", value, value / percent); - value = mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1); + value = mei.mi_mapsize / mst.ms_psize - (mei.mi_last_pgno + 1); printf(" Remained: %" PRIu64 " %.1f%%\n", value, value / percent); - value = mei.me_last_pgno + 1 - pages; + value = mei.mi_last_pgno + 1 - pages; printf(" Used now: %" PRIu64 " %.1f%%\n", value, value / percent); value = pages; @@ -292,7 +292,7 @@ int main(int argc, char *argv[]) { printf(" Reclaimable: %" PRIu64 " %.1f%%\n", value, value / percent); value = - mei.me_mapsize / mst.ms_psize - (mei.me_last_pgno + 1) + reclaimable; + mei.mi_mapsize / mst.ms_psize - (mei.mi_last_pgno + 1) + reclaimable; printf(" Available: %" PRIu64 " %.1f%%\n", value, value / percent); } else printf(" Free pages: %" PRIaPGNO "\n", pages); From c8a5df650b61f86b64de3e6bbe75fc3dfa4d927b Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Wed, 19 Jul 2017 09:12:46 +0300 Subject: [PATCH 297/303] mdbx: fix/rework exclusive-locking for Windows. Change-Id: I1b129a10ed7523024481480647317f1643f2ea70 --- src/lck-posix.c | 4 +++- src/lck-windows.c | 52 +++++++++++++++++++++++++++++------------------ src/mdbx.c | 11 ++++++---- src/osal.h | 2 +- 4 files changed, 43 insertions(+), 26 deletions(-) diff --git a/src/lck-posix.c b/src/lck-posix.c index c1f23b38..5b882b0d 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -91,7 +91,9 @@ static __inline int mdbx_lck_shared(int lfd) { return mdbx_lck_op(lfd, F_SETLKW, F_RDLCK, 0, 1); } -int mdbx_lck_downgrade(MDBX_env *env) { return mdbx_lck_shared(env->me_lfd); } +int mdbx_lck_downgrade(MDBX_env *env, bool complete) { + return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS; +} int mdbx_lck_upgrade(MDBX_env *env) { return mdbx_lck_exclusive(env->me_lfd); } diff --git a/src/lck-windows.c b/src/lck-windows.c index 7bbef4d9..66591db9 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -152,7 +152,7 @@ int mdbx_rdt_lock(MDBX_env *env) { if (env->me_lfd == INVALID_HANDLE_VALUE) return MDBX_SUCCESS; /* readonly database in readonly filesystem */ - /* transite from S-? (used) to S-E (locked), e.g. exlcusive lock upper-part */ + /* transite from S-? (used) to S-E (locked), e.g. exclusive lock upper-part */ if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) return MDBX_SUCCESS; return GetLastError(); @@ -168,18 +168,18 @@ void mdbx_rdt_unlock(MDBX_env *env) { /*----------------------------------------------------------------------------*/ /* global `initial` lock for lockfile initialization, -* exclusive/shared locking first cacheline */ + * exclusive/shared locking first cacheline */ /* FIXME: locking schema/algo descritpion. ?-? = free S-? = used - E-? + E-? = exclusive-read ?-S ?-E = middle S-S S-E = locked E-S - E-E = exclusive + E-E = exclusive-write */ int mdbx_lck_init(MDBX_env *env) { @@ -187,8 +187,9 @@ int mdbx_lck_init(MDBX_env *env) { return MDBX_SUCCESS; } -/* Seize state as exclusive (E-E and returns MDBX_RESULT_TRUE) -* or used (S-? and returns MDBX_RESULT_FALSE), otherwise returns an error */ +/* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE) + * or as 'used' (S-? and returns MDBX_RESULT_FALSE), otherwise returns an error + */ static int internal_seize_lck(HANDLE lfd) { int rc; assert(lfd != INVALID_HANDLE_VALUE); @@ -202,10 +203,10 @@ static int internal_seize_lck(HANDLE lfd) { return rc; } - /* 3) now on ?-E (middle), try E-E (exclusive) */ + /* 3) now on ?-E (middle), try E-E (exclusive-write) */ mdbx_jitter4testing(false); if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) - return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive), done */ + return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; /* 5) still on ?-E (middle) */ rc = GetLastError(); @@ -279,32 +280,43 @@ int mdbx_lck_seize(MDBX_env *env) { return rc; } -int mdbx_lck_downgrade(MDBX_env *env) { - /* Transite from exclusive state (E-E) to used (S-?) */ +int mdbx_lck_downgrade(MDBX_env *env, bool complete) { + /* Transite from exclusive state (E-?) to used (S-?) */ assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); - /* 1) must be at E-E (exclusive), transite to ?_E (middle) */ + /* 1) must be at E-E (exclusive-write) */ + if (!complete) { + /* transite from E-E to E_? (exclusive-read) */ + if (!funlock(env->me_lfd, LCK_UPPER)) + mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, + "E-E(exclusive-write) >> E-?(exclusive-read)", GetLastError()); + return MDBX_SUCCESS /* 2) now at E-? (exclusive-read), done */; + } + + /* 3) now at E-E (exclusive-write), transite to ?_E (middle) */ if (!funlock(env->me_lfd, LCK_LOWER)) mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, - "E-E(exclusive) >> ?-E(middle)", GetLastError()); + "E-E(exclusive-write) >> ?-E(middle)", GetLastError()); - /* 2) now at ?-E (middle), transite to S-E (locked) */ + /* 4) now at ?-E (middle), transite to S-E (locked) */ if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { - int rc = GetLastError() /* 3) something went wrong, give up */; + int rc = GetLastError() /* 5) something went wrong, give up */; + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + "?-E(middle) >> S-E(locked)", rc); return rc; } - /* 4) got S-E (locked), continue transition to S-? (used) */ + /* 6) got S-E (locked), continue transition to S-? (used) */ if (!funlock(env->me_lfd, LCK_UPPER)) mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, "S-E(locked) >> S-?(used)", GetLastError()); - return MDBX_SUCCESS /* 5) now at S-? (used), done */; + return MDBX_SUCCESS /* 7) now at S-? (used), done */; } int mdbx_lck_upgrade(MDBX_env *env) { - /* Transite from locked state (S-E) to exclusive (E-E) */ + /* Transite from locked state (S-E) to exclusive-write (E-E) */ assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); @@ -313,10 +325,10 @@ int mdbx_lck_upgrade(MDBX_env *env) { mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, "S-E(locked) >> ?-E(middle)", GetLastError()); - /* 3) now on ?-E (middle), try E-E (exclusive) */ + /* 3) now on ?-E (middle), try E-E (exclusive-write) */ mdbx_jitter4testing(false); if (flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) - return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive), done */ + return MDBX_RESULT_TRUE; /* 4) got E-E (exclusive-write), done */ /* 5) still on ?-E (middle) */ int rc = GetLastError(); @@ -324,7 +336,7 @@ int mdbx_lck_upgrade(MDBX_env *env) { if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, report but continue */ mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, - "?-E(middle) >> E-E(exclusive)", rc); + "?-E(middle) >> E-E(exclusive-write)", rc); } /* 7) still on ?-E (middle), try restore S-E (locked) */ diff --git a/src/mdbx.c b/src/mdbx.c index 445d948e..3173fe6f 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4906,11 +4906,14 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags, if (exclusive == NULL || *exclusive < 2) { /* LY: downgrade lock only if exclusive access not requested. * in case exclusive==1, just leave value as is. */ - rc = mdbx_lck_downgrade(env); - mdbx_debug("lck-downgrade: rc %i ", rc); - if (rc != MDBX_SUCCESS) - goto bailout; + rc = mdbx_lck_downgrade(env, true); + mdbx_debug("lck-downgrade-full: rc %i ", rc); + } else { + rc = mdbx_lck_downgrade(env, false); + mdbx_debug("lck-downgrade-partial: rc %i ", rc); } + if (rc != MDBX_SUCCESS) + goto bailout; } else { if (exclusive) { /* LY: just indicate that is not an exclusive access. */ diff --git a/src/osal.h b/src/osal.h index c35ffa3e..c6cad301 100644 --- a/src/osal.h +++ b/src/osal.h @@ -491,7 +491,7 @@ void mdbx_osal_jitter(bool tiny); int mdbx_lck_init(MDBX_env *env); int mdbx_lck_seize(MDBX_env *env); -int mdbx_lck_downgrade(MDBX_env *env); +int mdbx_lck_downgrade(MDBX_env *env, bool complete); int mdbx_lck_upgrade(MDBX_env *env); void mdbx_lck_destroy(MDBX_env *env); From 109be210b413af2c440bf38c165221d366a67773 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Jul 2017 01:59:05 +0300 Subject: [PATCH 298/303] mdbx: refine README. Change-Id: I91192a5ac1464677432956a0dfd7038bac9b021f --- .gitignore | 1 + README.md | 313 +++++++++++++++++++++++++++++++---------------------- 2 files changed, 183 insertions(+), 131 deletions(-) diff --git a/.gitignore b/.gitignore index 0222b199..117141cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ *[~#] *.[ao] *.bak +.le.ini core core.* *.exe diff --git a/README.md b/README.md index c7cf6431..7201448d 100644 --- a/README.md +++ b/README.md @@ -14,21 +14,29 @@ and [by Yandex](https://translate.yandex.ru/translate?url=https%3A%2F%2Fgithub.c ## Кратко _libmdbx_ - это встраиваемый key-value движок хранения со специфическим -набором возможностей, которые при правильном применении позволяют -создавать уникальные решения с чемпионской производительностью, идеально -сочетаясь с технологией -[MRAM](https://en.wikipedia.org/wiki/Magnetoresistive_random-access_memory). +набором свойств и возможностей, ориентированный на создание уникальных +легковесных решений с предельной производительностью. + +_libmdbx_ позволяет множеству процессов совместно читать и обновлять +несколько key-value таблиц с соблюдением [ACID](https://ru.wikipedia.org/wiki/ACID), +при минимальных накладных расходах и амортизационной стоимости любых операций Olog(N). + +_libmdbx_ обеспечивает +[serializability](https://en.wikipedia.org/wiki/Serializability) +изменений и согласованность данных после аварий. При этом транзакции +изменяющие данные никак не мешают операциям чтения и выполняются строго +последовательно с использованием единственного +[мьютекса](https://en.wikipedia.org/wiki/Mutual_exclusion). + +_libmdbx_ позволяет выполнять операции чтения с гарантиями +[wait-free](https://en.wikipedia.org/wiki/Non-blocking_algorithm#Wait-freedom), +параллельно на каждом ядре CPU, без использования атомарных операций +и/или примитивов синхронизации. -_libmdbx_ умеет обновлять совместно используемый набор данных, никак не -мешая при этом параллельным операциям чтения, не применяя атомарных -операций к самим данным, и обеспечивая согласованность при аварийной -остановке в любой момент. Поэтому _libmdbx_ позволяя строить системы с -линейным масштабированием производительности чтения/поиска по ядрам CPU -и амортизационной стоимостью любых операций Olog(N). ### История -_libmdbx_ является потомком "Lightning Memory-Mapped Database", +_libmdbx_ является развитием "Lightning Memory-Mapped Database", известной под аббревиатурой [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). Изначально доработка производилась в составе проекта @@ -50,13 +58,13 @@ Technologies](https://www.ptsecurity.ru). _libmdbx_ наследует все ключевые возможности и особенности от своего прародителя [LMDB](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database), -с устранением описанных далее проблем и архитектурных недочетов. +но с устранением ряда описываемых далее проблем и архитектурных недочетов. 1. Данные хранятся в упорядоченном отображении (ordered map), ключи всегда отсортированы, поддерживается выборка диапазонов (range lookups). 2. Данные отображается в память каждого работающего с БД процесса. - К данным и ключам обеспечивается прямой доступ без необходимости их + К данным и ключам обеспечивается прямой доступ в памяти без необходимости их копирования. 3. Транзакции согласно @@ -71,6 +79,10 @@ _libmdbx_ наследует все ключевые возможности и без [атомарных операций](https://ru.wikipedia.org/wiki/%D0%90%D1%82%D0%BE%D0%BC%D0%B0%D1%80%D0%BD%D0%B0%D1%8F_%D0%BE%D0%BF%D0%B5%D1%80%D0%B0%D1%86%D0%B8%D1%8F). Читатели не блокируются операциями записи и не конкурируют между собой, чтение масштабируется линейно по ядрам CPU. + > Для точности следует отметить, что "подключение к БД" (старт первой + > читающей транзакции в потоке) и "отключение от БД" (закрытие БД или + > завершение потока) требуют краткосрочного захвата блокировки для + > регистрации/дерегистрации текущего потока в "таблице читателей". 5. Эффективное хранение дубликатов (ключей с несколькими значениями), без дублирования ключей, с сортировкой значений, в @@ -79,7 +91,8 @@ _libmdbx_ наследует все ключевые возможности и 6. Эффективная поддержка коротких ключей фиксированной длины, в том числе целочисленных. 7. Амортизационная стоимость любой операции Olog(N), - [WAF](https://en.wikipedia.org/wiki/Write_amplification) (Write Amplification Factor) и RAF (Read Amplification Factor) также Olog(N). + [WAF](https://en.wikipedia.org/wiki/Write_amplification) (Write + Amplification Factor) и RAF (Read Amplification Factor) также Olog(N). 8. Нет [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) и журнала транзакций, после сбоев не требуется восстановление. Не требуется компактификация @@ -92,9 +105,9 @@ _libmdbx_ наследует все ключевые возможности и Сравнение производительности ============================ -Все данные получены многократным прогоном тестов на ноутбуке Lenovo -Carbon-2, i7-4600U 2.1 ГГц, 8 Гб ОЗУ, с SSD-диском SAMSUNG -MZNTD512HAGL-000L1 (DXT23L0Q) 512 Гб. +Все представленные ниже данные получены многократным прогоном тестов на +ноутбуке Lenovo Carbon-2, i7-4600U 2.1 ГГц, 8 Гб ОЗУ, с SSD-диском +SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Гб. Исходный код бенчмарка [_IOArena_](https://github.com/pmwkaa/ioarena) и сценарии тестирования [доступны на @@ -105,7 +118,7 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). ### Интегральная производительность ![Comparison #1: Integral Performance](https://raw.githubusercontent.com/wiki/ReOpen/libmdbx/img/perf-slide-1.png) -Показана соотнесенная сумма показателей производительности в трёх +Показана соотнесенная сумма ключевых показателей производительности в трёх бенчмарках: - Чтение/Поиск на машине с 4-мя процессорами; @@ -121,7 +134,7 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). *Бенчмарк в режиме асинхронной записи не включен по двум причинам:* 1. Такое сравнение не совсем правомочно, его следует делать с движками - ориентированными на хранение данных в памяти (Tarantool, Redis). + ориентированными на хранение данных в памяти ([Tarantool](https://tarantool.io/), [Redis](https://redis.io/)). 2. Превосходство libmdbx становится еще более подавляющем, что мешает восприятию информации. @@ -133,7 +146,7 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). Для каждого движка показана суммарная производительность при одновременном выполнении запросов чтения/поиска в 1-2-4-8 потоков на -машине с 4-мя процессорами. +машине с 4-мя физическими процессорами. -------------------------------------------------------------------------------- @@ -141,12 +154,12 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). ![Comparison #3: Sync-write mode](https://raw.githubusercontent.com/wiki/ReOpen/libmdbx/img/perf-slide-3.png) - Линейная шкала слева и темные прямоугольники соответствуют количеству - транзакций в секунду, усредненному за все время теста. + транзакций в секунду, усредненному за всё время теста. - Логарифмическая шкала справа и желтые интервальные отрезки соответствуют времени выполнения транзакций. При этом каждый отрезок - показывает минимальное и максимальное время затраченной на выполнения - транзакций, а крестик показывает среднеквадратичное значение. + показывает минимальное и максимальное время затраченное на выполнение + транзакций, а крестиком отмечено среднеквадратичное значение. Выполняется **10.000 транзакций в режиме синхронной фиксации данных** на диске. При этом требуется гарантия, что при аварийном выключении питания @@ -155,23 +168,40 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). режиме при фиксации каждой транзакции выполняется системный вызов [fdatasync](https://linux.die.net/man/2/fdatasync). -В каждой транзакции выполняется CRUD-операция (две вставки, одной -чтение, одно обновление, одно удаление). Бенчмарк стартует на пустой -базе и в результате выполняемых действий при завершении в базе -насчитывается 10.000 небольших key-value записей. +В каждой транзакции выполняется комбинированная CRUD-операция (две +вставки, одно чтение, одно обновление, одно удаление). Бенчмарк стартует +на пустой базе, а при завершении, в результате выполняемых действий, в +базе насчитывается 10.000 небольших key-value записей. -------------------------------------------------------------------------------- ### Отложенная фиксация ![Comparison #4: Lazy-write mode](https://raw.githubusercontent.com/wiki/ReOpen/libmdbx/img/perf-slide-4.png) - - Линейная шкала слева и темные прямоугольники соответствуют количеству транзакций в секунду, усредненному за все время теста. - - Логарифмическая шкала справа и желтые интервальные отрезки соответствуют времени выполнения транзакций. При этом каждый отрезок показывает минимальное и максимальное время затраченной на выполнения транзакций, а крестик показывает среднеквадратичное значение. + - Линейная шкала слева и темные прямоугольники соответствуют количеству + транзакций в секунду, усредненному за всё время теста. -Выполняется **100.000 транзакций в режиме отложенной фиксации данных** на диске. При этом требуется гарантия, что при аварийном выключении питания (или другом подобном сбое) все данные будут консистентны на момент завершения одной из транзакций, но допускается потеря изменений из некоторого количества последних транзакций, что для многих движков предполагает включение [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) (write-ahead logging) либо журнала транзакций, который в свою очередь опирается на гарантию упорядоченности данных в журналируемой файловой системе. _libmdbx_ при этом не ведет WAL, а передает весь контроль файловой системе и ядру ОС. + - Логарифмическая шкала справа и желтые интервальные отрезки + соответствуют времени выполнения транзакций. При этом каждый отрезок + показывает минимальное и максимальное время затраченное на выполнение + транзакций, а крестиком отмечено среднеквадратичное значение. -В каждой транзакции выполняется CRUD-операция (две вставки, одной чтение, одно обновление, одно удаление). -Бенчмарк стартует на пустой базе и в результате выполняемых действий при завершении в базе насчитывается 100.000 небольших key-value записей. +Выполняется **100.000 транзакций в режиме отложенной фиксации данных** +на диске. При этом требуется гарантия, что при аварийном выключении +питания (или другом подобном сбое) все данные будут консистентны на +момент завершения одной из транзакций, но допускается потеря изменений +из некоторого количества последних транзакций, что для многих движков +предполагает включение +[WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) (write-ahead +logging) либо журнала транзакций, который в свою очередь опирается на +гарантию упорядоченности данных в журналируемой файловой системе. +_libmdbx_ при этом не ведет WAL, а передает весь контроль файловой +системе и ядру ОС. + +В каждой транзакции выполняется комбинированная CRUD-операция (две +вставки, одно чтение, одно обновление, одно удаление). Бенчмарк стартует +на пустой базе, а при завершении, в результате выполняемых действий, в +базе насчитывается 100.000 небольших key-value записей. -------------------------------------------------------------------------------- @@ -179,28 +209,28 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). ![Comparison #5: Async-write mode](https://raw.githubusercontent.com/wiki/ReOpen/libmdbx/img/perf-slide-5.png) - Линейная шкала слева и темные прямоугольники соответствуют количеству - транзакций в секунду, усредненному за все время теста. + транзакций в секунду, усредненному за всё время теста. - Логарифмическая шкала справа и желтые интервальные отрезки соответствуют времени выполнения транзакций. При этом каждый отрезок - показывает минимальное и максимальное время затраченной на выполнения - транзакций, а крестик показывает среднеквадратичное значение. + показывает минимальное и максимальное время затраченное на выполнение + транзакций, а крестиком отмечено среднеквадратичное значение. Выполняется **1.000.000 транзакций в режиме асинхронной фиксации данных** на диске. При этом требуется гарантия, что при аварийном выключении питания (или другом подобном сбое) все данные будут консистентны на момент завершения одной из транзакций, но допускается -потеря изменений из любого количества последних транзакций. Во всех -движках при этом включался режим предполагающий минимальную нагрузку -записи на диск, и соответственно минимальную гарантию сохранности -данных. В _libmdbx_ при этом используется режим асинхронной записи -измененных страниц на диск силами ядра ОС посредством системрго вызова -[msync(MS_ASYNC)](https://linux.die.net/man/2/msync). +потеря изменений из значительного количества последних транзакций. Во +всех движках при этом включался режим предполагающий минимальную +нагрузку на диск по-записи, и соответственно минимальную гарантию +сохранности данных. В _libmdbx_ при этом используется режим асинхронной +записи измененных страниц на диск посредством ядра ОС и системного +вызова [msync(MS_ASYNC)](https://linux.die.net/man/2/msync). -В каждой транзакции выполняется CRUD-операция (две вставки, одной -чтение, одно обновление, одно удаление). Бенчмарк стартует на пустой -базе и в результате выполняемых действий при завершении в базе -насчитывается 1.000.000 небольших key-value записей. +В каждой транзакции выполняется комбинированная CRUD-операция (две +вставки, одно чтение, одно обновление, одно удаление). Бенчмарк стартует +на пустой базе, а при завершении, в результате выполняемых действий, в +базе насчитывается 10.000 небольших key-value записей. -------------------------------------------------------------------------------- @@ -213,19 +243,19 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). - суммарное количество операций ввода-вывода (IOPS), как записи, так и чтения. - - суммарное затраченное время процессора, как в режиме пользователя, + - суммарное затраченное время процессора, как в режиме пользовательских процессов, так и в режиме ядра ОС. - - максимальный объем места на диске. который требовался во время работы - теста. + - использованное место на диске при завершении теста, после закрытия БД из тестирующего процесса, + но без ожидания всех внутренних операций обслуживания (компактификации LSM и т.п.). -Движок _ForestDB_ был исключен при окончательном формировании диаграммы, -так как многократно превысил потребление каждого из ресурсов (потратил -процессорное время на генерацию IOPS для заполнения диска). Что не -позволяло наглядно сравнить показатели остальных движков на одной -диаграмме. +Движок _ForestDB_ был исключен при оформлении результатов, так как +относительно конкурентов многократно превысил потребление каждого из +ресурсов (потратил процессорное время на генерацию IOPS для заполнения +диска), что не позволяло наглядно сравнить показатели остальных движков +на одной диаграмме. -Все данные собирались посредством системного вывова +Все данные собирались посредством системного вызова [getrusage()](http://man7.org/linux/man-pages/man2/getrusage.2.html) и сканированием директорий с данными. @@ -235,17 +265,27 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). 1. Единовременно может выполняться не более одной транзакция изменения данных (один писатель). Зато все изменения всегда последовательны, не может быть - конфликтов или ошибок при откате транзакций. + конфликтов или логических ошибок при откате транзакций. 2. Отсутствие [WAL](https://en.wikipedia.org/wiki/Write-ahead_logging) обуславливает относительно большой [WAF](https://en.wikipedia.org/wiki/Write_amplification) (Write Amplification Factor). Поэтому фиксация изменений на диске может быть - дорогой и является главным ограничителем для производительности по - записи. В качестве компромисса предлагается несколько режимов ленивой - и/или периодической фиксации. В том числе режим `MAPASYNC`, при котором - изменения происходят только в памяти и асинхронно фиксируются на диске - ядром ОС. + достаточно дорогой и являться главным ограничением производительности + при интенсивном изменении данных. + > В качестве компромисса _libmdbx_ предлагает несколько режимов ленивой + > и/или периодической фиксации. В том числе режим `MAPASYNC`, при котором + > изменения происходят только в памяти и асинхронно фиксируются на диске + > ядром ОС. + > + > Однако, следует воспринимать это свойство аккуратно и взвешенно. + > Например, полная фиксация транзакции в БД с журналом потребует минимум 2 + > IOPS (скорее всего 3-4) из-за накладных расходов в файловой системе. В + > _libmdbx_ фиксация транзакции также требует от 2 IOPS. Однако, в БД с + > журналом кол-во IOPS будет меняться в зависимости от файловой системы, + > но не от кол-ва записей или их объема. Тогда как в _libmdbx_ кол-во + > будет расти логарифмически от кол-во записей/строк в БД (по высоте + > b+tree). 3. [COW](https://ru.wikipedia.org/wiki/%D0%9A%D0%BE%D0%BF%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D0%B5_%D0%BF%D1%80%D0%B8_%D0%B7%D0%B0%D0%BF%D0%B8%D1%81%D0%B8) для реализации [MVCC](https://ru.wikipedia.org/wiki/MVCC) выполняется на @@ -255,16 +295,26 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). страниц, что расходует [пропускную способность оперативной памяти](https://en.wikipedia.org/wiki/Memory_bandwidth) и является основным ограничителем производительности в режиме `MAPASYNC`. + > Этот недостаток неустраним, тем не менее следует дать некоторые пояснения. + > Дело в том, что фиксация изменений на диске потребует гораздо более + > значительного копирования данных в памяти и массы других затратных операций. + > Поэтому обусловленное этим недостатком падение производительности становится + > заметным только при отказе от фиксации изменений на диске. + > Соответственно, корректнее сказать что _libmdbx_ позволяет + > получить персистентность ценой минимального падения производительности. + > Если же нет необходимости оперативно сохранять данные, то логичнее + > использовать `std::map`. 4. В _LMDB_ существует проблема долгих чтений (приостановленных читателей), которая приводит к деградации производительности и переполнению БД. - В _libmdbx_ предложены средства для предотвращения, выхода из проблемной - ситуации и устранения её последствий. Подробности ниже. + > В _libmdbx_ предложены средства для предотвращения, быстрого выхода из + > некомфортной ситуации и устранения её последствий. Подробности ниже. 5. В _LMDB_ есть вероятность разрушения БД в режиме `WRITEMAP+MAPASYNC`. В _libmdbx_ для `WRITEMAP+MAPASYNC` гарантируется как сохранность базы, - так и согласованность данных. При этом также, в качестве альтернативы, - предложен режим `UTTERLY_NOSYNC`. Подробности ниже. + так и согласованность данных. + > Дополнительно, в качестве альтернативы, предложен режим `UTTERLY_NOSYNC`. + > Подробности ниже. #### Проблема долгих чтений @@ -299,60 +349,55 @@ github](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015). деградацию производительности. Операции чтения выполняются в контексте снимка данных (версии -БД), который был актуальным на момент старта транзакции чтения. -Такой читаемый снимок поддерживается неизменным до завершения -операции. В свою очередь, это не позволяет повторно -использовать страницы БД в последующих версиях (снимках БД). +БД), который был актуальным на момент старта транзакции чтения. Такой +читаемый снимок поддерживается неизменным до завершения операции. В свою +очередь, это не позволяет повторно использовать страницы БД в +последующих версиях (снимках БД). -Другими словами, если обновление данных выполняется на фоне -долгой операции чтения, то вместо повторного использования -"старых" ненужных страниц будут выделяться новые, так как -"старые" страницы составляют снимок БД, который еще -используется долгой операцией чтения. +Другими словами, если обновление данных выполняется на фоне долгой +операции чтения, то вместо повторного использования "старых" ненужных +страниц будут выделяться новые, так как "старые" страницы составляют +снимок БД, который еще используется долгой операцией чтения. -В результате, при интенсивном изменении данных и достаточно -длительной операции чтения, в БД могут быть исчерпаны свободные -страницы, что не позволит создавать новые снимки/версии БД. -Такая ситуация будет сохраняться до завершения операции чтения, -которая использует старый снимок данных и препятствует -повторному использованию страниц БД. +В результате, при интенсивном изменении данных и достаточно длительной +операции чтения, в БД могут быть исчерпаны свободные страницы, что не +позволит создавать новые снимки/версии БД. Такая ситуация будет +сохраняться до завершения операции чтения, которая использует старый +снимок данных и препятствует повторному использованию страниц БД. -Однако, на этом проблемы не заканчиваются. После описанной -ситуации, все дополнительные страницы, которые были выделены -пока переработка старых была невозможна, будут участвовать в -цикле выделения/освобождения до конца жизни экземпляра БД. В -оригинальной _LMDB_ этот цикл использования страниц работает по -принципу [FIFO](https://ru.wikipedia.org/wiki/FIFO). Поэтому -увеличение количества циркулирующий страниц, с точки зрения -механизмов кэширования и/или обратной записи, выглядит как -увеличение рабочего набор данных. Проще говоря, однократное -попадание в ситуацию "уснувшего читателя" приводит к -устойчивому эффекту вымывания I/O кэша при всех последующих -изменениях данных. +Однако, на этом проблемы не заканчиваются. После описанной ситуации, все +дополнительные страницы, которые были выделены пока переработка старых +была невозможна, будут участвовать в цикле выделения/освобождения до +конца жизни экземпляра БД. В оригинальной _LMDB_ этот цикл использования +страниц работает по принципу [FIFO](https://ru.wikipedia.org/wiki/FIFO). +Поэтому увеличение количества циркулирующий страниц, с точки зрения +механизмов кэширования и/или обратной записи, выглядит как увеличение +рабочего набор данных. Проще говоря, однократное попадание в ситуацию +"уснувшего читателя" приводит к устойчивому эффекту вымывания I/O кэша +при всех последующих изменениях данных. -Для устранения описанных проблемы в _libmdbx_ сделаны -существенные доработки, подробности ниже. Иллюстрации к -проблеме "долгих чтений" можно найти в [слайдах -презентации](http://www.slideshare.net/leoyuriev/lmdb). -Там же приведен пример количественной оценки прироста -производительности за счет эффективной работы -[BBWC](https://en.wikipedia.org/wiki/BBWC) при включении `LIFO -RECLAIM` в _libmdbx_. +Для устранения описанных проблемы в _libmdbx_ сделаны существенные +доработки, подробности ниже. Иллюстрации к проблеме "долгих чтений" +можно найти в [слайдах презентации](http://www.slideshare.net/leoyuriev/lmdb). + +Там же приведен пример количественной оценки прироста производительности +за счет эффективной работы [BBWC](https://en.wikipedia.org/wiki/BBWC) +при включении `LIFO RECLAIM` в _libmdbx_. #### Вероятность разрушения БД в режиме `WRITEMAP+MAPASYNC` -При работе в режиме `WRITEMAP+MAPSYNC` запись измененных -страниц выполняется ядром ОС, что имеет ряд преимуществ. Так -например, при крахе приложения, ядро ОС сохранит все изменения. +При работе в режиме `WRITEMAP+MAPSYNC` запись измененных страниц +выполняется ядром ОС, что имеет ряд преимуществ. Так например, при крахе +приложения, ядро ОС сохранит все изменения. -Однако, при аварийном отключении питания или сбое в ядре ОС, на -диске будет сохранена только часть измененных страниц БД. При -этом с большой вероятностью может оказаться так, что будут -сохранены мета-страницы со ссылками на страницы с новыми -версиями данных, но не сами новые данные. В этом случае БД -будет безвозвратна разрушена, даже если до аварии производилась -полная синхронизация данных (посредством `mdbx_env_sync()`). +Однако, при аварийном отключении питания или сбое в ядре ОС, на диске +будет сохранена только часть измененных страниц БД. При этом с большой +вероятностью может оказаться так, что будут сохранены мета-страницы со +ссылками на страницы с новыми версиями данных, но не сами новые данные. +В этом случае БД будет безвозвратна разрушена, даже если до аварии +производилась полная синхронизация данных (посредством +`mdbx_env_sync()`). В _libmdbx_ эта проблема устранена, подробности ниже. @@ -380,11 +425,11 @@ RECLAIM` в _libmdbx_. Посредством `mdbx_env_set_oomfunc()` может быть установлен внешний обработчик (callback), который будет вызван при исчерпания свободных страниц из-за долгой операцией чтения. - Обработчику будет передан PID и pthread_id. В свою очередь - обработчик может предпринять одно из действий: + Обработчику будет передан PID и pthread_id виновника. + В свою очередь обработчик может предпринять одно из действий: - * отправить сигнал kill (#9), если долгое чтение выполняется - сторонним процессом; + * нейтрализовать виновника (отправить сигнал kill #9), если + долгое чтение выполняется сторонним процессом; * отменить или перезапустить проблемную операцию чтения, если операция выполняется одним из потоков текущего процесса; @@ -423,6 +468,14 @@ RECLAIM` в _libmdbx_. `WRITEMAP+MAPSYNC` завершаемые транзакции помечаются как слабые, а при явной синхронизации данных как сильные. + * В _libmdbx_ поддерживается не две, а три отдельные мета-страницы. + Это позволяет выполнять фиксацию транзакций с формированием как + сильной, так и слабой точки фиксации, без потери двух предыдущих + точек фиксации (из которых одна может быть сильной, а вторая слабой). + В результате, _libmdbx_ позволяет в произвольном порядке чередовать + сильные и слабые точки фиксации без нарушения соответствующих + гарантий в случае неожиданной системной аварии во время фиксации. + * При открытии БД выполняется автоматический откат к последней сильной фиксации. Этим обеспечивается гарантия сохранности БД. @@ -463,12 +516,12 @@ RECLAIM` в _libmdbx_. 10. Возможность явно запросить обновление существующей записи, без создания новой посредством флажка `MDBX_CURRENT` для `mdbx_put()`. -11. Возможность обновить или удалить запись с получением предыдущего -значения данных посредством `mdbx_replace()`. +11. Возможность посредством `mdbx_replace()` обновить или удалить запись +с получением предыдущего значения данных, а также адресно изменить +конкретное multi-значение. -12. Поддержка ключей и значений нулевой длины. Включая сортированные -дубликаты, в том числе вне зависимости от порядка их добавления или -обновления. +12. Поддержка ключей и значений нулевой длины, включая сортированные +дубликаты. 13. Исправленный вариант `mdbx_cursor_count()`, возвращающий корректное количество дубликатов для всех типов таблиц и любого положения курсора. @@ -492,13 +545,14 @@ RECLAIM` в _libmdbx_. компараторов для ключей и данных, посредством `mdbx_dbi_open_ex()`. 19. Возможность посредством `mdbx_is_dirty()` определить находятся ли -некоторый ключ или данные в "грязной" странице БД. Таким образом избегаю -лишнего копирования данных перед выполнением модифицирующих операций -(значения в размещенные "грязных" страницах могут быть перезаписаны при -изменениях, иначе они будут неизменны). +некоторый ключ или данные в "грязной" странице БД. Таким образом, +избегая лишнего копирования данных перед выполнением модифицирующих +операций (значения в размещенные "грязных" страницах могут быть +перезаписаны при изменениях, иначе они будут неизменны). 20. Корректное обновление текущей записи, в том числе сортированного -дубликата, при использовании режима `MDBX_CURRENT` в `mdbx_cursor_put()`. +дубликата, при использовании режима `MDBX_CURRENT` в +`mdbx_cursor_put()`. 21. Все курсоры, как в транзакциях только для чтения, так и в пишущих, могут быть переиспользованы посредством `mdbx_cursor_renew()` и ДОЛЖНЫ @@ -535,12 +589,9 @@ mdbx_txn_abort() или mdbx_txn_reset(). Что позволяет избави 26. Генерация последовательностей посредством `mdbx_dbi_sequence()`. -27. Обновление данных с одновременным получением старых значений, -а также адресное изменение конкретного multi-значения посредством `mdbx_replace()`. +27. Расширенное динамическое управление размером БД, включая выбор +размера страницы посредством `mdbx_env_set_geometry()`. -28. Расширенное динамическое управление размером БД, включая выбор размера страницы -посредством `mdbx_env_set_geometry()`. - -29. Три мета-страницы вместо двух, что позволяет гарантированно консистентно -обновлять слабые контрольные точки фиксации без риска повредить крайнюю сильную -точку фиксации. +28. Три мета-страницы вместо двух, что позволяет гарантированно +консистентно обновлять слабые контрольные точки фиксации без риска +повредить крайнюю сильную точку фиксации. From 599711a0072de2b9f5436180e601104e6b58a010 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Jul 2017 12:16:07 +0300 Subject: [PATCH 299/303] mdbx: fix/avoid meta-update from setup_dxb() in read-only mode. Change-Id: I60359f92aba31bb5a22e9a39ea4658d2393e957d --- src/mdbx.c | 45 ++++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/src/mdbx.c b/src/mdbx.c index 3173fe6f..5df54716 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -4529,25 +4529,27 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { mdbx_ensure(env, meta.mm_geo.now >= meta.mm_geo.next); } - uint64_t filesize; - err = mdbx_filesize(env->me_fd, &filesize); + uint64_t filesize_before_mmap; + err = mdbx_filesize(env->me_fd, &filesize_before_mmap); if (unlikely(err != MDBX_SUCCESS)) return err; const size_t expected_bytes = pgno2bytes(env, meta.mm_geo.now); const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next); - if (filesize != expected_bytes) { + mdbx_ensure(env, expected_bytes >= used_bytes); + if (filesize_before_mmap != expected_bytes) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { mdbx_info("filesize mismatch (expect %" PRIuPTR ", have %" PRIu64 "), " "assume collision in non-exclusive mode", - expected_bytes, filesize); + expected_bytes, filesize_before_mmap); } else { mdbx_notice("filesize mismatch (expect %" PRIuPTR ", have %" PRIu64 ")", - expected_bytes, filesize); - if (filesize < used_bytes) { + expected_bytes, filesize_before_mmap); + if (filesize_before_mmap < used_bytes) { mdbx_error("last-page beyond end-of-file (last %" PRIaPGNO ", have %" PRIaPGNO ")", - meta.mm_geo.next, bytes2pgno(env, (size_t)filesize)); + meta.mm_geo.next, + bytes2pgno(env, (size_t)filesize_before_mmap)); return MDBX_CORRUPTED; } @@ -4561,6 +4563,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { expected_bytes); return err; } + filesize_before_mmap = expected_bytes; } } } @@ -4638,20 +4641,28 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { const MDBX_meta *head = mdbx_meta_head(env); if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { /* re-check file size after mmap */ - err = mdbx_filesize(env->me_fd, &filesize); + uint64_t filesize_after_mmap; + err = mdbx_filesize(env->me_fd, &filesize_after_mmap); if (unlikely(err != MDBX_SUCCESS)) return err; - if (filesize != expected_bytes) { - mdbx_info("datafile resized by system to %" PRIu64 " bytes", filesize); - if (filesize % env->me_os_psize || filesize > env->me_dbgeo.upper || - filesize < used_bytes) { - mdbx_info("unacceptable/unexpected datafile size %" PRIu64, filesize); + if (filesize_after_mmap != expected_bytes) { + if (filesize_after_mmap != filesize_before_mmap) + mdbx_info("datafile resized by system to %" PRIu64 " bytes", + filesize_after_mmap); + if (filesize_after_mmap % env->me_os_psize || + filesize_after_mmap > env->me_dbgeo.upper || + filesize_after_mmap < used_bytes) { + mdbx_info("unacceptable/unexpected datafile size %" PRIu64, + filesize_after_mmap); return MDBX_PROBLEM; } - meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now = (size_t)filesize); - mdbx_info("update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO - " pages", - env->me_dbgeo.now, meta.mm_geo.now); + if ((env->me_flags & MDBX_RDONLY) == 0) { + meta.mm_geo.now = + bytes2pgno(env, env->me_dbgeo.now = (size_t)filesize_after_mmap); + mdbx_info("update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO + " pages", + env->me_dbgeo.now, meta.mm_geo.now); + } } if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { From 242baf761f91f11803d1feccf65c89b101690db4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Jul 2017 10:30:46 +0300 Subject: [PATCH 300/303] mdbx: fix/rework shrinking, add MDBX_SHRINK_ALLOWED. Change-Id: I014440850aa4be927843aa2a6a268794a4da9b2a --- src/bits.h | 2 ++ src/mdbx.c | 84 +++++++++++++++++++++++++++++++++--------------------- 2 files changed, 54 insertions(+), 32 deletions(-) diff --git a/src/bits.h b/src/bits.h index d5cb9f73..89ecf4bb 100644 --- a/src/bits.h +++ b/src/bits.h @@ -669,6 +669,8 @@ struct MDBX_env { /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) +/* Additional flag for mdbx_sync_locked() */ +#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) /* Some fields are initialized. */ #define MDBX_ENV_ACTIVE UINT32_C(0x20000000) /* me_txkey is set */ diff --git a/src/mdbx.c b/src/mdbx.c index 5df54716..218bd2bf 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1561,8 +1561,10 @@ static int mdbx_mapresize(MDBX_env *env, const pgno_t size_pgno, mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); if (rc == MDBX_SUCCESS) { - if (env->me_txn0) - env->me_txn0->mt_end_pgno = size_pgno; + if (env->me_txn) { + mdbx_tassert(env->me_txn, size_pgno >= env->me_txn->mt_next_pgno); + env->me_txn->mt_end_pgno = size_pgno; + } env->me_dbgeo.now = size_bytes; env->me_dbgeo.upper = limit_bytes; } else { @@ -1889,14 +1891,20 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, env, mdbx_roundup2(pgno2bytes(env, txn->mt_next_pgno + head->mm_geo.grow), env->me_os_psize)); + while (next >= growth_pgno) + growth_pgno = bytes2pgno( + env, mdbx_roundup2(pgno2bytes(env, growth_pgno + head->mm_geo.grow), + env->me_os_psize)); if (growth_pgno > head->mm_geo.upper) growth_pgno = head->mm_geo.upper; mdbx_info("try growth datafile to %" PRIaPGNO " pages (+%" PRIaPGNO ")", growth_pgno, growth_pgno - txn->mt_end_pgno); rc = mdbx_mapresize(env, growth_pgno, head->mm_geo.upper); - if (rc == MDBX_SUCCESS) + if (rc == MDBX_SUCCESS) { + mdbx_tassert(env->me_txn, txn->mt_end_pgno >= next); continue; + } mdbx_warning("unable growth datafile to %" PRIaPGNO "pages (+%" PRIaPGNO "), errcode %d", @@ -2195,7 +2203,7 @@ int mdbx_env_sync(MDBX_env *env, int force) { container_of(head, MDBX_page, mp_data)->mp_pgno, mdbx_durable_str(head), env->me_sync_pending); MDBX_meta meta = *head; - int rc = mdbx_sync_locked(env, flags, &meta); + int rc = mdbx_sync_locked(env, flags | MDBX_SHRINK_ALLOWED, &meta); if (unlikely(rc != MDBX_SUCCESS)) { if (outside_txn) mdbx_txn_unlock(env); @@ -2509,9 +2517,11 @@ int mdbx_txn_renew(MDBX_txn *txn) { rc = mdbx_txn_renew0(txn, MDBX_TXN_RDONLY); if (rc == MDBX_SUCCESS) { - mdbx_debug("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "", + mdbx_debug("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); } return rc; } @@ -2632,9 +2642,11 @@ int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, unsigned flags, txn->mt_owner = mdbx_thread_self(); txn->mt_signature = MDBX_MT_SIGNATURE; *ret = txn; - mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "", + mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, txn->mt_txnid, (flags & MDBX_RDONLY) ? 'r' : 'w', (void *)txn, - (void *)env, txn->mt_dbs[MAIN_DBI].md_root); + (void *)env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); } return rc; @@ -2696,10 +2708,12 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { /* Export or close DBI handles opened in this txn */ mdbx_dbis_update(txn, mode & MDBX_END_UPDATE); - mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO "", + mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO + "/%" PRIaPGNO, names[mode & MDBX_END_OPMASK], txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, - (void *)env, txn->mt_dbs[MAIN_DBI].md_root); + (void *)env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { if (txn->mt_ro_reader) { @@ -3456,9 +3470,10 @@ int mdbx_txn_commit(MDBX_txn *txn) { !(txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS))) goto done; - mdbx_debug( - "committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO "", - txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); + mdbx_debug("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (void *)txn, (void *)env, + txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); /* Update DB root pointers */ if (txn->mt_numdbs > CORE_DBS) { @@ -3505,7 +3520,8 @@ int mdbx_txn_commit(MDBX_txn *txn) { meta.mm_canary = txn->mt_canary; mdbx_meta_set_txnid(env, &meta, txn->mt_txnid); - rc = mdbx_sync_locked(env, env->me_flags | txn->mt_flags, &meta); + rc = mdbx_sync_locked( + env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); } if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -3814,11 +3830,12 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); mdbx_assert(env, !META_IS_STEADY(head) || env->me_sync_pending != 0); + mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now); const size_t usedbytes = mdbx_roundup2(pgno2bytes(env, pending->mm_geo.next), env->me_os_psize); if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) - flags &= MDBX_WRITEMAP; + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* LY: step#1 - sync previously written/updated data-pages */ int rc = MDBX_RESULT_TRUE; @@ -3850,11 +3867,12 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, #else /* LY: check conditions to shrink datafile */ pgno_t shrink_pgno_delta = 0; - const pgno_t shrink_pgno = pending->mm_geo.next /* + pending->mm_geo.grow */; - if (pending->mm_geo.now > shrink_pgno && pending->mm_geo.shrink && - unlikely(pending->mm_geo.now - pending->mm_geo.shrink >= shrink_pgno)) { - if (pending->mm_geo.now > shrink_pgno && - pending->mm_geo.now - pending->mm_geo.shrink >= shrink_pgno) { + if ((flags & MDBX_SHRINK_ALLOWED) && + pending->mm_geo.next < head->mm_geo.next) { + const pgno_t shrink_pgno = + pending->mm_geo.next /* + pending->mm_geo.grow */; + if (pending->mm_geo.now > shrink_pgno && pending->mm_geo.shrink && + unlikely(pending->mm_geo.now - pending->mm_geo.shrink >= shrink_pgno)) { shrink_pgno_delta = pending->mm_geo.now - shrink_pgno; pending->mm_geo.now = shrink_pgno; } @@ -3896,15 +3914,15 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, } /* LY: step#2 - update meta-page. */ - mdbx_debug( - "writing meta%" PRIaPGNO " (%s, was %" PRIaTXN ", %s), root %" PRIaPGNO - "/%" PRIaPGNO ", " - "txn_id %" PRIaTXN ", %s", - container_of(target, MDBX_page, mp_data)->mp_pgno, - (target == head) ? "head" : "tail", mdbx_meta_txnid_stable(env, target), - mdbx_durable_str((const MDBX_meta *)target), - pending->mm_dbs[MAIN_DBI].md_root, pending->mm_dbs[FREE_DBI].md_root, - pending->mm_txnid_a, mdbx_durable_str(pending)); + mdbx_debug("writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + container_of(target, MDBX_page, mp_data)->mp_pgno, + pending->mm_dbs[MAIN_DBI].md_root, + pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, + pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, + pending->mm_geo.grow, pending->mm_geo.shrink, pending->mm_txnid_a, + mdbx_durable_str(pending)); mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, @@ -4680,7 +4698,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { mdbx_ensure(env, mdbx_meta_eq(env, &meta, head)); mdbx_meta_set_txnid(env, &meta, txnid + 1); env->me_sync_pending += env->me_psize; - err = mdbx_sync_locked(env, env->me_flags, &meta); + err = mdbx_sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta); if (err) { mdbx_info("error %d, while updating meta.geo: " "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO @@ -8374,8 +8392,10 @@ static int mdbx_rebalance(MDBX_cursor *mc) { } } } - } else - mdbx_debug("root page doesn't need rebalancing"); + } else { + mdbx_debug("root page %" PRIaPGNO " doesn't need rebalancing", + mp->mp_pgno); + } return MDBX_SUCCESS; } From 63af619080b1eb1b60365644976a943fc0fcfa5c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Jul 2017 15:49:05 +0300 Subject: [PATCH 301/303] mdbx: #ifdef-guard for FILE_PROVIDER_EXTERNAL_INFO_V1. Change-Id: I4bc4eb012f7d6ea874a10abd4a2bfcfb789c4a22 --- src/osal.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/osal.c b/src/osal.c index d7547370..71e67868 100644 --- a/src/osal.c +++ b/src/osal.c @@ -112,11 +112,13 @@ typedef struct _IO_STATUS_BLOCK { ULONG_PTR Information; } IO_STATUS_BLOCK, *PIO_STATUS_BLOCK; +#ifndef FILE_PROVIDER_CURRENT_VERSION typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 { ULONG Version; ULONG Algorithm; ULONG Flags; } FILE_PROVIDER_EXTERNAL_INFO_V1, *PFILE_PROVIDER_EXTERNAL_INFO_V1; +#endif #ifndef STATUS_OBJECT_NOT_EXTERNALLY_BACKED #define STATUS_OBJECT_NOT_EXTERNALLY_BACKED ((NTSTATUS)0xC000046DL) From c5d1384c64dd1bc3d2f8fc10e45f45d6cd86dd11 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Jul 2017 16:36:18 +0300 Subject: [PATCH 302/303] mdbx-tools: more fix MSVC warnings. Change-Id: Ib5f78a89fea09dc78d4922519eab6eaed4b1a7e6 --- src/tools/mdbx_chk.c | 11 +++++++++-- src/tools/mdbx_copy.c | 9 ++++++++- src/tools/mdbx_dump.c | 9 ++++++++- src/tools/mdbx_load.c | 9 ++++++++- src/tools/mdbx_stat.c | 9 ++++++++- src/tools/wingetopt.c | 6 ++---- 6 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 7a93d70d..f1619965 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -18,7 +18,14 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ -#endif /* _MSC_VER */ +#if _MSC_VER == 1900 +/* LY: MSVC 2015 has buggy/inconsistent PRIuPTR/PRIxPTR macros and format-arg + checker for size_t typedef. */ +#pragma warning(disable : 4777) /* format string '%10u' requires an argument \ + of type 'unsigned int', but variadic \ + argument 1 has type 'std::size_t' */ +#endif +#endif /* _MSC_VER (warnings) */ #include "../bits.h" @@ -203,7 +210,7 @@ static void problem_add(const char *object, uint64_t entry_number, } } -static struct problem *problems_push() { +static struct problem *problems_push(void) { struct problem *p = problems_list; problems_list = NULL; return p; diff --git a/src/tools/mdbx_copy.c b/src/tools/mdbx_copy.c index 2e384cac..ec0856ca 100644 --- a/src/tools/mdbx_copy.c +++ b/src/tools/mdbx_copy.c @@ -18,7 +18,14 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ -#endif /* _MSC_VER */ +#if _MSC_VER == 1900 +/* LY: MSVC 2015 has buggy/inconsistent PRIuPTR/PRIxPTR macros and format-arg + checker for size_t typedef. */ +#pragma warning(disable : 4777) /* format string '%10u' requires an argument \ + of type 'unsigned int', but variadic \ + argument 1 has type 'std::size_t' */ +#endif +#endif /* _MSC_VER (warnings) */ #include "../bits.h" diff --git a/src/tools/mdbx_dump.c b/src/tools/mdbx_dump.c index 71c300dd..19010c47 100644 --- a/src/tools/mdbx_dump.c +++ b/src/tools/mdbx_dump.c @@ -18,7 +18,14 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ -#endif /* _MSC_VER */ +#if _MSC_VER == 1900 +/* LY: MSVC 2015 has buggy/inconsistent PRIuPTR/PRIxPTR macros and format-arg + checker for size_t typedef. */ +#pragma warning(disable : 4777) /* format string '%10u' requires an argument \ + of type 'unsigned int', but variadic \ + argument 1 has type 'std::size_t' */ +#endif +#endif /* _MSC_VER (warnings) */ #include "../bits.h" #include diff --git a/src/tools/mdbx_load.c b/src/tools/mdbx_load.c index 16180e86..0c20b1e7 100644 --- a/src/tools/mdbx_load.c +++ b/src/tools/mdbx_load.c @@ -18,7 +18,14 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ -#endif /* _MSC_VER */ +#if _MSC_VER == 1900 +/* LY: MSVC 2015 has buggy/inconsistent PRIuPTR/PRIxPTR macros and format-arg + checker for size_t typedef. */ +#pragma warning(disable : 4777) /* format string '%10u' requires an argument \ + of type 'unsigned int', but variadic \ + argument 1 has type 'std::size_t' */ +#endif +#endif /* _MSC_VER (warnings) */ #include "../bits.h" #include diff --git a/src/tools/mdbx_stat.c b/src/tools/mdbx_stat.c index 7fbe924b..9b65d992 100644 --- a/src/tools/mdbx_stat.c +++ b/src/tools/mdbx_stat.c @@ -18,7 +18,14 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ -#endif /* _MSC_VER */ +#if _MSC_VER == 1900 +/* LY: MSVC 2015 has buggy/inconsistent PRIuPTR/PRIxPTR macros and format-arg + checker for size_t typedef. */ +#pragma warning(disable : 4777) /* format string '%10u' requires an argument \ + of type 'unsigned int', but variadic \ + argument 1 has type 'std::size_t' */ +#endif +#endif /* _MSC_VER (warnings) */ #include "../bits.h" diff --git a/src/tools/wingetopt.c b/src/tools/wingetopt.c index 3762ecae..8059e5d9 100644 --- a/src/tools/wingetopt.c +++ b/src/tools/wingetopt.c @@ -20,9 +20,6 @@ #define ERR(s, c) \ if (opterr) { \ - char errbuf[2]; \ - errbuf[0] = (char)c; \ - errbuf[1] = '\n'; \ fputs(argv[0], stderr); \ fputs(s, stderr); \ fputc(c, stderr); \ @@ -38,13 +35,14 @@ int getopt(int argc, char *const argv[], const char *opts) { int c; char *cp; - if (sp == 1) + if (sp == 1) { if (optind >= argc || argv[optind][0] != '-' || argv[optind][1] == '\0') return EOF; else if (strcmp(argv[optind], "--") == 0) { optind++; return EOF; } + } optopt = c = argv[optind][sp]; if (c == ':' || (cp = strchr(opts, c)) == NULL) { ERR(": illegal option -- ", c); From e2d770c62952756e8e8ce0969158c83e8f106419 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 21 Jul 2017 17:07:08 +0300 Subject: [PATCH 303/303] mdbx: fix title in the Makefile (minor). Change-Id: I8c77b226f55044c67b3a591da5257bf3886c00b0 --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 253cc479..f0016129 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,4 @@ -# GNU Makefile for libmdbx (reliable lightning memory-mapped DB library for Linux). -# https://github.com/ReOpen/libmdbx +# GNU Makefile for libmdbx, https://github.com/ReOpen/libmdbx ######################################################################## # Configuration. The compiler options must enable threaded compilation.