mdbx: adds functions for distance/move/range estimation (initial).

Change-Id: If59eccf7311123ab6384c4b93f9b1fed5a0a10d1
This commit is contained in:
Leonid Yuriev 2019-03-05 02:32:51 +03:00
parent 7d383350e8
commit 8ddfd1f34a
2 changed files with 402 additions and 0 deletions

47
mdbx.h
View File

@ -1695,6 +1695,53 @@ LIBMDBX_API int mdbx_cursor_on_first(MDBX_cursor *mc);
/* Returns: MDBX_RESULT_TRUE, MDBX_RESULT_FALSE or Error code. */
LIBMDBX_API int mdbx_cursor_on_last(MDBX_cursor *mc);
/* Estimates the distance between cursors as the number of elements.
* Both cursors must be initialized for the same DBI.
*
* [in] cursor_a The first cursor for estimation.
* [in] cursor_b The second cursor for estimation.
* [out] distance_items A pointer to store estimated distance value,
* i.e. *distance_items = distance(a - b).
*
* Returns A non-zero error value on failure and 0 on success. */
LIBMDBX_API int mdbx_estimate_distance(const MDBX_cursor *first,
const MDBX_cursor *last,
ptrdiff_t *distance_items);
/* Estimates the move distance, i.e. between the current cursor position and
* next position after the specified move-operation with given key and data.
* Current cursor position and state are preserved.
*
* [in] cursor Cursor for estimation.
* [in,out] key The key for a retrieved item.
* [in,out] data The data of a retrieved item.
* [in] op A cursor operation MDBX_cursor_op.
* [out] distance_items A pointer to store estimated move distance
* as the number of elements.
*
* Returns A non-zero error value on failure and 0 on success. */
LIBMDBX_API int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key,
MDBX_val *data, MDBX_cursor_op move_op,
ptrdiff_t *distance_items);
/* Estimates the size of a range in the number of elements.
*
* [in] txn A transaction handle returned by mdbx_txn_begin().
* [in] dbi A database handle returned by mdbx_dbi_open().
* [in] begin_key The key of range beginning or NULL for explicit FIRST.
* [in] begin_data Optional additional data to seeking among sorted
* duplicates. Only for MDBX_DUPSORT, NULL otherwise.
* [in] end_key The key of range ending or NULL for explicit LAST.
* [in] end_data Optional additional data to seeking among sorted
* duplicates. Only for MDBX_DUPSORT, NULL otherwise.
* [out] distance_items A pointer to store range estimation result.
*
* Returns A non-zero error value on failure and 0 on success. */
LIBMDBX_API int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi,
MDBX_val *begin_key, MDBX_val *begin_data,
MDBX_val *end_key, MDBX_val *end_data,
ptrdiff_t *size_items);
LIBMDBX_API int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key,
MDBX_val *new_data, MDBX_val *old_data,
unsigned flags);

View File

@ -13163,11 +13163,366 @@ int mdbx_cursor_eof(MDBX_cursor *mc) {
return MDBX_RESULT_FALSE;
}
//------------------------------------------------------------------------------
struct diff_result {
ptrdiff_t diff;
int level;
int root_nkeys;
};
static int cursor_diff(const MDBX_cursor *const __restrict first,
const MDBX_cursor *const __restrict last,
struct diff_result *const __restrict dr) {
dr->root_nkeys = 0;
dr->level = 0;
dr->diff = 0;
if (unlikely(first->mc_signature != MDBX_MC_SIGNATURE ||
last->mc_signature != MDBX_MC_SIGNATURE))
return MDBX_EBADSIGN;
if (unlikely(first->mc_dbi != last->mc_dbi))
return MDBX_EINVAL;
if (unlikely(!(first->mc_flags & last->mc_flags & C_INITIALIZED)))
return MDBX_ENODATA;
while (likely(dr->level < first->mc_snum && dr->level < last->mc_snum)) {
if (unlikely(first->mc_pg[dr->level] != last->mc_pg[dr->level]))
return MDBX_PROBLEM;
int nkeys = NUMKEYS(first->mc_pg[dr->level]);
assert(nkeys > 0);
if (dr->level == 0)
dr->root_nkeys = nkeys;
int max_ki = nkeys - 1;
int last_ki = last->mc_ki[dr->level];
int first_ki = first->mc_ki[dr->level];
dr->diff = ((last_ki < max_ki) ? last_ki : max_ki) -
((first_ki < max_ki) ? first_ki : max_ki);
if (dr->diff == 0) {
dr->level += 1;
continue;
}
while (unlikely(dr->diff == 1) && likely(dr->level + 1 < first->mc_snum &&
dr->level + 1 < last->mc_snum)) {
dr->level += 1;
/* DB'PAGEs: 0------------------>MAX
*
* CURSORs: first < last
* STACK[i ]: |
* STACK[+1]: ...f++N|0++l...
*/
nkeys = NUMKEYS(first->mc_pg[dr->level]);
dr->diff = (nkeys - first->mc_ki[dr->level]) + last->mc_ki[dr->level];
assert(dr->diff > 0);
}
while (unlikely(dr->diff == -1) && likely(dr->level + 1 < first->mc_snum &&
dr->level + 1 < last->mc_snum)) {
dr->level += 1;
/* DB'PAGEs: 0------------------>MAX
*
* CURSORs: last < first
* STACK[i ]: |
* STACK[+1]: ...l--N|0--f...
*/
nkeys = NUMKEYS(last->mc_pg[dr->level]);
dr->diff = -(nkeys - last->mc_ki[dr->level]) - first->mc_ki[dr->level];
assert(dr->diff < 0);
}
return MDBX_SUCCESS;
}
dr->diff = mdbx_cmp2int(last->mc_flags & C_EOF, first->mc_flags & C_EOF);
return MDBX_SUCCESS;
}
static ptrdiff_t estimate(const MDBX_db *db,
struct diff_result *const __restrict dr) {
/* root: branch-page => scale = leaf-factor * branch-factor(N-1)
* level-1: branch-page(s) => scale = leaf-factor * branch-factor^2
* level-2: branch-page(s) => scale = leaf-factor * branch-factor
* level-N: branch-page(s) => scale = leaf-factor
* last-level: leaf-page(s) => scale = 1
*/
ptrdiff_t btree_power = db->md_depth - 2 - dr->level;
if (btree_power < 0)
return dr->diff;
ptrdiff_t estimated =
(ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)db->md_leaf_pages;
if (btree_power == 0)
return estimated;
if (db->md_depth < 4) {
assert(dr->level == 0 && btree_power == 1);
return (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)dr->root_nkeys;
}
/* average_branch_fillfactor = total(branch_entries) / branch_pages
* total(branch_entries) = leaf_pages + branch_pages - 1 (root page) */
const size_t log2_fixedpoint = 3;
const size_t half = UINT64_C(1) << (log2_fixedpoint - 1);
const size_t factor =
((db->md_leaf_pages + db->md_branch_pages - 1) << log2_fixedpoint) /
db->md_branch_pages;
while (1) {
switch ((size_t)btree_power) {
default: {
const size_t square = (factor * factor + half) >> log2_fixedpoint;
const size_t quad = (square * square + half) >> log2_fixedpoint;
do {
estimated = estimated * quad + half;
estimated >>= log2_fixedpoint;
btree_power -= 4;
} while (btree_power >= 4);
continue;
}
case 3:
estimated = estimated * factor + half;
estimated >>= log2_fixedpoint;
__fallthrough /* fall through */;
case 2:
estimated = estimated * factor + half;
estimated >>= log2_fixedpoint;
__fallthrough /* fall through */;
case 1:
estimated = estimated * factor + half;
estimated >>= log2_fixedpoint;
__fallthrough /* fall through */;
case 0:
if (unlikely(estimated > (ptrdiff_t)db->md_entries))
return (ptrdiff_t)db->md_entries;
if (unlikely(estimated < -(ptrdiff_t)db->md_entries))
return -(ptrdiff_t)db->md_entries;
return estimated;
}
}
}
int mdbx_estimate_distance(const MDBX_cursor *first, const MDBX_cursor *last,
ptrdiff_t *distance_items) {
if (unlikely(first == NULL || last == NULL || distance_items == NULL))
return MDBX_EINVAL;
*distance_items = 0;
struct diff_result dr;
int rc = cursor_diff(first, last, &dr);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(dr.diff == 0) &&
F_ISSET(first->mc_db->md_flags & first->mc_db->md_flags,
MDBX_DUPSORT | C_INITIALIZED)) {
first = &first->mc_xcursor->mx_cursor;
last = &last->mc_xcursor->mx_cursor;
rc = cursor_diff(first, last, &dr);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
if (likely(dr.diff != 0))
*distance_items = estimate(first->mc_db, &dr);
return MDBX_SUCCESS;
}
int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data,
MDBX_cursor_op move_op, ptrdiff_t *distance_items) {
if (unlikely(cursor == NULL || distance_items == NULL ||
move_op == MDBX_GET_CURRENT || move_op == MDBX_GET_MULTIPLE))
return MDBX_EINVAL;
if (unlikely(cursor->mc_signature != MDBX_MC_SIGNATURE))
return MDBX_EBADSIGN;
if (!(cursor->mc_flags & C_INITIALIZED))
return MDBX_ENODATA;
MDBX_cursor_couple next;
mdbx_cursor_copy(cursor, &next.outer);
next.outer.mc_xcursor = NULL;
if (cursor->mc_db->md_flags & MDBX_DUPSORT) {
next.outer.mc_xcursor = &next.inner;
int rc = mdbx_xcursor_init0(&next.outer);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner;
mdbx_cursor_copy(&mx->mx_cursor, &next.inner.mx_cursor);
}
MDBX_val stub = {0, 0};
if (data == NULL) {
const unsigned mask =
1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY;
if (unlikely(mask & (1 << move_op)))
return MDBX_EINVAL;
data = &stub;
}
if (key == NULL) {
const unsigned mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE |
1 << MDBX_SET_KEY | 1 << MDBX_SET |
1 << MDBX_SET_RANGE;
if (unlikely(mask & (1 << move_op)))
return MDBX_EINVAL;
key = &stub;
}
int rc = mdbx_cursor_get(&next.outer, key, data, move_op);
if (unlikely(rc != MDBX_SUCCESS &&
(rc != MDBX_NOTFOUND || !(next.outer.mc_flags & C_INITIALIZED))))
return rc;
return mdbx_estimate_distance(cursor, &next.outer, distance_items);
}
static int mdbx_is_samedata(const MDBX_val *a, const MDBX_val *b) {
return a->iov_len == b->iov_len &&
memcmp(a->iov_base, b->iov_base, a->iov_len) == 0;
}
int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key,
MDBX_val *begin_data, MDBX_val *end_key,
MDBX_val *end_data, ptrdiff_t *size_items) {
if (unlikely(!txn || !size_items))
return MDBX_EINVAL;
if (unlikely(!begin_key && begin_data))
return MDBX_EINVAL;
if (unlikely(!end_key && end_data))
return MDBX_EINVAL;
if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE))
return MDBX_EBADSIGN;
if (unlikely(txn->mt_owner != mdbx_thread_self()))
return MDBX_THREAD_MISMATCH;
if (unlikely(!TXN_DBI_EXIST(txn, dbi, DB_USRVALID)))
return MDBX_EINVAL;
if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED))
return MDBX_BAD_TXN;
MDBX_cursor_couple begin;
/* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */
int rc = mdbx_cursor_init(&begin.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
if (unlikely(begin.outer.mc_db->md_entries == 0)) {
*size_items = 0;
return MDBX_SUCCESS;
}
MDBX_val origin_begin_key, origin_begin_data;
if (!begin_key) {
if (unlikely(!end_key)) {
/* LY: FIRST..LAST case */
*size_items = (ptrdiff_t)begin.outer.mc_db->md_entries;
return MDBX_SUCCESS;
}
MDBX_val stub = {0, 0};
rc = mdbx_cursor_first(&begin.outer, &stub, &stub);
} else {
if (end_key && !begin_data && !end_data &&
(begin_key == end_key || mdbx_is_samedata(begin_key, end_key))) {
/* LY: single key case */
int exact = 0;
rc = mdbx_cursor_set(&begin.outer, begin_key, NULL, MDBX_SET, &exact);
if (unlikely(rc != MDBX_SUCCESS)) {
*size_items = 0;
return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc;
}
*size_items = 1;
if (begin.outer.mc_xcursor != NULL) {
MDBX_node *leaf = NODEPTR(begin.outer.mc_pg[begin.outer.mc_top],
begin.outer.mc_ki[begin.outer.mc_top]);
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
/* LY: return the number of duplicates for given key */
mdbx_tassert(txn,
begin.outer.mc_xcursor == &begin.inner &&
(begin.inner.mx_cursor.mc_flags & C_INITIALIZED));
*size_items =
(sizeof(*size_items) >= sizeof(begin.inner.mx_db.md_entries) ||
begin.inner.mx_db.md_entries <= SIZE_MAX)
? (size_t)begin.inner.mx_db.md_entries
: SIZE_MAX;
}
}
return MDBX_SUCCESS;
}
MDBX_cursor_op begin_op = MDBX_SET_RANGE;
if (begin_data) {
begin_op = MDBX_GET_BOTH_RANGE;
origin_begin_data = *begin_data;
}
origin_begin_key = *begin_key;
rc = mdbx_cursor_set(&begin.outer, begin_key, begin_data, begin_op, NULL);
}
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc != MDBX_NOTFOUND || !(begin.outer.mc_flags & C_INITIALIZED))
return rc;
}
MDBX_cursor_couple end;
rc = mdbx_cursor_init(&end.outer, txn, dbi);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
MDBX_val origin_end_key, origin_end_data;
if (!end_key) {
MDBX_val stub = {0, 0};
rc = mdbx_cursor_last(&end.outer, &stub, &stub);
} else {
MDBX_cursor_op end_op = MDBX_SET_RANGE;
if (end_data) {
end_op = MDBX_GET_BOTH_RANGE;
origin_end_data = *end_data;
}
origin_end_key = *end_key;
rc = mdbx_cursor_set(&end.outer, end_key, end_data, end_op, NULL);
}
if (unlikely(rc != MDBX_SUCCESS)) {
if (rc != MDBX_NOTFOUND || !(end.outer.mc_flags & C_INITIALIZED))
return rc;
}
rc = mdbx_estimate_distance(&begin.outer, &end.outer, size_items);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
assert(*size_items >= -(ptrdiff_t)begin.outer.mc_db->md_entries &&
*size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries);
if (*size_items < 0) {
/* LY: inverted range case */
*size_items += (ptrdiff_t)begin.outer.mc_db->md_entries;
} else if (*size_items == 0 && begin_key && end_key) {
int cmp = begin.outer.mc_dbx->md_cmp(&origin_begin_key, &origin_end_key);
if (cmp == 0 && (begin.inner.mx_cursor.mc_flags & C_INITIALIZED) &&
begin_data && end_data)
cmp = begin.outer.mc_dbx->md_dcmp(&origin_begin_data, &origin_end_data);
if (cmp > 0) {
/* LY: inverted range case with empty scope */
*size_items = (ptrdiff_t)begin.outer.mc_db->md_entries;
}
}
assert(*size_items >= 0 &&
*size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries);
return MDBX_SUCCESS;
}
//------------------------------------------------------------------------------
/* Позволяет обновить или удалить существующую запись с получением
* в old_data предыдущего значения данных. При этом если new_data равен
* нулю, то выполняется удаление, иначе обновление/вставка.