mdbx: Merge branch 'devel'.

This commit is contained in:
Leo Yuriev 2016-07-26 16:09:49 +03:00
commit 3aba7faab7
4 changed files with 93 additions and 82 deletions

View File

@ -1,9 +1,11 @@
LMDB 0.9 Change Log
MDBX MDBX
Add MDB_PREV_MULTIPLE Add MDB_PREV_MULTIPLE
Fix MDB_CP_COMPACT (ITS#8209)
Add error MDB_PROBLEM, replace some MDB_CORRUPTED
LMDB 0.9.19 Release Engineering LMDB 0.9.19 Release Engineering
Fix mdb_env_cwalk cursor init (ITS#8424)
Fix robust mutexes on Solaris 10/11 (ITS#8339)
Fix MDB_GET_BOTH on non-dup record (ITS#8393) Fix MDB_GET_BOTH on non-dup record (ITS#8393)
Optimize mdb_drop Optimize mdb_drop
Fix xcursors after mdb_cursor_del (ITS#8406) Fix xcursors after mdb_cursor_del (ITS#8406)
@ -24,7 +26,6 @@ LMDB 0.9.18 Release (2016/02/05)
Add Getting Started page Add Getting Started page
Update WRITEMAP description Update WRITEMAP description
LMDB 0.9.17 Release (2015/11/30) LMDB 0.9.17 Release (2015/11/30)
Fix ITS#7377 catch calloc failure Fix ITS#7377 catch calloc failure
Fix ITS#8237 regression from ITS#7589 Fix ITS#8237 regression from ITS#7589

5
lmdb.h
View File

@ -475,8 +475,10 @@ typedef enum MDB_cursor_op {
#define MDB_BAD_VALSIZE (-30781) #define MDB_BAD_VALSIZE (-30781)
/** The specified DBI was changed unexpectedly */ /** The specified DBI was changed unexpectedly */
#define MDB_BAD_DBI (-30780) #define MDB_BAD_DBI (-30780)
/** Unexpected problem - txn should abort */
#define MDB_PROBLEM (-30779)
/** The last defined error code */ /** The last defined error code */
#define MDB_LAST_ERRCODE MDB_BAD_DBI #define MDB_LAST_ERRCODE MDB_PROBLEM
/** @} */ /** @} */
/** @brief Statistics for a database in the environment */ /** @brief Statistics for a database in the environment */
@ -730,6 +732,7 @@ int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd);
* <li>#MDB_CP_COMPACT - Perform compaction while copying: omit free * <li>#MDB_CP_COMPACT - Perform compaction while copying: omit free
* pages and sequentially renumber all pages in output. This option * pages and sequentially renumber all pages in output. This option
* consumes more CPU and runs more slowly than the default. * consumes more CPU and runs more slowly than the default.
* Currently it fails if the environment has suffered a page leak.
* </ul> * </ul>
* @return A non-zero error value on failure and 0 on success. * @return A non-zero error value on failure and 0 on success.
*/ */

154
mdb.c
View File

@ -109,7 +109,7 @@
#include <string.h> #include <string.h>
#include <time.h> #include <time.h>
#include <unistd.h> #include <unistd.h>
#include <alloca.h> #include <malloc.h>
#include <pthread.h> #include <pthread.h>
#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
@ -1181,6 +1181,7 @@ static char *const mdb_errstr[] = {
"MDB_BAD_TXN: Transaction must abort, has a child, or is invalid", "MDB_BAD_TXN: Transaction must abort, has a child, or is invalid",
"MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size", "MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size",
"MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly", "MDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly",
"MDB_PROBLEM: Unexpected problem - txn should abort",
}; };
char * __cold char * __cold
@ -1640,7 +1641,7 @@ mdb_page_loose(MDB_cursor *mc, MDB_page *mp)
if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */
mc->mc_flags &= ~(C_INITIALIZED|C_EOF); mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
txn->mt_flags |= MDB_TXN_ERROR; txn->mt_flags |= MDB_TXN_ERROR;
return MDB_CORRUPTED; return MDB_PROBLEM;
} }
/* ok, it's ours */ /* ok, it's ours */
loose = 1; loose = 1;
@ -2490,7 +2491,7 @@ mdb_page_touch(MDB_cursor *mc)
if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */ if (unlikely(mp != dl[x].mptr)) { /* bad cursor? */
mc->mc_flags &= ~(C_INITIALIZED|C_EOF); mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
txn->mt_flags |= MDB_TXN_ERROR; txn->mt_flags |= MDB_TXN_ERROR;
return MDB_CORRUPTED; return MDB_PROBLEM;
} }
return 0; return 0;
} }
@ -5664,7 +5665,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp)
j = ++(dl[0].mid); j = ++(dl[0].mid);
dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */
txn->mt_flags |= MDB_TXN_ERROR; txn->mt_flags |= MDB_TXN_ERROR;
return MDB_CORRUPTED; return MDB_PROBLEM;
} }
} }
txn->mt_dirty_room++; txn->mt_dirty_room++;
@ -7030,7 +7031,7 @@ put_sub:
return rc; return rc;
bad_sub: bad_sub:
if (unlikely(rc == MDB_KEYEXIST)) /* should not happen, we deleted that item */ if (unlikely(rc == MDB_KEYEXIST)) /* should not happen, we deleted that item */
rc = MDB_CORRUPTED; rc = MDB_PROBLEM;
} }
mc->mc_txn->mt_flags |= MDB_TXN_ERROR; mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
return rc; return rc;
@ -9042,11 +9043,12 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi,
#ifndef MDB_WBUF #ifndef MDB_WBUF
# define MDB_WBUF (1024*1024) # define MDB_WBUF (1024*1024)
#endif #endif
#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */
/** State needed for a compacting copy. */ /** State needed for a double-buffering compacting copy. */
typedef struct mdb_copy { typedef struct mdb_copy {
pthread_mutex_t mc_mutex; pthread_mutex_t mc_mutex;
pthread_cond_t mc_cond; pthread_cond_t mc_cond; /**< Condition variable for #mc_new */
char *mc_wbuf[2]; char *mc_wbuf[2];
char *mc_over[2]; char *mc_over[2];
MDB_env *mc_env; MDB_env *mc_env;
@ -9055,10 +9057,9 @@ typedef struct mdb_copy {
int mc_olen[2]; int mc_olen[2];
pgno_t mc_next_pgno; pgno_t mc_next_pgno;
HANDLE mc_fd; HANDLE mc_fd;
int mc_status; int mc_toggle; /**< Buffer number in provider */
volatile int mc_new; int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */
int mc_toggle; volatile int mc_error; /**< Error code, never cleared if set */
} mdb_copy; } mdb_copy;
/** Dedicated writer thread for compacting copy. */ /** Dedicated writer thread for compacting copy. */
@ -9071,20 +9072,16 @@ mdb_env_copythr(void *arg)
int len; int len;
pthread_mutex_lock(&my->mc_mutex); pthread_mutex_lock(&my->mc_mutex);
my->mc_new = 0;
pthread_cond_signal(&my->mc_cond);
for(;;) { for(;;) {
while (!my->mc_new) while (!my->mc_new)
pthread_cond_wait(&my->mc_cond, &my->mc_mutex); pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
if (my->mc_new < 0) { if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */
my->mc_new = 0;
break; break;
}
my->mc_new = 0;
wsize = my->mc_wlen[toggle]; wsize = my->mc_wlen[toggle];
ptr = my->mc_wbuf[toggle]; ptr = my->mc_wbuf[toggle];
again: again:
while (wsize > 0) { rc = MDB_SUCCESS;
while (wsize > 0 && !my->mc_error) {
len = write(my->mc_fd, ptr, wsize); len = write(my->mc_fd, ptr, wsize);
if (len < 0) { if (len < 0) {
rc = errno; rc = errno;
@ -9100,8 +9097,7 @@ again:
} }
} }
if (rc) { if (rc) {
my->mc_status = rc; my->mc_error = rc;
break;
} }
/* If there's an overflow page tail, write it too */ /* If there's an overflow page tail, write it too */
if (my->mc_olen[toggle]) { if (my->mc_olen[toggle]) {
@ -9112,30 +9108,33 @@ again:
} }
my->mc_wlen[toggle] = 0; my->mc_wlen[toggle] = 0;
toggle ^= 1; toggle ^= 1;
/* Return the empty buffer to provider */
my->mc_new--;
pthread_cond_signal(&my->mc_cond); pthread_cond_signal(&my->mc_cond);
} }
pthread_cond_signal(&my->mc_cond);
pthread_mutex_unlock(&my->mc_mutex); pthread_mutex_unlock(&my->mc_mutex);
return (void*)0; return NULL;
} }
/** Tell the writer thread there's a buffer ready to write */ /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer.
*
* @param[in] my control structure.
* @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending).
*/
static int __cold static int __cold
mdb_env_cthr_toggle(mdb_copy *my, int st) mdb_env_cthr_toggle(mdb_copy *my, int adjust)
{ {
int toggle = my->mc_toggle ^ 1;
pthread_mutex_lock(&my->mc_mutex); pthread_mutex_lock(&my->mc_mutex);
if (my->mc_status) { my->mc_new += adjust;
pthread_mutex_unlock(&my->mc_mutex);
return my->mc_status;
}
while (my->mc_new == 1)
pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
my->mc_new = st;
my->mc_toggle = toggle;
pthread_cond_signal(&my->mc_cond); pthread_cond_signal(&my->mc_cond);
while (my->mc_new & 2) /* both buffers in use */
pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
pthread_mutex_unlock(&my->mc_mutex); pthread_mutex_unlock(&my->mc_mutex);
return 0;
my->mc_toggle ^= (adjust & 1);
/* Both threads reset mc_wlen, to be safe from threading errors */
my->mc_wlen[my->mc_toggle] = 0;
return my->mc_error;
} }
/** Depth-first tree traversal for compacting copy. */ /** Depth-first tree traversal for compacting copy. */
@ -9202,6 +9201,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
} }
memcpy(&pg, NODEDATA(ni), sizeof(pg)); memcpy(&pg, NODEDATA(ni), sizeof(pg));
memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t));
rc = mdb_page_get(txn, pg, &omp, NULL); rc = mdb_page_get(txn, pg, &omp, NULL);
if (rc) if (rc)
goto done; goto done;
@ -9224,7 +9224,6 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags)
goto done; goto done;
toggle = my->mc_toggle; toggle = my->mc_toggle;
} }
memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t));
} else if (ni->mn_flags & F_SUBDATA) { } else if (ni->mn_flags & F_SUBDATA) {
MDB_db db; MDB_db db;
@ -9305,31 +9304,31 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd)
mdb_copy my; mdb_copy my;
MDB_txn *txn = NULL; MDB_txn *txn = NULL;
pthread_t thr; pthread_t thr;
int rc; pgno_t root, new_root;
int rc = MDB_SUCCESS;
pthread_mutex_init(&my.mc_mutex, NULL); memset(&my, 0, sizeof(my));
pthread_cond_init(&my.mc_cond, NULL); if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0)
rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2);
if (rc)
return rc; return rc;
if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0)
goto done2;
my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2);
if (my.mc_wbuf[0] == NULL) {
rc = errno;
goto done;
}
memset(my.mc_wbuf[0], 0, MDB_WBUF*2); memset(my.mc_wbuf[0], 0, MDB_WBUF*2);
my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF;
my.mc_wlen[0] = 0;
my.mc_wlen[1] = 0;
my.mc_olen[0] = 0;
my.mc_olen[1] = 0;
my.mc_next_pgno = NUM_METAS; my.mc_next_pgno = NUM_METAS;
my.mc_status = 0;
my.mc_new = 1;
my.mc_toggle = 0;
my.mc_env = env; my.mc_env = env;
my.mc_fd = fd; my.mc_fd = fd;
rc = pthread_create(&thr, NULL, mdb_env_copythr, &my); rc = pthread_create(&thr, NULL, mdb_env_copythr, &my);
assert(rc == 0); if (rc)
goto done;
rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
if (rc) if (rc)
return rc; goto finish;
mp = (MDB_page *)my.mc_wbuf[0]; mp = (MDB_page *)my.mc_wbuf[0];
memset(mp, 0, NUM_METAS * env->me_psize); memset(mp, 0, NUM_METAS * env->me_psize);
@ -9345,51 +9344,58 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd)
*(MDB_meta *)PAGEDATA(mp) = *mm; *(MDB_meta *)PAGEDATA(mp) = *mm;
mm = (MDB_meta *)PAGEDATA(mp); mm = (MDB_meta *)PAGEDATA(mp);
/* Count the number of free pages, subtract from lastpg to find /* Set metapage 1 with current main DB */
* number of active pages root = new_root = txn->mt_dbs[MAIN_DBI].md_root;
if (root != P_INVALID) {
/* Count free pages + freeDB pages. Subtract from last_pg
* to find the new last_pg, which also becomes the new root.
*/ */
{
MDB_ID freecount = 0; MDB_ID freecount = 0;
MDB_cursor mc; MDB_cursor mc;
MDB_val key, data; MDB_val key, data;
mdb_cursor_init(&mc, txn, FREE_DBI, NULL); mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
freecount += *(MDB_ID *)data.mv_data; freecount += *(MDB_ID *)data.mv_data;
if (rc != MDB_NOTFOUND)
goto finish;
freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + freecount += txn->mt_dbs[FREE_DBI].md_branch_pages +
txn->mt_dbs[FREE_DBI].md_leaf_pages + txn->mt_dbs[FREE_DBI].md_leaf_pages +
txn->mt_dbs[FREE_DBI].md_overflow_pages; txn->mt_dbs[FREE_DBI].md_overflow_pages;
/* Set metapage 1 */ new_root = txn->mt_next_pgno - 1 - freecount;
mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; mm->mm_last_pg = new_root;
mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
if (mm->mm_last_pg > NUM_METAS-1) { mm->mm_dbs[MAIN_DBI].md_root = new_root;
mm->mm_dbs[MAIN_DBI].md_root = mm->mm_last_pg;
mm->mm_txnid = 1;
} else { } else {
mm->mm_dbs[MAIN_DBI].md_root = P_INVALID; /* When the DB is empty, handle it specially to
* fix any breakage like page leaks from ITS#8174.
*/
mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags;
} }
if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) {
mm->mm_txnid = 1; /* use metapage 1 */
} }
my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_wlen[0] = env->me_psize * NUM_METAS;
my.mc_txn = txn; my.mc_txn = txn;
pthread_mutex_lock(&my.mc_mutex); rc = mdb_env_cwalk(&my, &root, 0);
while(my.mc_new) if (rc == MDB_SUCCESS && root != new_root) {
pthread_cond_wait(&my.mc_cond, &my.mc_mutex); rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */
pthread_mutex_unlock(&my.mc_mutex); }
rc = mdb_env_cwalk(&my, &txn->mt_dbs[MAIN_DBI].md_root, 0);
if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle])
rc = mdb_env_cthr_toggle(&my, 1);
mdb_env_cthr_toggle(&my, -1);
pthread_mutex_lock(&my.mc_mutex);
while(my.mc_new)
pthread_cond_wait(&my.mc_cond, &my.mc_mutex);
pthread_mutex_unlock(&my.mc_mutex);
pthread_join(thr, NULL);
finish:
if (rc)
my.mc_error = rc;
mdb_env_cthr_toggle(&my, 1 | MDB_EOF);
rc = pthread_join(thr, NULL);
mdb_txn_abort(txn); mdb_txn_abort(txn);
pthread_cond_destroy(&my.mc_cond);
pthread_mutex_destroy(&my.mc_mutex); done:
free(my.mc_wbuf[0]); free(my.mc_wbuf[0]);
return rc; pthread_cond_destroy(&my.mc_cond);
done2:
pthread_mutex_destroy(&my.mc_mutex);
return rc ? rc : my.mc_error;
} }
/** Copy environment as-is. */ /** Copy environment as-is. */

View File

@ -56,6 +56,7 @@ Write the library version number to the standard output, and exit.
Compact while copying. Only current data pages will be copied; freed Compact while copying. Only current data pages will be copied; freed
or unused pages will be omitted from the copy. This option will or unused pages will be omitted from the copy. This option will
slow down the backup process as it is more CPU-intensive. slow down the backup process as it is more CPU-intensive.
Currently it fails if the environment has suffered a page leak.
.TP .TP
.BR \-n .BR \-n
Open LDMB environment(s) which do not use subdirectories. Open LDMB environment(s) which do not use subdirectories.