From 3befcdab016ace85530baabd3ef6c1e7fceb5542 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 25 Jun 2016 07:55:34 +0200 Subject: [PATCH] mdbx: backport - ITS#8209 fix MDB_CP_COMPACT. Handle errors. Fix cond_wait condition so mc_new is the sole control var. Drop specious cond_waits. Do not look at 'mo' while copythr writes it. Preserve DB flags (use metapage#1) when main DB is empty. Fail if metapage root != actual root in output file. Some _aligned_malloc() doc seems to think arg NULL = user error. Don't know if posix_memalign() pointer is defined after failure. Change-Id: Idfdc118b4848bb96bace0f29db9dcdd710b7a1f4 --- lmdb.h | 1 + mdb.c | 154 +++++++++++++++++++++++++++-------------------------- mdb_copy.1 | 1 + 3 files changed, 81 insertions(+), 75 deletions(-) diff --git a/lmdb.h b/lmdb.h index da3b6758..2fc1e314 100644 --- a/lmdb.h +++ b/lmdb.h @@ -730,6 +730,7 @@ int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free * pages and sequentially renumber all pages in output. This option * consumes more CPU and runs more slowly than the default. + * Currently it fails if the environment has suffered a page leak. * * @return A non-zero error value on failure and 0 on success. */ diff --git a/mdb.c b/mdb.c index 06e0fce7..93840cab 100644 --- a/mdb.c +++ b/mdb.c @@ -109,7 +109,7 @@ #include #include #include -#include +#include #include #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) @@ -9040,13 +9040,14 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, } #ifndef MDB_WBUF -#define MDB_WBUF (1024*1024) +# define MDB_WBUF (1024*1024) #endif +#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ - /** State needed for a compacting copy. */ + /** State needed for a double-buffering compacting copy. */ typedef struct mdb_copy { pthread_mutex_t mc_mutex; - pthread_cond_t mc_cond; + pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ char *mc_wbuf[2]; char *mc_over[2]; MDB_env *mc_env; @@ -9055,10 +9056,9 @@ typedef struct mdb_copy { int mc_olen[2]; pgno_t mc_next_pgno; HANDLE mc_fd; - int mc_status; - volatile int mc_new; - int mc_toggle; - + int mc_toggle; /**< Buffer number in provider */ + int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ + volatile int mc_error; /**< Error code, never cleared if set */ } mdb_copy; /** Dedicated writer thread for compacting copy. */ @@ -9071,20 +9071,16 @@ mdb_env_copythr(void *arg) int len; pthread_mutex_lock(&my->mc_mutex); - my->mc_new = 0; - pthread_cond_signal(&my->mc_cond); for(;;) { while (!my->mc_new) pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - if (my->mc_new < 0) { - my->mc_new = 0; + if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ break; - } - my->mc_new = 0; wsize = my->mc_wlen[toggle]; ptr = my->mc_wbuf[toggle]; again: - while (wsize > 0) { + rc = MDB_SUCCESS; + while (wsize > 0 && !my->mc_error) { len = write(my->mc_fd, ptr, wsize); if (len < 0) { rc = errno; @@ -9100,8 +9096,7 @@ again: } } if (rc) { - my->mc_status = rc; - break; + my->mc_error = rc; } /* If there's an overflow page tail, write it too */ if (my->mc_olen[toggle]) { @@ -9112,30 +9107,33 @@ again: } my->mc_wlen[toggle] = 0; toggle ^= 1; + /* Return the empty buffer to provider */ + my->mc_new--; pthread_cond_signal(&my->mc_cond); } - pthread_cond_signal(&my->mc_cond); pthread_mutex_unlock(&my->mc_mutex); - return (void*)0; + return NULL; } - /** Tell the writer thread there's a buffer ready to write */ + /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. + * + * @param[in] my control structure. + * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). + */ static int __cold -mdb_env_cthr_toggle(mdb_copy *my, int st) +mdb_env_cthr_toggle(mdb_copy *my, int adjust) { - int toggle = my->mc_toggle ^ 1; pthread_mutex_lock(&my->mc_mutex); - if (my->mc_status) { - pthread_mutex_unlock(&my->mc_mutex); - return my->mc_status; - } - while (my->mc_new == 1) - pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - my->mc_new = st; - my->mc_toggle = toggle; + my->mc_new += adjust; pthread_cond_signal(&my->mc_cond); + while (my->mc_new & 2) /* both buffers in use */ + pthread_cond_wait(&my->mc_cond, &my->mc_mutex); pthread_mutex_unlock(&my->mc_mutex); - return 0; + + my->mc_toggle ^= (adjust & 1); + /* Both threads reset mc_wlen, to be safe from threading errors */ + my->mc_wlen[my->mc_toggle] = 0; + return my->mc_error; } /** Depth-first tree traversal for compacting copy. */ @@ -9202,6 +9200,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) } memcpy(&pg, NODEDATA(ni), sizeof(pg)); + memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); rc = mdb_page_get(txn, pg, &omp, NULL); if (rc) goto done; @@ -9224,7 +9223,6 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) goto done; toggle = my->mc_toggle; } - memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t)); } else if (ni->mn_flags & F_SUBDATA) { MDB_db db; @@ -9302,34 +9300,33 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) { MDB_meta *mm; MDB_page *mp; - mdb_copy my; + mdb_copy my = {0}; MDB_txn *txn = NULL; pthread_t thr; - int rc; + pgno_t root, new_root; + int rc = MDB_SUCCESS; - pthread_mutex_init(&my.mc_mutex, NULL); - pthread_cond_init(&my.mc_cond, NULL); - rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2); - if (rc) + if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) return rc; + if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) + goto done2; + my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); + if (my.mc_wbuf[0] == NULL) { + rc = errno; + goto done; + } memset(my.mc_wbuf[0], 0, MDB_WBUF*2); my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; - my.mc_wlen[0] = 0; - my.mc_wlen[1] = 0; - my.mc_olen[0] = 0; - my.mc_olen[1] = 0; my.mc_next_pgno = NUM_METAS; - my.mc_status = 0; - my.mc_new = 1; - my.mc_toggle = 0; my.mc_env = env; my.mc_fd = fd; rc = pthread_create(&thr, NULL, mdb_env_copythr, &my); - assert(rc == 0); + if (rc) + goto done; rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); if (rc) - return rc; + goto finish; mp = (MDB_page *)my.mc_wbuf[0]; memset(mp, 0, NUM_METAS * env->me_psize); @@ -9345,51 +9342,58 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) *(MDB_meta *)PAGEDATA(mp) = *mm; mm = (MDB_meta *)PAGEDATA(mp); - /* Count the number of free pages, subtract from lastpg to find - * number of active pages - */ - { + /* Set metapage 1 with current main DB */ + root = new_root = txn->mt_dbs[MAIN_DBI].md_root; + if (root != P_INVALID) { + /* Count free pages + freeDB pages. Subtract from last_pg + * to find the new last_pg, which also becomes the new root. + */ MDB_ID freecount = 0; MDB_cursor mc; MDB_val key, data; mdb_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) freecount += *(MDB_ID *)data.mv_data; + if (rc != MDB_NOTFOUND) + goto finish; freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + txn->mt_dbs[FREE_DBI].md_leaf_pages + txn->mt_dbs[FREE_DBI].md_overflow_pages; - /* Set metapage 1 */ - mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; + new_root = txn->mt_next_pgno - 1 - freecount; + mm->mm_last_pg = new_root; mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - if (mm->mm_last_pg > NUM_METAS-1) { - mm->mm_dbs[MAIN_DBI].md_root = mm->mm_last_pg; - mm->mm_txnid = 1; - } else { - mm->mm_dbs[MAIN_DBI].md_root = P_INVALID; - } + mm->mm_dbs[MAIN_DBI].md_root = new_root; + } else { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. + */ + mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; } + if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { + mm->mm_txnid = 1; /* use metapage 1 */ + } + my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; - pthread_mutex_lock(&my.mc_mutex); - while(my.mc_new) - pthread_cond_wait(&my.mc_cond, &my.mc_mutex); - pthread_mutex_unlock(&my.mc_mutex); - rc = mdb_env_cwalk(&my, &txn->mt_dbs[MAIN_DBI].md_root, 0); - if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) - rc = mdb_env_cthr_toggle(&my, 1); - mdb_env_cthr_toggle(&my, -1); - pthread_mutex_lock(&my.mc_mutex); - while(my.mc_new) - pthread_cond_wait(&my.mc_cond, &my.mc_mutex); - pthread_mutex_unlock(&my.mc_mutex); - pthread_join(thr, NULL); + rc = mdb_env_cwalk(&my, &root, 0); + if (rc == MDB_SUCCESS && root != new_root) { + rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ + } +finish: + if (rc) + my.mc_error = rc; + mdb_env_cthr_toggle(&my, 1 | MDB_EOF); + rc = pthread_join(thr, NULL); mdb_txn_abort(txn); - pthread_cond_destroy(&my.mc_cond); - pthread_mutex_destroy(&my.mc_mutex); + +done: free(my.mc_wbuf[0]); - return rc; + pthread_cond_destroy(&my.mc_cond); +done2: + pthread_mutex_destroy(&my.mc_mutex); + return rc ? rc : my.mc_error; } /** Copy environment as-is. */ diff --git a/mdb_copy.1 b/mdb_copy.1 index 6f566aab..2cadb3ac 100644 --- a/mdb_copy.1 +++ b/mdb_copy.1 @@ -56,6 +56,7 @@ Write the library version number to the standard output, and exit. Compact while copying. Only current data pages will be copied; freed or unused pages will be omitted from the copy. This option will slow down the backup process as it is more CPU-intensive. +Currently it fails if the environment has suffered a page leak. .TP .BR \-n Open LDMB environment(s) which do not use subdirectories.