mdbx: rework lck/body setup.

This commit is contained in:
Leo Yuriev 2017-04-24 15:51:21 +03:00
parent 19d877635c
commit 0d59cd4fe2
2 changed files with 127 additions and 129 deletions

View File

@ -655,7 +655,7 @@ static int mdbx_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, static int mdbx_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
pgno_t newpgno, unsigned nflags); pgno_t newpgno, unsigned nflags);
static int mdbx_env_read_header(MDB_env *env, MDB_meta *meta); static int mdbx_read_header(MDB_env *env, MDB_meta *meta);
static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending);
static void mdbx_env_close0(MDB_env *env); static void mdbx_env_close0(MDB_env *env);
@ -3264,7 +3264,7 @@ fail:
* @param[in] env the environment handle * @param[in] env the environment handle
* @param[out] meta address of where to store the meta information * @param[out] meta address of where to store the meta information
* @return 0 on success, non-zero on failure. */ * @return 0 on success, non-zero on failure. */
static int __cold mdbx_env_read_header(MDB_env *env, MDB_meta *meta) { static int __cold mdbx_read_header(MDB_env *env, MDB_meta *meta) {
MDB_metabuf pbuf; MDB_metabuf pbuf;
MDB_page *p; MDB_page *p;
MDB_meta *m; MDB_meta *m;
@ -3612,16 +3612,14 @@ static int __cold mdbx_env_map(MDB_env *env, void *addr, size_t usedsize) {
#endif #endif
#ifdef MADV_DONTDUMP #ifdef MADV_DONTDUMP
if (!(flags & MDBX_PAGEPERTURB)) { if (!(flags & MDBX_PAGEPERTURB))
(void)madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP); (void)madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP);
}
#endif #endif
#ifdef MADV_REMOVE #ifdef MADV_REMOVE
if (flags & MDB_WRITEMAP) { if (flags & MDB_WRITEMAP)
(void)madvise(env->me_map + usedsize, env->me_mapsize - usedsize, (void)madvise(env->me_map + usedsize, env->me_mapsize - usedsize,
MADV_REMOVE); MADV_REMOVE);
}
#else #else
(void)usedsize; (void)usedsize;
#endif #endif
@ -3739,14 +3737,17 @@ int __cold mdbx_env_get_maxreaders(MDB_env *env, unsigned *readers) {
} }
/* Further setup required for opening an LMDB environment */ /* Further setup required for opening an LMDB environment */
static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) { static int __cold mdbx_setup_body(MDB_env *env, MDB_meta *meta, int lck_rc) {
int newenv = 0; int rc = MDBX_RESULT_FALSE;
int rc = mdbx_env_read_header(env, meta); int err = mdbx_read_header(env, meta);
if (unlikely(rc != MDB_SUCCESS)) { if (unlikely(err != MDB_SUCCESS)) {
if (rc != ENOENT) if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA ||
return rc; (env->me_flags & MDB_RDONLY))
mdbx_debug("new mdbenv"); return err;
newenv = 1;
mdbx_debug("create new database");
rc = /* new database */ MDBX_RESULT_TRUE;
env->me_psize = env->me_os_psize; env->me_psize = env->me_os_psize;
if (env->me_psize > MAX_PAGESIZE) if (env->me_psize > MAX_PAGESIZE)
env->me_psize = MAX_PAGESIZE; env->me_psize = MAX_PAGESIZE;
@ -3771,84 +3772,77 @@ static int __cold mdbx_env_open2(MDB_env *env, MDB_meta *meta) {
meta->mm_mapsize = env->me_mapsize; meta->mm_mapsize = env->me_mapsize;
} }
if (newenv) { if (rc == MDBX_RESULT_TRUE) {
/* mdbx_env_map() may grow the datafile. Write the metapages /* mdbx_env_map() may grow the datafile. Write the metapages
* first, so the file will be valid if initialization fails. */ * first, so the file will be valid if initialization fails. */
rc = mdbx_env_init_meta(env, meta); err = mdbx_env_init_meta(env, meta);
if (unlikely(rc != MDB_SUCCESS)) if (unlikely(err != MDB_SUCCESS))
return rc; return err;
rc = mdbx_ftruncate(env->me_fd, env->me_mapsize); err = mdbx_ftruncate(env->me_fd, env->me_mapsize);
if (unlikely(rc != MDB_SUCCESS)) if (unlikely(err != MDB_SUCCESS))
return rc; return err;
} }
const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize; const size_t usedsize = (meta->mm_last_pg + 1) * env->me_psize;
rc = mdbx_env_map(env, NULL, usedsize); err = mdbx_env_map(env, NULL, usedsize);
if (rc) if (err)
return rc; return err;
mdbx_env_setup_limits(env, env->me_psize); mdbx_env_setup_limits(env, env->me_psize);
return MDB_SUCCESS; return rc;
} }
/****************************************************************************/ /****************************************************************************/
/* Open and/or initialize the lock region for the environment. */ /* Open and/or initialize the lock region for the environment. */
static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode, static int __cold mdbx_setup_locks(MDB_env *env, char *lck_pathname, int mode) {
int *excl) {
off_t size; off_t size;
assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_fd != INVALID_HANDLE_VALUE);
assert(env->me_lfd == INVALID_HANDLE_VALUE); assert(env->me_lfd == INVALID_HANDLE_VALUE);
int rc = mdbx_openfile(lpath, O_RDWR | O_CREAT, mode, &env->me_lfd); int err = mdbx_openfile(lck_pathname, O_RDWR | O_CREAT, mode, &env->me_lfd);
if (rc != MDB_SUCCESS) { if (err != MDB_SUCCESS) {
if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { if (err != EROFS || (env->me_flags & MDB_RDONLY) == 0)
env->me_lfd = INVALID_HANDLE_VALUE; return err;
rc = MDB_SUCCESS; /* LY: without-lck mode (e.g. on read-only filesystem) */
} else { env->me_lfd = INVALID_HANDLE_VALUE;
return rc;
}
} }
/* Try to get exclusive lock. If we succeed, then /* Try to get exclusive lock. If we succeed, then
* nobody is using the lock region and we should initialize it. */ * nobody is using the lock region and we should initialize it. */
rc = mdbx_lck_seize(env); const int rc = mdbx_lck_seize(env);
if (rc == MDBX_RESULT_TRUE) if (MDBX_IS_ERROR(rc))
*excl = true;
else if (rc == MDBX_RESULT_FALSE)
*excl = false;
else
return rc; return rc;
rc = mdbx_filesize(env->me_lfd, &size); err = mdbx_filesize(env->me_lfd, &size);
if (unlikely(rc != MDB_SUCCESS)) if (unlikely(err != MDB_SUCCESS))
return rc; return err;
if (*excl > 0) { if (rc == MDBX_RESULT_TRUE) {
off_t wanna = roundup2((env->me_maxreaders - 1) * sizeof(MDB_reader) + off_t wanna = roundup2((env->me_maxreaders - 1) * sizeof(MDB_reader) +
sizeof(MDBX_lockinfo), sizeof(MDBX_lockinfo),
env->me_os_psize); env->me_os_psize);
if (size != wanna) { if (size != wanna) {
rc = mdbx_ftruncate(env->me_lfd, wanna); err = mdbx_ftruncate(env->me_lfd, wanna);
if (unlikely(rc != MDB_SUCCESS)) if (unlikely(err != MDB_SUCCESS))
return rc; return err;
size = wanna; size = wanna;
} }
} }
env->me_maxreaders = (size - sizeof(MDBX_lockinfo)) / sizeof(MDB_reader) + 1; env->me_maxreaders = (size - sizeof(MDBX_lockinfo)) / sizeof(MDB_reader) + 1;
void *addr = NULL; void *addr = NULL;
rc = mdbx_mmap(&addr, size, true, env->me_lfd); err = mdbx_mmap(&addr, size, true, env->me_lfd);
if (unlikely(rc != MDB_SUCCESS)) if (unlikely(err != MDB_SUCCESS))
return rc; return err;
env->me_txns = addr; env->me_txns = addr;
if (!(env->me_flags & MDB_NOTLS)) { if (!(env->me_flags & MDB_NOTLS)) {
rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_txns->mti_readers[0], err = mdbx_rthc_alloc(&env->me_txkey, &env->me_txns->mti_readers[0],
&env->me_txns->mti_readers[env->me_maxreaders]); &env->me_txns->mti_readers[env->me_maxreaders]);
if (unlikely(rc != MDB_SUCCESS)) if (unlikely(err != MDB_SUCCESS))
return rc; return err;
env->me_flags |= MDB_ENV_TXKEY; env->me_flags |= MDB_ENV_TXKEY;
} }
@ -3875,11 +3869,12 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode,
return errno; return errno;
#endif #endif
if (*excl > 0) { if (rc == MDBX_RESULT_TRUE) {
/* LY: exlcusive mode, init lck */
memset(env->me_txns, 0, sizeof(MDBX_lockinfo)); memset(env->me_txns, 0, sizeof(MDBX_lockinfo));
rc = mdbx_lck_init(env); err = mdbx_lck_init(env);
if (rc) if (err)
return rc; return err;
env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_magic = MDB_MAGIC;
env->me_txns->mti_format = MDB_LOCK_FORMAT; env->me_txns->mti_format = MDB_LOCK_FORMAT;
@ -3896,7 +3891,7 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode,
} }
} }
return MDB_SUCCESS; return rc;
} }
/** The name of the lock file in the DB environment */ /** The name of the lock file in the DB environment */
@ -3922,8 +3917,8 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode,
int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags,
mode_t mode, int *exclusive) { mode_t mode, int *exclusive) {
int oflags, rc, len, excl = -1; int oflags, rc, len;
char *lpath, *dpath; char *lck_pathname, *dxb_pathname;
if (unlikely(!env || !path)) if (unlikely(!env || !path))
return MDBX_EINVAL; return MDBX_EINVAL;
@ -3941,18 +3936,18 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags,
} else { } else {
rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME);
} }
lpath = malloc(rc); lck_pathname = malloc(rc);
if (!lpath) if (!lck_pathname)
return ENOMEM; return MDBX_ENOMEM;
if (flags & MDB_NOSUBDIR) { if (flags & MDB_NOSUBDIR) {
dpath = lpath + len + sizeof(LOCKSUFF); dxb_pathname = lck_pathname + len + sizeof(LOCKSUFF);
sprintf(lpath, "%s" LOCKSUFF, path); sprintf(lck_pathname, "%s" LOCKSUFF, path);
strcpy(dpath, path); strcpy(dxb_pathname, path);
} else { } else {
dpath = lpath + len + sizeof(LOCKNAME); dxb_pathname = lck_pathname + len + sizeof(LOCKNAME);
sprintf(lpath, "%s" LOCKNAME, path); sprintf(lck_pathname, "%s" LOCKNAME, path);
sprintf(dpath, "%s" DATANAME, path); sprintf(dxb_pathname, "%s" DATANAME, path);
} }
rc = MDB_SUCCESS; rc = MDB_SUCCESS;
@ -3986,59 +3981,62 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags,
else else
oflags = O_RDWR | O_CREAT; oflags = O_RDWR | O_CREAT;
rc = mdbx_openfile(dpath, oflags, mode, &env->me_fd); rc = mdbx_openfile(dxb_pathname, oflags, mode, &env->me_fd);
if (rc != MDB_SUCCESS) if (rc != MDB_SUCCESS)
goto bailout; goto bailout;
rc = mdbx_env_setup_locks(env, lpath, mode, &excl); const int lck_rc = mdbx_setup_locks(env, lck_pathname, mode);
if (rc) if (MDBX_IS_ERROR(lck_rc)) {
rc = lck_rc;
goto bailout; goto bailout;
}
MDB_meta meta; MDB_meta meta;
rc = mdbx_env_open2(env, &meta); const int dxb_rc = mdbx_setup_body(env, &meta, lck_rc);
if (rc == MDB_SUCCESS) { if (MDBX_IS_ERROR(dxb_rc)) {
mdbx_debug("opened dbenv %p", (void *)env); rc = dxb_rc;
if (excl > 0) { goto bailout;
env->me_txns->mti_envmode = env->me_flags; }
if (exclusive == NULL || *exclusive < 2) {
/* LY: downgrade lock only if exclusive access not requested. mdbx_debug("opened dbenv %p", (void *)env);
* in case exclusive==1, just leave value as is. */ if (lck_rc == MDBX_RESULT_TRUE) {
rc = mdbx_lck_downgrade(env); env->me_txns->mti_envmode = env->me_flags;
if (rc != MDB_SUCCESS) if (exclusive == NULL || *exclusive < 2) {
goto bailout; /* LY: downgrade lock only if exclusive access not requested.
excl = 0; * in case exclusive==1, just leave value as is. */
} rc = mdbx_lck_downgrade(env);
} else { if (rc != MDB_SUCCESS)
if (exclusive) {
/* LY: just indicate that is not an exclusive access. */
*exclusive = 0;
}
if ((env->me_txns->mti_envmode ^ env->me_flags) &
(MDB_WRITEMAP | MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC)) {
/* LY: Current mode/flags incompatible with requested. */
rc = MDB_INCOMPATIBLE;
goto bailout; goto bailout;
}
} }
if (!(flags & MDB_RDONLY)) { } else {
MDB_txn *txn; if (exclusive) {
int tsize = sizeof(MDB_txn), /* LY: just indicate that is not an exclusive access. */
size = tsize + *exclusive = 0;
env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) + }
sizeof(unsigned) + 1); if ((env->me_txns->mti_envmode ^ env->me_flags) &
if ((env->me_pbuf = calloc(1, env->me_psize)) && (MDB_WRITEMAP | MDB_NOSYNC | MDB_NOMETASYNC | MDB_MAPASYNC)) {
(txn = calloc(1, size))) { /* LY: Current mode/flags incompatible with requested. */
txn->mt_dbs = (MDB_db *)((char *)txn + tsize); rc = MDB_INCOMPATIBLE;
txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs); goto bailout;
txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); }
txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); }
txn->mt_env = env; if (!(flags & MDB_RDONLY)) {
txn->mt_dbxs = env->me_dbxs; MDB_txn *txn;
txn->mt_flags = MDB_TXN_FINISHED; int tsize = sizeof(MDB_txn),
env->me_txn0 = txn; size = tsize +
} else { env->me_maxdbs * (sizeof(MDB_db) + sizeof(MDB_cursor *) +
rc = ENOMEM; sizeof(unsigned) + 1);
} if ((env->me_pbuf = calloc(1, env->me_psize)) && (txn = calloc(1, size))) {
txn->mt_dbs = (MDB_db *)((char *)txn + tsize);
txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs);
txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
txn->mt_env = env;
txn->mt_dbxs = env->me_dbxs;
txn->mt_flags = MDB_TXN_FINISHED;
env->me_txn0 = txn;
} else {
rc = MDBX_ENOMEM;
} }
} }
@ -4063,7 +4061,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags,
bailout: bailout:
if (rc) if (rc)
mdbx_env_close0(env); mdbx_env_close0(env);
free(lpath); free(lck_pathname);
return rc; return rc;
} }
@ -8464,24 +8462,24 @@ int __cold mdbx_env_copyfd(MDB_env *env, mdbx_filehandle_t fd) {
int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) { int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) {
int rc, len; int rc, len;
char *lpath; char *lck_pathname;
mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE; mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE;
if (env->me_flags & MDB_NOSUBDIR) { if (env->me_flags & MDB_NOSUBDIR) {
lpath = (char *)path; lck_pathname = (char *)path;
} else { } else {
len = strlen(path); len = strlen(path);
len += sizeof(DATANAME); len += sizeof(DATANAME);
lpath = malloc(len); lck_pathname = malloc(len);
if (!lpath) if (!lck_pathname)
return ENOMEM; return MDBX_ENOMEM;
sprintf(lpath, "%s" DATANAME, path); sprintf(lck_pathname, "%s" DATANAME, path);
} }
/* The destination path must exist, but the destination file must not. /* The destination path must exist, but the destination file must not.
* We don't want the OS to cache the writes, since the source data is * We don't want the OS to cache the writes, since the source data is
* already in the OS cache. */ * already in the OS cache. */
rc = mdbx_openfile(lpath, O_WRONLY | O_CREAT | O_EXCL, 0666, &newfd); rc = mdbx_openfile(lck_pathname, O_WRONLY | O_CREAT | O_EXCL, 0666, &newfd);
if (rc == MDB_SUCCESS) { if (rc == MDB_SUCCESS) {
if (env->me_psize >= env->me_os_psize) { if (env->me_psize >= env->me_os_psize) {
#ifdef F_NOCACHE /* __APPLE__ */ #ifdef F_NOCACHE /* __APPLE__ */
@ -8496,7 +8494,7 @@ int __cold mdbx_env_copy2(MDB_env *env, const char *path, unsigned flags) {
} }
if (!(env->me_flags & MDB_NOSUBDIR)) if (!(env->me_flags & MDB_NOSUBDIR))
free(lpath); free(lck_pathname);
if (newfd != INVALID_HANDLE_VALUE) { if (newfd != INVALID_HANDLE_VALUE) {
int err = mdbx_closefile(newfd); int err = mdbx_closefile(newfd);

View File

@ -333,12 +333,12 @@ int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, off_t offset) {
ov.Offset = (DWORD)offset; ov.Offset = (DWORD)offset;
ov.OffsetHigh = HIGH_DWORD(offset); ov.OffsetHigh = HIGH_DWORD(offset);
DWORD read; DWORD read = 0;
if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) {
int rc = GetLastError(); int rc = GetLastError();
if (rc == ERROR_HANDLE_EOF && read == 0 && offset == 0) if (rc == ERROR_HANDLE_EOF)
return MDBX_ENODATA; return (read == 0 && offset == 0) ? MDBX_ENODATA : ERROR_READ_FAULT;
return rc; return (rc == MDB_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc;
} }
return (read == bytes) ? MDB_SUCCESS : ERROR_READ_FAULT; return (read == bytes) ? MDB_SUCCESS : ERROR_READ_FAULT;
#else #else