Merge remote-tracking branch 'origin/mdb.master'

This commit is contained in:
Howard Chu 2012-10-22 17:05:26 -07:00
commit f51efd4b03
7 changed files with 553 additions and 172 deletions

View file

@ -1,6 +1,7 @@
mtest
mtest[23456]
testdb
mdb_copy
mdb_stat
*.[ao]
*.so

View file

@ -5,7 +5,7 @@ CFLAGS = -pthread $(OPT) $(W) $(XCFLAGS)
LDLIBS =
SOLIBS =
PROGS = mdb_stat mtest mtest2 mtest3 mtest4 mtest5
PROGS = mdb_stat mdb_copy mtest mtest2 mtest3 mtest4 mtest5
all: libmdb.a libmdb.so $(PROGS)
clean:
@ -22,6 +22,7 @@ libmdb.so: mdb.o midl.o
gcc -pthread -shared -o $@ mdb.o midl.o $(SOLIBS)
mdb_stat: mdb_stat.o libmdb.a
mdb_copy: mdb_copy.o libmdb.a
mtest: mtest.o libmdb.a
mtest2: mtest2.o libmdb.a
mtest3: mtest3.o libmdb.a

View file

@ -32,6 +32,7 @@
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#define _GNU_SOURCE 1
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/param.h>
@ -140,10 +141,11 @@
#define pthread_mutex_t HANDLE
#define pthread_key_t DWORD
#define pthread_self() GetCurrentThreadId()
#define pthread_key_create(x,y) (*(x) = TlsAlloc())
#define pthread_key_create(x,y) \
((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0)
#define pthread_key_delete(x) TlsFree(x)
#define pthread_getspecific(x) TlsGetValue(x)
#define pthread_setspecific(x,y) TlsSetValue(x,y)
#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
#define pthread_mutex_unlock(x) ReleaseMutex(x)
#define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE)
#define LOCK_MUTEX_R(env) pthread_mutex_lock((env)->me_rmutex)
@ -824,6 +826,7 @@ struct MDB_txn {
*/
#define MDB_TXN_RDONLY 0x01 /**< read-only transaction */
#define MDB_TXN_ERROR 0x02 /**< an error has occurred */
#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */
/** @} */
unsigned int mt_flags; /**< @ref mdb_txn */
/** Tracks which of the two meta pages was used at the start
@ -912,6 +915,10 @@ struct MDB_env {
HANDLE me_mfd; /**< just for writing the meta pages */
/** Failed to update the meta page. Probably an I/O error. */
#define MDB_FATAL_ERROR 0x80000000U
/** Read-only Filesystem. Allow read access, no locking. */
#define MDB_ROFS 0x40000000U
/** Some fields are initialized. */
#define MDB_ENV_ACTIVE 0x20000000U
uint32_t me_flags; /**< @ref mdb_env */
unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */
unsigned int me_maxreaders; /**< size of the reader table */
@ -1631,39 +1638,49 @@ mdb_txn_renew0(MDB_txn *txn)
{
MDB_env *env = txn->mt_env;
unsigned int i;
int rc;
/* Setup db info */
txn->mt_numdbs = env->me_numdbs;
txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
if (txn->mt_flags & MDB_TXN_RDONLY) {
MDB_reader *r = pthread_getspecific(env->me_txkey);
if (!r) {
pid_t pid = env->me_pid;
pthread_t tid = pthread_self();
if (env->me_flags & MDB_ROFS) {
i = mdb_env_pick_meta(env);
txn->mt_txnid = env->me_metas[i]->mm_txnid;
txn->mt_u.reader = NULL;
} else {
MDB_reader *r = pthread_getspecific(env->me_txkey);
if (!r) {
pid_t pid = env->me_pid;
pthread_t tid = pthread_self();
LOCK_MUTEX_R(env);
for (i=0; i<env->me_txns->mti_numreaders; i++)
if (env->me_txns->mti_readers[i].mr_pid == 0)
break;
if (i == env->me_maxreaders) {
LOCK_MUTEX_R(env);
for (i=0; i<env->me_txns->mti_numreaders; i++)
if (env->me_txns->mti_readers[i].mr_pid == 0)
break;
if (i == env->me_maxreaders) {
UNLOCK_MUTEX_R(env);
return MDB_READERS_FULL;
}
env->me_txns->mti_readers[i].mr_pid = pid;
env->me_txns->mti_readers[i].mr_tid = tid;
if (i >= env->me_txns->mti_numreaders)
env->me_txns->mti_numreaders = i+1;
/* Save numreaders for un-mutexed mdb_env_close() */
env->me_numreaders = env->me_txns->mti_numreaders;
UNLOCK_MUTEX_R(env);
return MDB_READERS_FULL;
r = &env->me_txns->mti_readers[i];
if ((rc = pthread_setspecific(env->me_txkey, r)) != 0) {
env->me_txns->mti_readers[i].mr_pid = 0;
return rc;
}
}
env->me_txns->mti_readers[i].mr_pid = pid;
env->me_txns->mti_readers[i].mr_tid = tid;
if (i >= env->me_txns->mti_numreaders)
env->me_txns->mti_numreaders = i+1;
/* Save numreaders for un-mutexed mdb_env_close() */
env->me_numreaders = env->me_txns->mti_numreaders;
UNLOCK_MUTEX_R(env);
r = &env->me_txns->mti_readers[i];
pthread_setspecific(env->me_txkey, r);
txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid;
txn->mt_u.reader = r;
}
txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid;
txn->mt_toggle = txn->mt_txnid & 1;
txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1;
txn->mt_u.reader = r;
} else {
LOCK_MUTEX_W(env);
@ -1803,7 +1820,8 @@ mdb_txn_reset0(MDB_txn *txn)
MDB_env *env = txn->mt_env;
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
txn->mt_u.reader->mr_txnid = (txnid_t)-1;
if (!(env->me_flags & MDB_ROFS))
txn->mt_u.reader->mr_txnid = (txnid_t)-1;
} else {
MDB_oldpages *mop;
MDB_page *dp;
@ -1932,15 +1950,15 @@ mdb_txn_commit(MDB_txn *txn)
return EINVAL;
}
/* Merge (and close) our cursors with parent's */
mdb_cursor_merge(txn);
if (txn->mt_parent) {
MDB_db *ip, *jp;
MDB_dbi i;
unsigned x, y;
MDB_ID2L dst, src;
/* Merge (and close) our cursors with parent's */
mdb_cursor_merge(txn);
/* Update parent's DB table */
ip = &txn->mt_parent->mt_dbs[2];
jp = &txn->mt_dbs[2];
@ -1988,7 +2006,7 @@ mdb_txn_commit(MDB_txn *txn)
return EINVAL;
}
if (!txn->mt_u.dirty_list[0].mid)
if (!txn->mt_u.dirty_list[0].mid && !(txn->mt_flags & MDB_TXN_DIRTY))
goto done;
DPRINTF("committing txn %zu %p on mdbenv %p, root page %zu",
@ -2123,7 +2141,7 @@ again:
while (env->me_pgfree) {
MDB_oldpages *mop = env->me_pgfree;
env->me_pgfree = mop->mo_next;
free(mop);;
free(mop);
}
/* Check for growth of freelist again */
@ -2400,7 +2418,7 @@ static int
mdb_env_write_meta(MDB_txn *txn)
{
MDB_env *env;
MDB_meta meta, metab;
MDB_meta meta, metab, *mp;
off_t off;
int rc, len, toggle;
char *ptr;
@ -2416,9 +2434,12 @@ mdb_env_write_meta(MDB_txn *txn)
toggle, txn->mt_dbs[MAIN_DBI].md_root);
env = txn->mt_env;
mp = env->me_metas[toggle];
if (env->me_flags & MDB_WRITEMAP) {
MDB_meta *mp = env->me_metas[toggle];
/* Persist any increases of mapsize config */
if (env->me_mapsize > mp->mm_mapsize)
mp->mm_mapsize = env->me_mapsize;
mp->mm_dbs[0] = txn->mt_dbs[0];
mp->mm_dbs[1] = txn->mt_dbs[1];
mp->mm_last_pg = txn->mt_next_pgno - 1;
@ -2439,7 +2460,13 @@ mdb_env_write_meta(MDB_txn *txn)
metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;
ptr = (char *)&meta;
off = offsetof(MDB_meta, mm_dbs[0].md_depth);
if (env->me_mapsize > mp->mm_mapsize) {
/* Persist any increases of mapsize config */
meta.mm_mapsize = env->me_mapsize;
off = offsetof(MDB_meta, mm_mapsize);
} else {
off = offsetof(MDB_meta, mm_dbs[0].md_depth);
}
len = sizeof(MDB_meta) - off;
ptr += off;
@ -2573,14 +2600,13 @@ mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
/** Further setup required for opening an MDB environment
*/
static int
mdb_env_open2(MDB_env *env, unsigned int flags)
mdb_env_open2(MDB_env *env)
{
unsigned int flags = env->me_flags;
int i, newenv = 0, prot;
MDB_meta meta;
MDB_page *p;
env->me_flags = flags;
memset(&meta, 0, sizeof(meta));
if ((i = mdb_env_read_header(env, &meta)) != 0) {
@ -2588,11 +2614,11 @@ mdb_env_open2(MDB_env *env, unsigned int flags)
return i;
DPUTS("new mdbenv");
newenv = 1;
meta.mm_mapsize = env->me_mapsize > DEFAULT_MAPSIZE ? env->me_mapsize : DEFAULT_MAPSIZE;
}
if (!env->me_mapsize) {
env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize;
}
if (env->me_mapsize < meta.mm_mapsize)
env->me_mapsize = meta.mm_mapsize;
#ifdef _WIN32
{
@ -2629,7 +2655,8 @@ mdb_env_open2(MDB_env *env, unsigned int flags)
prot = PROT_READ;
if (flags & MDB_WRITEMAP) {
prot |= PROT_WRITE;
ftruncate(env->me_fd, env->me_mapsize);
if (ftruncate(env->me_fd, env->me_mapsize) < 0)
return ErrCode();
}
env->me_map = mmap(meta.mm_address, env->me_mapsize, prot, i,
env->me_fd, 0);
@ -2640,7 +2667,6 @@ mdb_env_open2(MDB_env *env, unsigned int flags)
#endif
if (newenv) {
meta.mm_mapsize = env->me_mapsize;
if (flags & MDB_FIXEDMAP)
meta.mm_address = env->me_map;
i = mdb_env_init_meta(env, &meta);
@ -2759,9 +2785,12 @@ mdb_env_share_locks(MDB_env *env, int *excl)
* then release the existing exclusive lock.
*/
memset(&ov, 0, sizeof(ov));
LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov);
UnlockFile(env->me_lfd, 0, 0, 1, 0);
*excl = 0;
if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
rc = ErrCode();
} else {
UnlockFile(env->me_lfd, 0, 0, 1, 0);
*excl = 0;
}
}
#else
{
@ -2794,7 +2823,9 @@ mdb_env_excl_lock(MDB_env *env, int *excl)
} else {
OVERLAPPED ov;
memset(&ov, 0, sizeof(ov));
if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
*excl = 0;
} else {
rc = ErrCode();
}
}
@ -2805,7 +2836,9 @@ mdb_env_excl_lock(MDB_env *env, int *excl)
lock_info.l_whence = SEEK_SET;
lock_info.l_start = 0;
lock_info.l_len = 1;
if (!fcntl(env->me_lfd, F_SETLK, &lock_info)) {
while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
(rc = ErrCode()) == EINTR) ;
if (!rc) {
*excl = 1;
} else
# ifdef MDB_USE_POSIX_SEM
@ -2917,6 +2950,11 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) {
rc = ErrCode();
if (rc == ERROR_WRITE_PROTECT && (env->me_flags & MDB_RDONLY)) {
env->me_flags |= MDB_ROFS;
return MDB_SUCCESS;
}
goto fail_errno;
}
/* Try to get exclusive lock. If we succeed, then
@ -2929,15 +2967,27 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
#if !(O_CLOEXEC)
{
int fdflags;
if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1)
if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) {
rc = ErrCode();
if (rc == EROFS && (env->me_flags & MDB_RDONLY)) {
env->me_flags |= MDB_ROFS;
return MDB_SUCCESS;
}
goto fail_errno;
}
/* Lose record locks when exec*() */
if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0)
fcntl(env->me_lfd, F_SETFD, fdflags);
}
#else /* O_CLOEXEC on Linux: Open file and set FD_CLOEXEC atomically */
if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT|O_CLOEXEC, mode)) == -1)
if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT|O_CLOEXEC, mode)) == -1) {
rc = ErrCode();
if (rc == EROFS && (env->me_flags & MDB_RDONLY)) {
env->me_flags |= MDB_ROFS;
return MDB_SUCCESS;
}
goto fail_errno;
}
#endif
/* Try to get exclusive lock. If we succeed, then
@ -2996,7 +3046,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
mdb_sec_inited = 1;
}
GetFileInformationByHandle(env->me_lfd, &stbuf);
if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno;
idbuf.volume = stbuf.dwVolumeSerialNumber;
idbuf.nhigh = stbuf.nFileIndexHigh;
idbuf.nlow = stbuf.nFileIndexLow;
@ -3124,14 +3174,16 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode)
sprintf(dpath, "%s" DATANAME, path);
}
flags |= env->me_flags;
/* silently ignore WRITEMAP if we're only getting read access */
if (F_ISSET(flags, MDB_RDONLY|MDB_WRITEMAP))
flags ^= MDB_WRITEMAP;
env->me_flags = flags |= MDB_ENV_ACTIVE;
rc = mdb_env_setup_locks(env, lpath, mode, &excl);
if (rc)
goto leave;
/* silently ignore WRITEMAP if we're only getting read access */
if (F_ISSET(flags, MDB_RDONLY) && F_ISSET(flags, MDB_WRITEMAP))
flags ^= MDB_WRITEMAP;
#ifdef _WIN32
if (F_ISSET(flags, MDB_RDONLY)) {
oflags = GENERIC_READ;
@ -3156,7 +3208,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode)
goto leave;
}
if ((rc = mdb_env_open2(env, flags)) == MDB_SUCCESS) {
if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) {
if (flags & (MDB_RDONLY|MDB_NOSYNC|MDB_NOMETASYNC|MDB_WRITEMAP)) {
env->me_mfd = env->me_fd;
} else {
@ -3174,7 +3226,9 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode)
}
}
DPRINTF("opened dbenv %p", (void *) env);
pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
if (rc)
goto leave;
env->me_numdbs = 2; /* this notes that me_txkey was set */
#ifdef _WIN32
/* Windows TLS callbacks need help finding their TLS info. */
@ -3211,7 +3265,7 @@ mdb_env_close0(MDB_env *env, int excl)
{
int i;
if (env->me_lfd == INVALID_HANDLE_VALUE) /* 1st field to get inited */
if (!(env->me_flags & MDB_ENV_ACTIVE))
return;
free(env->me_dbflags);
@ -3272,9 +3326,127 @@ mdb_env_close0(MDB_env *env, int excl)
#endif
munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
}
close(env->me_lfd);
if (env->me_lfd != INVALID_HANDLE_VALUE) {
#ifdef _WIN32
if (excl >= 0) {
/* Unlock the lockfile. Windows would have unlocked it
* after closing anyway, but not necessarily at once.
*/
UnlockFile(env->me_lfd, 0, 0, 1, 0);
}
#endif
close(env->me_lfd);
}
env->me_lfd = INVALID_HANDLE_VALUE; /* Mark env as reset */
env->me_flags &= ~MDB_ENV_ACTIVE;
}
int
mdb_env_copy(MDB_env *env, const char *path)
{
MDB_txn *txn = NULL;
int rc, len;
size_t wsize;
char *lpath, *ptr;
HANDLE newfd = INVALID_HANDLE_VALUE;
if (env->me_flags & MDB_NOSUBDIR) {
lpath = (char *)path;
} else {
len = strlen(path);
len += sizeof(DATANAME);
lpath = malloc(len);
if (!lpath)
return ENOMEM;
sprintf(lpath, "%s" DATANAME, path);
}
/* The destination path must exist, but the destination file must not.
* We don't want the OS to cache the writes, since the source data is
* already in the OS cache.
*/
#ifdef _WIN32
newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
#else
newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL
#ifdef O_DIRECT
|O_DIRECT
#endif
, 0666);
#endif
if (!(env->me_flags & MDB_NOSUBDIR))
free(lpath);
if (newfd == INVALID_HANDLE_VALUE) {
rc = ErrCode();
goto leave;
}
#ifdef F_NOCACHE /* __APPLE__ */
rc = fcntl(newfd, F_NOCACHE, 1);
if (rc) {
rc = ErrCode();
goto leave;
}
#endif
/* Do the lock/unlock of the reader mutex before starting the
* write txn. Otherwise other read txns could block writers.
*/
rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
if (rc)
goto leave;
if (!(env->me_flags & MDB_ROFS)) {
/* We must start the actual read txn after blocking writers */
mdb_txn_reset0(txn);
/* Temporarily block writers until we snapshot the meta pages */
LOCK_MUTEX_W(env);
rc = mdb_txn_renew0(txn);
if (rc) {
UNLOCK_MUTEX_W(env);
goto leave;
}
}
wsize = env->me_psize * 2;
#ifdef _WIN32
{
DWORD len;
rc = WriteFile(newfd, env->me_map, wsize, &len, NULL);
rc = (len == wsize) ? MDB_SUCCESS : ErrCode();
}
#else
rc = write(newfd, env->me_map, wsize);
rc = (rc == (int)wsize) ? MDB_SUCCESS : ErrCode();
#endif
if (! (env->me_flags & MDB_ROFS))
UNLOCK_MUTEX_W(env);
if (rc)
goto leave;
ptr = env->me_map + wsize;
wsize = txn->mt_next_pgno * env->me_psize - wsize;
#ifdef _WIN32
{
DWORD len;
rc = WriteFile(newfd, ptr, wsize, &len, NULL);
rc = (len == wsize) ? MDB_SUCCESS : ErrCode();
}
#else
rc = write(newfd, ptr, wsize);
rc = (rc == (int)wsize) ? MDB_SUCCESS : ErrCode();
#endif
mdb_txn_abort(txn);
leave:
if (newfd != INVALID_HANDLE_VALUE)
close(newfd);
return rc;
}
void
@ -3811,8 +3983,12 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right)
: (mc->mc_ki[mc->mc_top] == 0)) {
DPRINTF("no more keys left, moving to %s sibling",
move_right ? "right" : "left");
if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS)
if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) {
/* undo cursor_pop before returning */
mc->mc_top++;
mc->mc_snum++;
return rc;
}
} else {
if (move_right)
mc->mc_ki[mc->mc_top]++;
@ -3825,7 +4001,7 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right)
indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp)))
return rc;;
return rc;
mdb_cursor_push(mc, mp);
@ -4222,8 +4398,8 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
}
assert(IS_LEAF(mc->mc_pg[mc->mc_top]));
mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
mc->mc_flags |= C_INITIALIZED|C_EOF;
mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
}
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
@ -4354,9 +4530,10 @@ fetchm:
case MDB_PREV_NODUP:
if (!(mc->mc_flags & C_INITIALIZED) || (mc->mc_flags & C_EOF)) {
rc = mdb_cursor_last(mc, key, data);
mc->mc_flags &= ~C_EOF;
} else
rc = mdb_cursor_prev(mc, key, data, op);
mc->mc_flags |= C_INITIALIZED;
mc->mc_ki[mc->mc_top]++;
}
rc = mdb_cursor_prev(mc, key, data, op);
break;
case MDB_FIRST:
rc = mdb_cursor_first(mc, key, data);
@ -4404,7 +4581,9 @@ mdb_cursor_touch(MDB_cursor *mc)
if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
MDB_cursor mc2;
mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
MDB_xcursor mcx;
mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI,
mc->mc_txn->mt_dbs[MAIN_DBI].md_flags & MDB_DUPSORT ? &mcx : NULL);
rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY);
if (rc)
return rc;
@ -6480,6 +6659,24 @@ mdb_env_stat(MDB_env *env, MDB_stat *arg)
return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg);
}
int
mdb_env_info(MDB_env *env, MDB_envinfo *arg)
{
int toggle;
if (env == NULL || arg == NULL)
return EINVAL;
toggle = mdb_env_pick_meta(env);
arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0;
arg->me_mapsize = env->me_mapsize;
arg->me_maxreaders = env->me_maxreaders;
arg->me_numreaders = env->me_numreaders;
arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg;
arg->me_last_txnid = env->me_metas[toggle]->mm_txnid;
return MDB_SUCCESS;
}
/** Set the default comparison functions for a database.
* Called immediately after a database is opened to set the defaults.
* The user can then override them with #mdb_set_compare() or
@ -6519,8 +6716,13 @@ int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
/* main DB? */
if (!name) {
*dbi = MAIN_DBI;
if (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY))
txn->mt_dbs[MAIN_DBI].md_flags |= (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY));
if (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY)) {
/* make sure flag changes get committed */
if ((txn->mt_dbs[MAIN_DBI].md_flags | flags) != txn->mt_dbs[MAIN_DBI].md_flags) {
txn->mt_dbs[MAIN_DBI].md_flags |= (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY));
txn->mt_flags |= MDB_TXN_DIRTY;
}
}
mdb_default_cmp(txn, MAIN_DBI);
return MDB_SUCCESS;
}
@ -6545,7 +6747,7 @@ int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
}
/* If no free slot and max hit, fail */
if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs - 1)
if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs)
return MDB_DBS_FULL;
/* Find the DB info */

View file

@ -38,6 +38,63 @@
* corrupt the database. Of course if your application code is known to
* be bug-free (...) then this is not an issue.
*
* Troubleshooting the lock file, plus semaphores on BSD systems:
*
* - A broken lockfile can cause sync issues.
* Stale reader transactions left behind by an aborted program
* cause further writes to grow the database quickly, and
* stale locks can block further operation.
*
* Fix: Terminate all programs using the database, or make
* them close it. Next database user will reset the lockfile.
*
* - On BSD systems or others configured with MDB_USE_POSIX_SEM,
* startup can fail due to semaphores owned by another userid.
*
* Fix: Open and close the database as the user which owns the
* semaphores (likely last user) or as root, while no other
* process is using the database.
*
* Restrictions/caveats (in addition to those listed for some functions):
*
* - Only the database owner should normally use the database on
* BSD systems or when otherwise configured with MDB_USE_POSIX_SEM.
* Multiple users can cause startup to fail later, as noted above.
*
* - A thread can only use one transaction at a time, plus any child
* transactions. Each transaction belongs to one thread. See below.
*
* - Use an MDB_env* in the process which opened it, without fork()ing.
*
* - Do not have open an MDB database twice in the same process at
* the same time. Not even from a plain open() call - close()ing it
* breaks flock() advisory locking.
*
* - Avoid long-lived transactions. Read transactions prevent
* reuse of pages freed by newer write transactions, thus the
* database can grow quickly. Write transactions prevent
* other write transactions, since writes are serialized.
*
* ...when several processes can use a database concurrently:
*
* - Avoid suspending a process with active transactions. These
* would then be "long-lived" as above.
*
* - Avoid aborting a process with an active transaction.
* The transaction becomes "long-lived" as above until the lockfile
* is reset, since the process may not remove it from the lockfile.
*
* - If you do that anyway, close the environment once in a while,
* so the lockfile can get reset.
*
* - Do not use MDB databases on remote filesystems, even between
* processes on the same host. This breaks flock() on some OSes,
* possibly memory map sync, and certainly sync between programs
* on different hosts.
*
* - Opening a database can fail if another process is opening or
* closing it at exactly the same time.
*
* @author Howard Chu, Symas Corporation.
*
* @copyright Copyright 2011-2012 Howard Chu, Symas Corp. All rights reserved.
@ -301,6 +358,16 @@ typedef struct MDB_stat {
size_t ms_entries; /**< Number of data items */
} MDB_stat;
/** @brief Information about the environment */
typedef struct MDB_envinfo {
void *me_mapaddr; /**< Address of map, if fixed */
size_t me_mapsize; /**< Size of the data memory map */
size_t me_last_pgno; /**< ID of the last used page */
size_t me_last_txnid; /**< ID of the last committed transaction */
unsigned int me_maxreaders; /**< maximum number of threads for the environment */
unsigned int me_numreaders; /**< maximum number of threads used in the environment */
} MDB_envinfo;
/** @brief Return the mdb library version information.
*
* @param[out] major if non-NULL, the library major version number is copied here
@ -344,6 +411,7 @@ int mdb_env_create(MDB_env **env);
* @param[in] flags Special options for this environment. This parameter
* must be set to 0 or by bitwise OR'ing together one or more of the
* values described here.
* Flags set by mdb_env_set_flags() are also used.
* <ul>
* <li>#MDB_FIXEDMAP
* use a fixed address for the mmap region. This flag must be specified
@ -393,6 +461,18 @@ int mdb_env_create(MDB_env **env);
*/
int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode);
/** @brief Copy an MDB environment to the specified path.
*
* This function may be used to make a backup of an existing environment.
* @param[in] env An environment handle returned by #mdb_env_create(). It
* must have already been opened successfully.
* @param[in] path The directory in which the copy will reside. This
* directory must already exist and be writable but must otherwise be
* empty.
* @return A non-zero error value on failure and 0 on success.
*/
int mdb_env_copy(MDB_env *env, const char *path);
/** @brief Return statistics about the MDB environment.
*
* @param[in] env An environment handle returned by #mdb_env_create()
@ -401,6 +481,14 @@ int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mod
*/
int mdb_env_stat(MDB_env *env, MDB_stat *stat);
/** @brief Return information about the MDB environment.
*
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[out] stat The address of an #MDB_envinfo structure
* where the information will be copied
*/
int mdb_env_info(MDB_env *env, MDB_envinfo *stat);
/** @brief Flush the data buffers to disk.
*
* Data is always written to disk when #mdb_txn_commit() is called,
@ -432,7 +520,7 @@ void mdb_env_close(MDB_env *env);
/** @brief Set environment flags.
*
* This may be used to set some flags that weren't already set during
* This may be used to set some flags in addition to those from
* #mdb_env_open(), or to unset these flags.
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] flags The flags to change, bitwise OR'ed together

View file

@ -0,0 +1,43 @@
/* mdb_copy.c - memory-mapped database backup tool */
/*
* Copyright 2012 Howard Chu, Symas Corp.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "mdb.h"
int main(int argc,char * argv[])
{
int rc;
MDB_env *env;
char *envname = argv[1];
if (argc != 3) {
fprintf(stderr, "usage: %s srcpath dstpath\n", argv[0]);
exit(EXIT_FAILURE);
}
rc = mdb_env_create(&env);
rc = mdb_env_open(env, envname, MDB_RDONLY, 0);
if (rc) {
printf("mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc));
} else {
rc = mdb_env_copy(env, argv[2]);
if (rc)
printf("mdb_env_copy failed, error %d %s\n", rc, mdb_strerror(rc));
}
mdb_env_close(env);
return rc ? EXIT_FAILURE : EXIT_SUCCESS;
}

View file

@ -13,49 +13,179 @@
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <unistd.h>
#include "mdb.h"
int main(int argc,char * argv[])
static void prstat(MDB_stat *ms)
{
int rc;
#if 0
printf(" Page size: %u\n", ms->ms_psize);
#endif
printf(" Tree depth: %u\n", ms->ms_depth);
printf(" Branch pages: %zu\n", ms->ms_branch_pages);
printf(" Leaf pages: %zu\n", ms->ms_leaf_pages);
printf(" Overflow pages: %zu\n", ms->ms_overflow_pages);
printf(" Entries: %zu\n", ms->ms_entries);
}
static void usage(char *prog)
{
fprintf(stderr, "usage: %s dbpath [-e] [-f] [-n] [-a|-s subdb]\n", prog);
exit(EXIT_FAILURE);
}
int main(int argc, char *argv[])
{
int i, rc;
MDB_env *env;
MDB_txn *txn;
MDB_dbi dbi;
MDB_stat mst;
char *envname = argv[1];
MDB_envinfo mei;
char *prog = argv[0];
char *envname;
char *subname = NULL;
int alldbs = 0, envinfo = 0, envflags = 0, freinfo = 0;
rc = mdb_env_create(&env);
if (argc > 2) {
mdb_env_set_maxdbs(env, 4);
subname = argv[2];
if (argc < 2) {
usage(prog);
}
rc = mdb_env_open(env, envname, MDB_RDONLY, 0);
/* -a: print stat of main DB and all subDBs
* -s: print stat of only the named subDB
* -e: print env info
* -f: print freelist info
* -n: use NOSUBDIR flag on env_open
* (default) print stat of only the main DB
*/
while ((i = getopt(argc, argv, "aefns:")) != EOF) {
switch(i) {
case 'a':
alldbs++;
break;
case 'e':
envinfo++;
break;
case 'f':
freinfo++;
break;
case 'n':
envflags |= MDB_NOSUBDIR;
break;
case 's':
subname = optarg;
break;
default:
usage(prog);
}
}
if (optind != argc - 1)
usage(prog);
envname = argv[optind];
rc = mdb_env_create(&env);
if (alldbs || subname) {
mdb_env_set_maxdbs(env, 4);
}
rc = mdb_env_open(env, envname, envflags | MDB_RDONLY, 0664);
if (rc) {
printf("mdb_env_open failed, error %d\n", rc);
printf("mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc));
goto env_close;
}
rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
if (rc) {
printf("mdb_txn_begin failed, error %d\n", rc);
printf("mdb_txn_begin failed, error %d %s\n", rc, mdb_strerror(rc));
goto env_close;
}
if (envinfo) {
rc = mdb_env_stat(env, &mst);
rc = mdb_env_info(env, &mei);
printf("Environment Info\n");
printf(" Map address: %p\n", mei.me_mapaddr);
printf(" Map size: %zu\n", mei.me_mapsize);
printf(" Page size: %u\n", mst.ms_psize);
printf(" Max pages: %zu\n", mei.me_mapsize / mst.ms_psize);
printf(" Number of pages used: %zu\n", mei.me_last_pgno+1);
printf(" Last transaction ID: %zu\n", mei.me_last_txnid);
printf(" Max readers: %u\n", mei.me_maxreaders);
printf(" Number of readers used: %u\n", mei.me_numreaders);
}
if (freinfo) {
MDB_cursor *cursor;
MDB_val data;
size_t pages = 0, *iptr;
printf("Freelist Status\n");
dbi = 0;
rc = mdb_cursor_open(txn, dbi, &cursor);
if (rc) {
printf("mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc));
goto txn_abort;
}
rc = mdb_stat(txn, dbi, &mst);
if (rc) {
printf("mdb_stat failed, error %d %s\n", rc, mdb_strerror(rc));
goto txn_abort;
}
while ((rc = mdb_cursor_get(cursor, NULL, &data, MDB_NEXT)) == 0) {
iptr = data.mv_data;
pages += *iptr;
}
mdb_cursor_close(cursor);
prstat(&mst);
printf(" Free pages: %zu\n", pages);
}
rc = mdb_open(txn, subname, 0, &dbi);
if (rc) {
printf("mdb_open failed, error %d\n", rc);
printf("mdb_open failed, error %d %s\n", rc, mdb_strerror(rc));
goto txn_abort;
}
rc = mdb_stat(txn, dbi, &mst);
printf("Page size: %u\n", mst.ms_psize);
printf("Tree depth: %u\n", mst.ms_depth);
printf("Branch pages: %zu\n", mst.ms_branch_pages);
printf("Leaf pages: %zu\n", mst.ms_leaf_pages);
printf("Overflow pages: %zu\n", mst.ms_overflow_pages);
printf("Entries: %zu\n", mst.ms_entries);
if (rc) {
printf("mdb_stat failed, error %d %s\n", rc, mdb_strerror(rc));
goto txn_abort;
}
printf("Status of %s\n", subname ? subname : "Main DB");
prstat(&mst);
if (alldbs) {
MDB_cursor *cursor;
MDB_val key;
rc = mdb_cursor_open(txn, dbi, &cursor);
if (rc) {
printf("mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc));
goto txn_abort;
}
while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT)) == 0) {
char *str = malloc(key.mv_size+1);
MDB_dbi db2;
memcpy(str, key.mv_data, key.mv_size);
str[key.mv_size] = '\0';
rc = mdb_open(txn, str, 0, &db2);
if (rc == MDB_SUCCESS)
printf("Status of %s\n", str);
free(str);
if (rc) continue;
rc = mdb_stat(txn, db2, &mst);
if (rc) {
printf("mdb_stat failed, error %d %s\n", rc, mdb_strerror(rc));
goto txn_abort;
}
prstat(&mst);
mdb_close(env, db2);
}
mdb_cursor_close(cursor);
}
mdb_close(env, dbi);
txn_abort:
mdb_txn_abort(txn);

View file

@ -1,84 +0,0 @@
/* mdb_stat.c - memory-mapped database status tool */
/*
* Copyright 2011 Howard Chu, Symas Corp.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mdb.h"
int main(int argc,char * argv[])
{
int rc;
MDB_env *env;
MDB_txn *txn;
MDB_dbi dbi;
MDB_stat mst;
MDB_cursor *cursor;
MDB_val key;
char *envname = argv[1];
rc = mdb_env_create(&env);
mdb_env_set_maxdbs(env, 4);
rc = mdb_env_open(env, envname, MDB_RDONLY, 0);
if (rc) {
printf("mdb_env_open failed, error %d\n", rc);
goto env_close;
}
rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
if (rc) {
printf("mdb_txn_begin failed, error %d\n", rc);
goto env_close;
}
rc = mdb_open(txn, NULL, 0, &dbi);
if (rc) {
printf("mdb_open failed, error %d\n", rc);
goto txn_abort;
}
rc = mdb_stat(txn, dbi, &mst);
printf("Page size: %u\n", mst.ms_psize);
printf("Tree depth: %u\n", mst.ms_depth);
printf("Branch pages: %zu\n", mst.ms_branch_pages);
printf("Leaf pages: %zu\n", mst.ms_leaf_pages);
printf("Overflow pages: %zu\n", mst.ms_overflow_pages);
printf("Entries: %zu\n", mst.ms_entries);
rc = mdb_cursor_open(txn, dbi, &cursor);
while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT)) == 0) {
char *str = malloc(key.mv_size+1);
MDB_dbi db2;
memcpy(str, key.mv_data, key.mv_size);
str[key.mv_size] = '\0';
printf("\n%s\n", str);
rc = mdb_open(txn, str, 0, &db2);
if (rc) break;
free(str);
rc = mdb_stat(txn, db2, &mst);
printf("Tree depth: %u\n", mst.ms_depth);
printf("Branch pages: %zu\n", mst.ms_branch_pages);
printf("Leaf pages: %zu\n", mst.ms_leaf_pages);
printf("Overflow pages: %zu\n", mst.ms_overflow_pages);
printf("Entries: %zu\n", mst.ms_entries);
mdb_close(env, db2);
}
mdb_cursor_close(cursor);
mdb_close(env, dbi);
txn_abort:
mdb_txn_abort(txn);
env_close:
mdb_env_close(env);
return rc ? EXIT_FAILURE : EXIT_SUCCESS;
}