diff --git a/libraries/libmdb/Makefile b/libraries/libmdb/Makefile index 67a2007bb8..5c206f2238 100644 --- a/libraries/libmdb/Makefile +++ b/libraries/libmdb/Makefile @@ -6,7 +6,7 @@ LDLIBS = SOLIBS = PROGS = mdb_stat mdb_copy mtest mtest2 mtest3 mtest4 mtest5 -all: libmdb.a libmdb.so $(PROGS) +all: liblmdb.a liblmdb.so $(PROGS) clean: rm -rf $(PROGS) *.[ao] *.so *~ testdb @@ -15,23 +15,23 @@ test: all mkdir testdb ./mtest && ./mdb_stat testdb -libmdb.a: mdb.o midl.o +liblmdb.a: mdb.o midl.o ar rs $@ mdb.o midl.o -libmdb.so: mdb.o midl.o +liblmdb.so: mdb.o midl.o gcc -pthread -shared -o $@ mdb.o midl.o $(SOLIBS) -mdb_stat: mdb_stat.o libmdb.a -mdb_copy: mdb_copy.o libmdb.a -mtest: mtest.o libmdb.a -mtest2: mtest2.o libmdb.a -mtest3: mtest3.o libmdb.a -mtest4: mtest4.o libmdb.a -mtest5: mtest5.o libmdb.a -mtest6: mtest6.o libmdb.a -mfree: mfree.o libmdb.a +mdb_stat: mdb_stat.o liblmdb.a +mdb_copy: mdb_copy.o liblmdb.a +mtest: mtest.o liblmdb.a +mtest2: mtest2.o liblmdb.a +mtest3: mtest3.o liblmdb.a +mtest4: mtest4.o liblmdb.a +mtest5: mtest5.o liblmdb.a +mtest6: mtest6.o liblmdb.a +mfree: mfree.o liblmdb.a -mdb.o: mdb.c mdb.h midl.h +mdb.o: mdb.c lmdb.h midl.h $(CC) $(CFLAGS) -fPIC $(CPPFLAGS) -c mdb.c midl.o: midl.c midl.h @@ -40,5 +40,5 @@ midl.o: midl.c midl.h %: %.o $(CC) $(CFLAGS) $(LDFLAGS) $^ $(LDLIBS) -o $@ -%.o: %.c mdb.h +%.o: %.c lmdb.h $(CC) $(CFLAGS) $(CPPFLAGS) -c $< diff --git a/libraries/libmdb/mdb.h b/libraries/libmdb/lmdb.h similarity index 97% rename from libraries/libmdb/mdb.h rename to libraries/libmdb/lmdb.h index e3fc129829..a2118b5bac 100644 --- a/libraries/libmdb/mdb.h +++ b/libraries/libmdb/lmdb.h @@ -1,7 +1,9 @@ -/** @file mdb.h - * @brief memory-mapped database library +/** @file lmdb.h + * @brief Lightning memory-mapped database library * - * @mainpage MDB Memory-Mapped Database Manager + * @mainpage MDB Lightning Memory-Mapped Database Manager + * + * @section intro_sec Introduction * MDB is a Btree-based database management library modeled loosely on the * BerkeleyDB API, but much simplified. The entire database is exposed * in a memory map, and all data fetches return data directly @@ -38,6 +40,7 @@ * corrupt the database. Of course if your application code is known to * be bug-free (...) then this is not an issue. * + * @section caveats_sec Caveats * Troubleshooting the lock file, plus semaphores on BSD systems: * * - A broken lockfile can cause sync issues. @@ -107,7 +110,7 @@ * top-level directory of the distribution or, alternatively, at * . * - * @par Derived From: + * @par Derived From: * This code is derived from btree.c written by Martin Hedenfalk. * * Copyright (c) 2009, 2010 Martin Hedenfalk @@ -124,8 +127,8 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#ifndef _MDB_H_ -#define _MDB_H_ +#ifndef _LMDB_H_ +#define _LMDB_H_ #include @@ -133,8 +136,9 @@ extern "C" { #endif -/** @defgroup public Public API +/** @defgroup mdb MDB API * @{ + * @brief OpenLDAP Lightning Memory-Mapped Database Manager */ /** @defgroup Version Version Macros * @{ @@ -144,7 +148,7 @@ extern "C" { /** Library minor version */ #define MDB_VERSION_MINOR 9 /** Library patch version */ -#define MDB_VERSION_PATCH 4 +#define MDB_VERSION_PATCH 5 /** Combine args a,b,c into a single integer for easy version comparisons */ #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) @@ -154,7 +158,7 @@ extern "C" { MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) /** The release date of this library version */ -#define MDB_VERSION_DATE "September 14, 2012" +#define MDB_VERSION_DATE "November 30, 2012" /** A stringifier for the version info */ #define MDB_VERSTR(a,b,c,d) "MDB " #a "." #b "." #c ": (" d ")" @@ -213,12 +217,14 @@ typedef int (MDB_cmp_func)(const MDB_val *a, const MDB_val *b); typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *relctx); /** @defgroup mdb_env Environment Flags + * + * Values do not overlap Database Flags. * @{ */ /** mmap at a fixed address */ #define MDB_FIXEDMAP 0x01 /** no environment directory */ -#define MDB_NOSUBDIR 0x02 +#define MDB_NOSUBDIR 0x4000 /** don't fsync after commit */ #define MDB_NOSYNC 0x10000 /** read only */ @@ -232,6 +238,8 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel /** @} */ /** @defgroup mdb_open Database Flags + * + * Values do not overlap Environment Flags. * @{ */ /** use reverse string keys */ @@ -496,9 +504,9 @@ int mdb_env_info(MDB_env *env, MDB_envinfo *stat); * the OS buffers upon commit as well, unless the environment was * opened with #MDB_NOSYNC. * @param[in] env An environment handle returned by #mdb_env_create() - * @param[in] force If non-zero, force the flush to occur. Otherwise + * @param[in] force If non-zero, force a synchronous flush. Otherwise * if the environment has the #MDB_NOSYNC flag set the flushes - * will be omitted. + * will be omitted, and with #MDB_MAPASYNC they will be asynchronous. * @return A non-zero error value on failure and 0 on success. Some possible * errors are: *
    @@ -603,11 +611,11 @@ int mdb_env_set_maxreaders(MDB_env *env, unsigned int readers); */ int mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers); - /** @brief Set the maximum number of databases for the environment. + /** @brief Set the maximum number of named databases for the environment. * * This function is only needed if multiple databases will be used in the - * environment. Simpler applications that only use a single database can ignore - * this option. + * environment. Simpler applications that use the environment as a single + * unnamed database can ignore this option. * This function may only be called after #mdb_env_create() and before #mdb_env_open(). * @param[in] env An environment handle returned by #mdb_env_create() * @param[in] dbs The maximum number of databases @@ -714,6 +722,8 @@ int mdb_txn_renew(MDB_txn *txn); * database handle resides in the shared environment, it is not owned * by the given transaction. Only one thread should call this function; * it is not mutex-protected in a read-only transaction. + * To use named databases (with name != NULL), #mdb_env_set_maxdbs() + * must be called before opening the enviorment. * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] name The name of the database to open. If only a single * database is needed in the environment, this value may be NULL. @@ -786,12 +796,12 @@ void mdb_close(MDB_env *env, MDB_dbi dbi); /** @brief Delete a database and/or free all its pages. * - * If the \b del parameter is non-zero the DB handle will be closed + * If the \b del parameter is 1, the DB handle will be closed * and the DB will be deleted. * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] dbi A database handle returned by #mdb_open() - * @param[in] del non-zero to delete the DB from the environment, - * otherwise just free its pages. + * @param[in] del 1 to delete the DB from the environment, + * 0 to just free its pages. * @return A non-zero error value on failure and 0 on success. */ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del); @@ -1150,4 +1160,4 @@ int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b); #ifdef __cplusplus } #endif -#endif /* _MDB_H_ */ +#endif /* _LMDB_H_ */ diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index e1a8689efc..61c948cc9b 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -117,7 +117,7 @@ #define MISALIGNED_OK 1 #endif -#include "mdb.h" +#include "lmdb.h" #include "midl.h" #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN) @@ -1242,6 +1242,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) MDB_page *np; pgno_t pgno = P_INVALID; MDB_ID2 mid; + txnid_t oldest = 0, last; int rc; *mp = NULL; @@ -1254,12 +1255,11 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { /* See if there's anything in the free DB */ - int j; MDB_reader *r; MDB_cursor m2; MDB_node *leaf; MDB_val data; - txnid_t *kptr, last; + txnid_t *kptr; mdb_cursor_init(&m2, txn, FREE_DBI, NULL); if (!txn->mt_env->me_pgfirst) { @@ -1282,15 +1282,21 @@ again: last = *(txnid_t *)key.mv_data; } - /* Unusable if referred by a meta page or reader... */ - j = 1; - if (last < txn->mt_txnid-1) { - j = txn->mt_env->me_txns->mti_numreaders; - r = txn->mt_env->me_txns->mti_readers + j; - for (j = -j; j && (lastmt_txnid - 1; + nr = txn->mt_env->me_txns->mti_numreaders; + r = txn->mt_env->me_txns->mti_readers; + for (i=0; i last) { /* It's usable, grab it. */ MDB_oldpages *mop; @@ -1331,29 +1337,108 @@ none: if (txn->mt_env->me_pghead) { MDB_oldpages *mop = txn->mt_env->me_pghead; if (num > 1) { - /* FIXME: For now, always use fresh pages. We - * really ought to search the free list for a - * contiguous range. - */ - ; + MDB_cursor m2; + int retry = 2, readit = 0, n2 = num-1; + unsigned int i, j, k; + + /* If current list is too short, must fetch more and coalesce */ + if (mop->mo_pages[0] < (unsigned)num) + readit = 1; + + mdb_cursor_init(&m2, txn, FREE_DBI, NULL); + do { + if (readit) { + MDB_val key, data; + MDB_oldpages *mop2; + pgno_t *idl; + int exact; + + last = mop->mo_txnid + 1; + + /* We haven't hit the readers list yet? */ + if (!oldest) { + MDB_reader *r; + unsigned int nr; + txnid_t mr; + + oldest = txn->mt_txnid - 1; + nr = txn->mt_env->me_txns->mti_numreaders; + r = txn->mt_env->me_txns->mti_readers; + for (i=0; imo_pages)); + if (!mop2) + return ENOMEM; + /* merge in sorted order */ + i = idl[0]; j = mop->mo_pages[0]; mop2->mo_pages[0] = k = i+j; + mop->mo_pages[0] = P_INVALID; + while (i>0 || j>0) { + if (i && idl[i] < mop->mo_pages[j]) + mop2->mo_pages[k--] = idl[i--]; + else + mop2->mo_pages[k--] = mop->mo_pages[j--]; + } + txn->mt_env->me_pglast = last; + mop2->mo_txnid = last; + mop2->mo_next = mop->mo_next; + txn->mt_env->me_pghead = mop2; + free(mop); + mop = mop2; + /* Keep trying to read until we have enough */ + if (mop->mo_pages[0] < (unsigned)num) { + continue; + } + } + + /* current list has enough pages, but are they contiguous? */ + for (i=mop->mo_pages[0]; i>=(unsigned)num; i--) { + if (mop->mo_pages[i-n2] == mop->mo_pages[i] + n2) { + pgno = mop->mo_pages[i]; + i -= n2; + /* move any stragglers down */ + for (j=i+num; j<=mop->mo_pages[0]; j++) + mop->mo_pages[i++] = mop->mo_pages[j]; + mop->mo_pages[0] -= num; + break; + } + } + + /* Stop if we succeeded, or no more retries */ + if (!retry || pgno != P_INVALID) + break; + readit = 1; + retry--; + + } while (1); } else { /* peel pages off tail, so we only have to truncate the list */ pgno = MDB_IDL_LAST(mop->mo_pages); - if (MDB_IDL_IS_RANGE(mop->mo_pages)) { - mop->mo_pages[2]++; - if (mop->mo_pages[2] > mop->mo_pages[1]) - mop->mo_pages[0] = 0; + mop->mo_pages[0]--; + } + if (MDB_IDL_IS_ZERO(mop->mo_pages)) { + txn->mt_env->me_pghead = mop->mo_next; + if (mc->mc_dbi == FREE_DBI) { + mop->mo_next = txn->mt_env->me_pgfree; + txn->mt_env->me_pgfree = mop; } else { - mop->mo_pages[0]--; - } - if (MDB_IDL_IS_ZERO(mop->mo_pages)) { - txn->mt_env->me_pghead = mop->mo_next; - if (mc->mc_dbi == FREE_DBI) { - mop->mo_next = txn->mt_env->me_pgfree; - txn->mt_env->me_pgfree = mop; - } else { - free(mop); - } + free(mop); } } } @@ -1523,7 +1608,8 @@ mdb_env_sync(MDB_env *env, int force) int rc = 0; if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { if (env->me_flags & MDB_WRITEMAP) { - int flags = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; + int flags = ((env->me_flags & MDB_MAPASYNC) && !force) + ? MS_ASYNC : MS_SYNC; if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) rc = ErrCode(); #ifdef _WIN32 @@ -2423,6 +2509,7 @@ mdb_env_write_meta(MDB_txn *txn) off_t off; int rc, len, toggle; char *ptr; + HANDLE mfd; #ifdef _WIN32 OVERLAPPED ov; #endif @@ -2481,14 +2568,16 @@ mdb_env_write_meta(MDB_txn *txn) off += PAGEHDRSZ; /* Write to the SYNC fd */ + mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ? + env->me_fd : env->me_mfd; #ifdef _WIN32 { memset(&ov, 0, sizeof(ov)); ov.Offset = off; - WriteFile(env->me_mfd, ptr, len, (DWORD *)&rc, &ov); + WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov); } #else - rc = pwrite(env->me_mfd, ptr, len, off); + rc = pwrite(mfd, ptr, len, off); #endif if (rc != len) { int r2; @@ -2576,7 +2665,7 @@ mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs) { if (env->me_map) return EINVAL; - env->me_maxdbs = dbs; + env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */ return MDB_SUCCESS; } @@ -2651,8 +2740,6 @@ mdb_env_open2(MDB_env *env) } #else i = MAP_SHARED; - if (meta.mm_address && (flags & MDB_FIXEDMAP)) - i |= MAP_FIXED; prot = PROT_READ; if (flags & MDB_WRITEMAP) { prot |= PROT_WRITE; @@ -2674,6 +2761,13 @@ mdb_env_open2(MDB_env *env) if (i != MDB_SUCCESS) { return i; } + } else if (meta.mm_address && env->me_map != meta.mm_address) { + /* Can happen because the address argument to mmap() is just a + * hint. mmap() can pick another, e.g. if the range is in use. + * The MAP_FIXED flag would prevent that, but then mmap could + * instead unmap existing pages to make room for the new map. + */ + return EBUSY; /* TODO: Make a new MDB_* error code? */ } env->me_psize = meta.mm_psize; @@ -3146,6 +3240,12 @@ fail: #define DATANAME "/data.mdb" /** The suffix of the lock file when no subdir is used */ #define LOCKSUFF "-lock" + /** Only a subset of the @ref mdb_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. + */ +#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC) +#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP) int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) @@ -3153,7 +3253,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) int oflags, rc, len, excl; char *lpath, *dpath; - if (env->me_fd != INVALID_HANDLE_VALUE) + if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) return EINVAL; len = strlen(path); @@ -3210,10 +3310,12 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) } if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { - if (flags & (MDB_RDONLY|MDB_NOSYNC|MDB_NOMETASYNC|MDB_WRITEMAP)) { + if (flags & (MDB_RDONLY|MDB_WRITEMAP)) { env->me_mfd = env->me_fd; } else { - /* synchronous fd for meta writes */ + /* Synchronous fd for meta writes. Needed even with + * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. + */ #ifdef _WIN32 env->me_mfd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, @@ -5570,7 +5672,7 @@ mdb_cursor_txn(MDB_cursor *mc) MDB_dbi mdb_cursor_dbi(MDB_cursor *mc) { - if (!mc) return 0; + assert(mc != NULL); return mc->mc_dbi; } @@ -6599,11 +6701,6 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, return mdb_cursor_put(&mc, key, data, flags); } -/** Only a subset of the @ref mdb_env flags can be changed - * at runtime. Changing other flags requires closing the environment - * and re-opening it with the new flags. - */ -#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC) int mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) { @@ -6893,7 +6990,7 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) MDB_cursor *mc; int rc; - if (!txn || !dbi || dbi >= txn->mt_numdbs) + if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1) return EINVAL; if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) diff --git a/libraries/libmdb/mdb_copy.c b/libraries/libmdb/mdb_copy.c index c5eb6b500b..bd0b859110 100644 --- a/libraries/libmdb/mdb_copy.c +++ b/libraries/libmdb/mdb_copy.c @@ -13,8 +13,7 @@ */ #include #include -#include -#include "mdb.h" +#include "lmdb.h" int main(int argc,char * argv[]) { diff --git a/libraries/libmdb/mdb_stat.c b/libraries/libmdb/mdb_stat.c index 88ac801807..4dfcf49645 100644 --- a/libraries/libmdb/mdb_stat.c +++ b/libraries/libmdb/mdb_stat.c @@ -15,7 +15,7 @@ #include #include #include -#include "mdb.h" +#include "lmdb.h" static void prstat(MDB_stat *ms) { diff --git a/libraries/libmdb/mfree.c b/libraries/libmdb/mfree.c index b0e6980026..79cce66fa6 100644 --- a/libraries/libmdb/mfree.c +++ b/libraries/libmdb/mfree.c @@ -15,7 +15,7 @@ #include #include #include -#include "mdb.h" +#include "lmdb.h" #include "midl.h" int main(int argc,char * argv[]) diff --git a/libraries/libmdb/mtest.c b/libraries/libmdb/mtest.c index bb5ec816b6..8c8dd57835 100644 --- a/libraries/libmdb/mtest.c +++ b/libraries/libmdb/mtest.c @@ -15,7 +15,7 @@ #include #include #include -#include "mdb.h" +#include "lmdb.h" int main(int argc,char * argv[]) { diff --git a/libraries/libmdb/mtest2.c b/libraries/libmdb/mtest2.c index a0e9914417..44d1de7ccd 100644 --- a/libraries/libmdb/mtest2.c +++ b/libraries/libmdb/mtest2.c @@ -18,7 +18,7 @@ #include #include #include -#include "mdb.h" +#include "lmdb.h" int main(int argc,char * argv[]) { diff --git a/libraries/libmdb/mtest3.c b/libraries/libmdb/mtest3.c index eb9723ba2e..c189eaa952 100644 --- a/libraries/libmdb/mtest3.c +++ b/libraries/libmdb/mtest3.c @@ -18,7 +18,7 @@ #include #include #include -#include "mdb.h" +#include "lmdb.h" int main(int argc,char * argv[]) { diff --git a/libraries/libmdb/mtest4.c b/libraries/libmdb/mtest4.c index b2f5d931c6..e0ba7e20b6 100644 --- a/libraries/libmdb/mtest4.c +++ b/libraries/libmdb/mtest4.c @@ -18,7 +18,7 @@ #include #include #include -#include "mdb.h" +#include "lmdb.h" int main(int argc,char * argv[]) { diff --git a/libraries/libmdb/mtest5.c b/libraries/libmdb/mtest5.c index c63402ed6b..bc472fa093 100644 --- a/libraries/libmdb/mtest5.c +++ b/libraries/libmdb/mtest5.c @@ -18,7 +18,7 @@ #include #include #include -#include "mdb.h" +#include "lmdb.h" int main(int argc,char * argv[]) { diff --git a/libraries/libmdb/mtest6.c b/libraries/libmdb/mtest6.c index a5bf3a2119..0bf26ccc45 100644 --- a/libraries/libmdb/mtest6.c +++ b/libraries/libmdb/mtest6.c @@ -18,7 +18,7 @@ #include #include #include -#include "mdb.h" +#include "lmdb.h" char dkbuf[1024];