From 248fd1c7b86c21f1f02f5bdd8b3018882159be63 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 12 Jul 2012 16:50:27 -0700 Subject: [PATCH 1/5] Windows thread callback support --- libraries/libmdb/mdb.c | 77 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 3 deletions(-) diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index 983c75c8de..fc889ce7ad 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -2568,11 +2568,8 @@ mdb_env_open2(MDB_env *env, unsigned int flags) } -#ifndef _WIN32 /** Release a reader thread's slot in the reader lock table. * This function is called automatically when a thread exits. - * Windows doesn't support destructor callbacks for thread-specific storage, - * so this function is not compiled there. * @param[in] ptr This points to the slot in the reader lock table. */ static void @@ -2584,6 +2581,60 @@ mdb_env_reader_dest(void *ptr) reader->mr_pid = 0; reader->mr_tid = 0; } + +#ifdef _WIN32 +/** Junk for arranging thread-specific callbacks on Windows. This is + * necessarily platform and compiler-specific. Windows supports up + * to 1088 keys. Let's assume nobody opens more than 64 environments + * in a single process, for now. They can override this if needed. + */ +#ifndef MAX_TLS_KEYS +#define MAX_TLS_KEYS 64 +#endif +static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS]; +static int mdb_tls_nkeys; + +static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr) +{ + int i; + switch(reason) { + case DLL_PROCESS_ATTACH: break; + case DLL_THREAD_ATTACH: break; + case DLL_THREAD_DETACH: + for (i=0; ime_path = strdup(path); DPRINTF("opened dbenv %p", (void *) env); pthread_key_create(&env->me_txkey, mdb_env_reader_dest); +#ifdef _WIN32 + /* Windows TLS callbacks need help finding their TLS info. */ + if (mdb_tls_nkeys < MAX_TLS_KEYS) + mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey; + else { + rc = ENOMEM; + goto leave; + } +#endif LAZY_RWLOCK_INIT(&env->me_dblock, NULL); if (excl) mdb_env_share_locks(env); @@ -3087,6 +3147,17 @@ mdb_env_close(MDB_env *env) LAZY_RWLOCK_DESTROY(&env->me_dblock); pthread_key_delete(env->me_txkey); +#ifdef _WIN32 + /* Delete our key from the global list */ + { int i; + for (i=0; ime_txkey) { + mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1]; + mdb_tls_nkeys--; + break; + } + } +#endif if (env->me_map) { munmap(env->me_map, env->me_mapsize); From 1a9775dc5a0a494659b2669f0732cd09b152571d Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 13 Jul 2012 11:56:29 -0700 Subject: [PATCH 2/5] Fix ID -> MDB_ID due to 20baad4a207db5d0e84fc3a7409f216aefa59385 --- libraries/libmdb/mfree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/libmdb/mfree.c b/libraries/libmdb/mfree.c index 0833b22f47..b0e6980026 100644 --- a/libraries/libmdb/mfree.c +++ b/libraries/libmdb/mfree.c @@ -27,7 +27,7 @@ int main(int argc,char * argv[]) MDB_txn *txn; MDB_stat mst; MDB_cursor *cursor; - ID i, j, *iptr; + MDB_ID i, j, *iptr; if (argc != 2) { fprintf(stderr, "usage: %s \n", argv[0]); @@ -41,7 +41,7 @@ int main(int argc,char * argv[]) rc = mdb_cursor_open(txn, dbi, &cursor); while ((rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT)) == 0) { printf("key: %p %zu, data: %p\n", - key.mv_data, *(ID *) key.mv_data, + key.mv_data, *(MDB_ID *) key.mv_data, data.mv_data); iptr = data.mv_data; j = *iptr++; From 0ea56294f179d7e93e06a575af73837eb0a8607c Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 13 Jul 2012 11:57:11 -0700 Subject: [PATCH 3/5] Fix darwin sem_open() names Must begin with '/' --- libraries/libmdb/mdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index fc889ce7ad..44365337c7 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -2922,7 +2922,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) val.mv_data = &idbuf; val.mv_size = sizeof(idbuf); mdb_hash_hex(&val, hexbuf); - sprintf(env->me_txns->mti_rmname, "MDBr%s", hexbuf); + sprintf(env->me_txns->mti_rmname, "/MDBr%s", hexbuf); if (sem_unlink(env->me_txns->mti_rmname)) { rc = ErrCode(); if (rc != ENOENT && rc != EINVAL) @@ -2933,7 +2933,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) rc = ErrCode(); goto fail; } - sprintf(env->me_txns->mti_wmname, "MDBw%s", hexbuf); + sprintf(env->me_txns->mti_wmname, "/MDBw%s", hexbuf); if (sem_unlink(env->me_txns->mti_wmname)) { rc = ErrCode(); if (rc != ENOENT && rc != EINVAL) From a0993354a603a970889ad5c160c289ecca316f81 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 12 Jul 2012 17:04:05 -0700 Subject: [PATCH 4/5] Don't use env-private copy of DB root nodes. Just lookup the DB roots as needed. When many DBs are in use, most of the copies won't be referenced in a given txn, and there's a bad race condition in the copy routine. --- libraries/libmdb/mdb.c | 137 ++++++++--------------------------------- 1 file changed, 27 insertions(+), 110 deletions(-) diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index 44365337c7..4de36c67b8 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -335,45 +335,6 @@ static txnid_t mdb_debug_start; #define DKEY(x) 0 #endif -/** @defgroup lazylock Lazy Locking - * Macros for locks that aren't actually needed. - * The DB view is always consistent because all writes are wrapped in - * the wmutex. Finer-grained locks aren't necessary. - * @{ - */ -#ifndef LAZY_LOCKS - /** Use lazy locking. I.e., don't lock these accesses at all. */ -#define LAZY_LOCKS 1 -#endif -#if LAZY_LOCKS - /** Grab the reader lock */ -#define LAZY_MUTEX_LOCK(x) - /** Release the reader lock */ -#define LAZY_MUTEX_UNLOCK(x) - /** Release the DB table reader/writer lock */ -#define LAZY_RWLOCK_UNLOCK(x) - /** Grab the DB table write lock */ -#define LAZY_RWLOCK_WRLOCK(x) - /** Grab the DB table read lock */ -#define LAZY_RWLOCK_RDLOCK(x) - /** Declare the DB table rwlock. Should not be followed by ';'. */ -#define LAZY_RWLOCK_DEF(x) - /** Initialize the DB table rwlock */ -#define LAZY_RWLOCK_INIT(x,y) - /** Destroy the DB table rwlock */ -#define LAZY_RWLOCK_DESTROY(x) -#else -#define LAZY_MUTEX_LOCK(x) pthread_mutex_lock(x) -#define LAZY_MUTEX_UNLOCK(x) pthread_mutex_unlock(x) -#define LAZY_RWLOCK_UNLOCK(x) pthread_rwlock_unlock(x) -#define LAZY_RWLOCK_WRLOCK(x) pthread_rwlock_wrlock(x) -#define LAZY_RWLOCK_RDLOCK(x) pthread_rwlock_rdlock(x) -#define LAZY_RWLOCK_DEF(x) pthread_rwlock_t x; -#define LAZY_RWLOCK_INIT(x,y) pthread_rwlock_init(x,y) -#define LAZY_RWLOCK_DESTROY(x) pthread_rwlock_destroy(x) -#endif -/** @} */ - /** An invalid page number. * Mainly used to denote an empty tree. */ @@ -924,7 +885,7 @@ struct MDB_env { /** Failed to update the meta page. Probably an I/O error. */ #define MDB_FATAL_ERROR 0x80000000U uint32_t me_flags; /**< @ref mdb_env */ - uint32_t me_extrapad; /**< unused for now */ + unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */ unsigned int me_maxreaders; /**< size of the reader table */ MDB_dbi me_numdbs; /**< number of DBs opened */ MDB_dbi me_maxdbs; /**< size of the DB table */ @@ -936,13 +897,10 @@ struct MDB_env { size_t me_mapsize; /**< size of the data memory map */ off_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ - unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */ - unsigned int me_db_toggle; /**< which DB table is current */ - txnid_t me_wtxnid; /**< ID of last txn we committed */ txnid_t me_pgfirst; /**< ID of first old page record we used */ txnid_t me_pglast; /**< ID of last old page record we used */ MDB_dbx *me_dbxs; /**< array of static DB info */ - MDB_db *me_dbs[2]; /**< two arrays of MDB_db info */ + uint16_t *me_dbflags; /**< array of DB flags */ MDB_oldpages *me_pghead; /**< list of old page records */ MDB_oldpages *me_pgfree; /**< list of page records to free */ pthread_key_t me_txkey; /**< thread-key for readers */ @@ -951,8 +909,6 @@ struct MDB_env { MDB_IDL me_free_pgs; /** ID2L of pages that were written during a write txn */ MDB_ID2 me_dirty_list[MDB_IDL_UM_SIZE]; - /** rwlock for the DB tables, if #LAZY_LOCKS is false */ - LAZY_RWLOCK_DEF(me_dblock) #ifdef _WIN32 HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */ HANDLE me_wmutex; @@ -1544,12 +1500,15 @@ static int mdb_txn_renew0(MDB_txn *txn) { MDB_env *env = txn->mt_env; - char mt_dbflag = 0; + unsigned int i; + + /* Setup db info */ + txn->mt_numdbs = env->me_numdbs; + txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ if (txn->mt_flags & MDB_TXN_RDONLY) { MDB_reader *r = pthread_getspecific(env->me_txkey); if (!r) { - unsigned int i; pid_t pid = getpid(); pthread_t tid = pthread_self(); @@ -1569,15 +1528,9 @@ mdb_txn_renew0(MDB_txn *txn) r = &env->me_txns->mti_readers[i]; pthread_setspecific(env->me_txkey, r); } - txn->mt_txnid = env->me_txns->mti_txnid; + txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid; txn->mt_toggle = txn->mt_txnid & 1; txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1; - /* This happens if a different process was the - * last writer to the DB. - */ - if (env->me_wtxnid < txn->mt_txnid) - mt_dbflag = DB_STALE; - r->mr_txnid = txn->mt_txnid; txn->mt_u.reader = r; } else { LOCK_MUTEX_W(env); @@ -1585,8 +1538,6 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_txnid = env->me_txns->mti_txnid; txn->mt_toggle = txn->mt_txnid & 1; txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1; - if (env->me_wtxnid < txn->mt_txnid) - mt_dbflag = DB_STALE; txn->mt_txnid++; #if MDB_DEBUG if (txn->mt_txnid == mdb_debug_start) @@ -1599,17 +1550,11 @@ mdb_txn_renew0(MDB_txn *txn) env->me_txn = txn; } - /* Copy the DB arrays */ - LAZY_RWLOCK_RDLOCK(&env->me_dblock); - txn->mt_numdbs = env->me_numdbs; - txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ + /* Copy the DB info and flags */ memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db)); - if (txn->mt_numdbs > 2) - memcpy(txn->mt_dbs+2, env->me_dbs[env->me_db_toggle]+2, - (txn->mt_numdbs - 2) * sizeof(MDB_db)); - LAZY_RWLOCK_UNLOCK(&env->me_dblock); - - memset(txn->mt_dbflags, mt_dbflag, env->me_numdbs); + for (i=2; imt_numdbs; i++) + txn->mt_dbs[i].md_flags = env->me_dbflags[i]; + memset(txn->mt_dbflags, DB_STALE, env->me_numdbs); return MDB_SUCCESS; } @@ -1828,21 +1773,11 @@ mdb_txn_commit(MDB_txn *txn) if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { if (txn->mt_numdbs > env->me_numdbs) { - /* update the DB tables */ - int toggle = !env->me_db_toggle; - MDB_db *ip, *jp; + /* update the DB flags */ MDB_dbi i; - - ip = &env->me_dbs[toggle][env->me_numdbs]; - jp = &txn->mt_dbs[env->me_numdbs]; - LAZY_RWLOCK_WRLOCK(&env->me_dblock); - for (i = env->me_numdbs; i < txn->mt_numdbs; i++) { - *ip++ = *jp++; - } - - env->me_db_toggle = toggle; - env->me_numdbs = txn->mt_numdbs; - LAZY_RWLOCK_UNLOCK(&env->me_dblock); + for (i = env->me_numdbs; imt_numdbs; i++) + env->me_dbflags[i] = txn->mt_dbs[i].md_flags; + env->me_numdbs = i; } mdb_txn_abort(txn); return MDB_SUCCESS; @@ -2171,28 +2106,15 @@ again: mdb_txn_abort(txn); return n; } - env->me_wtxnid = txn->mt_txnid; done: env->me_txn = NULL; - /* update the DB tables */ - { - int toggle = !env->me_db_toggle; - MDB_db *ip, *jp; + if (txn->mt_numdbs > env->me_numdbs) { + /* update the DB flags */ MDB_dbi i; - - ip = &env->me_dbs[toggle][2]; - jp = &txn->mt_dbs[2]; - LAZY_RWLOCK_WRLOCK(&env->me_dblock); - for (i = 2; i < txn->mt_numdbs; i++) { - if (ip->md_root != jp->md_root) - *ip = *jp; - ip++; jp++; - } - - env->me_db_toggle = toggle; - env->me_numdbs = txn->mt_numdbs; - LAZY_RWLOCK_UNLOCK(&env->me_dblock); + for (i = env->me_numdbs; imt_numdbs; i++) + env->me_dbflags[i] = txn->mt_dbs[i].md_flags; + env->me_numdbs = i; } UNLOCK_MUTEX_W(env); @@ -2388,9 +2310,7 @@ mdb_env_write_meta(MDB_txn *txn) * readers will get consistent data regardless of how fresh or * how stale their view of these values is. */ - LAZY_MUTEX_LOCK(&env->me_txns->mti_mutex); txn->mt_env->me_txns->mti_txnid = txn->mt_txnid; - LAZY_MUTEX_UNLOCK(&env->me_txns->mti_mutex); return MDB_SUCCESS; } @@ -3100,13 +3020,13 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) goto leave; } #endif - LAZY_RWLOCK_INIT(&env->me_dblock, NULL); if (excl) mdb_env_share_locks(env); - env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); - env->me_dbs[0] = calloc(env->me_maxdbs, sizeof(MDB_db)); - env->me_dbs[1] = calloc(env->me_maxdbs, sizeof(MDB_db)); env->me_numdbs = 2; + env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx)); + env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t)); + if (!env->me_dbxs || !env->me_dbflags) + rc = ENOMEM; } leave: @@ -3140,12 +3060,10 @@ mdb_env_close(MDB_env *env) free(dp); } - free(env->me_dbs[1]); - free(env->me_dbs[0]); + free(env->me_dbflags); free(env->me_dbxs); free(env->me_path); - LAZY_RWLOCK_DESTROY(&env->me_dblock); pthread_key_delete(env->me_txkey); #ifdef _WIN32 /* Delete our key from the global list */ @@ -6316,8 +6234,7 @@ int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi) txn->mt_dbflags[slot] = dbflag; memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); *dbi = slot; - txn->mt_env->me_dbs[0][slot] = txn->mt_dbs[slot]; - txn->mt_env->me_dbs[1][slot] = txn->mt_dbs[slot]; + txn->mt_env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags; mdb_default_cmp(txn, slot); if (!unused) { txn->mt_numdbs++; From 7e9a6134fde1e65648aa18c935c1af7554fd5b97 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 17 Jul 2012 04:02:48 -0700 Subject: [PATCH 5/5] Make sure cursor's DB is init'd if STALE. --- libraries/libmdb/mdb.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index 4de36c67b8..4588e75bca 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -932,8 +932,10 @@ static int mdb_page_touch(MDB_cursor *mc); static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp); static int mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); +#define MDB_PS_MODIFY 1 +#define MDB_PS_ROOTONLY 2 static int mdb_page_search(MDB_cursor *mc, - MDB_val *key, int modify); + MDB_val *key, int flags); static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned int nflags); @@ -1554,7 +1556,8 @@ mdb_txn_renew0(MDB_txn *txn) memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db)); for (i=2; imt_numdbs; i++) txn->mt_dbs[i].md_flags = env->me_dbflags[i]; - memset(txn->mt_dbflags, DB_STALE, env->me_numdbs); + txn->mt_dbflags[0] = txn->mt_dbflags[1] = 0; + memset(txn->mt_dbflags+2, DB_STALE, env->me_numdbs-2); return MDB_SUCCESS; } @@ -1858,7 +1861,7 @@ mdb_txn_commit(MDB_txn *txn) /* should only be one record now */ if (env->me_pghead) { /* make sure first page of freeDB is touched and on freelist */ - mdb_page_search(&mc, NULL, 1); + mdb_page_search(&mc, NULL, MDB_PS_MODIFY); } /* Delete IDLs we used from the free list */ @@ -1887,7 +1890,7 @@ free2: /* make sure last page of freeDB is touched and on freelist */ key.mv_size = MAXKEYSIZE+1; key.mv_data = NULL; - mdb_page_search(&mc, &key, 1); + mdb_page_search(&mc, &key, MDB_PS_MODIFY); mdb_midl_sort(txn->mt_free_pgs); #if MDB_DEBUG > 1 @@ -3371,7 +3374,8 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret) * @param[in,out] mc the cursor for this operation. * @param[in] key the key to search for. If NULL, search for the lowest * page. (This is used by #mdb_cursor_first().) - * @param[in] modify If true, visited pages are updated with new page numbers. + * @param[in] flags If MDB_PS_MODIFY set, visited pages are updated with new page numbers. + * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. * @return 0 on success, non-zero on failure. */ static int @@ -3453,7 +3457,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) * @return 0 on success, non-zero on failure. */ static int -mdb_page_search(MDB_cursor *mc, MDB_val *key, int modify) +mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) { int rc; pgno_t root; @@ -3468,11 +3472,11 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int modify) /* Make sure we're using an up-to-date root */ if (mc->mc_dbi > MAIN_DBI) { if ((*mc->mc_dbflag & DB_STALE) || - (modify && !(*mc->mc_dbflag & DB_DIRTY))) { + ((flags & MDB_PS_MODIFY) && !(*mc->mc_dbflag & DB_DIRTY))) { MDB_cursor mc2; unsigned char dbflag = 0; mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, modify); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, flags & MDB_PS_MODIFY); if (rc) return rc; if (*mc->mc_dbflag & DB_STALE) { @@ -3485,7 +3489,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int modify) mdb_node_read(mc->mc_txn, leaf, &data); memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); } - if (modify) + if (flags & MDB_PS_MODIFY) dbflag = DB_DIRTY; *mc->mc_dbflag = dbflag; } @@ -3508,12 +3512,15 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int modify) DPRINTF("db %u root page %zu has flags 0x%X", mc->mc_dbi, root, mc->mc_pg[0]->mp_flags); - if (modify) { + if (flags & MDB_PS_MODIFY) { if ((rc = mdb_page_touch(mc))) return rc; } - return mdb_page_search_root(mc, key, modify); + if (flags & MDB_PS_ROOTONLY) + return MDB_SUCCESS; + + return mdb_page_search_root(mc, key, flags); } /** Return the data associated with a given node. @@ -4149,7 +4156,7 @@ mdb_cursor_touch(MDB_cursor *mc) if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { MDB_cursor mc2; mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 1); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY); if (rc) return rc; *mc->mc_dbflag = DB_DIRTY; @@ -4995,6 +5002,9 @@ mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) } else { mc->mc_xcursor = NULL; } + if (*mc->mc_dbflag & DB_STALE) { + mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); + } } int