From c22a7d98ecac671b69faeef7d406cec4140b2abf Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 20 Aug 2013 17:37:53 -0700 Subject: [PATCH 01/12] Update caveats now that mdb_reader_check exists --- libraries/liblmdb/lmdb.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 83f083d034..ef284d9c7e 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -48,8 +48,10 @@ * cause further writes to grow the database quickly, and * stale locks can block further operation. * - * Fix: Terminate all programs using the database, or make - * them close it. Next database user will reset the lockfile. + * Fix: Check for stale readers periodically, using the + * #mdb_reader_check function or the mdb_stat tool. Or just + * make all programs using the database close it; the lockfile + * is always reset on first open of the environment. * * - On BSD systems or others configured with MDB_USE_POSIX_SEM, * startup can fail due to semaphores owned by another userid. @@ -86,11 +88,12 @@ * ...when several processes can use a database concurrently: * * - Avoid aborting a process with an active transaction. - * The transaction becomes "long-lived" as above until the lockfile - * is reset, since the process may not remove it from the lockfile. + * The transaction becomes "long-lived" as above until a check + * for stale readers is performed or the lockfile is reset, + * since the process may not remove it from the lockfile. * - * - If you do that anyway, close the environment once in a while, - * so the lockfile can get reset. + * - If you do that anyway, do a periodic check for stale readers. Or + * close the environment once in a while, so the lockfile can get reset. * * - Do not use MDB databases on remote filesystems, even between * processes on the same host. This breaks flock() on some OSes, From 7b1db13050b86d864cdc17d9e0e7b18ee0f77071 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 22 Aug 2013 18:51:48 -0700 Subject: [PATCH 02/12] Less aggressive page_spill spilling all possible pages is a waste because the majority will be needed again. Just do 1/8th of the dirty list instead. --- libraries/liblmdb/mdb.c | 53 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 74cca10c01..ddb7df6039 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1429,7 +1429,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) MDB_txn *txn = m0->mc_txn; MDB_page *dp; MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned int i, j; + unsigned int i, j, k, need; int rc, level; if (m0->mc_flags & C_SUB) @@ -1444,6 +1444,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) if (key) i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; i += i; /* double it for good measure */ + need = i; if (txn->mt_dirty_room > i) return MDB_SUCCESS; @@ -1470,8 +1471,21 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) /* Preserve pages used by cursors */ mdb_cursorpages_mark(m0, P_DIRTY); + /* Less aggressive spill - we originally spilled the entire dirty list, + * with a few exceptions for cursor pages and DB root pages. But this + * turns out to be a lot of wasted effort because in a large txn many + * of those pages will need to be used again. So now we spill only 1/8th + * of the dirty pages. Testing revealed this to be a good tradeoff, + * better than 1/2, 1/4, or 1/10. + */ + k = 0; + need *= 100; + if (need < MDB_IDL_UM_MAX / 8) + need = MDB_IDL_UM_MAX / 8; + /* Save the page IDs of all the pages we're flushing */ - for (i=1; i<=dl[0].mid; i++) { + /* flush from the tail forward, this saves a lot of shifting later on. */ + for (i=dl[0].mid; i>0; i--) { dp = dl[i].mptr; if (dp->mp_flags & P_KEEP) continue; @@ -1494,13 +1508,46 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) } if ((rc = mdb_midl_append(&txn->mt_spill_pgs, dl[i].mid))) goto done; + k++; + if (k > need) + break; } mdb_midl_sort(txn->mt_spill_pgs); - rc = mdb_page_flush(txn); + /* Since we're only doing the tail 1/8th of the dirty list, + * fake a dirty list to reflect this. + */ + { + MDB_ID2 old; + k = dl[0].mid - i + 1; + old = dl[i-1]; + dl[i-1].mid = k; + txn->mt_u.dirty_list = &dl[i-1]; + + rc = mdb_page_flush(txn); + + /* reset back to the real list */ + dl[0].mid -= k; + dl[0].mid += dl[i-1].mid; + dl[i-1] = old; + txn->mt_u.dirty_list = dl; + } mdb_cursorpages_mark(m0, P_DIRTY|P_KEEP); + /* Reset any dirty root pages we kept that page_flush didn't see */ + for (i=0; imt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) + goto done; + if (dp->mp_flags & P_KEEP) + dp->mp_flags ^= P_KEEP; + } + } + done: if (rc == 0) { if (txn->mt_parent) { From 45c4ed18050297871cb5f4f55a5b6560cb5ca1bc Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 22 Aug 2013 21:29:53 -0700 Subject: [PATCH 03/12] Tweak prev commit If somehow "need" is larger than the list size, don't try to fake out the dirty list. --- libraries/liblmdb/mdb.c | 45 +++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index ddb7df6039..708964b67b 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1479,7 +1479,6 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) * better than 1/2, 1/4, or 1/10. */ k = 0; - need *= 100; if (need < MDB_IDL_UM_MAX / 8) need = MDB_IDL_UM_MAX / 8; @@ -1519,32 +1518,38 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) */ { MDB_ID2 old; - k = dl[0].mid - i + 1; - old = dl[i-1]; - dl[i-1].mid = k; - txn->mt_u.dirty_list = &dl[i-1]; + if (i) { + k = dl[0].mid - i + 1; + old = dl[i-1]; + dl[i-1].mid = k; + txn->mt_u.dirty_list = &dl[i-1]; + } rc = mdb_page_flush(txn); - /* reset back to the real list */ - dl[0].mid -= k; - dl[0].mid += dl[i-1].mid; - dl[i-1] = old; - txn->mt_u.dirty_list = dl; + if (i) { + /* reset back to the real list */ + dl[0].mid -= k; + dl[0].mid += dl[i-1].mid; + dl[i-1] = old; + txn->mt_u.dirty_list = dl; + } } mdb_cursorpages_mark(m0, P_DIRTY|P_KEEP); - /* Reset any dirty root pages we kept that page_flush didn't see */ - for (i=0; imt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - pgno_t pgno = txn->mt_dbs[i].md_root; - if (pgno == P_INVALID) - continue; - if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) - goto done; - if (dp->mp_flags & P_KEEP) - dp->mp_flags ^= P_KEEP; + if (i) { + /* Reset any dirty root pages we kept that page_flush didn't see */ + for (i=0; imt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) + goto done; + if (dp->mp_flags & P_KEEP) + dp->mp_flags ^= P_KEEP; + } } } From 392be3a7c65837728426007c09f17c28a9397910 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Fri, 23 Aug 2013 07:37:17 +0200 Subject: [PATCH 04/12] Simplify last commits --- libraries/liblmdb/mdb.c | 123 ++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 73 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 708964b67b..640419c6e7 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1354,19 +1354,24 @@ mdb_dlist_free(MDB_txn *txn) dl[0].mid = 0; } -/* Set or clear P_KEEP in non-overflow, non-sub pages in this txn's cursors. +/* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. * @param[in] mc A cursor handle for the current operation. * @param[in] pflags Flags of the pages to update: * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. + * @param[in] all No shortcuts. Needed except after a full #mdb_page_flush(). + * @return 0 on success, non-zero on failure. */ -static void -mdb_cursorpages_mark(MDB_cursor *mc, unsigned pflags) +static int +mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { MDB_txn *txn = mc->mc_txn; MDB_cursor *m3; MDB_xcursor *mx; + MDB_page *dp; unsigned i, j; + int rc = MDB_SUCCESS, level; + /* Mark pages seen by cursors */ if (mc->mc_flags & C_UNTRACK) mc = NULL; /* will find mc in mt_cursors */ for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { @@ -1384,9 +1389,26 @@ mdb_cursorpages_mark(MDB_cursor *mc, unsigned pflags) if (i == 0) break; } + + if (all) { + /* Mark dirty root pages */ + for (i=0; imt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + pgno_t pgno = txn->mt_dbs[i].md_root; + if (pgno == P_INVALID) + continue; + if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) + break; + if ((dp->mp_flags & (P_DIRTY|P_KEEP)) == pflags && level <= 1) + dp->mp_flags ^= P_KEEP; + } + } + } + + return rc; } -static int mdb_page_flush(MDB_txn *txn); +static int mdb_page_flush(MDB_txn *txn, int keep); /** Spill pages from the dirty list back to disk. * This is intended to prevent running into #MDB_TXN_FULL situations, @@ -1429,8 +1451,8 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) MDB_txn *txn = m0->mc_txn; MDB_page *dp; MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned int i, j, k, need; - int rc, level; + unsigned int i, j, need; + int rc; if (m0->mc_flags & C_SUB) return MDB_SUCCESS; @@ -1455,21 +1477,9 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) return ENOMEM; } - /* Mark all the dirty root pages we want to preserve */ - for (i=0; imt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - pgno_t pgno = txn->mt_dbs[i].md_root; - if (pgno == P_INVALID) - continue; - if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) - goto done; - if ((dp->mp_flags & P_DIRTY) && level <= 1) - dp->mp_flags |= P_KEEP; - } - } - - /* Preserve pages used by cursors */ - mdb_cursorpages_mark(m0, P_DIRTY); + /* Preserve pages which may soon be dirtied again */ + if ((rc = mdb_pages_xkeep(m0, P_DIRTY, 1)) != MDB_SUCCESS) + goto done; /* Less aggressive spill - we originally spilled the entire dirty list, * with a few exceptions for cursor pages and DB root pages. But this @@ -1478,13 +1488,12 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) * of the dirty pages. Testing revealed this to be a good tradeoff, * better than 1/2, 1/4, or 1/10. */ - k = 0; if (need < MDB_IDL_UM_MAX / 8) need = MDB_IDL_UM_MAX / 8; /* Save the page IDs of all the pages we're flushing */ /* flush from the tail forward, this saves a lot of shifting later on. */ - for (i=dl[0].mid; i>0; i--) { + for (i=dl[0].mid; i && need; i--) { dp = dl[i].mptr; if (dp->mp_flags & P_KEEP) continue; @@ -1507,51 +1516,16 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) } if ((rc = mdb_midl_append(&txn->mt_spill_pgs, dl[i].mid))) goto done; - k++; - if (k > need) - break; + need--; } mdb_midl_sort(txn->mt_spill_pgs); - /* Since we're only doing the tail 1/8th of the dirty list, - * fake a dirty list to reflect this. - */ - { - MDB_ID2 old; - if (i) { - k = dl[0].mid - i + 1; - old = dl[i-1]; - dl[i-1].mid = k; - txn->mt_u.dirty_list = &dl[i-1]; - } + /* Flush the spilled part of dirty list */ + if ((rc = mdb_page_flush(txn, i)) != MDB_SUCCESS) + goto done; - rc = mdb_page_flush(txn); - - if (i) { - /* reset back to the real list */ - dl[0].mid -= k; - dl[0].mid += dl[i-1].mid; - dl[i-1] = old; - txn->mt_u.dirty_list = dl; - } - } - - mdb_cursorpages_mark(m0, P_DIRTY|P_KEEP); - - if (i) { - /* Reset any dirty root pages we kept that page_flush didn't see */ - for (i=0; imt_numdbs; i++) { - if (txn->mt_dbflags[i] & DB_DIRTY) { - pgno_t pgno = txn->mt_dbs[i].md_root; - if (pgno == P_INVALID) - continue; - if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) - goto done; - if (dp->mp_flags & P_KEEP) - dp->mp_flags ^= P_KEEP; - } - } - } + /* Reset any dirty pages we kept that page_flush didn't see */ + rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); done: if (rc == 0) { @@ -2626,10 +2600,13 @@ mdb_freelist_save(MDB_txn *txn) return rc; } -/** Flush dirty pages to the map, after clearing their dirty flag. +/** Flush (some) dirty pages to the map, after clearing their dirty flag. + * @param[in] txn the transaction that's being committed + * @param[in] keep number of initial pages in dirty_list to keep dirty. + * @return 0 on success, non-zero on failure. */ static int -mdb_page_flush(MDB_txn *txn) +mdb_page_flush(MDB_txn *txn, int keep) { MDB_env *env = txn->mt_env; MDB_ID2L dl = txn->mt_u.dirty_list; @@ -2647,10 +2624,11 @@ mdb_page_flush(MDB_txn *txn) int n = 0; #endif - j = 0; + j = i = keep; + if (env->me_flags & MDB_WRITEMAP) { /* Clear dirty flags */ - for (i=1; i<=pagecount; i++) { + while (++i <= pagecount) { dp = dl[i].mptr; /* Don't flush this page yet */ if (dp->mp_flags & P_KEEP) { @@ -2665,8 +2643,8 @@ mdb_page_flush(MDB_txn *txn) } /* Write the pages */ - for (i = 1;; i++) { - if (i <= pagecount) { + for (;;) { + if (++i <= pagecount) { dp = dl[i].mptr; /* Don't flush this page yet */ if (dp->mp_flags & P_KEEP) { @@ -2745,8 +2723,7 @@ mdb_page_flush(MDB_txn *txn) #endif /* _WIN32 */ } - j = 0; - for (i=1; i<=pagecount; i++) { + for (i = keep; ++i <= pagecount; ) { dp = dl[i].mptr; /* This is a page we skipped above */ if (!dl[i].mid) { @@ -2949,7 +2926,7 @@ mdb_txn_commit(MDB_txn *txn) mdb_audit(txn); #endif - if ((rc = mdb_page_flush(txn)) || + if ((rc = mdb_page_flush(txn, 0)) || (rc = mdb_env_sync(env, 0)) || (rc = mdb_env_write_meta(txn))) goto fail; From 6616034eb9a4ea5e84d620ec398a8aa0a9e336fb Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Fri, 23 Aug 2013 07:39:04 +0200 Subject: [PATCH 05/12] mdb_page_spill(): Fix nested txn mt_dirty_room --- libraries/liblmdb/mdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 640419c6e7..c6a7359a04 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1530,13 +1530,13 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) done: if (rc == 0) { if (txn->mt_parent) { - MDB_txn *tx2; - pgno_t pgno = dl[i].mid; txn->mt_dirty_room = txn->mt_parent->mt_dirty_room - dl[0].mid; /* dirty pages that are dirty in an ancestor don't * count against this txn's dirty_room. */ for (i=1; i<=dl[0].mid; i++) { + pgno_t pgno = dl[i].mid; + MDB_txn *tx2; for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { j = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno); if (j <= tx2->mt_u.dirty_list[0].mid && From e15a90c7624797afaec703475b6f6be93ca62918 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 22 Aug 2013 21:43:29 -0700 Subject: [PATCH 06/12] ITS#7670 Tweak cursor_del0 Always leave cursor pointing at "next" node, if any. Find next sibling if we're already at end of current page. --- libraries/liblmdb/mdb.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index c6a7359a04..43f91c9000 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -7192,6 +7192,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) int rc; MDB_page *mp; indx_t ki; + unsigned int nkeys; mp = mc->mc_pg[mc->mc_top]; ki = mc->mc_ki[mc->mc_top]; @@ -7211,18 +7212,18 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) rc = mdb_rebalance(mc); if (rc != MDB_SUCCESS) mc->mc_txn->mt_flags |= MDB_TXN_ERROR; - /* if mc points past last node in page, invalidate */ - else if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) - mc->mc_flags &= ~(C_INITIALIZED|C_EOF); - - { - /* Adjust other cursors pointing to mp */ + else { MDB_cursor *m2; - unsigned int nkeys; MDB_dbi dbi = mc->mc_dbi; mp = mc->mc_pg[mc->mc_top]; nkeys = NUMKEYS(mp); + + /* if mc points past last node in page, find next sibling */ + if (mc->mc_ki[mc->mc_top] >= nkeys) + mdb_cursor_sibling(mc, 1); + + /* Adjust other cursors pointing to mp */ for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (m2 == mc) continue; @@ -7232,7 +7233,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) if (m2->mc_ki[mc->mc_top] > ki) m2->mc_ki[mc->mc_top]--; if (m2->mc_ki[mc->mc_top] >= nkeys) - m2->mc_flags &= ~(C_INITIALIZED|C_EOF); + mdb_cursor_sibling(m2, 1); } } } From 6423c8774904789fe4d3f0b7bc887201ef0ec41a Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 23 Aug 2013 07:36:07 -0700 Subject: [PATCH 07/12] ITS#7671 always return the key for a dup val --- libraries/liblmdb/mdb.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 43f91c9000..15d2e6bdb7 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4948,8 +4948,11 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { if (op == MDB_NEXT || op == MDB_NEXT_DUP) { rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); - if (op != MDB_NEXT || rc != MDB_NOTFOUND) + if (op != MDB_NEXT || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) + MDB_GET_KEY(leaf, key); return rc; + } } } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); @@ -5015,11 +5018,14 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) if (mc->mc_db->md_flags & MDB_DUPSORT) { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (op == MDB_PREV || op == MDB_PREV_DUP) { - if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { + if (op == MDB_PREV || op == MDB_PREV_DUP) { rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); - if (op != MDB_PREV || rc != MDB_NOTFOUND) + if (op != MDB_PREV || rc != MDB_NOTFOUND) { + if (rc == MDB_SUCCESS) + MDB_GET_KEY(leaf, key); return rc; + } } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if (op == MDB_PREV_DUP) From 41bf5c906aa745310b328a3b1a3700be339e3a9e Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sat, 24 Aug 2013 19:48:37 +0100 Subject: [PATCH 08/12] Speedup deletion from spill_pgs --- libraries/liblmdb/mdb.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 15d2e6bdb7..842008578c 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1475,6 +1475,17 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); if (!txn->mt_spill_pgs) return ENOMEM; + } else { + /* strip any dups */ + MDB_IDL sl = txn->mt_spill_pgs; + unsigned int num = sl[0]; + j=1; + for (i=1; i<=num; i++) { + if (sl[i] == sl[j]) + continue; + sl[++j] = sl[i]; + } + sl[0] = j; } /* Preserve pages which may soon be dirtied again */ @@ -1813,10 +1824,15 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) mdb_page_copy(np, mp, env->me_psize); } if (txn == tx0) { - /* If in current txn, this page is no longer spilled */ - for (; x < txn->mt_spill_pgs[0]; x++) + /* If in current txn, this page is no longer spilled. + * If it happens to be the last page, truncate the spill list. + * Otherwise temporarily dup its neighbor over it. Dups will + * be stripped out later by the next mdb_page_spill run. + */ + if (x == txn->mt_spill_pgs[0]) + txn->mt_spill_pgs[0]--; + else txn->mt_spill_pgs[x] = txn->mt_spill_pgs[x+1]; - txn->mt_spill_pgs[0]--; } /* otherwise, if belonging to a parent txn, the * page remains spilled until child commits */ From b917266ca10c09092cb000c34740289c698ca694 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sat, 24 Aug 2013 11:54:05 -0700 Subject: [PATCH 09/12] Cleanup prev commit --- libraries/liblmdb/mdb.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 842008578c..59a0d51f16 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1481,9 +1481,8 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) unsigned int num = sl[0]; j=1; for (i=1; i<=num; i++) { - if (sl[i] == sl[j]) - continue; - sl[++j] = sl[i]; + if (sl[j] != sl[i]) + sl[++j] = sl[i]; } sl[0] = j; } From 56c2a8d04aeb1aeb7001d54bfa9b195da7eb140a Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sat, 24 Aug 2013 20:37:45 +0100 Subject: [PATCH 10/12] Another take on spill_pgs and deletes --- libraries/liblmdb/mdb.c | 47 +++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 59a0d51f16..c56d09266c 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -847,7 +847,8 @@ struct MDB_txn { */ MDB_IDL mt_free_pgs; /** The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ MDB_IDL mt_spill_pgs; union { @@ -1476,12 +1477,12 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) if (!txn->mt_spill_pgs) return ENOMEM; } else { - /* strip any dups */ + /* purge deleted slots */ MDB_IDL sl = txn->mt_spill_pgs; unsigned int num = sl[0]; - j=1; + j=0; for (i=1; i<=num; i++) { - if (sl[j] != sl[i]) + if (!(sl[i] & 1)) sl[++j] = sl[i]; } sl[0] = j; @@ -1504,6 +1505,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) /* Save the page IDs of all the pages we're flushing */ /* flush from the tail forward, this saves a lot of shifting later on. */ for (i=dl[0].mid; i && need; i--) { + MDB_ID pn = dl[i].mid << 1; dp = dl[i].mptr; if (dp->mp_flags & P_KEEP) continue; @@ -1514,8 +1516,8 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) MDB_txn *tx2; for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { if (tx2->mt_spill_pgs) { - j = mdb_midl_search(tx2->mt_spill_pgs, dl[i].mid); - if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == dl[i].mid) { + j = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) { dp->mp_flags |= P_KEEP; break; } @@ -1524,7 +1526,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) if (tx2) continue; } - if ((rc = mdb_midl_append(&txn->mt_spill_pgs, dl[i].mid))) + if ((rc = mdb_midl_append(&txn->mt_spill_pgs, pn))) goto done; need--; } @@ -1798,13 +1800,13 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) MDB_env *env = tx0->mt_env; MDB_txn *txn; unsigned x; - pgno_t pgno = mp->mp_pgno; + pgno_t pgno = mp->mp_pgno, pn = pgno << 1; for (txn = tx0; txn; txn=txn->mt_parent) { if (!txn->mt_spill_pgs) continue; - x = mdb_midl_search(txn->mt_spill_pgs, pgno); - if (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pgno) { + x = mdb_midl_search(txn->mt_spill_pgs, pn); + if (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pn) { MDB_page *np; int num; if (IS_OVERFLOW(mp)) @@ -1825,13 +1827,12 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) if (txn == tx0) { /* If in current txn, this page is no longer spilled. * If it happens to be the last page, truncate the spill list. - * Otherwise temporarily dup its neighbor over it. Dups will - * be stripped out later by the next mdb_page_spill run. + * Otherwise mark it as deleted by setting the LSB. */ if (x == txn->mt_spill_pgs[0]) txn->mt_spill_pgs[0]--; else - txn->mt_spill_pgs[x] = txn->mt_spill_pgs[x+1]; + txn->mt_spill_pgs[x] |= 1; } /* otherwise, if belonging to a parent txn, the * page remains spilled until child commits */ @@ -2823,9 +2824,10 @@ mdb_txn_commit(MDB_txn *txn) len = x; /* zero out our dirty pages in parent spill list */ for (i=1; i<=src[0].mid; i++) { - if (src[i].mid < parent->mt_spill_pgs[x]) + MDB_ID pn = src[i].mid << 1; + if (pn < parent->mt_spill_pgs[x]) continue; - if (src[i].mid > parent->mt_spill_pgs[x]) { + if (pn > parent->mt_spill_pgs[x]) { if (x <= 1) break; x--; @@ -4533,8 +4535,9 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) * leave that unless page_touch happens again). */ if (tx2->mt_spill_pgs) { - x = mdb_midl_search(tx2->mt_spill_pgs, pgno); - if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pgno) { + MDB_ID pn = pgno << 1; + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { p = (MDB_page *)(env->me_map + env->me_psize * pgno); goto done; } @@ -4764,6 +4767,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) unsigned x = 0, ovpages = mp->mp_pages; MDB_env *env = txn->mt_env; MDB_IDL sl = txn->mt_spill_pgs; + MDB_ID pn = pg << 1; int rc; DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); @@ -4778,7 +4782,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) if (env->me_pghead && !txn->mt_parent && ((mp->mp_flags & P_DIRTY) || - (sl && (x = mdb_midl_search(sl, pg)) <= sl[0] && sl[x] == pg))) + (sl && (x = mdb_midl_search(sl, pn)) <= sl[0] && sl[x] == pn))) { unsigned i, j; pgno_t *mop; @@ -4788,9 +4792,10 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) return rc; if (!(mp->mp_flags & P_DIRTY)) { /* This page is no longer spilled */ - for (; x < sl[0]; x++) - sl[x] = sl[x+1]; - sl[0]--; + if (x == sl[0]) + sl[0]--; + else + sl[x] |= 1; goto release; } /* Remove from dirty list */ From 4dd10d70a5f62cb68046efaf4dcb5edd806aa851 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sun, 25 Aug 2013 23:59:58 +0200 Subject: [PATCH 11/12] mdb_page_unspill(): Fix dirty_room in nested txn --- libraries/liblmdb/mdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index c56d09266c..e34f7951b3 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1847,7 +1847,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) x = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno); if (x <= tx2->mt_u.dirty_list[0].mid && tx2->mt_u.dirty_list[x].mid == pgno) { - txn->mt_dirty_room++; + tx0->mt_dirty_room++; break; } } From 2a28686e4f56cf539e817661728e6c05de339eba Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 27 Aug 2013 06:28:30 -0700 Subject: [PATCH 12/12] ITS#7672 fix mdb_dbi_flags Use a txn instead of env argument. Only return persistent flags. --- libraries/liblmdb/lmdb.h | 4 ++-- libraries/liblmdb/mdb.c | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index ef284d9c7e..b2c3861ac9 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -920,12 +920,12 @@ int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); /** @brief Retrieve the DB flags for a database handle. * - * @param[in] env An environment handle returned by #mdb_env_create() + * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @param[in] dbi A database handle returned by #mdb_dbi_open() * @param[out] flags Address where the flags will be returned. * @return A non-zero error value on failure and 0 on success. */ -int mdb_dbi_flags(MDB_env *env, MDB_dbi dbi, unsigned int *flags); +int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags); /** @brief Close a database handle. * diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index e34f7951b3..835a8a8603 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -7968,7 +7968,6 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db txn->mt_dbflags[slot] = dbflag; memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); *dbi = slot; - txn->mt_env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags; mdb_default_cmp(txn, slot); if (!unused) { txn->mt_numdbs++; @@ -8004,12 +8003,12 @@ void mdb_dbi_close(MDB_env *env, MDB_dbi dbi) free(ptr); } -int mdb_dbi_flags(MDB_env *env, MDB_dbi dbi, unsigned int *flags) +int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags) { /* We could return the flags for the FREE_DBI too but what's the point? */ - if (dbi < MAIN_DBI || dbi >= env->me_numdbs) + if (txn == NULL || dbi < MAIN_DBI || dbi >= txn->mt_numdbs) return EINVAL; - *flags = env->me_dbflags[dbi]; + *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS; return MDB_SUCCESS; }