From 0144da6eab00cf5f321d5c6794175485ba3efefd Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sat, 21 Sep 2013 14:37:11 -0700 Subject: [PATCH 01/25] ITS#7701 fix mdb_rebalance Must copy tmp cursor back to real cursor when merging into tmp cursor. --- libraries/liblmdb/mdb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 9c5e7389de..e5313b0c85 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -7248,8 +7248,11 @@ mdb_rebalance(MDB_cursor *mc) else { if (mc->mc_ki[ptop] == 0) rc = mdb_page_merge(&mn, mc); - else + else { + mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; rc = mdb_page_merge(mc, &mn); + mdb_cursor_copy(&mn, mc); + } mc->mc_flags &= ~(C_INITIALIZED|C_EOF); } return rc; From 912e09fd04e0512e6f764176a4a0a2ab009c87d1 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sun, 15 Sep 2013 13:08:29 -0700 Subject: [PATCH 02/25] ITS#7681 fix 18a07eb7c2dc33372455a6040984cd6b699b41a5 Set rc=0 when taking the SET_RANGE branch --- libraries/liblmdb/mdb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index e5313b0c85..e30d85c476 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -5231,9 +5231,10 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (!mc->mc_top) { /* There are no other pages */ mc->mc_ki[mc->mc_top] = 0; - if (op == MDB_SET_RANGE) + if (op == MDB_SET_RANGE) { + rc = 0; goto set1; - else + } else return MDB_NOTFOUND; } } @@ -5298,6 +5299,7 @@ set1: if (rc) { if (op == MDB_GET_BOTH || rc > 0) return MDB_NOTFOUND; + rc = 0; } } else { From 3335b2583490f10c3f273391b48e1e2987022f4c Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:07:29 +0200 Subject: [PATCH 03/25] ITS#7682 mdb_env_copy(): Avoid Linux O_DIRECT bug. Use fcntl() to set the flag. Linux open(,O_DIRECT...) can create the file even on failure, if the filesystem lacks O_DIRECT support. --- libraries/liblmdb/mdb.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index e30d85c476..46dea53c87 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4246,14 +4246,6 @@ mdb_env_copy(MDB_env *env, const char *path) newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); #else -#ifdef O_DIRECT - /* The OS supports O_DIRECT, try with it */ - newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL|O_DIRECT, 0666); - /* But open can fail if O_DIRECT isn't supported by the file system - * so retry without the flag - */ - if (newfd == INVALID_HANDLE_VALUE && ErrCode() == EINVAL) -#endif newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); #endif if (newfd == INVALID_HANDLE_VALUE) { @@ -4261,6 +4253,11 @@ mdb_env_copy(MDB_env *env, const char *path) goto leave; } +#ifdef O_DIRECT + /* Set O_DIRECT if the file system supports it */ + if ((rc = fcntl(newfd, F_GETFL)) != -1) + (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); +#endif #ifdef F_NOCACHE /* __APPLE__ */ rc = fcntl(newfd, F_NOCACHE, 1); if (rc) { From 5b21307f29ccda563e5f0d82294083c841b0e927 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:09:47 +0200 Subject: [PATCH 04/25] Update MDB documentation and comments. --- libraries/liblmdb/lmdb.h | 18 +++++++++++------- libraries/liblmdb/mdb.c | 41 +++++++++++++++++++++++----------------- libraries/liblmdb/midl.h | 2 +- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 8a46c1d0d1..f637229283 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -216,13 +216,13 @@ typedef struct MDB_cursor MDB_cursor; /** @brief Generic structure used for passing keys and data in and out * of the database. * - * Key sizes must be between 1 and the liblmdb build-time constant - * #MDB_MAXKEYSIZE inclusive. This currently defaults to 511. The - * same applies to data sizes in databases with the #MDB_DUPSORT flag. - * Other data items can in theory be from 0 to 0xffffffff bytes long. - * * Values returned from the database are valid only until a subsequent - * update operation, or the end of the transaction. + * update operation, or the end of the transaction. Do not modify or + * free them, they commonly point into the database itself. + * + * Key sizes must be between 1 and #mdb_env_get_maxkeysize() inclusive. + * The same applies to data sizes in databases with the #MDB_DUPSORT flag. + * Other data items can in theory be from 0 to 0xffffffff bytes long. */ typedef struct MDB_val { size_t mv_size; /**< size of the data item */ @@ -486,6 +486,8 @@ int mdb_env_create(MDB_env **env); * and uses fewer mallocs, but loses protection from application bugs * like wild pointer writes and other bad updates into the database. * Incompatible with nested transactions. + * Processes with and without MDB_WRITEMAP on the same environment do + * not cooperate well. *
  • #MDB_NOMETASYNC * Flush system buffers to disk only once per transaction, omit the * metadata flush. Defer that until the system flushes files to disk, @@ -733,8 +735,10 @@ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); /** @brief Get the maximum size of a key for the environment. * + * This is the compile-time constant #MDB_MAXKEYSIZE, default 511. + * See @ref MDB_val. * @param[in] env An environment handle returned by #mdb_env_create() - * @return The maximum size of a key. (#MDB_MAXKEYSIZE) + * @return The maximum size of a key */ int mdb_env_get_maxkeysize(MDB_env *env); diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 46dea53c87..0bc9db798f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -171,7 +171,7 @@ #define Z "I" #else -#define Z "z" +#define Z "z" /**< printf format modifier for size_t */ /** For MDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */ #define MDB_PIDLOCK 1 @@ -600,7 +600,7 @@ typedef struct MDB_page { #define P_LEAF 0x02 /**< leaf page */ #define P_OVERFLOW 0x04 /**< overflow page */ #define P_META 0x08 /**< meta page */ -#define P_DIRTY 0x10 /**< dirty page */ +#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ #define P_KEEP 0x8000 /**< leave this page alone during spill */ @@ -786,7 +786,10 @@ typedef struct MDB_db { /** Handle for the default DB. */ #define MAIN_DBI 1 - /** Meta page content. */ + /** Meta page content. + * A meta page is the start point for accessing a database snapshot. + * Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). + */ typedef struct MDB_meta { /** Stamp identifying this as an MDB file. It must be set * to #MDB_MAGIC. */ @@ -905,7 +908,14 @@ struct MDB_txn { struct MDB_xcursor; - /** Cursors are used for all DB operations */ + /** Cursors are used for all DB operations. + * A cursor holds a path of (page pointer, key index) from the DB + * root to a position in the DB, plus other state. #MDB_DUPSORT + * cursors include an xcursor to the current data item. Write txns + * track their cursors and keep them up to date when data moves. + * Exception: An xcursor's pointer to a #P_SUBP page can be stale. + * (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). + */ struct MDB_cursor { /** Next cursor on this DB in this txn */ MDB_cursor *mc_next; @@ -1019,8 +1029,8 @@ struct MDB_env { /** Nested transaction */ typedef struct MDB_ntxn { - MDB_txn mnt_txn; /* the transaction */ - MDB_pgstate mnt_pgstate; /* parent transaction's saved freestate */ + MDB_txn mnt_txn; /**< the transaction */ + MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ } MDB_ntxn; /** max number of pages to commit in one writev() call */ @@ -1329,7 +1339,7 @@ mdb_page_free(MDB_env *env, MDB_page *mp) env->me_dpages = mp; } -/* Free a dirty page */ +/** Free a dirty page */ static void mdb_dpage_free(MDB_env *env, MDB_page *dp) { @@ -1356,7 +1366,7 @@ mdb_dlist_free(MDB_txn *txn) dl[0].mid = 0; } -/* Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. +/** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn. * @param[in] mc A cursor handle for the current operation. * @param[in] pflags Flags of the pages to update: * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. @@ -1415,15 +1425,12 @@ static int mdb_page_flush(MDB_txn *txn, int keep); /** Spill pages from the dirty list back to disk. * This is intended to prevent running into #MDB_TXN_FULL situations, * but note that they may still occur in a few cases: - * 1) pages in #MDB_DUPSORT sub-DBs are never spilled, so if there - * are too many of these dirtied in one txn, the txn may still get - * too full. + * 1) our estimate of the txn size could be too small. Currently this + * seems unlikely, except with a large number of #MDB_MULTIPLE items. * 2) child txns may run out of space if their parents dirtied a * lot of pages and never spilled them. TODO: we probably should do * a preemptive spill during #mdb_txn_begin() of a child txn, if * the parent's dirty_room is below a given threshold. - * 3) our estimate of the txn size could be too small. At the - * moment this seems unlikely. * * Otherwise, if not using nested txns, it is expected that apps will * not run into #MDB_TXN_FULL any more. The pages are flushed to disk @@ -2585,7 +2592,7 @@ mdb_freelist_save(MDB_txn *txn) total_room += head_room; } - /* Fill in the reserved, touched me_pghead records */ + /* Fill in the reserved me_pghead records */ rc = MDB_SUCCESS; if (mop_len) { MDB_val key, data; @@ -4305,7 +4312,7 @@ mdb_cmp_long(const MDB_val *a, const MDB_val *b) *(size_t *)a->mv_data > *(size_t *)b->mv_data; } -/** Compare two items pointing at aligned int's */ +/** Compare two items pointing at aligned unsigned int's */ static int mdb_cmp_int(const MDB_val *a, const MDB_val *b) { @@ -4313,7 +4320,7 @@ mdb_cmp_int(const MDB_val *a, const MDB_val *b) *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data; } -/** Compare two items pointing at ints of unknown alignment. +/** Compare two items pointing at unsigned ints of unknown alignment. * Nodes and keys are guaranteed to be 2-byte aligned. */ static int @@ -8270,7 +8277,7 @@ int mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) return 0; } -/* insert pid into list if not already present. +/** Insert pid into list if not already present. * return -1 if already present. */ static int mdb_pid_insert(pid_t *ids, pid_t pid) diff --git a/libraries/liblmdb/midl.h b/libraries/liblmdb/midl.h index b0bdff3f49..635cd29e0e 100644 --- a/libraries/liblmdb/midl.h +++ b/libraries/liblmdb/midl.h @@ -39,7 +39,7 @@ extern "C" { /** @defgroup idls ID List Management * @{ */ - /** A generic ID number. These were entryIDs in back-bdb. + /** A generic unsigned ID number. These were entryIDs in back-bdb. * Preferably it should have the same size as a pointer. */ typedef size_t MDB_ID; From 5b96d68fafd1732790ac5f887e5050033a48da42 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:13:27 +0200 Subject: [PATCH 05/25] mdb_pages_xkeep(): Reformat, use common flag mask --- libraries/liblmdb/mdb.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 0bc9db798f..b5a3d8242f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1376,10 +1376,11 @@ mdb_dlist_free(MDB_txn *txn) static int mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { + enum { Mask = P_SUBP|P_DIRTY|P_KEEP }; MDB_txn *txn = mc->mc_txn; MDB_cursor *m3; MDB_xcursor *mx; - MDB_page *dp; + MDB_page *dp, *mp; unsigned i, j; int rc = MDB_SUCCESS, level; @@ -1389,13 +1390,14 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { for (; mc; mc=mc->mc_next) { for (m3 = mc; m3->mc_flags & C_INITIALIZED; m3 = &mx->mx_cursor) { - for (j=0; jmc_snum; j++) - if ((m3->mc_pg[j]->mp_flags & (P_SUBP|P_DIRTY|P_KEEP)) - == pflags) - m3->mc_pg[j]->mp_flags ^= P_KEEP; - mx = m3->mc_xcursor; - if (mx == NULL) - break; + for (j=0; jmc_snum; j++) { + mp = m3->mc_pg[j]; + if ((mp->mp_flags & Mask) == pflags) + mp->mp_flags ^= P_KEEP; + } + mx = m3->mc_xcursor; + if (mx == NULL) + break; } } if (i == 0) @@ -1411,7 +1413,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) continue; if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) break; - if ((dp->mp_flags & (P_DIRTY|P_KEEP)) == pflags && level <= 1) + if ((dp->mp_flags & Mask) == pflags && level <= 1) dp->mp_flags ^= P_KEEP; } } From cfe262dce9a0be4791db4720aa308d5a6ecb8888 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:13:27 +0200 Subject: [PATCH 06/25] ITS#7515 mdb_pages_xkeep(): Careful about xcursors. Don't prod sub-pages or pages referring to uninitialized xcursors. --- libraries/liblmdb/mdb.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index b5a3d8242f..093cb53227 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1381,6 +1381,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) MDB_cursor *m3; MDB_xcursor *mx; MDB_page *dp, *mp; + MDB_node *leaf; unsigned i, j; int rc = MDB_SUCCESS, level; @@ -1389,14 +1390,23 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) mc = NULL; /* will find mc in mt_cursors */ for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { for (; mc; mc=mc->mc_next) { - for (m3 = mc; m3->mc_flags & C_INITIALIZED; m3 = &mx->mx_cursor) { + if (!(mc->mc_flags & C_INITIALIZED)) + continue; + for (m3 = mc;; m3 = &mx->mx_cursor) { + mp = NULL; for (j=0; jmc_snum; j++) { mp = m3->mc_pg[j]; if ((mp->mp_flags & Mask) == pflags) mp->mp_flags ^= P_KEEP; } mx = m3->mc_xcursor; - if (mx == NULL) + /* Proceed to mx if it is at a sub-database */ + if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED))) + break; + if (! (mp && (mp->mp_flags & P_LEAF))) + break; + leaf = NODEPTR(mp, m3->mc_ki[j-1]); + if (!(leaf->mn_flags & F_SUBDATA)) break; } } From bc48a40621ee26f15503dbd16321c972648b4769 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:13:27 +0200 Subject: [PATCH 07/25] ITS#7515 Fix mt_dirty_room in nested txns. Fix description & code: Also ignore dirty pages hidden by spilled pages, as they won't merge into our dirty_list. Update it in mdb_page_flush() instead of mdb_page_spill(). --- libraries/liblmdb/mdb.c | 57 ++++++++++------------------------------- 1 file changed, 13 insertions(+), 44 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 093cb53227..b49977df8c 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -892,7 +892,11 @@ struct MDB_txn { #define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ /** @} */ unsigned int mt_flags; /**< @ref mdb_txn */ - /** dirty_list maxsize - # of allocated pages allowed, including in parent txns */ + /** dirty_list room: Array size - #dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirty_list into mt_parent after freeing hidden mt_parent pages. + */ unsigned int mt_dirty_room; /** Tracks which of the two meta pages was used at the start * of this transaction. @@ -1560,31 +1564,7 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) rc = mdb_pages_xkeep(m0, P_DIRTY|P_KEEP, i); done: - if (rc == 0) { - if (txn->mt_parent) { - txn->mt_dirty_room = txn->mt_parent->mt_dirty_room - dl[0].mid; - /* dirty pages that are dirty in an ancestor don't - * count against this txn's dirty_room. - */ - for (i=1; i<=dl[0].mid; i++) { - pgno_t pgno = dl[i].mid; - MDB_txn *tx2; - for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { - j = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno); - if (j <= tx2->mt_u.dirty_list[0].mid && - tx2->mt_u.dirty_list[j].mid == pgno) { - txn->mt_dirty_room++; - break; - } - } - } - } else { - txn->mt_dirty_room = MDB_IDL_UM_MAX - dl[0].mid; - } - txn->mt_flags |= MDB_TXN_SPILLS; - } else { - txn->mt_flags |= MDB_TXN_ERROR; - } + txn->mt_flags |= rc ? MDB_TXN_ERROR : MDB_TXN_SPILLS; return rc; } @@ -1829,6 +1809,8 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) if (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pn) { MDB_page *np; int num; + if (txn->mt_dirty_room == 0) + return MDB_TXN_FULL; if (IS_OVERFLOW(mp)) num = mp->mp_pages; else @@ -1857,21 +1839,6 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) * page remains spilled until child commits */ - if (txn->mt_parent) { - MDB_txn *tx2; - /* If this page is also in a parent's dirty list, then - * it's already accounted in dirty_room, and we need to - * cancel out the decrement that mdb_page_dirty does. - */ - for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { - x = mdb_mid2l_search(tx2->mt_u.dirty_list, pgno); - if (x <= tx2->mt_u.dirty_list[0].mid && - tx2->mt_u.dirty_list[x].mid == pgno) { - tx0->mt_dirty_room++; - break; - } - } - } mdb_page_dirty(tx0, np); np->mp_flags |= P_DIRTY; *ret = np; @@ -2674,8 +2641,7 @@ mdb_page_flush(MDB_txn *txn, int keep) } dp->mp_flags &= ~P_DIRTY; } - dl[0].mid = j; - return MDB_SUCCESS; + goto done; } /* Write the pages */ @@ -2769,8 +2735,11 @@ mdb_page_flush(MDB_txn *txn, int keep) } mdb_dpage_free(env, dp); } - dl[0].mid = j; +done: + i--; + txn->mt_dirty_room += i - j; + dl[0].mid = j; return MDB_SUCCESS; } From a3b3482854698f520135a49de930df4b788694dd Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:13:27 +0200 Subject: [PATCH 08/25] ITS#7515 Fix mdb_txn_commit(nested txn with spills). Catch malloc error. Fix hunt for dirty vs spilled pages: Don't leave x at a deleted pageno. Cleanup: Factor out variables, squash pages already marked for deletion. --- libraries/liblmdb/mdb.c | 54 +++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index b49977df8c..84429ee3d6 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2779,14 +2779,18 @@ mdb_txn_commit(MDB_txn *txn) if (txn->mt_parent) { MDB_txn *parent = txn->mt_parent; - unsigned x, y, len; MDB_ID2L dst, src; + MDB_IDL pspill; + unsigned x, y, len, ps_len; /* Append our free list to parent's */ rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); if (rc) goto fail; mdb_midl_free(txn->mt_free_pgs); + /* Failures after this must either undo the changes + * to the parent or set MDB_TXN_ERROR in the parent. + */ parent->mt_next_pgno = txn->mt_next_pgno; parent->mt_flags = txn->mt_flags; @@ -2808,37 +2812,26 @@ mdb_txn_commit(MDB_txn *txn) dst = parent->mt_u.dirty_list; src = txn->mt_u.dirty_list; /* Remove anything in our dirty list from parent's spill list */ - if (parent->mt_spill_pgs) { - x = parent->mt_spill_pgs[0]; - len = x; - /* zero out our dirty pages in parent spill list */ - for (i=1; i<=src[0].mid; i++) { + if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) { + x = y = ps_len; + pspill[0] = (pgno_t)-1; + /* Mark our dirty pages as deleted in parent spill list */ + for (i=0, len=src[0].mid; ++i <= len; ) { MDB_ID pn = src[i].mid << 1; - if (pn < parent->mt_spill_pgs[x]) - continue; - if (pn > parent->mt_spill_pgs[x]) { - if (x <= 1) - break; + while (pn > pspill[x]) x--; - continue; + if (pn == pspill[x]) { + pspill[x] = 1; + y = --x; } - parent->mt_spill_pgs[x] = 0; - len--; - } - /* OK, we had a few hits, squash zeros from the spill list */ - if (len < parent->mt_spill_pgs[0]) { - x=1; - for (y=1; y<=parent->mt_spill_pgs[0]; y++) { - if (parent->mt_spill_pgs[y]) { - if (y != x) { - parent->mt_spill_pgs[x] = parent->mt_spill_pgs[y]; - } - x++; - } - } - parent->mt_spill_pgs[0] = len; } + /* Squash deleted pagenums if we deleted any */ + for (x=y; ++x <= ps_len; ) + if (!(pspill[x] & 1)) + pspill[++y] = pspill[x]; + pspill[0] = y; } + /* Find len = length of merging our dirty list with parent's */ x = dst[0].mid; dst[0].mid = 0; /* simplify loops */ @@ -2872,7 +2865,10 @@ mdb_txn_commit(MDB_txn *txn) parent->mt_dirty_room = txn->mt_dirty_room; if (txn->mt_spill_pgs) { if (parent->mt_spill_pgs) { - mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + /* TODO: Prevent failure here, so parent does not fail */ + rc = mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + if (rc) + parent->mt_flags |= MDB_TXN_ERROR; mdb_midl_free(txn->mt_spill_pgs); mdb_midl_sort(parent->mt_spill_pgs); } else { @@ -2883,7 +2879,7 @@ mdb_txn_commit(MDB_txn *txn) parent->mt_child = NULL; mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); free(txn); - return MDB_SUCCESS; + return rc; } if (txn != env->me_txn) { From 8e1d10e82887a0b8fdea6151c73377f9cabeb517 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:13:27 +0200 Subject: [PATCH 09/25] ITS#7515 Fix mdb_page_unspill() in nested txn. Malloc a page in this txn, not in a parent. --- libraries/liblmdb/mdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 84429ee3d6..21d11b98de 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1818,7 +1818,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) if (env->me_flags & MDB_WRITEMAP) { np = mp; } else { - np = mdb_page_malloc(txn, num); + np = mdb_page_malloc(tx0, num); if (!np) return ENOMEM; if (num > 1) From 52cb8b3417386e2cacee98401a797d8dd1a81020 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:13:27 +0200 Subject: [PATCH 10/25] mdb_page_unspill(): Rename local vars. The names have caused bugs, "txn" was treated as the current transaction. --- libraries/liblmdb/mdb.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 21d11b98de..f5db761e9c 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1789,24 +1789,24 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) /** Pull a page off the txn's spill list, if present. * If a page being referenced was spilled to disk in this txn, bring * it back and make it dirty/writable again. - * @param[in] tx0 the transaction handle. + * @param[in] txn the transaction handle. * @param[in] mp the page being referenced. * @param[out] ret the writable page, if any. ret is unchanged if * mp wasn't spilled. */ static int -mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) +mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) { - MDB_env *env = tx0->mt_env; - MDB_txn *txn; + MDB_env *env = txn->mt_env; + const MDB_txn *tx2; unsigned x; pgno_t pgno = mp->mp_pgno, pn = pgno << 1; - for (txn = tx0; txn; txn=txn->mt_parent) { - if (!txn->mt_spill_pgs) + for (tx2 = txn; tx2; tx2=tx2->mt_parent) { + if (!tx2->mt_spill_pgs) continue; - x = mdb_midl_search(txn->mt_spill_pgs, pn); - if (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pn) { + x = mdb_midl_search(tx2->mt_spill_pgs, pn); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { MDB_page *np; int num; if (txn->mt_dirty_room == 0) @@ -1818,7 +1818,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) if (env->me_flags & MDB_WRITEMAP) { np = mp; } else { - np = mdb_page_malloc(tx0, num); + np = mdb_page_malloc(txn, num); if (!np) return ENOMEM; if (num > 1) @@ -1826,7 +1826,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) else mdb_page_copy(np, mp, env->me_psize); } - if (txn == tx0) { + if (tx2 == txn) { /* If in current txn, this page is no longer spilled. * If it happens to be the last page, truncate the spill list. * Otherwise mark it as deleted by setting the LSB. @@ -1839,7 +1839,7 @@ mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) * page remains spilled until child commits */ - mdb_page_dirty(tx0, np); + mdb_page_dirty(txn, np); np->mp_flags |= P_DIRTY; *ret = np; break; From 7bdb5be0b572a4ff952456a2faca9ecb7a6e8786 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:20:05 +0200 Subject: [PATCH 11/25] Rename SWAP() to avoid conflict with sqlightning. Happened since sqlightning #includes mdb.c, midl.c. --- libraries/liblmdb/midl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libraries/liblmdb/midl.c b/libraries/liblmdb/midl.c index 86e4592d2d..0225af9584 100644 --- a/libraries/liblmdb/midl.c +++ b/libraries/liblmdb/midl.c @@ -203,7 +203,7 @@ int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ) /* Quicksort + Insertion sort for small arrays */ #define SMALL 8 -#define SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; } +#define MIDL_SWAP(a,b) { itmp=(a); (a)=(b); (b)=itmp; } void mdb_midl_sort( MDB_IDL ids ) @@ -231,15 +231,15 @@ mdb_midl_sort( MDB_IDL ids ) l = istack[jstack--]; } else { k = (l + ir) >> 1; /* Choose median of left, center, right */ - SWAP(ids[k], ids[l+1]); + MIDL_SWAP(ids[k], ids[l+1]); if (ids[l] < ids[ir]) { - SWAP(ids[l], ids[ir]); + MIDL_SWAP(ids[l], ids[ir]); } if (ids[l+1] < ids[ir]) { - SWAP(ids[l+1], ids[ir]); + MIDL_SWAP(ids[l+1], ids[ir]); } if (ids[l] < ids[l+1]) { - SWAP(ids[l], ids[l+1]); + MIDL_SWAP(ids[l], ids[l+1]); } i = l+1; j = ir; @@ -248,7 +248,7 @@ mdb_midl_sort( MDB_IDL ids ) do i++; while(ids[i] > a); do j--; while(ids[j] < a); if (j < i) break; - SWAP(ids[i],ids[j]); + MIDL_SWAP(ids[i],ids[j]); } ids[l+1] = ids[j]; ids[j] = a; From 31cfca9316da2b2b901d384811d80a328eefbcdb Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:20:42 +0200 Subject: [PATCH 12/25] Tweak MDB_DEBUG output --- libraries/liblmdb/mdb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index f5db761e9c..e991a17207 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4649,7 +4649,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) } DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, - key ? DKEY(key) : NULL)); + key ? DKEY(key) : "null")); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -6245,7 +6245,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, data ? data->mv_size : 0, - key ? key->mv_size : 0, key ? DKEY(key) : NULL)); + key ? key->mv_size : 0, key ? DKEY(key) : "null")); if (IS_LEAF2(mp)) { /* Move higher keys up one slot. */ @@ -8323,6 +8323,8 @@ int mdb_reader_check(MDB_env *env, int *dead) if (!mdb_reader_pid(env, Pidcheck, pid)) { for (j=i; j Date: Mon, 23 Sep 2013 20:21:11 +0200 Subject: [PATCH 13/25] Clean up and simplify mdb_page_search(). Only named DBs can have DB_STALE, and they do not use MDB_PS_MODIFY. Replace magic key values with flags. Drop duplicated comments at mdb_page_search_root() vs. mdb_page_search(), and rephrase. --- libraries/liblmdb/mdb.c | 79 +++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 50 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index e991a17207..8d2a43c5db 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -869,8 +869,8 @@ struct MDB_txn { * @{ */ #define DB_DIRTY 0x01 /**< DB was written in this txn */ -#define DB_STALE 0x02 /**< DB record is older than txnID */ -#define DB_NEW 0x04 /**< DB handle opened in this txn */ +#define DB_STALE 0x02 /**< Named-DB record is older than txnID */ +#define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ /** @} */ /** In write txns, array of cursors for each DB */ @@ -1056,6 +1056,8 @@ static int mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); #define MDB_PS_MODIFY 1 #define MDB_PS_ROOTONLY 2 +#define MDB_PS_FIRST 4 +#define MDB_PS_LAST 8 static int mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags); static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); @@ -1269,7 +1271,7 @@ static void mdb_audit(MDB_txn *txn) txn->mt_dbs[i].md_leaf_pages + txn->mt_dbs[i].md_overflow_pages; if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { - mdb_page_search(&mc, NULL, 0); + mdb_page_search(&mc, NULL, MDB_PS_FIRST); do { unsigned j; MDB_page *mp; @@ -2474,7 +2476,7 @@ mdb_freelist_save(MDB_txn *txn) if (env->me_pghead) { /* Make sure first page of freeDB is touched and on freelist */ - rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY); + rc = mdb_page_search(&mc, NULL, MDB_PS_FIRST|MDB_PS_MODIFY); if (rc && rc != MDB_NOTFOUND) return rc; } @@ -2502,9 +2504,7 @@ mdb_freelist_save(MDB_txn *txn) if (freecnt < txn->mt_free_pgs[0]) { if (!freecnt) { /* Make sure last page of freeDB is touched and on freelist */ - key.mv_size = MDB_MAXKEYSIZE+1; - key.mv_data = NULL; - rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY); + rc = mdb_page_search(&mc, NULL, MDB_PS_LAST|MDB_PS_MODIFY); if (rc && rc != MDB_NOTFOUND) return rc; } @@ -4579,18 +4579,11 @@ done: return MDB_SUCCESS; } -/** Search for the page a given key should be in. - * Pushes parent pages on the cursor stack. This function continues a - * search on a cursor that has already been initialized. (Usually by - * #mdb_page_search() but also by #mdb_node_move().) - * @param[in,out] mc the cursor for this operation. - * @param[in] key the key to search for. If NULL, search for the lowest - * page. (This is used by #mdb_cursor_first().) - * @param[in] modify If true, visited pages are updated with new page numbers. - * @return 0 on success, non-zero on failure. +/** Finish #mdb_page_search() / #mdb_page_search_lowest(). + * The cursor is at the root page, set up the rest of it. */ static int -mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) +mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) { MDB_page *mp = mc->mc_pg[mc->mc_top]; int rc; @@ -4604,11 +4597,10 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) assert(NUMKEYS(mp) > 1); DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); - if (key == NULL) /* Initialize cursor to first page. */ + if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { i = 0; - else if (key->mv_size > MDB_MAXKEYSIZE && key->mv_data == NULL) { - /* cursor to last page */ - i = NUMKEYS(mp)-1; + if (flags & MDB_PS_LAST) + i = NUMKEYS(mp) - 1; } else { int exact; node = mdb_node_search(mc, key, &exact); @@ -4621,10 +4613,9 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) i--; } } + DPRINTF(("following index %u for key [%s]", i, DKEY(key))); } - if (key) - DPRINTF(("following index %u for key [%s]", i, DKEY(key))); assert(i < NUMKEYS(mp)); node = NODEPTR(mp, i); @@ -4635,7 +4626,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) if ((rc = mdb_cursor_push(mc, mp))) return rc; - if (modify) { + if (flags & MDB_PS_MODIFY) { if ((rc = mdb_page_touch(mc)) != 0) return rc; mp = mc->mc_pg[mc->mc_top]; @@ -4675,18 +4666,17 @@ mdb_page_search_lowest(MDB_cursor *mc) mc->mc_ki[mc->mc_top] = 0; if ((rc = mdb_cursor_push(mc, mp))) return rc; - return mdb_page_search_root(mc, NULL, 0); + return mdb_page_search_root(mc, NULL, MDB_PS_FIRST); } /** Search for the page a given key should be in. - * Pushes parent pages on the cursor stack. This function just sets up - * the search; it finds the root page for \b mc's database and sets this - * as the root of the cursor's stack. Then #mdb_page_search_root() is - * called to complete the search. + * Push it and its parent pages on the cursor stack. * @param[in,out] mc the cursor for this operation. - * @param[in] key the key to search for. If NULL, search for the lowest - * page. (This is used by #mdb_cursor_first().) - * @param[in] flags If MDB_PS_MODIFY set, visited pages are updated with new page numbers. + * @param[in] key the key to search for, or NULL for first/last page. + * @param[in] flags If MDB_PS_MODIFY is set, visited pages in the DB + * are touched (updated with new page numbers). + * If MDB_PS_FIRST or MDB_PS_LAST is set, find first or last leaf. + * This is used by #mdb_cursor_first() and #mdb_cursor_last(). * If MDB_PS_ROOTONLY set, just fetch root node, no further lookups. * @return 0 on success, non-zero on failure. */ @@ -4697,23 +4687,20 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) pgno_t root; /* Make sure the txn is still viable, then find the root from - * the txn's db table. + * the txn's db table and set it as the root of the cursor's stack. */ if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) { DPUTS("transaction has failed, must abort"); return MDB_BAD_TXN; } else { /* Make sure we're using an up-to-date root */ - if (mc->mc_dbi > MAIN_DBI) { - if ((*mc->mc_dbflag & DB_STALE) || - ((flags & MDB_PS_MODIFY) && !(*mc->mc_dbflag & DB_DIRTY))) { + if (*mc->mc_dbflag & DB_STALE) { MDB_cursor mc2; - unsigned char dbflag = 0; mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL); - rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, flags & MDB_PS_MODIFY); + rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 0); if (rc) return rc; - if (*mc->mc_dbflag & DB_STALE) { + { MDB_val data; int exact = 0; uint16_t flags; @@ -4733,11 +4720,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) return MDB_INCOMPATIBLE; memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); } - if (flags & MDB_PS_MODIFY) - dbflag = DB_DIRTY; *mc->mc_dbflag &= ~DB_STALE; - *mc->mc_dbflag |= dbflag; - } } root = mc->mc_db->md_root; @@ -5310,7 +5293,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, 0); + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); if (rc != MDB_SUCCESS) return rc; } @@ -5356,11 +5339,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (!(mc->mc_flags & C_EOF)) { if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - MDB_val lkey; - - lkey.mv_size = MDB_MAXKEYSIZE+1; - lkey.mv_data = NULL; - rc = mdb_page_search(mc, &lkey, 0); + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); if (rc != MDB_SUCCESS) return rc; } @@ -8056,7 +8035,7 @@ mdb_drop0(MDB_cursor *mc, int subs) { int rc; - rc = mdb_page_search(mc, NULL, 0); + rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); if (rc == MDB_SUCCESS) { MDB_txn *txn = mc->mc_txn; MDB_node *ni; From e3f6c152c5ab1a9810a1d3ed7de9a3252ff9e35a Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 23 Sep 2013 20:21:11 +0200 Subject: [PATCH 14/25] Drop unneeded code. MDB_txn.mt_toggle: Use (mt_txnid & 1) instead. Drop error checks which will be repeated. mdb_cursor_set(): Turn assert into if/return to match the above. mdb_cursor_del(): 'flags' are now used as bitflags. --- libraries/liblmdb/mdb.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 8d2a43c5db..c4db089b32 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -898,10 +898,6 @@ struct MDB_txn { * dirty_list into mt_parent after freeing hidden mt_parent pages. */ unsigned int mt_dirty_room; - /** Tracks which of the two meta pages was used at the start - * of this transaction. - */ - unsigned int mt_toggle; }; /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. @@ -2118,6 +2114,7 @@ static int mdb_txn_renew0(MDB_txn *txn) { MDB_env *env = txn->mt_env; + MDB_meta *meta; unsigned int i; uint16_t x; int rc, new_notls = 0; @@ -2128,8 +2125,8 @@ mdb_txn_renew0(MDB_txn *txn) if (txn->mt_flags & MDB_TXN_RDONLY) { if (!env->me_txns) { - i = mdb_env_pick_meta(env); - txn->mt_txnid = env->me_metas[i]->mm_txnid; + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; txn->mt_u.reader = NULL; } else { MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : @@ -2174,13 +2171,13 @@ mdb_txn_renew0(MDB_txn *txn) } txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid; txn->mt_u.reader = r; + meta = env->me_metas[txn->mt_txnid & 1]; } - txn->mt_toggle = txn->mt_txnid & 1; } else { LOCK_MUTEX_W(env); txn->mt_txnid = env->me_txns->mti_txnid; - txn->mt_toggle = txn->mt_txnid & 1; + meta = env->me_metas[txn->mt_txnid & 1]; txn->mt_txnid++; #if MDB_DEBUG if (txn->mt_txnid == mdb_debug_start) @@ -2196,10 +2193,10 @@ mdb_txn_renew0(MDB_txn *txn) } /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db)); + memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db)); /* Moved to here to avoid a data race in read TXNs */ - txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1; + txn->mt_next_pgno = meta->mm_last_pg+1; for (i=2; imt_numdbs; i++) { x = env->me_dbflags[i]; @@ -2295,7 +2292,6 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) return ENOMEM; } txn->mt_txnid = parent->mt_txnid; - txn->mt_toggle = parent->mt_toggle; txn->mt_dirty_room = parent->mt_dirty_room; txn->mt_u.dirty_list[0].mid = 0; txn->mt_spill_pgs = NULL; @@ -3093,7 +3089,7 @@ mdb_env_write_meta(MDB_txn *txn) assert(txn != NULL); assert(txn->mt_env != NULL); - toggle = !txn->mt_toggle; + toggle = txn->mt_txnid & 1; DPRINTF(("writing meta page %d for root page %"Z"u", toggle, txn->mt_dbs[MAIN_DBI].md_root)); @@ -4878,7 +4874,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, if (txn->mt_flags & MDB_TXN_ERROR) return MDB_BAD_TXN; - if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { + if (key->mv_size > MDB_MAXKEYSIZE) { return MDB_BAD_VALSIZE; } @@ -5107,7 +5103,8 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, assert(mc); assert(key); - assert(key->mv_size > 0); + if (key->mv_size == 0) + return MDB_BAD_VALSIZE; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); @@ -5431,7 +5428,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, case MDB_SET_RANGE: if (key == NULL) { rc = EINVAL; - } else if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { + } else if (key->mv_size > MDB_MAXKEYSIZE) { rc = MDB_BAD_VALSIZE; } else if (op == MDB_SET_RANGE) rc = mdb_cursor_set(mc, key, data, op, NULL); @@ -6051,7 +6048,6 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) return rc; - flags &= ~MDB_NOSPILL; /* TODO: Or change (flags != MDB_NODUPDATA) to ~(flags & MDB_NODUPDATA), not looking at the logic of that code just now */ rc = mdb_cursor_touch(mc); if (rc) @@ -7301,7 +7297,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { + if (key->mv_size > MDB_MAXKEYSIZE) { return MDB_BAD_VALSIZE; } @@ -7765,13 +7761,6 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID)) return EINVAL; - if (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR)) - return (txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN; - - if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) { - return MDB_BAD_VALSIZE; - } - if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags) return EINVAL; From f73994054779be3e177bfd27f44fcd3147e7d08d Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 1 Oct 2013 13:16:38 -0700 Subject: [PATCH 15/25] Add MDB_NORDLOCK to omit all reader table usage Calling app wants to manage its own locking. --- libraries/liblmdb/lmdb.h | 2 ++ libraries/liblmdb/mdb.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index f637229283..848ba635dd 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -269,6 +269,8 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_MAPASYNC 0x100000 /** tie reader locktable slots to #MDB_txn objects instead of to threads */ #define MDB_NOTLS 0x200000 + /** don't use reader locktable at all, caller must manage read/write concurrency */ +#define MDB_NORDLOCK 0x400000 /** @} */ /** @defgroup mdb_dbi_open Database Flags diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index c4db089b32..e82335d716 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -3983,7 +3983,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode goto leave; } - if (F_ISSET(flags, MDB_RDONLY)) { + if ((flags & (MDB_RDONLY|MDB_NORDLOCK)) == MDB_RDONLY) { rc = mdb_env_setup_locks(env, lpath, mode, &excl); if (rc) goto leave; From a1685c3ef70bc70fcb7b8fde4a2a2e9f6d8e11f0 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 1 Oct 2013 23:36:57 -0700 Subject: [PATCH 16/25] More doc for MDB_NORDLOCK --- libraries/liblmdb/lmdb.h | 9 +++++++-- libraries/liblmdb/mdb.c | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 848ba635dd..968862f987 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -265,11 +265,11 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_NOMETASYNC 0x40000 /** use writable mmap */ #define MDB_WRITEMAP 0x80000 - /** use asynchronous msync when MDB_WRITEMAP is used */ + /** use asynchronous msync when #MDB_WRITEMAP is used */ #define MDB_MAPASYNC 0x100000 /** tie reader locktable slots to #MDB_txn objects instead of to threads */ #define MDB_NOTLS 0x200000 - /** don't use reader locktable at all, caller must manage read/write concurrency */ + /** for #MDB_RDONLY env, don't use reader locktable, caller must manage read/write concurrency */ #define MDB_NORDLOCK 0x400000 /** @} */ @@ -527,6 +527,11 @@ int mdb_env_create(MDB_env **env); * user threads over individual OS threads need this option. Such an * application must also serialize the write transactions in an OS * thread, since MDB's write locking is unaware of the user threads. + *
  • #MDB_NORDLOCK + * Don't use the reader locktable at all. This flag is only valid + * with #MDB_RDONLY. MDB will use no read locks. If other processes + * may be opening the environment with write access, the callers + * must manage read/write locks themselves. * * @param[in] mode The UNIX permissions to set on created files. This parameter * is ignored on Windows. diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index e82335d716..637cd08809 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -3900,7 +3900,7 @@ fail: * environment and re-opening it with the new flags. */ #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC) -#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS) +#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS|MDB_NORDLOCK) int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) From 79eac42c0f1fdd6c807f0b601c03ab35bf7d7d8e Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 3 Oct 2013 10:26:44 -0700 Subject: [PATCH 17/25] s/MDB_NORDLOCK/MDB_NOLOCK/ Leave all lock management to the caller. --- libraries/liblmdb/lmdb.h | 16 ++++++++------- libraries/liblmdb/mdb.c | 43 +++++++++++++++++++++++++--------------- 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 968862f987..8cdac26b06 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -269,8 +269,8 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_MAPASYNC 0x100000 /** tie reader locktable slots to #MDB_txn objects instead of to threads */ #define MDB_NOTLS 0x200000 - /** for #MDB_RDONLY env, don't use reader locktable, caller must manage read/write concurrency */ -#define MDB_NORDLOCK 0x400000 + /** don't do any locking, caller must manage their own locks */ +#define MDB_NOLOCK 0x400000 /** @} */ /** @defgroup mdb_dbi_open Database Flags @@ -527,11 +527,13 @@ int mdb_env_create(MDB_env **env); * user threads over individual OS threads need this option. Such an * application must also serialize the write transactions in an OS * thread, since MDB's write locking is unaware of the user threads. - *
  • #MDB_NORDLOCK - * Don't use the reader locktable at all. This flag is only valid - * with #MDB_RDONLY. MDB will use no read locks. If other processes - * may be opening the environment with write access, the callers - * must manage read/write locks themselves. + *
  • #MDB_NOLOCK + * Don't do any locking. If concurrent access is anticipated, the + * caller must manage all concurrency itself. For proper operation + * the caller must enforce single-writer semantics, and must ensure + * that no readers are using old transactions while a writer is + * active. The simplest approach is to use an exclusive lock so that + * no readers may be active at all when a writer begins. * * @param[in] mode The UNIX permissions to set on created files. This parameter * is ignored on Windows. diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 637cd08809..30b562c43e 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -425,7 +425,8 @@ typedef uint16_t indx_t; * * If #MDB_NOTLS is set, the slot address is not saved in thread-specific data. * - * No reader table is used if the database is on a read-only filesystem. + * No reader table is used if the database is on a read-only filesystem, or + * if #MDB_NOLOCK is set. * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which @@ -1572,12 +1573,14 @@ mdb_find_oldest(MDB_txn *txn) { int i; txnid_t mr, oldest = txn->mt_txnid - 1; - MDB_reader *r = txn->mt_env->me_txns->mti_readers; - for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { - if (r[i].mr_pid) { - mr = r[i].mr_txnid; - if (oldest > mr) - oldest = mr; + if (txn->mt_env->me_txns) { + MDB_reader *r = txn->mt_env->me_txns->mti_readers; + for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + mr = r[i].mr_txnid; + if (oldest > mr) + oldest = mr; + } } } return oldest; @@ -2174,10 +2177,15 @@ mdb_txn_renew0(MDB_txn *txn) meta = env->me_metas[txn->mt_txnid & 1]; } } else { - LOCK_MUTEX_W(env); + if (env->me_txns) { + LOCK_MUTEX_W(env); - txn->mt_txnid = env->me_txns->mti_txnid; - meta = env->me_metas[txn->mt_txnid & 1]; + txn->mt_txnid = env->me_txns->mti_txnid; + meta = env->me_metas[txn->mt_txnid & 1]; + } else { + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; + } txn->mt_txnid++; #if MDB_DEBUG if (txn->mt_txnid == mdb_debug_start) @@ -2417,7 +2425,8 @@ mdb_txn_reset0(MDB_txn *txn, const char *act) env->me_txn = NULL; /* The writer mutex was locked in mdb_txn_begin. */ - UNLOCK_MUTEX_W(env); + if (env->me_txns) + UNLOCK_MUTEX_W(env); } } @@ -2934,7 +2943,8 @@ done: env->me_txn = NULL; mdb_dbis_update(txn, 1); - UNLOCK_MUTEX_W(env); + if (env->me_txns) + UNLOCK_MUTEX_W(env); free(txn); return MDB_SUCCESS; @@ -3180,7 +3190,8 @@ done: * readers will get consistent data regardless of how fresh or * how stale their view of these values is. */ - env->me_txns->mti_txnid = txn->mt_txnid; + if (env->me_txns) + env->me_txns->mti_txnid = txn->mt_txnid; return MDB_SUCCESS; } @@ -3900,7 +3911,7 @@ fail: * environment and re-opening it with the new flags. */ #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC) -#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS|MDB_NORDLOCK) +#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK) int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) @@ -3953,7 +3964,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode } /* For RDONLY, get lockfile after we know datafile exists */ - if (!F_ISSET(flags, MDB_RDONLY)) { + if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { rc = mdb_env_setup_locks(env, lpath, mode, &excl); if (rc) goto leave; @@ -3983,7 +3994,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode goto leave; } - if ((flags & (MDB_RDONLY|MDB_NORDLOCK)) == MDB_RDONLY) { + if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { rc = mdb_env_setup_locks(env, lpath, mode, &excl); if (rc) goto leave; From 07dc79a7c33121bc1be55bbd81ff643094df6851 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 3 Oct 2013 23:59:24 +0200 Subject: [PATCH 18/25] Set subDB DBI=parent DBI, fix MDB_DUPSORT delete. xcursor DBIs were parent DBI+1 for debugging. Instead output -(parent DBI). Fixes a crash in mdb_cursor_del0()'s xcursor tracking, it forgot to subtract 1 for C_SUB cursors. --- libraries/liblmdb/mdb.c | 52 ++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 30b562c43e..cca5fc624c 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -317,6 +317,9 @@ static txnid_t mdb_debug_start; * The string is printed literally, with no format processing. */ #define DPUTS(arg) DPRINTF(("%s", arg)) + /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) /** @} */ /** A default memory page size. @@ -1859,7 +1862,6 @@ mdb_page_touch(MDB_cursor *mc) MDB_page *mp = mc->mc_pg[mc->mc_top], *np; MDB_txn *txn = mc->mc_txn; MDB_cursor *m2, *m3; - MDB_dbi dbi; pgno_t pgno; int rc; @@ -1876,7 +1878,8 @@ mdb_page_touch(MDB_cursor *mc) (rc = mdb_page_alloc(mc, 1, &np))) return rc; pgno = np->mp_pgno; - DPRINTF(("touched db %u page %"Z"u -> %"Z"u", mc->mc_dbi,mp->mp_pgno,pgno)); + DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), + mp->mp_pgno, pgno)); assert(mp->mp_pgno != pgno); mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ @@ -1922,17 +1925,16 @@ mdb_page_touch(MDB_cursor *mc) done: /* Adjust cursors pointing to mp */ mc->mc_pg[mc->mc_top] = np; - dbi = mc->mc_dbi; + m2 = txn->mt_cursors[mc->mc_dbi]; if (mc->mc_flags & C_SUB) { - dbi--; - for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + for (; m2; m2=m2->mc_next) { m3 = &m2->mc_xcursor->mx_cursor; if (m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[mc->mc_top] == mp) m3->mc_pg[mc->mc_top] = np; } } else { - for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + for (; m2; m2=m2->mc_next) { if (m2->mc_snum < mc->mc_snum) continue; if (m2->mc_pg[mc->mc_top] == mp) { m2->mc_pg[mc->mc_top] = np; @@ -4502,8 +4504,8 @@ mdb_cursor_pop(MDB_cursor *mc) if (mc->mc_snum) mc->mc_top--; - DPRINTF(("popped page %"Z"u off db %u cursor %p", top->mp_pgno, - mc->mc_dbi, (void *) mc)); + DPRINTF(("popped page %"Z"u off db %d cursor %p", top->mp_pgno, + DDBI(mc), (void *) mc)); } } @@ -4511,8 +4513,8 @@ mdb_cursor_pop(MDB_cursor *mc) static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) { - DPRINTF(("pushing page %"Z"u on db %u cursor %p", mp->mp_pgno, - mc->mc_dbi, (void *) mc)); + DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, + DDBI(mc), (void *) mc)); if (mc->mc_snum >= CURSOR_STACK) { assert(mc->mc_snum < CURSOR_STACK); @@ -4745,8 +4747,8 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) mc->mc_snum = 1; mc->mc_top = 0; - DPRINTF(("db %u root page %"Z"u has flags 0x%X", - mc->mc_dbi, root, mc->mc_pg[0]->mp_flags)); + DPRINTF(("db %d root page %"Z"u has flags 0x%X", + DDBI(mc), root, mc->mc_pg[0]->mp_flags)); if (flags & MDB_PS_MODIFY) { if ((rc = mdb_page_touch(mc))) @@ -5620,8 +5622,8 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return MDB_BAD_VALSIZE; #endif - DPRINTF(("==> put db %u key [%s], size %"Z"u, data size %"Z"u", - mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size)); + DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u", + DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size)); dkey.mv_size = 0; @@ -5950,9 +5952,6 @@ new_sub: unsigned i = mc->mc_top; MDB_page *mp = mc->mc_pg[i]; - if (mc->mc_flags & C_SUB) - dbi--; - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (mc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; @@ -6462,7 +6461,7 @@ mdb_xcursor_init0(MDB_cursor *mc) mx->mx_cursor.mc_txn = mc->mc_txn; mx->mx_cursor.mc_db = &mx->mx_db; mx->mx_cursor.mc_dbx = &mx->mx_dbx; - mx->mx_cursor.mc_dbi = mc->mc_dbi+1; + mx->mx_cursor.mc_dbi = mc->mc_dbi; mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; @@ -6510,7 +6509,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) mx->mx_db.md_flags |= MDB_INTEGERKEY; } } - DPRINTF(("Sub-db %u for db %u root page %"Z"u", mx->mx_cursor.mc_dbi, mc->mc_dbi, + DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); mx->mx_dbflag = DB_VALID | (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY) ? DB_DIRTY : 0); @@ -6831,9 +6830,6 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) MDB_dbi dbi = csrc->mc_dbi; MDB_page *mp = csrc->mc_pg[csrc->mc_top]; - if (csrc->mc_flags & C_SUB) - dbi--; - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (csrc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; @@ -7008,9 +7004,6 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) MDB_dbi dbi = csrc->mc_dbi; MDB_page *mp = cdst->mc_pg[cdst->mc_top]; - if (csrc->mc_flags & C_SUB) - dbi--; - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (csrc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; @@ -7109,9 +7102,6 @@ mdb_rebalance(MDB_cursor *mc) MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; - if (mc->mc_flags & C_SUB) - dbi--; - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (mc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; @@ -7141,9 +7131,6 @@ mdb_rebalance(MDB_cursor *mc) MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; - if (mc->mc_flags & C_SUB) - dbi--; - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (mc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; @@ -7712,9 +7699,6 @@ done: MDB_dbi dbi = mc->mc_dbi; int fixup = NUMKEYS(mp); - if (mc->mc_flags & C_SUB) - dbi--; - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (mc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; From 3d67838a59b30b5732c96612cf8ce5bbf59d7441 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 3 Oct 2013 23:59:24 +0200 Subject: [PATCH 19/25] Set MDB_xcursor DB_DIRTY, clear md_name. Both were unused and md_name was unmaintained -- except mdb_cursor_touch(xcursor) would abuse md_name as a key to touch MAIN_DBI if it could somehow get passed ! DB_DIRTY. --- libraries/liblmdb/mdb.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index cca5fc624c..cc7471ac4e 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -872,7 +872,7 @@ struct MDB_txn { * @ingroup internal * @{ */ -#define DB_DIRTY 0x01 /**< DB was written in this txn */ +#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */ #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ @@ -6466,6 +6466,8 @@ mdb_xcursor_init0(MDB_cursor *mc) mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags = C_SUB; + mx->mx_dbx.md_name.mv_size = 0; + mx->mx_dbx.md_name.mv_data = NULL; mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; mx->mx_dbx.md_dcmp = NULL; mx->mx_dbx.md_rel = mc->mc_dbx->md_rel; @@ -6511,10 +6513,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) } DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); - mx->mx_dbflag = DB_VALID | (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY) ? - DB_DIRTY : 0); - mx->mx_dbx.md_name.mv_data = NODEKEY(node); - mx->mx_dbx.md_name.mv_size = node->mn_ksize; + mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ #if UINT_MAX < SIZE_MAX if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) #ifdef MISALIGNED_OK From 3a1d73dafff3a4da3a1ec3841831edadcb7e76be Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 3 Oct 2013 23:59:24 +0200 Subject: [PATCH 20/25] Optimize code holding reader mutex --- libraries/liblmdb/mdb.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index cc7471ac4e..0ee9f50b36 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2119,8 +2119,9 @@ static int mdb_txn_renew0(MDB_txn *txn) { MDB_env *env = txn->mt_env; + MDB_txninfo *ti = env->me_txns; MDB_meta *meta; - unsigned int i; + unsigned int i, nr; uint16_t x; int rc, new_notls = 0; @@ -2129,7 +2130,7 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ if (txn->mt_flags & MDB_TXN_RDONLY) { - if (!env->me_txns) { + if (!ti) { meta = env->me_metas[ mdb_env_pick_meta(env) ]; txn->mt_txnid = meta->mm_txnid; txn->mt_u.reader = NULL; @@ -2153,36 +2154,38 @@ mdb_txn_renew0(MDB_txn *txn) } LOCK_MUTEX_R(env); - for (i=0; ime_txns->mti_numreaders; i++) - if (env->me_txns->mti_readers[i].mr_pid == 0) + nr = ti->mti_numreaders; + for (i=0; imti_readers[i].mr_pid == 0) break; if (i == env->me_maxreaders) { UNLOCK_MUTEX_R(env); return MDB_READERS_FULL; } - env->me_txns->mti_readers[i].mr_pid = pid; - env->me_txns->mti_readers[i].mr_tid = tid; - if (i >= env->me_txns->mti_numreaders) - env->me_txns->mti_numreaders = i+1; + ti->mti_readers[i].mr_pid = pid; + ti->mti_readers[i].mr_tid = tid; + if (i == nr) + ti->mti_numreaders = ++nr; /* Save numreaders for un-mutexed mdb_env_close() */ - env->me_numreaders = env->me_txns->mti_numreaders; + env->me_numreaders = nr; UNLOCK_MUTEX_R(env); - r = &env->me_txns->mti_readers[i]; + + r = &ti->mti_readers[i]; new_notls = (env->me_flags & MDB_NOTLS); if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { r->mr_pid = 0; return rc; } } - txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid; + txn->mt_txnid = r->mr_txnid = ti->mti_txnid; txn->mt_u.reader = r; meta = env->me_metas[txn->mt_txnid & 1]; } } else { - if (env->me_txns) { + if (ti) { LOCK_MUTEX_W(env); - txn->mt_txnid = env->me_txns->mti_txnid; + txn->mt_txnid = ti->mti_txnid; meta = env->me_metas[txn->mt_txnid & 1]; } else { meta = env->me_metas[ mdb_env_pick_meta(env) ]; From 8f075595a14076fa78d1943f7bbb893031e9a6b2 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 3 Oct 2013 23:59:25 +0200 Subject: [PATCH 21/25] mdb_node_add(): Plug page leak when MDB_PAGE_FULL. Do not fail after mdb_page_new() succeeds. --- libraries/liblmdb/mdb.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 0ee9f50b36..f3e3180e00 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -6221,6 +6221,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, { unsigned int i; size_t node_size = NODESIZE; + ssize_t room; indx_t ofs; MDB_node *node; MDB_page *mp = mc->mc_pg[mc->mc_top]; @@ -6251,9 +6252,9 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, return MDB_SUCCESS; } + room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t); if (key != NULL) node_size += key->mv_size; - if (IS_LEAF(mp)) { assert(data); if (F_ISSET(flags, F_BIGDATA)) { @@ -6265,26 +6266,23 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, /* Put data on overflow page. */ DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page", data->mv_size, node_size+data->mv_size)); - node_size += sizeof(pgno_t); + node_size += sizeof(pgno_t) + (node_size & 1); + if ((ssize_t)node_size > room) + goto full; if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) return rc; DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); flags |= F_BIGDATA; + goto update; } else { node_size += data->mv_size; } } node_size += node_size & 1; + if ((ssize_t)node_size > room) + goto full; - if (node_size + sizeof(indx_t) > SIZELEFT(mp)) { - DPRINTF(("not enough room in page %"Z"u, got %u ptrs", - mp->mp_pgno, NUMKEYS(mp))); - DPRINTF(("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower, - mp->mp_upper - mp->mp_lower)); - DPRINTF(("node size = %"Z"u", node_size)); - return MDB_PAGE_FULL; - } - +update: /* Move higher pointers up one slot. */ for (i = NUMKEYS(mp); i > indx; i--) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; @@ -6330,6 +6328,13 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, } return MDB_SUCCESS; + +full: + DPRINTF(("not enough room in page %"Z"u, got %u ptrs", + mp->mp_pgno, NUMKEYS(mp))); + DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); + DPRINTF(("node size = %"Z"u", node_size)); + return MDB_PAGE_FULL; } /** Delete the specified node from a page. From 99ea7669a35a79878394b29cd3448b58daf031fb Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 3 Oct 2013 23:59:25 +0200 Subject: [PATCH 22/25] mdb_cursor_sibling(): Fix error result --- libraries/liblmdb/mdb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index f3e3180e00..e7564be8a0 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4942,8 +4942,11 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) assert(IS_BRANCH(mc->mc_pg[mc->mc_top])); indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL) != 0)) + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0) { + /* mc will be inconsistent if caller does mc_snum++ as above */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); return rc; + } mdb_cursor_push(mc, mp); if (!move_right) From 0f9b79e12cb9b1b716a0268f6dc70adc41067a09 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 3 Oct 2013 23:59:25 +0200 Subject: [PATCH 23/25] Maintain MDB_cursor.mc_top --- libraries/liblmdb/mdb.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index e7564be8a0..2f671ed0a1 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -5553,14 +5553,14 @@ fetchm: return rc; } -/** Touch all the pages in the cursor stack. +/** Touch all the pages in the cursor stack. Set mc_top. * Makes sure all the pages are writable, before attempting a write operation. * @param[in] mc The cursor to operate on. */ static int mdb_cursor_touch(MDB_cursor *mc) { - int rc; + int rc = MDB_SUCCESS; if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) { MDB_cursor mc2; @@ -5571,13 +5571,14 @@ mdb_cursor_touch(MDB_cursor *mc) return rc; *mc->mc_dbflag |= DB_DIRTY; } - for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) { - rc = mdb_page_touch(mc); - if (rc) - return rc; + mc->mc_top = 0; + if (mc->mc_snum) { + do { + rc = mdb_page_touch(mc); + } while (!rc && ++(mc->mc_top) < mc->mc_snum); + mc->mc_top = mc->mc_snum-1; } - mc->mc_top = mc->mc_snum-1; - return MDB_SUCCESS; + return rc; } /** Do not spill pages to disk if txn is getting full, may fail instead */ @@ -5640,6 +5641,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, } else if (mc->mc_db->md_root == P_INVALID) { /* new database, cursor has nothing to point to */ mc->mc_snum = 0; + mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; rc = MDB_NO_ROOT; } else { @@ -6499,6 +6501,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); mx->mx_cursor.mc_pg[0] = 0; mx->mx_cursor.mc_snum = 0; + mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags = C_SUB; } else { MDB_page *fp = NODEDATA(node); @@ -6511,8 +6514,8 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) mx->mx_db.md_entries = NUMKEYS(fp); COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; mx->mx_cursor.mc_top = 0; + mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; mx->mx_cursor.mc_pg[0] = fp; mx->mx_cursor.mc_ki[0] = 0; if (mc->mc_db->md_flags & MDB_DUPFIXED) { From 9f7ae8925fb896d7de9345ea441c5da3fce6670c Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Fri, 4 Oct 2013 00:48:19 +0200 Subject: [PATCH 24/25] Add Caveat: Readers need write access. Whitespace. --- libraries/liblmdb/lmdb.h | 6 +++++- libraries/liblmdb/mdb.c | 2 +- libraries/liblmdb/mdb_stat.c | 2 +- libraries/liblmdb/midl.c | 4 ++-- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 8cdac26b06..f6dfa60c38 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -66,6 +66,10 @@ * BSD systems or when otherwise configured with MDB_USE_POSIX_SEM. * Multiple users can cause startup to fail later, as noted above. * + * - There is normally no pure read-only mode, since readers need write + * access to locks and lock file. Exceptions: On read-only filesystems + * or with the #MDB_NOLOCK flag described under #mdb_env_open(). + * * - A thread can only use one transaction at a time, plus any child * transactions. Each transaction belongs to one thread. See below. * The #MDB_NOTLS flag changes this for read-only transactions. @@ -489,7 +493,7 @@ int mdb_env_create(MDB_env **env); * like wild pointer writes and other bad updates into the database. * Incompatible with nested transactions. * Processes with and without MDB_WRITEMAP on the same environment do - * not cooperate well. + * not cooperate well. *
  • #MDB_NOMETASYNC * Flush system buffers to disk only once per transaction, omit the * metadata flush. Defer that until the system flushes files to disk, diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 2f671ed0a1..d1cd8643fc 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -8258,7 +8258,7 @@ static int mdb_pid_insert(pid_t *ids, pid_t pid) return -1; } } - + if( val > 0 ) { ++cursor; } diff --git a/libraries/liblmdb/mdb_stat.c b/libraries/liblmdb/mdb_stat.c index aeb573a495..40bd4ccf1d 100644 --- a/libraries/liblmdb/mdb_stat.c +++ b/libraries/liblmdb/mdb_stat.c @@ -199,7 +199,7 @@ int main(int argc, char *argv[]) printf("mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); goto txn_abort; } - + rc = mdb_stat(txn, dbi, &mst); if (rc) { printf("mdb_stat failed, error %d %s\n", rc, mdb_strerror(rc)); diff --git a/libraries/liblmdb/midl.c b/libraries/liblmdb/midl.c index 0225af9584..5ee2129046 100644 --- a/libraries/liblmdb/midl.c +++ b/libraries/liblmdb/midl.c @@ -59,7 +59,7 @@ unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ) return cursor; } } - + if( val > 0 ) { ++cursor; } @@ -89,7 +89,7 @@ int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) /* no room */ --ids[0]; return -2; - + } else { /* insert id */ for (i=ids[0]; i>x; i--) From 2520247ba156ae2f881c800ccc47792c622725ae Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 4 Oct 2013 02:46:38 -0700 Subject: [PATCH 25/25] ITS#7715 always set filesize for WRITEMAP --- libraries/liblmdb/mdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index d1cd8643fc..0aae740614 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -3272,7 +3272,7 @@ mdb_env_map(MDB_env *env, void *addr, int newsize) int prot = PROT_READ; if (flags & MDB_WRITEMAP) { prot |= PROT_WRITE; - if (newsize && ftruncate(env->me_fd, env->me_mapsize) < 0) + if (ftruncate(env->me_fd, env->me_mapsize) < 0) return ErrCode(); } env->me_map = mmap(addr, env->me_mapsize, prot, MAP_SHARED,