From 589370d880fb7272ca6e8ff8fc6efec4cb51d8e2 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 2 Jul 2013 02:19:17 -0700 Subject: [PATCH 01/18] Tweaks for MDB_MULTIPLE Terminate loop on intermediate failures, return count of written items, document usage. --- libraries/liblmdb/lmdb.h | 12 +++++++++++- libraries/liblmdb/mdb.c | 30 ++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 2076eb35fa..9f00a04202 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -309,7 +309,7 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_APPEND 0x20000 /** Duplicate data is being appended, don't split full pages. */ #define MDB_APPENDDUP 0x40000 -/** Store multiple data items in one call. */ +/** Store multiple data items in one call. Only for #MDB_DUPFIXED. */ #define MDB_MULTIPLE 0x80000 /* @} */ @@ -1210,6 +1210,16 @@ int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * correct order. Loading unsorted keys with this flag will cause * data corruption. *
  • #MDB_APPENDDUP - as above, but for sorted dup data. + *
  • #MDB_MULTIPLE - store multiple contiguous data elements in a + * single request. This flag may only be specified if the database + * was opened with #MDB_DUPFIXED. The \b data argument must be an + * array of two MDB_vals. The mv_size of the first MDB_val must be + * the size of a single data element. The mv_data of the first MDB_val + * must point to the beginning of the array of contiguous data elements. + * The mv_size of the second MDB_val must be the count of the number + * of data elements to store. On return this field will be set to + * the count of the number of elements actually written. The mv_data + * of the second MDB_val is unused. * * @return A non-zero error value on failure and 0 on success. Some possible * errors are: diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 0badbbbf69..149a893208 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4969,7 +4969,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_page *fp; MDB_db dummy; int do_sub = 0, insert = 0; - unsigned int mcount = 0; + unsigned int mcount = 0, dcount; size_t nsize; int rc, rc2; MDB_pagebuf pbuf; @@ -4977,6 +4977,16 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned int nflags; DKBUF; + /* Check this first so counter will always be zero on any + * early failures. + */ + if (flags & MDB_MULTIPLE) { + dcount = data[1].mv_size; + data[1].mv_size = 0; + if (!F_ISSET(mc->mc_db->md_flags, MDB_DUPFIXED)) + return EINVAL; + } + if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY)) return EACCES; @@ -5340,8 +5350,8 @@ put_sub: } } } - /* we've done our job */ - dkey.mv_size = 0; + /* we've done our job */ + dkey.mv_size = 0; } if (flags & MDB_APPENDDUP) xflags |= MDB_APPEND; @@ -5357,12 +5367,16 @@ put_sub: if (!rc && !(flags & MDB_CURRENT)) mc->mc_db->md_entries++; if (flags & MDB_MULTIPLE) { - mcount++; - if (mcount < data[1].mv_size) { - data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - goto more; + if (!rc) { + mcount++; + if (mcount < dcount) { + data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size; + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + goto more; + } } + /* let caller know how many succeeded, if any */ + data[1].mv_size = mcount; } } done: From 06a3ad08cd07146ecc223c1a3940bd4906a6f7a0 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 2 Jul 2013 02:23:49 -0700 Subject: [PATCH 02/18] Silence uninit warning in prev commit --- libraries/liblmdb/mdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 149a893208..bc727cf2a7 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4969,7 +4969,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_page *fp; MDB_db dummy; int do_sub = 0, insert = 0; - unsigned int mcount = 0, dcount; + unsigned int mcount = 0, dcount = 0; size_t nsize; int rc, rc2; MDB_pagebuf pbuf; From 3d1e70950455725a81fe9e3dc0370d994498d392 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 6 Jul 2013 21:42:45 +0200 Subject: [PATCH 03/18] Silence more uninit warnings --- libraries/liblmdb/mdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index bc727cf2a7..d4f73282de 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2209,13 +2209,13 @@ mdb_page_flush(MDB_txn *txn) unsigned psize = env->me_psize; int i, pagecount = dl[0].mid, rc; size_t size = 0, pos = 0; - pgno_t pgno; + pgno_t pgno = 0; MDB_page *dp = NULL; #ifdef _WIN32 OVERLAPPED ov; #else struct iovec iov[MDB_COMMIT_PAGES]; - ssize_t wpos, wsize = 0, wres; + ssize_t wpos = 0, wsize = 0, wres; size_t next_pos = 1; /* impossible pos, so pos != next_pos */ int n = 0; #endif From 9be6af0dcb0469965477022541e95e74d48dad9e Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sun, 7 Jul 2013 17:13:27 +0200 Subject: [PATCH 04/18] Simplify MDB_cursor: Drop flags C_ALLOCD,C_SHADOW. --- libraries/liblmdb/mdb.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index d4f73282de..08690fbd83 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -903,8 +903,6 @@ struct MDB_cursor { #define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ #define C_EOF 0x02 /**< No more data */ #define C_SUB 0x04 /**< Cursor is a sub-cursor */ -#define C_SHADOW 0x08 /**< Cursor is a dup from a parent txn */ -#define C_ALLOCD 0x10 /**< Cursor was malloc'd */ #define C_SPLITTING 0x20 /**< Cursor is in page_split */ #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ /** @} */ @@ -1659,7 +1657,7 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) mc->mc_dbflag = &dst->mt_dbflags[i]; mc->mc_snum = m2->mc_snum; mc->mc_top = m2->mc_top; - mc->mc_flags = m2->mc_flags | (C_SHADOW|C_ALLOCD); + mc->mc_flags = m2->mc_flags; for (j=0; jmc_snum; j++) { mc->mc_pg[j] = m2->mc_pg[j]; mc->mc_ki[j] = m2->mc_ki[j]; @@ -1679,7 +1677,7 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; mx->mx_cursor.mc_snum = mx2->mx_cursor.mc_snum; mx->mx_cursor.mc_top = mx2->mx_cursor.mc_top; - mx->mx_cursor.mc_flags = mx2->mx_cursor.mc_flags | C_SHADOW; + mx->mx_cursor.mc_flags = mx2->mx_cursor.mc_flags; for (j=0; jmx_cursor.mc_snum; j++) { mx->mx_cursor.mc_pg[j] = mx2->mx_cursor.mc_pg[j]; mx->mx_cursor.mc_ki[j] = mx2->mx_cursor.mc_ki[j]; @@ -1698,7 +1696,7 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) /** Close this write txn's cursors, after optionally merging its shadow * cursors back into parent's. * @param[in] txn the transaction handle. - * @param[in] merge 0 to not merge cursors, C_SHADOW to merge. + * @param[in] merge zero to not merge cursors, non-zero to merge. * @return 0 on success, non-zero on failure. */ static void @@ -1710,7 +1708,7 @@ mdb_cursors_close(MDB_txn *txn, unsigned merge) for (i = txn->mt_numdbs; --i >= 0; ) { for (mc = cursors[i]; mc; mc = next) { next = mc->mc_next; - if (mc->mc_flags & merge) { + if (merge && mc->mc_orig) { MDB_cursor *m2 = mc->mc_orig; m2->mc_snum = mc->mc_snum; m2->mc_top = mc->mc_top; @@ -1719,8 +1717,8 @@ mdb_cursors_close(MDB_txn *txn, unsigned merge) m2->mc_ki[j] = mc->mc_ki[j]; } } - if (mc->mc_flags & C_ALLOCD) - free(mc); + /* Only malloced cursors are permanently tracked. */ + free(mc); } cursors[i] = NULL; } @@ -2359,7 +2357,7 @@ mdb_txn_commit(MDB_txn *txn) parent->mt_flags = txn->mt_flags; /* Merge our cursors into parent's and close them */ - mdb_cursors_close(txn, C_SHADOW); + mdb_cursors_close(txn, 1); /* Update parent's DB table. */ memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); @@ -5914,7 +5912,6 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) txn->mt_cursors[dbi] = mc; mc->mc_flags |= C_UNTRACK; } - mc->mc_flags |= C_ALLOCD; } else { return ENOMEM; } @@ -5927,19 +5924,13 @@ mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret) int mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) { - unsigned flags; - if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs) return EINVAL; if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors) return EINVAL; - flags = mc->mc_flags; - mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor); - - mc->mc_flags |= (flags & C_ALLOCD); return MDB_SUCCESS; } @@ -5978,8 +5969,7 @@ mdb_cursor_close(MDB_cursor *mc) if (*prev == mc) *prev = mc->mc_next; } - if (mc->mc_flags & C_ALLOCD) - free(mc); + free(mc); } } From be47ca766713f55e5b3abd18120514fdad7d90f2 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sun, 7 Jul 2013 17:13:27 +0200 Subject: [PATCH 05/18] ITS#7515 Fix tracking of parent txn's cursors. Restore mc_flags and xcursors, they were tracked but not merged. Simplify: Track parent txn's original cursors after backing them up, instead of tracking copies and merging them back at commit. --- libraries/liblmdb/mdb.c | 112 ++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 63 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 08690fbd83..7647ea3e34 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -879,8 +879,8 @@ struct MDB_xcursor; struct MDB_cursor { /** Next cursor on this DB in this txn */ MDB_cursor *mc_next; - /** Original cursor if this is a shadow */ - MDB_cursor *mc_orig; + /** Backup of the original cursor if this cursor is a shadow */ + MDB_cursor *mc_backup; /** Context used for databases with #MDB_DUPSORT, otherwise NULL */ struct MDB_xcursor *mc_xcursor; /** The transaction that owns this cursor */ @@ -1633,57 +1633,35 @@ mdb_env_sync(MDB_env *env, int force) return rc; } -/** Make shadow copies of all of parent txn's cursors */ +/** Back up parent txn's cursors, then grab the originals for tracking */ static int mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) { - MDB_cursor *mc, *m2; - unsigned int i, j, size; + MDB_cursor *mc, *bk; + MDB_xcursor *mx; + size_t size; + int i; - for (i=0;imt_numdbs; i++) { - if (src->mt_cursors[i]) { + for (i = src->mt_numdbs; --i >= 0; ) { + if ((mc = src->mt_cursors[i]) != NULL) { size = sizeof(MDB_cursor); - if (src->mt_cursors[i]->mc_xcursor) + if (mc->mc_xcursor) size += sizeof(MDB_xcursor); - for (m2 = src->mt_cursors[i]; m2; m2=m2->mc_next) { - mc = malloc(size); - if (!mc) + for (; mc; mc = bk->mc_next) { + bk = malloc(size); + if (!bk) return ENOMEM; - mc->mc_orig = m2; - mc->mc_txn = dst; - mc->mc_dbi = i; + *bk = *mc; + mc->mc_backup = bk; mc->mc_db = &dst->mt_dbs[i]; - mc->mc_dbx = m2->mc_dbx; - mc->mc_dbflag = &dst->mt_dbflags[i]; - mc->mc_snum = m2->mc_snum; - mc->mc_top = m2->mc_top; - mc->mc_flags = m2->mc_flags; - for (j=0; jmc_snum; j++) { - mc->mc_pg[j] = m2->mc_pg[j]; - mc->mc_ki[j] = m2->mc_ki[j]; - } - if (m2->mc_xcursor) { - MDB_xcursor *mx, *mx2; - mx = (MDB_xcursor *)(mc+1); - mc->mc_xcursor = mx; - mx2 = m2->mc_xcursor; - mx->mx_db = mx2->mx_db; - mx->mx_dbx = mx2->mx_dbx; - mx->mx_dbflag = mx2->mx_dbflag; - mx->mx_cursor.mc_txn = dst; - mx->mx_cursor.mc_dbi = mx2->mx_cursor.mc_dbi; - mx->mx_cursor.mc_db = &mx->mx_db; - mx->mx_cursor.mc_dbx = &mx->mx_dbx; - mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; - mx->mx_cursor.mc_snum = mx2->mx_cursor.mc_snum; - mx->mx_cursor.mc_top = mx2->mx_cursor.mc_top; - mx->mx_cursor.mc_flags = mx2->mx_cursor.mc_flags; - for (j=0; jmx_cursor.mc_snum; j++) { - mx->mx_cursor.mc_pg[j] = mx2->mx_cursor.mc_pg[j]; - mx->mx_cursor.mc_ki[j] = mx2->mx_cursor.mc_ki[j]; - } - } else { - mc->mc_xcursor = NULL; + /* Kill pointers into src - and dst to reduce abuse: The + * user may not use mc until dst ends. Otherwise we'd... + */ + mc->mc_txn = NULL; /* ...set this to dst */ + mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */ + if ((mx = mc->mc_xcursor) != NULL) { + *(MDB_xcursor *)(bk+1) = *mx; + mx->mx_cursor.mc_txn = NULL; /* ...and dst. */ } mc->mc_next = dst->mt_cursors[i]; dst->mt_cursors[i] = mc; @@ -1693,32 +1671,40 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) return MDB_SUCCESS; } -/** Close this write txn's cursors, after optionally merging its shadow - * cursors back into parent's. +/** Close this write txn's cursors, give parent txn's cursors back to parent. * @param[in] txn the transaction handle. - * @param[in] merge zero to not merge cursors, non-zero to merge. + * @param[in] merge true to keep changes to parent cursors, false to revert. * @return 0 on success, non-zero on failure. */ static void mdb_cursors_close(MDB_txn *txn, unsigned merge) { - MDB_cursor **cursors = txn->mt_cursors, *mc, *next; - int i, j; + MDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk; + MDB_xcursor *mx; + int i; for (i = txn->mt_numdbs; --i >= 0; ) { for (mc = cursors[i]; mc; mc = next) { - next = mc->mc_next; - if (merge && mc->mc_orig) { - MDB_cursor *m2 = mc->mc_orig; - m2->mc_snum = mc->mc_snum; - m2->mc_top = mc->mc_top; - for (j = mc->mc_snum; --j >= 0; ) { - m2->mc_pg[j] = mc->mc_pg[j]; - m2->mc_ki[j] = mc->mc_ki[j]; - } + next = mc->mc_next; + if ((bk = mc->mc_backup) != NULL) { + if (merge) { + /* Commit changes to parent txn */ + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbflag = bk->mc_dbflag; + if ((mx = mc->mc_xcursor) != NULL) + mx->mx_cursor.mc_txn = bk->mc_txn; + } else { + /* Abort nested txn */ + *mc = *bk; + if ((mx = mc->mc_xcursor) != NULL) + *mx = *(MDB_xcursor *)(bk+1); } - /* Only malloced cursors are permanently tracked. */ - free(mc); + mc = bk; + } + free(mc); } cursors[i] = NULL; } @@ -5867,7 +5853,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) { - mc->mc_orig = NULL; + mc->mc_backup = NULL; mc->mc_dbi = dbi; mc->mc_txn = txn; mc->mc_db = &txn->mt_dbs[dbi]; @@ -5961,7 +5947,7 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp) void mdb_cursor_close(MDB_cursor *mc) { - if (mc != NULL) { + if (mc && !mc->mc_backup) { /* remove from txn, if tracked */ if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; From a77767013a7cc60bd22ef598ebf5c6e7021bba88 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sun, 7 Jul 2013 17:14:38 +0200 Subject: [PATCH 06/18] ITS#7515 Reject conflicting page versions. If mdb_page_touch() sees a page in txn's dirty_list, that is the page version txn's cursors should have. Fail if the user may be seeing and depending on another version. --- libraries/liblmdb/mdb.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 7647ea3e34..96ac9521f2 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1560,9 +1560,10 @@ mdb_page_touch(MDB_cursor *mc) if (dl[0].mid) { unsigned x = mdb_mid2l_search(dl, pgno); if (x <= dl[0].mid && dl[x].mid == pgno) { - np = dl[x].mptr; - if (mp != np) - mc->mc_pg[mc->mc_top] = np; + if (mp != dl[x].mptr) { /* bad cursor? */ + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); + return MDB_CORRUPTED; + } return 0; } } From 64676da8d9c46d5a40eaca713bf0735e241ea4cf Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 9 Jul 2013 14:21:35 -0700 Subject: [PATCH 07/18] Fixup other cursors after delete op --- libraries/liblmdb/mdb.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 96ac9521f2..17ff40cd89 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -6544,9 +6544,14 @@ static int mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) { int rc; + MDB_page *mp; + indx_t ki; + + mp = mc->mc_pg[mc->mc_top]; + ki = mc->mc_ki[mc->mc_top]; /* add overflow pages to free list */ - if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_BIGDATA)) { + if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_BIGDATA)) { MDB_page *omp; pgno_t pg; @@ -6555,7 +6560,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) (rc = mdb_ovpage_free(mc, omp))) return rc; } - mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], mc->mc_db->md_pad); + mdb_node_del(mp, ki, mc->mc_db->md_pad); mc->mc_db->md_entries--; rc = mdb_rebalance(mc); if (rc != MDB_SUCCESS) @@ -6564,6 +6569,28 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) else if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) mc->mc_flags &= ~C_INITIALIZED; + { + /* Adjust other cursors pointing to mp */ + MDB_cursor *m2; + unsigned int nkeys; + MDB_dbi dbi = mc->mc_dbi; + + mp = mc->mc_pg[mc->mc_top]; + nkeys = NUMKEYS(mp); + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (m2 == mc) + continue; + if (!(m2->mc_flags & C_INITIALIZED)) + continue; + if (m2->mc_pg[mc->mc_top] == mp) { + if (m2->mc_ki[mc->mc_top] > ki) + m2->mc_ki[mc->mc_top]--; + if (m2->mc_ki[mc->mc_top] >= nkeys) + m2->mc_flags &= ~C_INITIALIZED; + } + } + } + return rc; } From f81eb631ffec43b53d051664cf7e8a62852f4693 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 10 Jul 2013 08:49:29 -0700 Subject: [PATCH 08/18] Cursors: Clear C_EOF when clearing C_INITIALIZED --- libraries/liblmdb/mdb.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 17ff40cd89..a82b26380d 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4379,7 +4379,7 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if (op == MDB_NEXT_DUP) return MDB_NOTFOUND; } @@ -4391,7 +4391,6 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) DPUTS("=====> move to next sibling page"); if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { mc->mc_flags |= C_EOF; - mc->mc_flags &= ~C_INITIALIZED; return rc; } mp = mc->mc_pg[mc->mc_top]; @@ -4449,7 +4448,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) if (op != MDB_PREV || rc != MDB_NOTFOUND) return rc; } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if (op == MDB_PREV_DUP) return MDB_NOTFOUND; } @@ -4461,7 +4460,6 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) if (mc->mc_ki[mc->mc_top] == 0) { DPUTS("=====> move to prev sibling page"); if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { - mc->mc_flags &= ~C_INITIALIZED; return rc; } mp = mc->mc_pg[mc->mc_top]; @@ -4667,7 +4665,7 @@ set1: } else { if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) return rc; } @@ -4715,7 +4713,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) return rc; } else { if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) return rc; } @@ -4763,7 +4761,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) return rc; } else { if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) return rc; } @@ -6534,7 +6532,7 @@ mdb_rebalance(MDB_cursor *mc) rc = mdb_page_merge(&mn, mc); else rc = mdb_page_merge(mc, &mn); - mc->mc_flags &= ~C_INITIALIZED; + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); } return rc; } @@ -6567,7 +6565,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) mc->mc_txn->mt_flags |= MDB_TXN_ERROR; /* if mc points past last node in page, invalidate */ else if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) - mc->mc_flags &= ~C_INITIALIZED; + mc->mc_flags &= ~(C_INITIALIZED|C_EOF); { /* Adjust other cursors pointing to mp */ @@ -6586,7 +6584,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) if (m2->mc_ki[mc->mc_top] > ki) m2->mc_ki[mc->mc_top]--; if (m2->mc_ki[mc->mc_top] >= nkeys) - m2->mc_flags &= ~C_INITIALIZED; + m2->mc_flags &= ~(C_INITIALIZED|C_EOF); } } } @@ -7421,7 +7419,7 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); /* Invalidate the dropped DB's cursors */ for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) - m2->mc_flags &= ~C_INITIALIZED; + m2->mc_flags &= ~(C_INITIALIZED|C_EOF); if (rc) goto leave; From b711c07f3432d104612fd3186a22dab6f2369570 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 10 Jul 2013 11:03:51 -0700 Subject: [PATCH 09/18] Fix rebalance/cursor adjust When collapsing root, must also move cursor index down, not just the page pointer. Also in mtest, break from NEXT loops on error, otherwise it just prints the previous key/data again, which looks confusing. --- libraries/liblmdb/mdb.c | 2 ++ libraries/liblmdb/mtest.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index a82b26380d..620e5b51ff 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -6451,6 +6451,7 @@ mdb_rebalance(MDB_cursor *mc) return rc; mc->mc_db->md_depth--; mc->mc_db->md_branch_pages--; + mc->mc_ki[0] = mc->mc_ki[1]; { /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; @@ -6469,6 +6470,7 @@ mdb_rebalance(MDB_cursor *mc) m3->mc_pg[0] = mc->mc_pg[0]; m3->mc_snum = 1; m3->mc_top = 0; + m3->mc_ki[0] = m3->mc_ki[1]; } } } diff --git a/libraries/liblmdb/mtest.c b/libraries/liblmdb/mtest.c index 55cdd43c38..dbc69b8d4c 100644 --- a/libraries/liblmdb/mtest.c +++ b/libraries/liblmdb/mtest.c @@ -129,6 +129,8 @@ int main(int argc,char * argv[]) rc = mdb_cursor_open(txn, dbi, &cur2); for (i=0; i<50; i++) { rc = mdb_cursor_get(cur2, &key, &data, MDB_NEXT); + if (rc) + break; printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int) key.mv_size, (char *) key.mv_data, data.mv_data, (int) data.mv_size, (char *) data.mv_data); @@ -142,6 +144,7 @@ int main(int argc,char * argv[]) data.mv_data, (int) data.mv_size, (char *) data.mv_data); for (i=0; i<32; i++) { rc = mdb_cursor_get(cur2, &key, &data, MDB_NEXT); + if (rc) break; printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int) key.mv_size, (char *) key.mv_data, data.mv_data, (int) data.mv_size, (char *) data.mv_data); @@ -158,6 +161,7 @@ int main(int argc,char * argv[]) data.mv_data, (int) data.mv_size, (char *) data.mv_data); for (i=0; i<32; i++) { rc = mdb_cursor_get(cursor, &key, &data, MDB_NEXT); + if (rc) break; printf("key: %p %.*s, data: %p %.*s\n", key.mv_data, (int) key.mv_size, (char *) key.mv_data, data.mv_data, (int) data.mv_size, (char *) data.mv_data); From 3d46d5502aa4065b9a08ddb3711aa4dfd14ddca2 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 10 Jul 2013 22:11:44 +0200 Subject: [PATCH 10/18] Do not follow uninited cursors' page pointers. Nor uninited cursors' subcursors' page pointers. --- libraries/liblmdb/mdb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 620e5b51ff..0ba5e2dbd0 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -5328,6 +5328,7 @@ put_sub: for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (!(m2->mc_flags & C_INITIALIZED)) continue; if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) { mdb_xcursor_init1(m2, leaf); } @@ -7025,7 +7026,7 @@ done: m3 = m2; if (m3 == mc) continue; - if (!(m3->mc_flags & C_INITIALIZED)) + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) continue; if (m3->mc_flags & C_SPLITTING) continue; From ba6dfe0bbb454a41c1c1ad8a29238925b1927980 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 12 Jul 2013 12:53:35 -0700 Subject: [PATCH 11/18] Fix env_read_header() on Windows Commit d6d2638acc245116b8f091ac425b6700d06c4713 broke read on zero-length files. --- libraries/liblmdb/mdb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 0ba5e2dbd0..b186ba7411 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2487,6 +2487,8 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) memset(&ov, 0, sizeof(ov)); ov.Offset = off; rc = ReadFile(env->me_fd,&pbuf,MDB_PAGESIZE,&len,&ov) ? (int)len : -1; + if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF) + rc = 0; #else rc = pread(env->me_fd, &pbuf, MDB_PAGESIZE, off); #endif From 87a7f06feb795ee50325c74ed02844ecfde5270c Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 11 Jul 2013 22:09:46 +0200 Subject: [PATCH 12/18] Factor out parent --- libraries/liblmdb/mdb.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index b186ba7411..bf1be8559b 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2348,16 +2348,16 @@ mdb_txn_commit(MDB_txn *txn) /* Update parent's DB table. */ memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); - txn->mt_parent->mt_numdbs = txn->mt_numdbs; - txn->mt_parent->mt_dbflags[0] = txn->mt_dbflags[0]; - txn->mt_parent->mt_dbflags[1] = txn->mt_dbflags[1]; + parent->mt_numdbs = txn->mt_numdbs; + parent->mt_dbflags[0] = txn->mt_dbflags[0]; + parent->mt_dbflags[1] = txn->mt_dbflags[1]; for (i=2; imt_numdbs; i++) { /* preserve parent's DB_NEW status */ - x = txn->mt_parent->mt_dbflags[i] & DB_NEW; - txn->mt_parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; + x = parent->mt_dbflags[i] & DB_NEW; + parent->mt_dbflags[i] = txn->mt_dbflags[i] | x; } - dst = txn->mt_parent->mt_u.dirty_list; + dst = parent->mt_u.dirty_list; src = txn->mt_u.dirty_list; /* Find len = length of merging our dirty list with parent's */ x = dst[0].mid; @@ -2391,7 +2391,7 @@ mdb_txn_commit(MDB_txn *txn) free(txn->mt_u.dirty_list); parent->mt_dirty_room = txn->mt_dirty_room; - txn->mt_parent->mt_child = NULL; + parent->mt_child = NULL; mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); free(txn); return MDB_SUCCESS; From 08373439a604b6ad852d7148aa0ec8f33774bbe4 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 11 Jul 2013 22:09:46 +0200 Subject: [PATCH 13/18] Move code out to mdb_page_dirty() --- libraries/liblmdb/mdb.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index bf1be8559b..57353fc0de 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1337,6 +1337,24 @@ mdb_find_oldest(MDB_txn *txn) return oldest; } +/** Add a page to the txn's dirty list */ +static void +mdb_page_dirty(MDB_txn *txn, MDB_page *mp) +{ + MDB_ID2 mid; + int (*insert)(MDB_ID2L, MDB_ID2 *); + + if (txn->mt_env->me_flags & MDB_WRITEMAP) { + insert = mdb_mid2l_append; + } else { + insert = mdb_mid2l_insert; + } + mid.mid = mp->mp_pgno; + mid.mptr = mp; + insert(txn->mt_u.dirty_list, &mid); + txn->mt_dirty_room--; +} + /** Allocate pages for writing. * If there are free pages available from older transactions, they * will be re-used first. Otherwise a new page will be allocated. @@ -1367,11 +1385,9 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) pgno_t pgno, *mop = env->me_pghead; unsigned i, j, k, mop_len = mop ? mop[0] : 0; MDB_page *np; - MDB_ID2 mid; txnid_t oldest = 0, last; MDB_cursor_op op; MDB_cursor m2; - int (*insert)(MDB_ID2L, MDB_ID2 *); *mp = NULL; @@ -1474,11 +1490,9 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) search_done: if (env->me_flags & MDB_WRITEMAP) { np = (MDB_page *)(env->me_map + env->me_psize * pgno); - insert = mdb_mid2l_append; } else { if (!(np = mdb_page_malloc(txn, num))) return ENOMEM; - insert = mdb_mid2l_insert; } if (i) { mop[0] = mop_len -= num; @@ -1488,10 +1502,8 @@ search_done: } else { txn->mt_next_pgno = pgno + num; } - mid.mid = np->mp_pgno = pgno; - mid.mptr = np; - insert(txn->mt_u.dirty_list, &mid); - txn->mt_dirty_room--; + np->mp_pgno = pgno; + mdb_page_dirty(txn, np); *mp = np; return MDB_SUCCESS; From 6741f9c0ef7973aeebacc3e547db9fc07702a8c2 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 11 Jul 2013 22:09:46 +0200 Subject: [PATCH 14/18] Save freelist using proper mdb_cursor_put(). (Restructuring for upcoming mdb_page_spill work.) mdb_freelist_save() can't just Get() the destination, since mdb_page_spill() may have put the destination in the read-only map. TODO: Can this new put() modify the freelist, which would break it? The final iteration's put() can shorten the node, the rest uses MDB_CURRENT. We could set P_KEEP on dirty freeDB leaves and ovpages, since they are all about to be modified. But the code in this commit must stay anyway, if mdb should support dropping a 256G DB. I.e. too big for dirty_list. --- libraries/liblmdb/mdb.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 57353fc0de..50887d6789 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2171,25 +2171,32 @@ mdb_freelist_save(MDB_txn *txn) total_room += head_room; } - /* Fill in the reserved, touched me_pghead records. Avoid write ops - * so they cannot rearrange anything, just read the destinations. - */ + /* Fill in the reserved, touched me_pghead records */ rc = MDB_SUCCESS; if (mop_len) { MDB_val key, data; - mop += mop_len + 1; + mop += mop_len; rc = mdb_cursor_first(&mc, &key, &data); for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { - MDB_IDL dest = data.mv_data; + unsigned flags = MDB_CURRENT; + txnid_t id = *(txnid_t *)key.mv_data; ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + MDB_ID save; - assert(len >= 0 && *(txnid_t*)key.mv_data <= env->me_pglast); - if (len > mop_len) + assert(len >= 0 && id <= env->me_pglast); + key.mv_data = &id; + if (len > mop_len) { len = mop_len; - *dest++ = len; - memcpy(dest, mop -= len, len * sizeof(MDB_ID)); - if (! (mop_len -= len)) + data.mv_size = (len + 1) * sizeof(MDB_ID); + flags = 0; + } + data.mv_data = mop -= len; + save = mop[0]; + mop[0] = len; + rc = mdb_cursor_put(&mc, &key, &data, flags); + mop[0] = save; + if (rc || !(mop_len -= len)) break; } } From c09db5757d4cac831bc2731476cca4a1d85d3368 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 11 Jul 2013 22:09:46 +0200 Subject: [PATCH 15/18] Simplify: Always set C_UNTRACK for tracked cursors. TODO: Rename C_UNTRACK to C_TRACKED. Omitted now for readability. The current name is because it's lazy: not always set when tracked. --- libraries/liblmdb/mdb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 50887d6789..336db31ca3 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -6663,6 +6663,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, * run out of space, triggering a split. We need this * cursor to be consistent until the end of the rebalance. */ + mc.mc_flags |= C_UNTRACK; mc.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &mc; rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA); From d7bc4baf637dc1dd2afae1f67384fb6ff9b31bb4 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 11 Jul 2013 22:09:47 +0200 Subject: [PATCH 16/18] Delay touching pages until cursor is positioned. This avoids unnecessary rewrites of pages that do not change. (Restructuring for upcoming mdb_page_spill work.) --- libraries/liblmdb/mdb.c | 50 +++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 336db31ca3..a533ed4d89 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4968,6 +4968,7 @@ int mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned int flags) { + enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */ MDB_node *leaf = NULL; MDB_val xdata, *rdata, dkey; MDB_page *fp; @@ -5015,23 +5016,10 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return EINVAL; rc = MDB_SUCCESS; } else if (mc->mc_db->md_root == P_INVALID) { - MDB_page *np; - /* new database, write a root leaf page */ - DPUTS("allocating new root leaf page"); - if ((rc = mdb_page_new(mc, P_LEAF, 1, &np))) { - return rc; - } + /* new database, cursor has nothing to point to */ mc->mc_snum = 0; - mdb_cursor_push(mc, np); - mc->mc_db->md_root = np->mp_pgno; - mc->mc_db->md_depth++; - *mc->mc_dbflag |= DB_DIRTY; - if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) - == MDB_DUPFIXED) - np->mp_flags |= P_LEAF2; - mc->mc_flags |= C_INITIALIZED; - rc = MDB_NOTFOUND; - goto top; + mc->mc_flags &= ~C_INITIALIZED; + rc = MDB_NO_ROOT; } else { int exact = 0; MDB_val d2; @@ -5049,7 +5037,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } } else { - rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); + rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact); } if ((flags & MDB_NOOVERWRITE) && rc == 0) { DPRINTF("duplicate key [%s]", DKEY(key)); @@ -5060,12 +5048,30 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return rc; } - /* Cursor is positioned, now make sure all pages are writable */ - rc2 = mdb_cursor_touch(mc); - if (rc2) - return rc2; + /* Cursor is positioned */ + + if (rc == MDB_NO_ROOT) { + MDB_page *np; + /* new database, write a root leaf page */ + DPUTS("allocating new root leaf page"); + if ((rc2 = mdb_page_new(mc, P_LEAF, 1, &np))) { + return rc2; + } + mdb_cursor_push(mc, np); + mc->mc_db->md_root = np->mp_pgno; + mc->mc_db->md_depth++; + *mc->mc_dbflag |= DB_DIRTY; + if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED)) + == MDB_DUPFIXED) + np->mp_flags |= P_LEAF2; + mc->mc_flags |= C_INITIALIZED; + } else { + /* make sure all cursor pages are writable */ + rc2 = mdb_cursor_touch(mc); + if (rc2) + return rc2; + } -top: /* The key already exists */ if (rc == MDB_SUCCESS) { /* there's only a key anyway, so this is a no-op */ From e9ed4d75f786e1b592f000cb4a0afb9b16f73cf1 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 11 Jul 2013 22:09:47 +0200 Subject: [PATCH 17/18] Spill pages, take 3 --- libraries/liblmdb/mdb.c | 368 +++++++++++++++++++++++++++++++++++++-- libraries/liblmdb/midl.c | 4 +- libraries/liblmdb/midl.h | 12 +- 3 files changed, 359 insertions(+), 25 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index a533ed4d89..cd99d67add 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -585,6 +585,7 @@ typedef struct MDB_page { #define P_DIRTY 0x10 /**< dirty page */ #define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */ #define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */ +#define P_KEEP 0x8000 /**< leave this page alone during spill */ /** @} */ uint16_t mp_flags; /**< @ref mdb_page */ #define mp_lower mp_pb.pb.pb_lower @@ -824,6 +825,10 @@ struct MDB_txn { /** The list of pages that became unused during this transaction. */ MDB_IDL mt_free_pgs; + /** The list of dirty pages we temporarily wrote to disk + * because the dirty list was full. + */ + MDB_IDL mt_spill_pgs; union { MDB_ID2L dirty_list; /**< for write txns: modified pages */ MDB_reader *reader; /**< this thread's reader table slot or NULL */ @@ -857,6 +862,7 @@ struct MDB_txn { #define MDB_TXN_RDONLY 0x01 /**< read-only transaction */ #define MDB_TXN_ERROR 0x02 /**< an error has occurred */ #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ +#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ /** @} */ unsigned int mt_flags; /**< @ref mdb_txn */ /** dirty_list maxsize - # of allocated pages allowed, including in parent txns */ @@ -1306,7 +1312,7 @@ mdb_dpage_free(MDB_env *env, MDB_page *dp) } } -/* Return all dirty pages to dpage list */ +/** Return all dirty pages to dpage list */ static void mdb_dlist_free(MDB_txn *txn) { @@ -1320,6 +1326,148 @@ mdb_dlist_free(MDB_txn *txn) dl[0].mid = 0; } +static int mdb_page_flush(MDB_txn *txn); + +/** Spill pages from the dirty list back to disk. + * This is intended to prevent running into #MDB_TXN_FULL situations, + * but note that they may still occur in a few cases: + * 1) pages in #MDB_DUPSORT sub-DBs are never spilled, so if there + * are too many of these dirtied in one txn, the txn may still get + * too full. + * 2) child txns may run out of space if their parents dirtied a + * lot of pages and never spilled them. TODO: we probably should do + * a preemptive spill during #mdb_txn_begin() of a child txn, if + * the parent's dirty_room is below a given threshold. + * 3) our estimate of the txn size could be too small. At the + * moment this seems unlikely. + * + * Otherwise, if not using nested txns, it is expected that apps will + * not run into #MDB_TXN_FULL any more. The pages are flushed to disk + * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared. + * If the txn never references them again, they can be left alone. + * If the txn only reads them, they can be used without any fuss. + * If the txn writes them again, they can be dirtied immediately without + * going thru all of the work of #mdb_page_touch(). Such references are + * handled by #mdb_page_unspill(). + * + * Also note, we never spill DB root pages, nor pages of active cursors, + * because we'll need these back again soon anyway. And in nested txns, + * we can't spill a page in a child txn if it was already spilled in a + * parent txn. That would alter the parent txns' data even though + * the child hasn't committed yet, and we'd have no way to undo it if + * the child aborted. + * + * @param[in] m0 cursor A cursor handle identifying the transaction and + * database for which we are checking space. + * @param[in] key For a put operation, the key being stored. + * @param[in] data For a put operation, the data being stored. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) +{ + MDB_txn *txn = m0->mc_txn; + MDB_page *dp; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned int i, j; + int rc; + + if (m0->mc_flags & C_SUB) + return MDB_SUCCESS; + + /* Estimate how much space this op will take */ + i = m0->mc_db->md_depth; + /* Named DBs also dirty the main DB */ + if (m0->mc_dbi > MAIN_DBI) + i += txn->mt_dbs[MAIN_DBI].md_depth; + /* For puts, roughly factor in the key+data size */ + if (key) + i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize; + i += i; /* double it for good measure */ + + if (txn->mt_dirty_room > i) + return MDB_SUCCESS; + + if (!txn->mt_spill_pgs) { + txn->mt_spill_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX); + if (!txn->mt_spill_pgs) + return ENOMEM; + } + + /* Mark all the dirty root pages we want to preserve */ + for (i=0; imt_numdbs; i++) { + if (txn->mt_dbflags[i] & DB_DIRTY) { + j = mdb_mid2l_search(dl, txn->mt_dbs[i].md_root); + if (j <= dl[0].mid) { + dp = dl[j].mptr; + dp->mp_flags |= P_KEEP; + } + } + } + /* Mark all the pages of active cursors we want to preserve */ + for (i=0; imt_numdbs; i++) { + MDB_cursor *mc = txn->mt_cursors[i]; + /* See if m0 is tracked or not */ + if (i == m0->mc_dbi && !(m0->mc_flags & C_UNTRACK)) { + /* nope. tack it on in front */ + m0->mc_next = mc; + mc = m0; + } + for (; mc; mc=mc->mc_next) { + if (mc->mc_flags & C_INITIALIZED) { + for (j=0; jmc_snum; j++) { + if (mc->mc_pg[j]->mp_flags & P_DIRTY) + mc->mc_pg[j]->mp_flags |= P_KEEP; + } + if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { + MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; + if (mx->mc_flags & C_INITIALIZED) { + for (j=0; jmc_snum; j++) { + if ((mx->mc_pg[j]->mp_flags & (P_SUBP|P_DIRTY)) + == P_DIRTY) + mx->mc_pg[j]->mp_flags |= P_KEEP; + } + } + } + } + } + } + + /* Save the page IDs of all the pages we're flushing */ + for (i=1; i<=dl[0].mid; i++) { + dp = dl[i].mptr; + if (dp->mp_flags & P_KEEP) + continue; + /* Can't spill twice, make sure it's not already in a parent's + * spill list. + */ + if (txn->mt_parent) { + MDB_txn *tx2; + for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) { + if (tx2->mt_spill_pgs) { + j = mdb_midl_search(tx2->mt_spill_pgs, dl[i].mid); + if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == dl[i].mid) { + dp->mp_flags |= P_KEEP; + break; + } + } + } + if (tx2) + continue; + } + if ((rc = mdb_midl_append(&txn->mt_spill_pgs, dl[i].mid))) + return rc; + } + mdb_midl_sort(txn->mt_spill_pgs); + + rc = mdb_page_flush(txn); + if (rc == 0) { + txn->mt_dirty_room = MDB_IDL_UM_MAX - dl[0].mid; + txn->mt_flags |= MDB_TXN_SPILLS; + } + return rc; +} + /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ static txnid_t mdb_find_oldest(MDB_txn *txn) @@ -1533,6 +1681,61 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) } } +/** Pull a page off the txn's spill list, if present. + * If a page being referenced was spilled to disk in this txn, bring + * it back and make it dirty/writable again. + * @param[in] tx0 the transaction handle. + * @param[in] mp the page being referenced. + * @param[out] ret the writable page, if any. ret is unchanged if + * mp wasn't spilled. + */ +static int +mdb_page_unspill(MDB_txn *tx0, MDB_page *mp, MDB_page **ret) +{ + MDB_env *env = tx0->mt_env; + MDB_txn *txn; + unsigned x; + pgno_t pgno = mp->mp_pgno; + + for (txn = tx0; txn; txn=txn->mt_parent) { + if (!txn->mt_spill_pgs) + continue; + x = mdb_midl_search(txn->mt_spill_pgs, pgno); + if (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pgno) { + MDB_page *np; + int num; + if (IS_OVERFLOW(mp)) + num = mp->mp_pages; + else + num = 1; + if (env->me_flags & MDB_WRITEMAP) { + np = mp; + } else { + np = mdb_page_malloc(txn, num); + if (!np) + return ENOMEM; + if (num > 1) + memcpy(np, mp, num * env->me_psize); + else + mdb_page_copy(np, mp, env->me_psize); + } + if (txn == tx0) { + /* If in current txn, this page is no longer spilled */ + for (; x < txn->mt_spill_pgs[0]; x++) + txn->mt_spill_pgs[x] = txn->mt_spill_pgs[x+1]; + txn->mt_spill_pgs[0]--; + } /* otherwise, if belonging to a parent txn, the + * page remains spilled until child commits + */ + mdb_page_dirty(tx0, np); + np->mp_flags |= P_DIRTY; + *ret = np; + break; + } + } + return MDB_SUCCESS; +} + /** Touch a page: make it dirty and re-insert into tree with updated pgno. * @param[in] mc cursor pointing to the page to be touched * @return 0 on success, non-zero on failure. @@ -1548,6 +1751,14 @@ mdb_page_touch(MDB_cursor *mc) int rc; if (!F_ISSET(mp->mp_flags, P_DIRTY)) { + if (txn->mt_flags & MDB_TXN_SPILLS) { + np = NULL; + rc = mdb_page_unspill(txn, mp, &np); + if (rc) + return rc; + if (np) + goto done; + } if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || (rc = mdb_page_alloc(mc, 1, &np))) return rc; @@ -1595,6 +1806,7 @@ mdb_page_touch(MDB_cursor *mc) np->mp_pgno = pgno; np->mp_flags |= P_DIRTY; +done: /* Adjust cursors pointing to mp */ mc->mc_pg[mc->mc_top] = np; dbi = mc->mc_dbi; @@ -1801,6 +2013,7 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_u.dirty_list[0].mid = 0; txn->mt_free_pgs = env->me_free_pgs; txn->mt_free_pgs[0] = 0; + txn->mt_spill_pgs = NULL; env->me_txn = txn; } @@ -1906,6 +2119,7 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) txn->mt_toggle = parent->mt_toggle; txn->mt_dirty_room = parent->mt_dirty_room; txn->mt_u.dirty_list[0].mid = 0; + txn->mt_spill_pgs = NULL; txn->mt_next_pgno = parent->mt_next_pgno; parent->mt_child = txn; txn->mt_parent = parent; @@ -2008,6 +2222,7 @@ mdb_txn_reset0(MDB_txn *txn, const char *act) txn->mt_parent->mt_child = NULL; env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; mdb_midl_free(txn->mt_free_pgs); + mdb_midl_free(txn->mt_spill_pgs); free(txn->mt_u.dirty_list); return; } @@ -2210,7 +2425,7 @@ mdb_page_flush(MDB_txn *txn) { MDB_env *env = txn->mt_env; MDB_ID2L dl = txn->mt_u.dirty_list; - unsigned psize = env->me_psize; + unsigned psize = env->me_psize, j; int i, pagecount = dl[0].mid, rc; size_t size = 0, pos = 0; pgno_t pgno = 0; @@ -2224,13 +2439,20 @@ mdb_page_flush(MDB_txn *txn) int n = 0; #endif + j = 0; if (env->me_flags & MDB_WRITEMAP) { /* Clear dirty flags */ for (i = pagecount; i; i--) { dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & P_KEEP) { + dp->mp_flags ^= P_KEEP; + dl[++j] = dl[i]; + continue; + } dp->mp_flags &= ~P_DIRTY; } - dl[0].mid = 0; + dl[0].mid = j; return MDB_SUCCESS; } @@ -2238,6 +2460,12 @@ mdb_page_flush(MDB_txn *txn) for (i = 1;; i++) { if (i <= pagecount) { dp = dl[i].mptr; + /* Don't flush this page yet */ + if (dp->mp_flags & P_KEEP) { + dp->mp_flags ^= P_KEEP; + dl[i].mid = 0; + continue; + } pgno = dl[i].mid; /* clear dirty flag */ dp->mp_flags &= ~P_DIRTY; @@ -2309,7 +2537,18 @@ mdb_page_flush(MDB_txn *txn) #endif /* _WIN32 */ } - mdb_dlist_free(txn); + j = 0; + for (i=1; i<=pagecount; i++) { + dp = dl[i].mptr; + /* This is a page we skipped above */ + if (!dl[i].mid) { + dl[++j] = dl[i]; + dl[j].mid = dp->mp_pgno; + continue; + } + mdb_dpage_free(env, dp); + } + dl[0].mid = j; return MDB_SUCCESS; } @@ -2378,6 +2617,37 @@ mdb_txn_commit(MDB_txn *txn) dst = parent->mt_u.dirty_list; src = txn->mt_u.dirty_list; + /* Remove anything in our dirty list from parent's spill list */ + if (parent->mt_spill_pgs) { + x = parent->mt_spill_pgs[0]; + len = x; + /* zero out our dirty pages in parent spill list */ + for (i=1; i<=src[0].mid; i++) { + if (src[i].mid < parent->mt_spill_pgs[x]) + continue; + if (src[i].mid > parent->mt_spill_pgs[x]) { + if (x <= 1) + break; + x--; + continue; + } + parent->mt_spill_pgs[x] = 0; + len--; + } + /* OK, we had a few hits, squash zeros from the spill list */ + if (len < parent->mt_spill_pgs[0]) { + x=1; + for (y=1; y<=parent->mt_spill_pgs[0]; y++) { + if (parent->mt_spill_pgs[y]) { + if (y != x) { + parent->mt_spill_pgs[x] = parent->mt_spill_pgs[y]; + } + x++; + } + } + parent->mt_spill_pgs[0] = len; + } + } /* Find len = length of merging our dirty list with parent's */ x = dst[0].mid; dst[0].mid = 0; /* simplify loops */ @@ -2409,6 +2679,15 @@ mdb_txn_commit(MDB_txn *txn) dst[0].mid = len; free(txn->mt_u.dirty_list); parent->mt_dirty_room = txn->mt_dirty_room; + if (txn->mt_spill_pgs) { + if (parent->mt_spill_pgs) { + mdb_midl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs); + mdb_midl_free(txn->mt_spill_pgs); + mdb_midl_sort(parent->mt_spill_pgs); + } else { + parent->mt_spill_pgs = txn->mt_spill_pgs; + } + } parent->mt_child = NULL; mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); @@ -3991,6 +4270,19 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) level = 1; do { MDB_ID2L dl = tx2->mt_u.dirty_list; + unsigned x; + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). + */ + if (tx2->mt_spill_pgs) { + x = mdb_midl_search(tx2->mt_spill_pgs, pgno); + if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pgno) { + p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); + goto done; + } + } if (dl[0].mid) { unsigned x = mdb_mid2l_search(dl, pgno); if (x <= dl[0].mid && dl[x].mid == pgno) { @@ -4091,6 +4383,8 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) DPRINTF("found leaf page %zu for key [%s]", mp->mp_pgno, key ? DKEY(key) : NULL); + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; return MDB_SUCCESS; } @@ -4218,11 +4512,21 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) int rc; DPRINTF("free ov page %zu (%d)", pg, ovpages); - /* If the page is dirty we just acquired it, so we should - * give it back to our current free list, if any. + /* If the page is dirty or on the spill list we just acquired it, + * so we should give it back to our current free list, if any. * Not currently supported in nested txns. * Otherwise put it onto the list of pages we freed in this txn. */ + if (!(mp->mp_flags & P_DIRTY) && txn->mt_spill_pgs) { + unsigned x = mdb_midl_search(txn->mt_spill_pgs, pg); + if (x <= txn->mt_spill_pgs[0] && txn->mt_spill_pgs[x] == pg) { + /* This page is no longer spilled */ + for (; x < txn->mt_spill_pgs[0]; x++) + txn->mt_spill_pgs[x] = txn->mt_spill_pgs[x+1]; + txn->mt_spill_pgs[0]--; + goto release; + } + } if ((mp->mp_flags & P_DIRTY) && !txn->mt_parent && env->me_pghead) { unsigned j, x; pgno_t *mop; @@ -4248,6 +4552,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) } if (!(env->me_flags & MDB_WRITEMAP)) mdb_dpage_free(env, mp); +release: /* Insert in me_pghead */ mop = env->me_pghead; j = mop[0] + ovpages; @@ -4964,6 +5269,9 @@ mdb_cursor_touch(MDB_cursor *mc) return MDB_SUCCESS; } +/** Do not spill pages to disk if txn is getting full, may fail instead */ +#define MDB_NOSPILL 0x8000 + int mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned int flags) @@ -4974,7 +5282,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_page *fp; MDB_db dummy; int do_sub = 0, insert = 0; - unsigned int mcount = 0, dcount = 0; + unsigned int mcount = 0, dcount = 0, nospill; size_t nsize; int rc, rc2; MDB_pagebuf pbuf; @@ -4992,6 +5300,9 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return EINVAL; } + nospill = flags & MDB_NOSPILL; + flags &= ~MDB_NOSPILL; + if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY)) return EACCES; @@ -5048,7 +5359,17 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, return rc; } - /* Cursor is positioned */ + /* Cursor is positioned, check for room in the dirty list */ + if (!nospill) { + if (flags & MDB_MULTIPLE) { + rdata = &xdata; + xdata.mv_size = data->mv_size * dcount; + } else { + rdata = data; + } + if ((rc2 = mdb_page_spill(mc, key, rdata))) + return rc2; + } if (rc == MDB_NO_ROOT) { MDB_page *np; @@ -5227,8 +5548,18 @@ current: return rc2; ovpages = omp->mp_pages; - /* Is the ov page writable and large enough? */ - if ((omp->mp_flags & P_DIRTY) && ovpages >= dpages) { + /* Is the ov page large enough? */ + if (ovpages >= dpages) { + if (!(omp->mp_flags & P_DIRTY) && + (level || (mc->mc_txn->mt_env->me_flags & MDB_WRITEMAP))) + { + rc = mdb_page_unspill(mc->mc_txn, omp, &omp); + if (rc) + return rc; + level = 0; /* dirty in this txn or clean */ + } + /* Is it dirty? */ + if (omp->mp_flags & P_DIRTY) { /* yes, overwrite it. Note in this case we don't * bother to try shrinking the page if the new data * is smaller than the overflow threshold. @@ -5261,10 +5592,10 @@ current: else memcpy(METADATA(omp), data->mv_data, data->mv_size); goto done; - } else { - if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) - return rc2; + } } + if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) + return rc2; } else if (NODEDSZ(leaf) == data->mv_size) { /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, @@ -5337,10 +5668,11 @@ put_sub: xdata.mv_data = ""; leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (flags & MDB_CURRENT) { - xflags = MDB_CURRENT; + xflags = MDB_CURRENT|MDB_NOSPILL; } else { mdb_xcursor_init1(mc, leaf); - xflags = (flags & MDB_NODUPDATA) ? MDB_NOOVERWRITE : 0; + xflags = (flags & MDB_NODUPDATA) ? + MDB_NOOVERWRITE|MDB_NOSPILL : MDB_NOSPILL; } /* converted, write the original data first */ if (dkey.mv_size) { @@ -5411,6 +5743,10 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (!(mc->mc_flags & C_INITIALIZED)) return EINVAL; + if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL))) + return rc; + flags &= ~MDB_NOSPILL; /* TODO: Or change (flags != MDB_NODUPDATA) to ~(flags & MDB_NODUPDATA), not looking at the logic of that code just now */ + rc = mdb_cursor_touch(mc); if (rc) return rc; @@ -5422,7 +5758,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) { mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); } - rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, 0); + rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, MDB_NOSPILL); /* If sub-DB still has entries, we're done */ if (mc->mc_xcursor->mx_db.md_entries) { if (leaf->mn_flags & F_SUBDATA) { diff --git a/libraries/liblmdb/midl.c b/libraries/liblmdb/midl.c index e7bd680cb0..86e4592d2d 100644 --- a/libraries/liblmdb/midl.c +++ b/libraries/liblmdb/midl.c @@ -31,8 +31,7 @@ */ #define CMP(x,y) ( (x) < (y) ? -1 : (x) > (y) ) -#if 0 /* superseded by append/sort */ -static unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ) +unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ) { /* * binary search of id in ids @@ -67,6 +66,7 @@ static unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ) return cursor; } +#if 0 /* superseded by append/sort */ int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) { unsigned x, i; diff --git a/libraries/liblmdb/midl.h b/libraries/liblmdb/midl.h index 9ce7133c6e..b0bdff3f49 100644 --- a/libraries/liblmdb/midl.h +++ b/libraries/liblmdb/midl.h @@ -74,14 +74,12 @@ typedef MDB_ID *MDB_IDL; xidl[xlen] = (id); \ } while (0) -#if 0 /* superseded by append/sort */ - /** Insert an ID into an IDL. - * @param[in,out] ids The IDL to insert into. - * @param[in] id The ID to insert. - * @return 0 on success, -1 if ID was already present, -2 on error. + /** Search for an ID in an IDL. + * @param[in] ids The IDL to search. + * @param[in] id The ID to search for. + * @return The index of the first ID greater than or equal to \b id. */ -int mdb_midl_insert( MDB_IDL ids, MDB_ID id ); -#endif +unsigned mdb_midl_search( MDB_IDL ids, MDB_ID id ); /** Allocate an IDL. * Allocates memory for an IDL of the given size. From f04dc0ebd21f7a61ae31d9bff648907fce0e80e7 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Fri, 12 Jul 2013 11:30:33 +0200 Subject: [PATCH 18/18] Also set/clear P_KEEP in parent txn's cursors --- libraries/liblmdb/mdb.c | 73 +++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index cd99d67add..df88adcd27 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1326,6 +1326,44 @@ mdb_dlist_free(MDB_txn *txn) dl[0].mid = 0; } +/* Set or clear P_KEEP in non-overflow, non-sub pages in known cursors. + * When clearing, only consider backup cursors (from parent txns) since + * other P_KEEP flags have already been cleared. + * @param[in] mc A cursor handle for the current operation. + * @param[in] pflags Flags of the pages to update: + * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it. + */ +static void +mdb_cursorpages_mark(MDB_cursor *mc, unsigned pflags) +{ + MDB_txn *txn = mc->mc_txn; + MDB_cursor *m2, *m3; + MDB_xcursor *mx; + unsigned i, j; + + if (mc->mc_flags & C_UNTRACK) + mc = NULL; /* will find mc in mt_cursors */ + for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { + for (; mc; mc=mc->mc_next) { + m2 = pflags == P_DIRTY ? mc : mc->mc_backup; + for (; m2; m2 = m2->mc_backup) { + for (m3=m2; m3->mc_flags & C_INITIALIZED; m3=&mx->mx_cursor) { + for (j=0; jmc_snum; j++) + if ((m3->mc_pg[j]->mp_flags & (P_SUBP|P_DIRTY|P_KEEP)) + == pflags) + m3->mc_pg[j]->mp_flags ^= P_KEEP; + if (!(m3->mc_db->md_flags & MDB_DUPSORT)) + break; + /* Cursor backups have mx malloced at the end of m2 */ + mx = (m3 == mc ? m3->mc_xcursor : (MDB_xcursor *)(m3+1)); + } + } + } + if (i == 0) + break; + } +} + static int mdb_page_flush(MDB_txn *txn); /** Spill pages from the dirty list back to disk. @@ -1404,34 +1442,9 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) } } } - /* Mark all the pages of active cursors we want to preserve */ - for (i=0; imt_numdbs; i++) { - MDB_cursor *mc = txn->mt_cursors[i]; - /* See if m0 is tracked or not */ - if (i == m0->mc_dbi && !(m0->mc_flags & C_UNTRACK)) { - /* nope. tack it on in front */ - m0->mc_next = mc; - mc = m0; - } - for (; mc; mc=mc->mc_next) { - if (mc->mc_flags & C_INITIALIZED) { - for (j=0; jmc_snum; j++) { - if (mc->mc_pg[j]->mp_flags & P_DIRTY) - mc->mc_pg[j]->mp_flags |= P_KEEP; - } - if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) { - MDB_cursor *mx = &mc->mc_xcursor->mx_cursor; - if (mx->mc_flags & C_INITIALIZED) { - for (j=0; jmc_snum; j++) { - if ((mx->mc_pg[j]->mp_flags & (P_SUBP|P_DIRTY)) - == P_DIRTY) - mx->mc_pg[j]->mp_flags |= P_KEEP; - } - } - } - } - } - } + + /* Preserve pages used by cursors */ + mdb_cursorpages_mark(m0, P_DIRTY); /* Save the page IDs of all the pages we're flushing */ for (i=1; i<=dl[0].mid; i++) { @@ -1461,6 +1474,9 @@ mdb_page_spill(MDB_cursor *m0, MDB_val *key, MDB_val *data) mdb_midl_sort(txn->mt_spill_pgs); rc = mdb_page_flush(txn); + + mdb_cursorpages_mark(m0, P_DIRTY|P_KEEP); + if (rc == 0) { txn->mt_dirty_room = MDB_IDL_UM_MAX - dl[0].mid; txn->mt_flags |= MDB_TXN_SPILLS; @@ -6216,6 +6232,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) { + mc->mc_next = NULL; mc->mc_backup = NULL; mc->mc_dbi = dbi; mc->mc_txn = txn;