From 0cdd9dffddf66c730a35f48db2bb02d8bb3e5731 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Fri, 17 May 2013 19:31:17 +0200 Subject: [PATCH 01/45] ITS#7589 mdb_cursor_put(): Update ovpage nodesize. Update the nodesize when overwriting an overflow page. As before, do not attempt to shrink the page. --- libraries/liblmdb/mdb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 8415ccb26c..2c715ec74f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -5199,9 +5199,10 @@ current: /* Is the ov page writable and large enough? */ if ((omp->mp_flags & P_DIRTY) && ovpages >= dpages) { /* yes, overwrite it. Note in this case we don't - * bother to try shrinking the node if the new data + * bother to try shrinking the page if the new data * is smaller than the overflow threshold. */ + SETDSZ(leaf, data->mv_size); if (F_ISSET(flags, MDB_RESERVE)) data->mv_data = METADATA(omp); else From e4ce404992f440adf0a5b026932b8cfd392bfa35 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sun, 19 May 2013 18:38:13 +0200 Subject: [PATCH 02/45] More ITS#7589: Fix prev commit. mp_pages gives #pages in the ovpage. OVPAGES() no longer does. --- libraries/liblmdb/mdb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 2c715ec74f..fced10c202 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -5192,10 +5192,11 @@ current: pgno_t pg; int ovpages, dpages; - ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize); dpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); memcpy(&pg, NODEDATA(leaf), sizeof(pg)); mdb_page_get(mc->mc_txn, pg, &omp); + ovpages = omp->mp_pages; + /* Is the ov page writable and large enough? */ if ((omp->mp_flags & P_DIRTY) && ovpages >= dpages) { /* yes, overwrite it. Note in this case we don't From 2eb50b1d2ed61dedf33a9d89ef25e0ba853f5f81 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sun, 19 May 2013 19:08:07 +0200 Subject: [PATCH 03/45] More ITS#7589 followup: OVPAGES() -> mp_pages. --- libraries/liblmdb/mdb.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index fced10c202..79635fd014 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -6514,10 +6514,14 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) /* add overflow pages to free list */ if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_BIGDATA)) { int i, ovpages; + MDB_page *omp; pgno_t pg; memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize); + if ((rc = mdb_page_get(mc->mc_txn, pg, &omp)) != 0) + return rc; + assert(IS_OVERFLOW(omp)); + ovpages = omp->mp_pages; mc->mc_db->md_overflow_pages -= ovpages; for (i=0; imc_pg[mc->mc_top]); i++) { ni = NODEPTR(mc->mc_pg[mc->mc_top], i); if (ni->mn_flags & F_BIGDATA) { - int j, ovpages = OVPAGES(NODEDSZ(ni), mc->mc_txn->mt_env->me_psize); + int j, ovpages; + MDB_page *omp; pgno_t pg; memcpy(&pg, NODEDATA(ni), sizeof(pg)); + if ((rc = mdb_page_get(mc->mc_txn, pg, &omp)) != 0) + return rc; + assert(IS_OVERFLOW(omp)); + ovpages = omp->mp_pages; for (j=0; jmc_txn->mt_free_pgs, pg); pg++; From 5bdf2aae6ee0c8bdcd6a7759bd098d6531c668a0 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 20 May 2013 23:12:26 +0200 Subject: [PATCH 04/45] Tweak mdb_page_malloc(),mdb_page_get() semantics. mdb_page_malloc(): Add "number of pages" parameter. mdb_page_get(): Add output param for how page was found. Do not set return params on error. mdb_cursor_put(): Catch mdb_page_get() error. Prepares for next commit, no change in caller behavior other than on mdb_page_get error. --- libraries/liblmdb/mdb.c | 89 +++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 39 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 79635fd014..6c7d28c6b9 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -999,7 +999,7 @@ static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); static int mdb_page_touch(MDB_cursor *mc); -static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp); +static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); static int mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); #define MDB_PS_MODIFY 1 @@ -1252,19 +1252,27 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) return txn->mt_dbxs[dbi].md_dcmp(a, b); } -/** Allocate a single page. - * Re-use old malloc'd pages first, otherwise just malloc. +/** Allocate a page. + * Re-use old malloc'd pages first for singletons, otherwise just malloc. */ static MDB_page * -mdb_page_malloc(MDB_cursor *mc) { - MDB_page *ret; - size_t sz = mc->mc_txn->mt_env->me_psize; - if ((ret = mc->mc_txn->mt_env->me_dpages) != NULL) { - VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz); - VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); - mc->mc_txn->mt_env->me_dpages = ret->mp_next; - } else if ((ret = malloc(sz)) != NULL) { - VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz); +mdb_page_malloc(MDB_cursor *mc, unsigned num) +{ + MDB_env *env = mc->mc_txn->mt_env; + MDB_page *ret = env->me_dpages; + size_t sz = env->me_psize; + if (num == 1) { + if (ret) { + VGMEMP_ALLOC(env, ret, sz); + VGMEMP_DEFINED(ret, sizeof(ret->mp_next)); + env->me_dpages = ret->mp_next; + return ret; + } + } else { + sz *= num; + } + if ((ret = malloc(sz)) != NULL) { + VGMEMP_ALLOC(env, ret, sz); } return ret; } @@ -1520,17 +1528,8 @@ none: np = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); np->mp_pgno = pgno; } else { - if (txn->mt_env->me_dpages && num == 1) { - np = txn->mt_env->me_dpages; - VGMEMP_ALLOC(txn->mt_env, np, txn->mt_env->me_psize); - VGMEMP_DEFINED(np, sizeof(np->mp_next)); - txn->mt_env->me_dpages = np->mp_next; - } else { - size_t sz = txn->mt_env->me_psize * num; - if ((np = malloc(sz)) == NULL) - return ENOMEM; - VGMEMP_ALLOC(txn->mt_env, np, sz); - } + if (!(np = mdb_page_malloc(mc, num))) + return ENOMEM; if (pgno == P_INVALID) { np->mp_pgno = txn->mt_next_pgno; txn->mt_next_pgno += num; @@ -1653,7 +1652,7 @@ finish: } assert(mc->mc_txn->mt_u.dirty_list[0].mid < MDB_IDL_UM_MAX); /* No - copy it */ - np = mdb_page_malloc(mc); + np = mdb_page_malloc(mc, 1); if (!np) return ENOMEM; memcpy(np, mp, mc->mc_txn->mt_env->me_psize); @@ -4032,17 +4031,20 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) * @param[in] txn the transaction for this access. * @param[in] pgno the page number for the page to retrieve. * @param[out] ret address of a pointer where the page's address will be stored. + * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. * @return 0 on success, non-zero on failure. */ static int -mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret) +mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) { MDB_page *p = NULL; + int level; if (!((txn->mt_flags & MDB_TXN_RDONLY) | (txn->mt_env->me_flags & MDB_WRITEMAP))) { MDB_txn *tx2 = txn; + level = 1; do { MDB_ID2L dl = tx2->mt_u.dirty_list; if (dl[0].mid) { @@ -4052,19 +4054,24 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret) goto done; } } + level++; } while ((tx2 = tx2->mt_parent) != NULL); } if (pgno < txn->mt_next_pgno) { + level = 0; p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); } else { DPRINTF("page %zu not found", pgno); assert(p != NULL); + return MDB_PAGE_NOTFOUND; } done: *ret = p; - return (p != NULL) ? MDB_SUCCESS : MDB_PAGE_NOTFOUND; + if (lvl) + *lvl = level; + return MDB_SUCCESS; } /** Search for the page a given key should be in. @@ -4118,7 +4125,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify) assert(i < NUMKEYS(mp)); node = NODEPTR(mp, i); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp))) + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) return rc; mc->mc_ki[mc->mc_top] = i; @@ -4157,7 +4164,7 @@ mdb_page_search_lowest(MDB_cursor *mc) MDB_node *node = NODEPTR(mp, 0); int rc; - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp))) + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0) return rc; mc->mc_ki[mc->mc_top] = 0; @@ -4235,7 +4242,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) assert(root > 1); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) - if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0]))) + if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0) return rc; mc->mc_snum = 1; @@ -4278,7 +4285,7 @@ mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) */ data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if ((rc = mdb_page_get(txn, pgno, &omp))) { + if ((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0) { DPRINTF("read overflow page %zu failed", pgno); return rc; } @@ -4355,7 +4362,7 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) assert(IS_BRANCH(mc->mc_pg[mc->mc_top])); indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp))) + if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL) != 0)) return rc; mdb_cursor_push(mc, mp); @@ -5194,7 +5201,8 @@ current: dpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - mdb_page_get(mc->mc_txn, pg, &omp); + if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) != 0) + return rc2; ovpages = omp->mp_pages; /* Is the ov page writable and large enough? */ @@ -6415,8 +6423,8 @@ mdb_rebalance(MDB_cursor *mc) DPUTS("collapsing root page!"); mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); - if ((rc = mdb_page_get(mc->mc_txn, mc->mc_db->md_root, - &mc->mc_pg[0]))) + rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); + if (rc) return rc; mc->mc_db->md_depth--; mc->mc_db->md_branch_pages--; @@ -6469,7 +6477,8 @@ mdb_rebalance(MDB_cursor *mc) DPUTS("reading right neighbor"); mn.mc_ki[ptop]++; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mn.mc_pg[mn.mc_top]))) + rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + if (rc) return rc; mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); @@ -6479,7 +6488,8 @@ mdb_rebalance(MDB_cursor *mc) DPUTS("reading left neighbor"); mn.mc_ki[ptop]--; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mn.mc_pg[mn.mc_top]))) + rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + if (rc) return rc; mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; mc->mc_ki[mc->mc_top] = 0; @@ -6518,7 +6528,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) pgno_t pg; memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if ((rc = mdb_page_get(mc->mc_txn, pg, &omp)) != 0) + if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) != 0) return rc; assert(IS_OVERFLOW(omp)); ovpages = omp->mp_pages; @@ -6863,7 +6873,7 @@ newsep: /* Move half of the keys to the right sibling. */ /* grab a page to hold a temporary copy */ - copy = mdb_page_malloc(mc); + copy = mdb_page_malloc(mc, 1); if (copy == NULL) return ENOMEM; @@ -7287,7 +7297,8 @@ mdb_drop0(MDB_cursor *mc, int subs) MDB_page *omp; pgno_t pg; memcpy(&pg, NODEDATA(ni), sizeof(pg)); - if ((rc = mdb_page_get(mc->mc_txn, pg, &omp)) != 0) + rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL); + if (rc != 0) return rc; assert(IS_OVERFLOW(omp)); ovpages = omp->mp_pages; From a2ce25482a60e516d1d9b417ed666dac6fdebb93 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Mon, 20 May 2013 23:12:52 +0200 Subject: [PATCH 05/45] ITS#7515 Fix nested txn touch of subpage/ovpage. mdb_page_touch(): Don't touch a subpage, replacing with non-subpage. mdb_cursor_put(): Don't overwrite ancestor txn's dirty overflow page. --- libraries/liblmdb/mdb.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 6c7d28c6b9..acf8338ab3 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1633,7 +1633,7 @@ finish: SETPGNO(NODEPTR(mc->mc_pg[mc->mc_top-1], mc->mc_ki[mc->mc_top-1]), mp->mp_pgno); else mc->mc_db->md_root = mp->mp_pgno; - } else if (mc->mc_txn->mt_parent) { + } else if (mc->mc_txn->mt_parent && !(mp->mp_flags & P_SUBP)) { MDB_page *np; MDB_ID2 mid; /* If txn has a parent, make sure the page is in our @@ -5197,11 +5197,11 @@ current: if (F_ISSET(leaf->mn_flags, F_BIGDATA)) { MDB_page *omp; pgno_t pg; - int ovpages, dpages; + unsigned psize = mc->mc_txn->mt_env->me_psize; + int level, ovpages, dpages = OVPAGES(data->mv_size, psize); - dpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) != 0) + if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0) return rc2; ovpages = omp->mp_pages; @@ -5211,6 +5211,28 @@ current: * bother to try shrinking the page if the new data * is smaller than the overflow threshold. */ + if (level > 1) { + /* It is writable only in a parent txn */ + size_t sz = (size_t) psize * ovpages, off; + MDB_page *np = mdb_page_malloc(mc, ovpages); + MDB_ID2 id2; + if (!np) + return ENOMEM; + id2.mid = pg; + id2.mptr = np; + mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); + if (!(flags & MDB_RESERVE)) { + /* Copy end of page, adjusting alignment so + * compiler may copy words instead of bytes. + */ + off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); + memcpy((size_t *)((char *)np + off), + (size_t *)((char *)omp + off), sz - off); + sz = PAGEHDRSZ; + } + memcpy(np, omp, sz); /* Copy beginning of page */ + omp = np; + } SETDSZ(leaf, data->mv_size); if (F_ISSET(flags, MDB_RESERVE)) data->mv_data = METADATA(omp); From 1b6d7ee7e1aae85bfce2794e60038c2b264ad93c Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 21 May 2013 19:04:52 +0200 Subject: [PATCH 06/45] ITS#7598 mdb_dbi_open(named DB): Check mainDB flags. Reject attempts to open named databases if the main database has flag MDB_DUPSORT or MDB_INTEGERKEY. DUPSORT would require an xcursor for the DB, INTEGERKEY would expect the DB name to be a binary integer. --- libraries/liblmdb/mdb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index acf8338ab3..4c85d67ea6 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -7226,6 +7226,10 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs) return MDB_DBS_FULL; + /* Cannot mix named databases with some mainDB flags */ + if (txn->mt_dbs[MAIN_DBI].md_flags & (MDB_DUPSORT|MDB_INTEGERKEY)) + return (flags & MDB_CREATE) ? MDB_INCOMPATIBLE : MDB_NOTFOUND; + /* Find the DB info */ dbflag = DB_NEW|DB_VALID; exact = 0; From a1b16ce5f0f661b0dd5a30706a1d6178cf0fa561 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 21 May 2013 22:44:51 +0200 Subject: [PATCH 07/45] ITS#7598 Tweak MDB__NODUP,fix mdb_stat. MDB_NEXT_NODUP, MDB_PREV_NODUP: Allow for non-MDB_DUPSORT databases. No mdb.c code changes needed. mdb_stat.c: Use MDB_NEXT_NODUP, to avoid a crash with a DUPSORT mainDB. --- libraries/liblmdb/lmdb.h | 6 ++---- libraries/liblmdb/mdb_stat.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 954ffde1cb..53dab20461 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -325,13 +325,11 @@ typedef enum MDB_cursor_op { Only for #MDB_DUPSORT */ MDB_NEXT_MULTIPLE, /**< Return all duplicate data items at the next cursor position. Only for #MDB_DUPFIXED */ - MDB_NEXT_NODUP, /**< Position at first data item of next key. - Only for #MDB_DUPSORT */ + MDB_NEXT_NODUP, /**< Position at first data item of next key */ MDB_PREV, /**< Position at previous data item */ MDB_PREV_DUP, /**< Position at previous data item of current key. Only for #MDB_DUPSORT */ - MDB_PREV_NODUP, /**< Position at last data item of previous key. - Only for #MDB_DUPSORT */ + MDB_PREV_NODUP, /**< Position at last data item of previous key */ MDB_SET, /**< Position at specified key */ MDB_SET_KEY, /**< Position at specified key, return key + data */ MDB_SET_RANGE /**< Position at first key greater than or equal to specified key. */ diff --git a/libraries/liblmdb/mdb_stat.c b/libraries/liblmdb/mdb_stat.c index dd0735f242..ca6ad63955 100644 --- a/libraries/liblmdb/mdb_stat.c +++ b/libraries/liblmdb/mdb_stat.c @@ -193,7 +193,7 @@ int main(int argc, char *argv[]) printf("mdb_cursor_open failed, error %d %s\n", rc, mdb_strerror(rc)); goto txn_abort; } - while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT)) == 0) { + while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { char *str = malloc(key.mv_size+1); MDB_dbi db2; memcpy(str, key.mv_data, key.mv_size); From 5ea1cd8f6d083095cf4f20b802cea90441cea03a Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 21 May 2013 23:48:27 +0200 Subject: [PATCH 08/45] mdb_stat cleanup. Exit with success when there was no failure. Do not use data containing NUL as a DB name (which is a C string). --- libraries/liblmdb/mdb_stat.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb_stat.c b/libraries/liblmdb/mdb_stat.c index ca6ad63955..3e6be21597 100644 --- a/libraries/liblmdb/mdb_stat.c +++ b/libraries/liblmdb/mdb_stat.c @@ -194,8 +194,11 @@ int main(int argc, char *argv[]) goto txn_abort; } while ((rc = mdb_cursor_get(cursor, &key, NULL, MDB_NEXT_NODUP)) == 0) { - char *str = malloc(key.mv_size+1); + char *str; MDB_dbi db2; + if (memchr(key.mv_data, '\0', key.mv_size)) + continue; + str = malloc(key.mv_size+1); memcpy(str, key.mv_data, key.mv_size); str[key.mv_size] = '\0'; rc = mdb_open(txn, str, 0, &db2); @@ -214,6 +217,9 @@ int main(int argc, char *argv[]) mdb_cursor_close(cursor); } + if (rc == MDB_NOTFOUND) + rc = MDB_SUCCESS; + mdb_close(env, dbi); txn_abort: mdb_txn_abort(txn); From feaeab0c0488ba60f4b2b1a901cb685df0c5cf01 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 21 May 2013 23:55:13 +0200 Subject: [PATCH 09/45] Factor out mdb_find_oldest,mdb_dlist_free,dirty_list. Do not rescan reader table (mdb_find_oldest) after "goto again". Skip clearing dirty_list[nonzero].mid in mdb_dlist_free(); it was not done in mdb_reset0() anyway. --- libraries/liblmdb/mdb.c | 119 ++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 67 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 4c85d67ea6..5558875872 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1289,6 +1289,44 @@ mdb_page_free(MDB_env *env, MDB_page *mp) env->me_dpages = mp; } +/* Return all dirty pages to dpage list */ +static void +mdb_dlist_free(MDB_txn *txn) +{ + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned i, n = dl[0].mid; + + for (i = 1; i <= n; i++) { + MDB_page *dp = dl[i].mptr; + if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { + mdb_page_free(env, dp); + } else { + /* large pages just get freed directly */ + VGMEMP_FREE(env, dp); + free(dp); + } + } + dl[0].mid = 0; +} + +/** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ +static txnid_t +mdb_find_oldest(MDB_txn *txn) +{ + int i; + txnid_t mr, oldest = txn->mt_txnid - 1; + MDB_reader *r = txn->mt_env->me_txns->mti_readers; + for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + mr = r[i].mr_txnid; + if (oldest > mr) + oldest = mr; + } + } + return oldest; +} + /** Allocate pages for writing. * If there are free pages available from older transactions, they * will be re-used first. Otherwise a new page will be allocated. @@ -1323,7 +1361,6 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { /* See if there's anything in the free DB */ - MDB_reader *r; MDB_cursor m2; MDB_node *leaf; MDB_val data; @@ -1348,19 +1385,8 @@ again: last = *(txnid_t *)key.mv_data; } - { - unsigned int i, nr; - txnid_t mr; - oldest = txn->mt_txnid - 1; - nr = txn->mt_env->me_txns->mti_numreaders; - r = txn->mt_env->me_txns->mti_readers; - for (i=0; i last) { /* It's usable, grab it. @@ -1429,19 +1455,7 @@ none: /* We haven't hit the readers list yet? */ if (!oldest) { - MDB_reader *r; - unsigned int nr; - txnid_t mr; - - oldest = txn->mt_txnid - 1; - nr = txn->mt_env->me_txns->mti_numreaders; - r = txn->mt_env->me_txns->mti_readers; - for (i=0; imc_db->md_root = mp->mp_pgno; } else if (mc->mc_txn->mt_parent && !(mp->mp_flags & P_SUBP)) { MDB_page *np; - MDB_ID2 mid; + MDB_ID2 mid, *dl = mc->mc_txn->mt_u.dirty_list; /* If txn has a parent, make sure the page is in our * dirty list. */ - if (mc->mc_txn->mt_u.dirty_list[0].mid) { - unsigned x = mdb_mid2l_search(mc->mc_txn->mt_u.dirty_list, mp->mp_pgno); - if (x <= mc->mc_txn->mt_u.dirty_list[0].mid && - mc->mc_txn->mt_u.dirty_list[x].mid == mp->mp_pgno) { - if (mc->mc_txn->mt_u.dirty_list[x].mptr != mp) { - mp = mc->mc_txn->mt_u.dirty_list[x].mptr; - mc->mc_pg[mc->mc_top] = mp; - } + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, mp->mp_pgno); + if (x <= dl[0].mid && dl[x].mid == mp->mp_pgno) { + np = dl[x].mptr; + if (mp != np) + mc->mc_pg[mc->mc_top] = np; return 0; } } - assert(mc->mc_txn->mt_u.dirty_list[0].mid < MDB_IDL_UM_MAX); + assert(dl[0].mid < MDB_IDL_UM_MAX); /* No - copy it */ np = mdb_page_malloc(mc, 1); if (!np) @@ -1658,7 +1670,7 @@ finish: memcpy(np, mp, mc->mc_txn->mt_env->me_psize); mid.mid = np->mp_pgno; mid.mptr = np; - mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &mid); + mdb_mid2l_insert(dl, &mid); mp = np; goto finish; } @@ -2031,7 +2043,6 @@ static void mdb_txn_reset0(MDB_txn *txn) { MDB_env *env = txn->mt_env; - unsigned int i; /* Close any DBI handles opened in this txn */ mdb_dbis_update(txn, 0); @@ -2045,24 +2056,11 @@ mdb_txn_reset0(MDB_txn *txn) txn->mt_numdbs = 0; /* close nothing if called again */ txn->mt_dbxs = NULL; /* mark txn as reset */ } else { - MDB_page *dp; - mdb_cursors_close(txn, 0); if (!(env->me_flags & MDB_WRITEMAP)) { - /* return all dirty pages to dpage list */ - for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) { - dp = txn->mt_u.dirty_list[i].mptr; - if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { - mdb_page_free(txn->mt_env, dp); - } else { - /* large pages just get freed directly */ - VGMEMP_FREE(txn->mt_env, dp); - free(dp); - } - } + mdb_dlist_free(txn); } - free(env->me_pgfree); if (txn->mt_parent) { @@ -2415,7 +2413,6 @@ free2: dp = txn->mt_u.dirty_list[i].mptr; /* clear dirty flag */ dp->mp_flags &= ~P_DIRTY; - txn->mt_u.dirty_list[i].mid = 0; } txn->mt_u.dirty_list[0].mid = 0; goto sync; @@ -2513,19 +2510,7 @@ free2: #endif } while (!done); - /* Drop the dirty pages. - */ - for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) { - dp = txn->mt_u.dirty_list[i].mptr; - if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { - mdb_page_free(txn->mt_env, dp); - } else { - VGMEMP_FREE(txn->mt_env, dp); - free(dp); - } - txn->mt_u.dirty_list[i].mid = 0; - } - txn->mt_u.dirty_list[0].mid = 0; + mdb_dlist_free(txn); sync: if ((n = mdb_env_sync(env, 0)) != 0 || From 92fe958805be33a06d8ae55727e9c61c81c8b751 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 21 May 2013 23:58:57 +0200 Subject: [PATCH 10/45] Drop unused liblmdb MIDL-range support. --- libraries/liblmdb/midl.c | 23 +++------------------ libraries/liblmdb/midl.h | 43 +++------------------------------------- 2 files changed, 6 insertions(+), 60 deletions(-) diff --git a/libraries/liblmdb/midl.c b/libraries/liblmdb/midl.c index 57f1e049a2..d2617c61e4 100644 --- a/libraries/liblmdb/midl.c +++ b/libraries/liblmdb/midl.c @@ -71,17 +71,6 @@ int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) { unsigned x, i; - if (MDB_IDL_IS_RANGE( ids )) { - /* if already in range, treat as a dup */ - if (id >= MDB_IDL_RANGE_FIRST(ids) && id <= MDB_IDL_RANGE_LAST(ids)) - return -1; - if (id < MDB_IDL_RANGE_FIRST(ids)) - ids[1] = id; - else if (id > MDB_IDL_RANGE_LAST(ids)) - ids[2] = id; - return 0; - } - x = mdb_midl_search( ids, id ); assert( x > 0 ); @@ -97,15 +86,9 @@ int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) } if ( ++ids[0] >= MDB_IDL_DB_MAX ) { - if( id < ids[1] ) { - ids[1] = id; - ids[2] = ids[ids[0]-1]; - } else if ( ids[ids[0]-1] < id ) { - ids[2] = id; - } else { - ids[2] = ids[ids[0]-1]; - } - ids[0] = MDB_NOID; + /* no room */ + --ids[0]; + return -2; } else { /* insert id */ diff --git a/libraries/liblmdb/midl.h b/libraries/liblmdb/midl.h index 792e6ab938..019d92849e 100644 --- a/libraries/liblmdb/midl.h +++ b/libraries/liblmdb/midl.h @@ -52,64 +52,27 @@ typedef size_t MDB_ID; */ typedef MDB_ID *MDB_IDL; -#define MDB_NOID (~(MDB_ID)0) - /* IDL sizes - likely should be even bigger * limiting factors: sizeof(ID), thread stack size */ #define MDB_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ #define MDB_IDL_DB_SIZE (1<bi_lastid) ) -#define MDB_IDL_ALL( bdb, ids ) MDB_IDL_RANGE( ids, 1, ((bdb)->bi_lastid) ) - #define MDB_IDL_FIRST( ids ) ( (ids)[1] ) -#define MDB_IDL_LAST( ids ) ( MDB_IDL_IS_RANGE(ids) \ - ? (ids)[2] : (ids)[(ids)[0]] ) - -#define MDB_IDL_N( ids ) ( MDB_IDL_IS_RANGE(ids) \ - ? ((ids)[2]-(ids)[1])+1 : (ids)[0] ) +#define MDB_IDL_LAST( ids ) ( (ids)[(ids)[0]] ) #if 0 /* superseded by append/sort */ /** Insert an ID into an IDL. * @param[in,out] ids The IDL to insert into. * @param[in] id The ID to insert. - * @return 0 on success, -1 if the ID was already present in the IDL. + * @return 0 on success, -1 if ID was already present, -2 on error. */ int mdb_midl_insert( MDB_IDL ids, MDB_ID id ); #endif From e31c7d3b31d8d5073195e31b283e8ccb46bd13cf Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 23 May 2013 08:13:08 -0700 Subject: [PATCH 11/45] ITS#7594 De-init other subcursors in page_touch --- libraries/liblmdb/mdb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 5558875872..22c5a5c526 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1636,6 +1636,8 @@ finish: if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) { m2->mc_pg[mc->mc_top] = mp; + if (mc->mc_db->md_flags & MDB_DUPSORT) + m2->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } } } From 4b49291653c485841886e5a52c0aabd9ce50a9d9 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sat, 25 May 2013 10:16:55 -0700 Subject: [PATCH 12/45] Add _M_IX86 macro for MSVC --- libraries/liblmdb/mdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 22c5a5c526..bc25bb39f0 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -114,7 +114,7 @@ #define BIG_ENDIAN __BIG_ENDIAN #endif -#if defined(__i386) || defined(__x86_64) +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) #define MISALIGNED_OK 1 #endif From c68e5ae9be63d9464efcf37ace117881e63b7d9a Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 30 May 2013 13:06:12 -0700 Subject: [PATCH 13/45] Add mdb_env_copyfd() Allow writing backup to an already opened file handle, for piping to tar/gzip/ssh/whatever. --- libraries/liblmdb/lmdb.h | 19 +++++++ libraries/liblmdb/mdb.c | 107 +++++++++++++++++++---------------- libraries/liblmdb/mdb_copy.1 | 8 ++- libraries/liblmdb/mdb_copy.c | 9 ++- 4 files changed, 89 insertions(+), 54 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 53dab20461..9776366c7c 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -144,6 +144,14 @@ typedef int mdb_mode_t; typedef mode_t mdb_mode_t; #endif +#ifndef _WIN32 + /** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#define HANDLE int +#endif + /** @defgroup mdb MDB API * @{ * @brief OpenLDAP Lightning Memory-Mapped Database Manager @@ -533,6 +541,17 @@ int mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t */ int mdb_env_copy(MDB_env *env, const char *path); + /** @brief Copy an MDB environment to the specified file descriptor. + * + * This function may be used to make a backup of an existing environment. + * @param[in] env An environment handle returned by #mdb_env_create(). It + * must have already been opened successfully. + * @param[in] fd The filedescriptor to write the copy to. It must + * have already been opened for Write access. + * @return A non-zero error value on failure and 0 on success. + */ +int mdb_env_copyfd(MDB_env *env, HANDLE fd); + /** @brief Return statistics about the MDB environment. * * @param[in] env An environment handle returned by #mdb_env_create() diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index bc25bb39f0..eeb0540894 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -199,12 +199,6 @@ mdb_sem_wait(sem_t *sem) */ #define ErrCode() errno - /** An abstraction for a file handle. - * On POSIX systems file handles are small integers. On Windows - * they're opaque pointers. - */ -#define HANDLE int - /** A value for an invalid file handle. * Mainly used to initialize file variables and signify that they are * unused. @@ -3631,60 +3625,20 @@ mdb_env_close0(MDB_env *env, int excl) } int -mdb_env_copy(MDB_env *env, const char *path) +mdb_env_copyfd(MDB_env *env, int fd) { MDB_txn *txn = NULL; - int rc, len; + int rc; size_t wsize; - char *lpath, *ptr; + char *ptr; HANDLE newfd = INVALID_HANDLE_VALUE; - if (env->me_flags & MDB_NOSUBDIR) { - lpath = (char *)path; - } else { - len = strlen(path); - len += sizeof(DATANAME); - lpath = malloc(len); - if (!lpath) - return ENOMEM; - sprintf(lpath, "%s" DATANAME, path); - } - - /* The destination path must exist, but the destination file must not. - * We don't want the OS to cache the writes, since the source data is - * already in the OS cache. - */ -#ifdef _WIN32 - newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, - FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); -#else - newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL -#ifdef O_DIRECT - |O_DIRECT -#endif - , 0666); -#endif - if (!(env->me_flags & MDB_NOSUBDIR)) - free(lpath); - if (newfd == INVALID_HANDLE_VALUE) { - rc = ErrCode(); - goto leave; - } - -#ifdef F_NOCACHE /* __APPLE__ */ - rc = fcntl(newfd, F_NOCACHE, 1); - if (rc) { - rc = ErrCode(); - goto leave; - } -#endif - /* Do the lock/unlock of the reader mutex before starting the * write txn. Otherwise other read txns could block writers. */ rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); if (rc) - goto leave; + return rc; if (env->me_txns) { /* We must start the actual read txn after blocking writers */ @@ -3751,6 +3705,59 @@ mdb_env_copy(MDB_env *env, const char *path) leave: mdb_txn_abort(txn); + return rc; +} + +int +mdb_env_copy(MDB_env *env, const char *path) +{ + int rc, len; + char *lpath; + HANDLE newfd = INVALID_HANDLE_VALUE; + + if (env->me_flags & MDB_NOSUBDIR) { + lpath = (char *)path; + } else { + len = strlen(path); + len += sizeof(DATANAME); + lpath = malloc(len); + if (!lpath) + return ENOMEM; + sprintf(lpath, "%s" DATANAME, path); + } + + /* The destination path must exist, but the destination file must not. + * We don't want the OS to cache the writes, since the source data is + * already in the OS cache. + */ +#ifdef _WIN32 + newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); +#else + newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL +#ifdef O_DIRECT + |O_DIRECT +#endif + , 0666); +#endif + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); + if (newfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + +#ifdef F_NOCACHE /* __APPLE__ */ + rc = fcntl(newfd, F_NOCACHE, 1); + if (rc) { + rc = ErrCode(); + goto leave; + } +#endif + + rc = mdb_env_copyfd(env, newfd); + +leave: if (newfd != INVALID_HANDLE_VALUE) close(newfd); diff --git a/libraries/liblmdb/mdb_copy.1 b/libraries/liblmdb/mdb_copy.1 index 2b3d421e78..b759f68786 100644 --- a/libraries/liblmdb/mdb_copy.1 +++ b/libraries/liblmdb/mdb_copy.1 @@ -5,12 +5,18 @@ mdb_copy \- LMDB environment copy tool .SH SYNOPSIS .B mdb_copy -.I srcpath\ dstpath +.I srcpath\ [dstpath] .SH DESCRIPTION The .B mdb_copy utility copies an LMDB environment. The environment can be copied regardless of whether it is currently in use. + +If +.I dstpath +is specified it must be the path of an empty directory +for storing the backup. Otherwise, the backup will be +written to stdout. .SH DIAGNOSTICS Exit status is zero if no errors occur. Errors result in a non-zero exit status and diff --git a/libraries/liblmdb/mdb_copy.c b/libraries/liblmdb/mdb_copy.c index bd0b859110..a2ac4cc7c1 100644 --- a/libraries/liblmdb/mdb_copy.c +++ b/libraries/liblmdb/mdb_copy.c @@ -21,8 +21,8 @@ int main(int argc,char * argv[]) MDB_env *env; char *envname = argv[1]; - if (argc != 3) { - fprintf(stderr, "usage: %s srcpath dstpath\n", argv[0]); + if (argc<2 || argc>3) { + fprintf(stderr, "usage: %s srcpath [dstpath]\n", argv[0]); exit(EXIT_FAILURE); } @@ -32,7 +32,10 @@ int main(int argc,char * argv[]) if (rc) { printf("mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); } else { - rc = mdb_env_copy(env, argv[2]); + if (argc == 2) + rc = mdb_env_copyfd(env, 1); + else + rc = mdb_env_copy(env, argv[2]); if (rc) printf("mdb_env_copy failed, error %d %s\n", rc, mdb_strerror(rc)); } From ad573fe1251f1a1c1f29b4c921f11209ae8e9ffb Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 30 May 2013 13:09:28 -0700 Subject: [PATCH 14/45] Fix prev commit --- libraries/liblmdb/mdb.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index eeb0540894..4109e71c34 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -3631,7 +3631,6 @@ mdb_env_copyfd(MDB_env *env, int fd) int rc; size_t wsize; char *ptr; - HANDLE newfd = INVALID_HANDLE_VALUE; /* Do the lock/unlock of the reader mutex before starting the * write txn. Otherwise other read txns could block writers. @@ -3658,11 +3657,11 @@ mdb_env_copyfd(MDB_env *env, int fd) #ifdef _WIN32 { DWORD len; - rc = WriteFile(newfd, env->me_map, wsize, &len, NULL); + rc = WriteFile(fd, env->me_map, wsize, &len, NULL); rc = (len == wsize) ? MDB_SUCCESS : ErrCode(); } #else - rc = write(newfd, env->me_map, wsize); + rc = write(fd, env->me_map, wsize); rc = (rc == (int)wsize) ? MDB_SUCCESS : ErrCode(); #endif if (env->me_txns) @@ -3681,7 +3680,7 @@ mdb_env_copyfd(MDB_env *env, int fd) w2 = MAX_WRITE; else w2 = wsize; - rc = WriteFile(newfd, ptr, w2, &len, NULL); + rc = WriteFile(fd, ptr, w2, &len, NULL); rc = (len == w2) ? MDB_SUCCESS : ErrCode(); if (rc) break; wsize -= w2; @@ -3695,7 +3694,7 @@ mdb_env_copyfd(MDB_env *env, int fd) w2 = MAX_WRITE; else w2 = wsize; - wres = write(newfd, ptr, w2); + wres = write(fd, ptr, w2); rc = (wres > 0) ? MDB_SUCCESS : ErrCode(); if (rc) break; wsize -= wres; From f207c50b15e3727ac3f7ca7bdf54022e34a84e2f Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 30 May 2013 13:13:33 -0700 Subject: [PATCH 15/45] Add warning about interrupting copy --- libraries/liblmdb/mdb_copy.1 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libraries/liblmdb/mdb_copy.1 b/libraries/liblmdb/mdb_copy.1 index b759f68786..11a0042df4 100644 --- a/libraries/liblmdb/mdb_copy.1 +++ b/libraries/liblmdb/mdb_copy.1 @@ -17,6 +17,12 @@ If is specified it must be the path of an empty directory for storing the backup. Otherwise, the backup will be written to stdout. + +Note: currently, if the copy is interrupted a stale lock +will be left in the LMDB environment. This may be fixed +in a future release, but until then you must not +interrupt the copy process. + .SH DIAGNOSTICS Exit status is zero if no errors occur. Errors result in a non-zero exit status and From d29b9600e6b893ebea8ed34c5c9ec933727ecb5b Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 30 May 2013 15:33:59 -0700 Subject: [PATCH 16/45] Windows portability fixes for prev commit --- libraries/liblmdb/lmdb.h | 16 +++++++++------- libraries/liblmdb/mdb.c | 8 +++++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 9776366c7c..2076eb35fa 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -144,12 +144,14 @@ typedef int mdb_mode_t; typedef mode_t mdb_mode_t; #endif -#ifndef _WIN32 - /** An abstraction for a file handle. - * On POSIX systems file handles are small integers. On Windows - * they're opaque pointers. - */ -#define HANDLE int +/** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#ifdef _WIN32 +typedef void *mdb_filehandle_t; +#else +typedef int mdb_filehandle_t; #endif /** @defgroup mdb MDB API @@ -550,7 +552,7 @@ int mdb_env_copy(MDB_env *env, const char *path); * have already been opened for Write access. * @return A non-zero error value on failure and 0 on success. */ -int mdb_env_copyfd(MDB_env *env, HANDLE fd); +int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); /** @brief Return statistics about the MDB environment. * diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 4109e71c34..33b9d47668 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -199,6 +199,12 @@ mdb_sem_wait(sem_t *sem) */ #define ErrCode() errno + /** An abstraction for a file handle. + * On POSIX systems file handles are small integers. On Windows + * they're opaque pointers. + */ +#define HANDLE int + /** A value for an invalid file handle. * Mainly used to initialize file variables and signify that they are * unused. @@ -3625,7 +3631,7 @@ mdb_env_close0(MDB_env *env, int excl) } int -mdb_env_copyfd(MDB_env *env, int fd) +mdb_env_copyfd(MDB_env *env, HANDLE fd) { MDB_txn *txn = NULL; int rc; From 65faa5ed7ebba7136999f52514c71d8670402627 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 30 May 2013 15:56:30 -0700 Subject: [PATCH 17/45] tweak mdb_copy, trap signals --- libraries/liblmdb/mdb_copy.1 | 5 ----- libraries/liblmdb/mdb_copy.c | 23 ++++++++++++++++++++++- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/libraries/liblmdb/mdb_copy.1 b/libraries/liblmdb/mdb_copy.1 index 11a0042df4..7837de5f6b 100644 --- a/libraries/liblmdb/mdb_copy.1 +++ b/libraries/liblmdb/mdb_copy.1 @@ -18,11 +18,6 @@ is specified it must be the path of an empty directory for storing the backup. Otherwise, the backup will be written to stdout. -Note: currently, if the copy is interrupted a stale lock -will be left in the LMDB environment. This may be fixed -in a future release, but until then you must not -interrupt the copy process. - .SH DIAGNOSTICS Exit status is zero if no errors occur. Errors result in a non-zero exit status and diff --git a/libraries/liblmdb/mdb_copy.c b/libraries/liblmdb/mdb_copy.c index a2ac4cc7c1..ca92009cff 100644 --- a/libraries/liblmdb/mdb_copy.c +++ b/libraries/liblmdb/mdb_copy.c @@ -11,10 +11,22 @@ * top-level directory of the distribution or, alternatively, at * . */ +#ifdef _WIN32 +#include +#define MDB_STDOUT GetStdHandle(STD_OUTPUT_HANDLE) +#else +#define MDB_STDOUT 1 +#endif #include #include +#include #include "lmdb.h" +static void +sighandle(int sig) +{ +} + int main(int argc,char * argv[]) { int rc; @@ -26,6 +38,15 @@ int main(int argc,char * argv[]) exit(EXIT_FAILURE); } +#ifdef SIGPIPE + signal(SIGPIPE, sighandle); +#endif +#ifdef SIGHUP + signal(SIGHUP, sighandle); +#endif + signal(SIGINT, sighandle); + signal(SIGTERM, sighandle); + rc = mdb_env_create(&env); rc = mdb_env_open(env, envname, MDB_RDONLY, 0); @@ -33,7 +54,7 @@ int main(int argc,char * argv[]) printf("mdb_env_open failed, error %d %s\n", rc, mdb_strerror(rc)); } else { if (argc == 2) - rc = mdb_env_copyfd(env, 1); + rc = mdb_env_copyfd(env, MDB_STDOUT); else rc = mdb_env_copy(env, argv[2]); if (rc) From 3b623d66e1e0d1fd78d85e5e657b1a167dba8a51 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 5 Jun 2013 15:23:54 -0700 Subject: [PATCH 18/45] ITS#7594 better fix Update the subDB cursor, don't invalidate it --- libraries/liblmdb/mdb.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 33b9d47668..0e14320214 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1636,8 +1636,13 @@ finish: if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) { m2->mc_pg[mc->mc_top] = mp; - if (mc->mc_db->md_flags & MDB_DUPSORT) - m2->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) { + MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); + if (!(leaf->mn_flags & F_SUBDATA)) { + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + } } } } From aff123ba113728a254339c07560b7d98c88f30a6 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 5 Jun 2013 16:13:43 -0700 Subject: [PATCH 19/45] ITS#7594 more for subDB cursor fix --- libraries/liblmdb/mdb.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 0e14320214..0be43fecc2 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -5413,8 +5413,18 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) void *db = NODEDATA(leaf); memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db)); } else { + MDB_cursor *m2; /* shrink fake page */ mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + /* fix other sub-DB cursors pointed at this fake page */ + for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { + if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; + if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top] && + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } } mc->mc_db->md_entries--; return rc; From e7f6767ea815fe0ada1f95037dfdec176ec4d5bb Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sun, 5 May 2013 01:28:12 -0700 Subject: [PATCH 20/45] Return fresh overflow pages to current pghead And remove them from the current dirty list. --- libraries/liblmdb/mdb.c | 79 +++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 0be43fecc2..9e01fb5e07 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1404,7 +1404,7 @@ again: txn->mt_env->me_pglast = last; goto again; } - mop = malloc(MDB_IDL_SIZEOF(idl)); + mop = mdb_midl_alloc(idl[0]); if (!mop) return ENOMEM; txn->mt_env->me_pglast = last; @@ -1474,7 +1474,7 @@ none: if (oldest <= last) break; idl = (MDB_ID *) data.mv_data; - mop2 = malloc(MDB_IDL_SIZEOF(idl) + MDB_IDL_SIZEOF(mop)); + mop2 = mdb_midl_alloc(idl[0] + mop[0]); if (!mop2) return ENOMEM; /* merge in sorted order */ @@ -1487,7 +1487,7 @@ none: mop2[k--] = mop[j--]; } txn->mt_env->me_pglast = last; - free(txn->mt_env->me_pgfree); + mdb_midl_free(txn->mt_env->me_pgfree); txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop2; mop = mop2; /* Keep trying to read until we have enough */ @@ -1521,7 +1521,7 @@ none: mop[0]--; } if (MDB_IDL_IS_ZERO(mop)) { - free(txn->mt_env->me_pgfree); + mdb_midl_free(txn->mt_env->me_pgfree); txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL; } } @@ -1990,7 +1990,7 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ if (env->me_pghead) { size = MDB_IDL_SIZEOF(env->me_pghead); - env->me_pghead = malloc(size); + env->me_pghead = mdb_midl_alloc(env->me_pghead[0]); if (env->me_pghead) memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); else @@ -2068,7 +2068,7 @@ mdb_txn_reset0(MDB_txn *txn) if (!(env->me_flags & MDB_WRITEMAP)) { mdb_dlist_free(txn); } - free(env->me_pgfree); + mdb_midl_free(env->me_pgfree); if (txn->mt_parent) { txn->mt_parent->mt_child = NULL; @@ -2403,7 +2403,7 @@ free2: if (freecnt != txn->mt_free_pgs[0]) goto free2; - free(env->me_pgfree); + mdb_midl_free(env->me_pgfree); env->me_pghead = env->me_pgfree = NULL; if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { @@ -4266,6 +4266,53 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) return mdb_page_search_root(mc, key, flags); } +static int +mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) +{ + pgno_t pg = mp->mp_pgno; + unsigned i, ovpages = mp->mp_pages; + int rc; + + DPRINTF("free ov page %zu (%d)", pg, ovpages); + mc->mc_db->md_overflow_pages -= ovpages; + /* If the page is dirty we just acquired it, so we should + * give it back to our current free list, if any. + * Otherwise put it onto the list of pages we freed in this txn. + */ + if ((mp->mp_flags & P_DIRTY) && mc->mc_txn->mt_env->me_pghead) { + unsigned j, x; + pgno_t *mop = mc->mc_txn->mt_env->me_pghead; + /* Remove from dirty list */ + x = mdb_mid2l_search(mc->mc_txn->mt_u.dirty_list, pg); + for (; x < mc->mc_txn->mt_u.dirty_list[0].mid; x++) + mc->mc_txn->mt_u.dirty_list[x] = mc->mc_txn->mt_u.dirty_list[x+1]; + mc->mc_txn->mt_u.dirty_list[0].mid--; + /* Make room to insert pg */ + j = mop[0] + ovpages; + if (j > mop[-1]) { + rc = mdb_midl_grow(&mop, ovpages); + if (rc) + return rc; + mc->mc_txn->mt_env->me_pghead = mc->mc_txn->mt_env->me_pgfree = mop; + } + for (i = mop[0]; i>0; i--) { + if (mop[i] < pg) + mop[j--] = mop[i]; + else + break; + } + while (j>i) + mop[j--] = pg++; + mop[0] += ovpages; + } else { + for (i=0; imc_txn->mt_free_pgs, pg); + pg++; + } + } + return 0; +} + /** Return the data associated with a given node. * @param[in] txn The transaction for this operation. * @param[in] leaf The node being read. @@ -5244,14 +5291,7 @@ current: memcpy(METADATA(omp), data->mv_data, data->mv_size); goto done; } else { - /* no, free ovpages */ - int i; - mc->mc_db->md_overflow_pages -= ovpages; - for (i=0; imc_txn->mt_free_pgs, pg); - pg++; - } + mdb_ovpage_free(mc, omp); } } else if (NODEDSZ(leaf) == data->mv_size) { /* same size, just replace it. Note that we could @@ -6559,7 +6599,6 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) /* add overflow pages to free list */ if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_BIGDATA)) { - int i, ovpages; MDB_page *omp; pgno_t pg; @@ -6567,13 +6606,7 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) != 0) return rc; assert(IS_OVERFLOW(omp)); - ovpages = omp->mp_pages; - mc->mc_db->md_overflow_pages -= ovpages; - for (i=0; imc_txn->mt_free_pgs, pg); - pg++; - } + mdb_ovpage_free(mc, omp); } mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], mc->mc_db->md_pad); mc->mc_db->md_entries--; From bcb67dd22f06a7f7c85350bc548c50ccfc27dbaa Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sat, 8 Jun 2013 14:10:08 -0700 Subject: [PATCH 21/45] Make sure mdb_stat() gets valid data --- libraries/liblmdb/mdb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 9e01fb5e07..a90d7d5510 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -7325,6 +7325,12 @@ int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg) if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs) return EINVAL; + if (txn->mt_dbflags[dbi] & DB_STALE) { + MDB_cursor mc; + MDB_xcursor mx; + /* Stale, must read the DB's root. cursor_init does it for us. */ + mdb_cursor_init(&mc, txn, dbi, &mx); + } return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg); } From c2cac4588a40480c020d320b544bc5f8e72adb11 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 11 Jun 2013 17:13:08 -0700 Subject: [PATCH 22/45] Fix CURSOR_NEXT/PREV on emptied DB --- libraries/liblmdb/mdb.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index a90d7d5510..f52dda7291 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4935,9 +4935,11 @@ fetchm: case MDB_NEXT: case MDB_NEXT_DUP: case MDB_NEXT_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) + if (!(mc->mc_flags & C_INITIALIZED)) { rc = mdb_cursor_first(mc, key, data); - else + if (rc) + break; + } else rc = mdb_cursor_next(mc, key, data, op); break; case MDB_PREV: @@ -4945,6 +4947,8 @@ fetchm: case MDB_PREV_NODUP: if (!(mc->mc_flags & C_INITIALIZED)) { rc = mdb_cursor_last(mc, key, data); + if (rc) + break; mc->mc_flags |= C_INITIALIZED; mc->mc_ki[mc->mc_top]++; } From 99427aa7ded1f2f467fc2b32dde1169a28eb6b64 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 12 Jun 2013 17:20:42 +0200 Subject: [PATCH 23/45] Drop me_pgfree, add mdb_freelist_save(). Split up saving me_pghead, to make me_pgfree unneeded. Also mf_pghead is now a midl. Needed after e7f6767ea815fe0ada1f95037dfdec176ec4d5bb ("Return fresh overflow pages to current pghead"). Tweak MDB_DEBUG freelist output, make it ascending. --- libraries/liblmdb/mdb.c | 320 +++++++++++++++++++++------------------- 1 file changed, 166 insertions(+), 154 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index f52dda7291..3f314f4cf2 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -928,7 +928,6 @@ typedef struct MDB_xcursor { typedef struct MDB_pgstate { txnid_t mf_pglast; /**< ID of last old page record we used */ pgno_t *mf_pghead; /**< old pages reclaimed from freelist */ - pgno_t *mf_pgfree; /**< memory to free when dropping me_pghead */ } MDB_pgstate; /** The database environment. */ @@ -963,14 +962,13 @@ struct MDB_env { MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ # define me_pglast me_pgstate.mf_pglast # define me_pghead me_pgstate.mf_pghead -# define me_pgfree me_pgstate.mf_pgfree MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ /** IDL of pages that became unused in a write txn */ MDB_IDL me_free_pgs; /** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ MDB_ID2L me_dirty_list; /** Max number of freelist items that can fit in a single overflow page */ - unsigned int me_maxfree_1pg; + int me_maxfree_1pg; /** Max size of a node on a page */ unsigned int me_nodemax; #ifdef _WIN32 @@ -1408,7 +1406,7 @@ again: if (!mop) return ENOMEM; txn->mt_env->me_pglast = last; - txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop; + txn->mt_env->me_pghead = mop; memcpy(mop, idl, MDB_IDL_SIZEOF(idl)); #if MDB_DEBUG > 1 @@ -1416,9 +1414,8 @@ again: unsigned int i; DPRINTF("IDL read txn %zu root %zu num %zu", last, txn->mt_dbs[FREE_DBI].md_root, idl[0]); - for (i=0; imt_env->me_pglast = last; - mdb_midl_free(txn->mt_env->me_pgfree); - txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop2; + mdb_midl_free(txn->mt_env->me_pghead); + txn->mt_env->me_pghead = mop2; mop = mop2; /* Keep trying to read until we have enough */ if (mop[0] < (unsigned)num) { @@ -1521,8 +1518,8 @@ none: mop[0]--; } if (MDB_IDL_IS_ZERO(mop)) { - mdb_midl_free(txn->mt_env->me_pgfree); - txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL; + mdb_midl_free(txn->mt_env->me_pghead); + txn->mt_env->me_pghead = NULL; } } } @@ -1996,7 +1993,6 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) else rc = ENOMEM; } - env->me_pgfree = env->me_pghead; if (!rc) rc = mdb_cursor_shadow(parent, txn); if (rc) @@ -2068,7 +2064,7 @@ mdb_txn_reset0(MDB_txn *txn) if (!(env->me_flags & MDB_WRITEMAP)) { mdb_dlist_free(txn); } - mdb_midl_free(env->me_pgfree); + mdb_midl_free(env->me_pghead); if (txn->mt_parent) { txn->mt_parent->mt_child = NULL; @@ -2081,7 +2077,7 @@ mdb_txn_reset0(MDB_txn *txn) env->me_free_pgs = txn->mt_free_pgs; } - txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL; + txn->mt_env->me_pghead = NULL; txn->mt_env->me_pglast = 0; env->me_txn = NULL; @@ -2128,6 +2124,149 @@ mdb_txn_abort(MDB_txn *txn) free(txn); } +/** Save the freelist as of this transaction to the freeDB. + * This changes the freelist. Keep trying until it stabilizes. + */ +static int +mdb_freelist_save(MDB_txn *txn) +{ + /* env->me_pghead[] can grow and shrink during this call. + * env->me_pglast and txn->mt_free_pgs[] can only grow. + * Page numbers cannot disappear from txn->mt_free_pgs[]. + */ + MDB_cursor mc; + MDB_env *env = txn->mt_env; + int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1; + txnid_t pglast = 0, head_id = 0; + pgno_t freecnt = 0, *free_pgs, *mop; + ssize_t head_room = 0, total_room = 0, mop_len; + + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + + if (env->me_pghead || env->me_pglast) { + /* Make sure first page of freeDB is touched and on freelist */ + rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + + for (;;) { + /* Come back here after each Put() in case freelist changed */ + MDB_val key, data; + + /* If using records from freeDB which we have not yet + * deleted, delete them and any we reserved for me_pghead. + */ + while (pglast < env->me_pglast) { + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + return rc; + pglast = head_id = *(txnid_t *)key.mv_data; + total_room = head_room = 0; + assert(pglast <= env->me_pglast); + rc = mdb_cursor_del(&mc, 0); + if (rc) + return rc; + } + + /* Save the IDL of pages freed by this txn, to a single record */ + if (freecnt < txn->mt_free_pgs[0]) { + if (!freecnt) { + /* Make sure last page of freeDB is touched and on freelist */ + key.mv_size = MDB_MAXKEYSIZE+1; + key.mv_data = NULL; + rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY); + if (rc && rc != MDB_NOTFOUND) + return rc; + } + free_pgs = txn->mt_free_pgs; + /* Write to last page of freeDB */ + key.mv_size = sizeof(txn->mt_txnid); + key.mv_data = &txn->mt_txnid; + do { + freecnt = free_pgs[0]; + data.mv_size = MDB_IDL_SIZEOF(free_pgs); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + /* Retry if mt_free_pgs[] grew during the Put() */ + free_pgs = txn->mt_free_pgs; + } while (freecnt < free_pgs[0]); + mdb_midl_sort(free_pgs); + memcpy(data.mv_data, free_pgs, data.mv_size); +#if MDB_DEBUG > 1 + { + unsigned int i = free_pgs[0]; + DPRINTF("IDL write txn %zu root %zu num %u", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); + for (; i; i--) + DPRINTF("IDL %zu", free_pgs[i]); + } +#endif + continue; + } + + mop = env->me_pghead; + mop_len = mop ? mop[0] : 0; + + /* Reserve records for me_pghead[]. Split it if multi-page, + * to avoid searching freeDB for a page range. Use keys in + * range [1,me_pglast]: Smaller than txnid of oldest reader. + */ + if (total_room >= mop_len) { + if (total_room == mop_len || --more < 0) + break; + } else if (head_room >= maxfree_1pg && head_id > 1) { + /* Keep current record (overflow page), add a new one */ + head_id--; + head_room = 0; + } + /* (Re)write {key = head_id, IDL length = head_room} */ + total_room -= head_room; + head_room = mop_len - total_room; + if (head_room > maxfree_1pg && head_id > 1) { + /* Overflow multi-page for part of me_pghead */ + head_room /= head_id; /* amortize page sizes */ + head_room += maxfree_1pg - head_room % (maxfree_1pg + 1); + } else if (head_room < 0) { + /* Rare case, not bothering to delete this record */ + head_room = 0; + } + key.mv_size = sizeof(head_id); + key.mv_data = &head_id; + data.mv_size = (head_room + 1) * sizeof(pgno_t); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); + if (rc) + return rc; + *(MDB_ID *)data.mv_data = 0; /* IDL is initially empty */ + total_room += head_room; + } + + /* Fill in the reserved, touched me_pghead records. Avoid write ops + * so they cannot rearrange anything, just read the destinations. + */ + rc = MDB_SUCCESS; + if (mop_len) { + MDB_val key, data; + + mop += mop_len + 1; + rc = mdb_cursor_first(&mc, &key, &data); + for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) { + MDB_IDL dest = data.mv_data; + ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1; + + assert(len >= 0 && *(txnid_t*)key.mv_data <= env->me_pglast); + if (len > mop_len) + len = mop_len; + *dest++ = len; + memcpy(dest, mop -= len, len * sizeof(MDB_ID)); + if (! (mop_len -= len)) + break; + } + } + return rc; +} + int mdb_txn_commit(MDB_txn *txn) { @@ -2137,9 +2276,7 @@ mdb_txn_commit(MDB_txn *txn) off_t size; MDB_page *dp; MDB_env *env; - pgno_t next, freecnt; - txnid_t oldpg_txnid, id; - MDB_cursor mc; + pgno_t next; assert(txn != NULL); assert(txn->mt_env != NULL); @@ -2234,7 +2371,7 @@ mdb_txn_commit(MDB_txn *txn) parent->mt_dirty_room = txn->mt_dirty_room; txn->mt_parent->mt_child = NULL; - free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pgfree); + mdb_midl_free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pghead); free(txn); return MDB_SUCCESS; } @@ -2255,6 +2392,7 @@ mdb_txn_commit(MDB_txn *txn) /* Update DB root pointers */ if (txn->mt_numdbs > 2) { + MDB_cursor mc; MDB_dbi i; MDB_val data; data.mv_size = sizeof(MDB_db); @@ -2270,142 +2408,12 @@ mdb_txn_commit(MDB_txn *txn) } } - /* Save the freelist as of this transaction to the freeDB. This - * can change the freelist, so keep trying until it stabilizes. - * - * env->me_pglast and the length of txn->mt_free_pgs cannot decrease, - * except the code below can decrease env->me_pglast to split pghead. - * Page numbers cannot disappear from txn->mt_free_pgs. New pages - * can only appear in env->me_pghead when env->me_pglast increases. - * Until then, the me_pghead pointer won't move but can become NULL. - */ - - mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - oldpg_txnid = id = 0; - freecnt = 0; - - /* should only be one record now */ - if (env->me_pghead || env->me_pglast) { - /* make sure first page of freeDB is touched and on freelist */ - rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY); - if (rc && rc != MDB_NOTFOUND) { -fail: - mdb_txn_abort(txn); - return rc; - } - } - - /* Delete IDLs we used from the free list */ - if (env->me_pglast) { - MDB_val key; - - do { -free_pgfirst: - rc = mdb_cursor_first(&mc, &key, NULL); - if (rc) - goto fail; - oldpg_txnid = *(txnid_t *)key.mv_data; -again: - assert(oldpg_txnid <= env->me_pglast); - id = 0; - rc = mdb_cursor_del(&mc, 0); - if (rc) - goto fail; - } while (oldpg_txnid < env->me_pglast); - } - - /* Save IDL of pages freed by this txn, to freeDB */ -free2: - if (freecnt != txn->mt_free_pgs[0]) { - MDB_val key, data; - - /* make sure last page of freeDB is touched and on freelist */ - key.mv_size = MDB_MAXKEYSIZE+1; - key.mv_data = NULL; - rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY); - if (rc && rc != MDB_NOTFOUND) - goto fail; - -#if MDB_DEBUG > 1 - { - unsigned int i; - MDB_IDL idl = txn->mt_free_pgs; - mdb_midl_sort(txn->mt_free_pgs); - DPRINTF("IDL write txn %zu root %zu num %zu", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]); - for (i=1; i<=idl[0]; i++) { - DPRINTF("IDL %zu", idl[i]); - } - } -#endif - /* write to last page of freeDB */ - key.mv_size = sizeof(pgno_t); - key.mv_data = &txn->mt_txnid; - /* The free list can still grow during this call, - * despite the pre-emptive touches above. So retry - * until the reserved space remains big enough. - */ - do { - assert(freecnt < txn->mt_free_pgs[0]); - freecnt = txn->mt_free_pgs[0]; - data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (rc) - goto fail; - } while (freecnt != txn->mt_free_pgs[0]); - mdb_midl_sort(txn->mt_free_pgs); - memcpy(data.mv_data, txn->mt_free_pgs, data.mv_size); - if (oldpg_txnid < env->me_pglast || (!env->me_pghead && id)) - goto free_pgfirst; /* used up freeDB[oldpg_txnid] */ - } - - /* Put back page numbers we took from freeDB but did not use */ - if (env->me_pghead) { - for (;;) { - MDB_val key, data; - pgno_t orig, *mop; - - mop = env->me_pghead; - id = env->me_pglast; - key.mv_size = sizeof(id); - key.mv_data = &id; - /* These steps may grow the freelist again - * due to freed overflow pages... - */ - i = 2; - do { - orig = mop[0]; - if (orig > env->me_maxfree_1pg && id > 4) - orig = env->me_maxfree_1pg; /* Do not use more than 1 page */ - data.mv_size = (orig + 1) * sizeof(pgno_t); - rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); - if (rc) - goto fail; - assert(!env->me_pghead || env->me_pglast); - /* mop could have been used again here */ - if (id != env->me_pglast || env->me_pghead == NULL) - goto again; /* was completely used up */ - assert(mop == env->me_pghead); - } while (mop[0] < orig && --i); - memcpy(data.mv_data, mop, data.mv_size); - if (mop[0] <= orig) - break; - *(pgno_t *)data.mv_data = orig; - mop[orig] = mop[0] - orig; - env->me_pghead = mop += orig; - /* Save more oldpages at the previous txnid. */ - assert(env->me_pglast == id && id == oldpg_txnid); - env->me_pglast = --oldpg_txnid; - } - } - - /* Check for growth of freelist again */ - if (freecnt != txn->mt_free_pgs[0]) - goto free2; - - mdb_midl_free(env->me_pgfree); - env->me_pghead = env->me_pgfree = NULL; + rc = mdb_freelist_save(txn); + if (rc) + goto fail; + mdb_midl_free(env->me_pghead); + env->me_pghead = NULL; if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { if (mdb_midl_shrink(&txn->mt_free_pgs)) env->me_free_pgs = txn->mt_free_pgs; @@ -2535,6 +2543,10 @@ done: free(txn); return MDB_SUCCESS; + +fail: + mdb_txn_abort(txn); + return rc; } /** Read the environment parameters of a DB environment before @@ -4293,7 +4305,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) rc = mdb_midl_grow(&mop, ovpages); if (rc) return rc; - mc->mc_txn->mt_env->me_pghead = mc->mc_txn->mt_env->me_pgfree = mop; + mc->mc_txn->mt_env->me_pghead = mop; } for (i = mop[0]; i>0; i--) { if (mop[i] < pg) From 2d0b362b6f2b1247eb693008a39efaf341f15e82 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 12 Jun 2013 08:41:32 -0700 Subject: [PATCH 24/45] Partial revert c2cac4588a40480c020d320b544bc5f8e72adb11 MDB_NEXT was fine before, duh. --- libraries/liblmdb/mdb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 3f314f4cf2..2f6a02735f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4947,11 +4947,9 @@ fetchm: case MDB_NEXT: case MDB_NEXT_DUP: case MDB_NEXT_NODUP: - if (!(mc->mc_flags & C_INITIALIZED)) { + if (!(mc->mc_flags & C_INITIALIZED)) rc = mdb_cursor_first(mc, key, data); - if (rc) - break; - } else + else rc = mdb_cursor_next(mc, key, data, op); break; case MDB_PREV: From c37a11a424b79f1601b2426c5802ea309d7799e2 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 13 Jun 2013 08:25:25 +0200 Subject: [PATCH 25/45] More for ITS#7620 Fix mdb_ovpage_free(). Do not binary-search dirty_list, it is unsorted when MDB_WRITEMAP. Catch errors. In nested txns, put the page in mt_free_pgs after all since pages dirty in a parent txn would add complexities. --- libraries/liblmdb/mdb.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 2f6a02735f..536aaaaff5 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4281,47 +4281,55 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) static int mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) { + MDB_txn *txn = mc->mc_txn; pgno_t pg = mp->mp_pgno; unsigned i, ovpages = mp->mp_pages; int rc; DPRINTF("free ov page %zu (%d)", pg, ovpages); - mc->mc_db->md_overflow_pages -= ovpages; /* If the page is dirty we just acquired it, so we should * give it back to our current free list, if any. + * Not currently supported in nested txns. * Otherwise put it onto the list of pages we freed in this txn. */ - if ((mp->mp_flags & P_DIRTY) && mc->mc_txn->mt_env->me_pghead) { + if ((mp->mp_flags & P_DIRTY) && !txn->mt_parent && txn->mt_env->me_pghead) { unsigned j, x; - pgno_t *mop = mc->mc_txn->mt_env->me_pghead; - /* Remove from dirty list */ - x = mdb_mid2l_search(mc->mc_txn->mt_u.dirty_list, pg); - for (; x < mc->mc_txn->mt_u.dirty_list[0].mid; x++) - mc->mc_txn->mt_u.dirty_list[x] = mc->mc_txn->mt_u.dirty_list[x+1]; - mc->mc_txn->mt_u.dirty_list[0].mid--; - /* Make room to insert pg */ + pgno_t *mop = txn->mt_env->me_pghead; + MDB_ID2 *dl, ix, iy; + /* Prepare to insert pg */ j = mop[0] + ovpages; if (j > mop[-1]) { rc = mdb_midl_grow(&mop, ovpages); if (rc) return rc; - mc->mc_txn->mt_env->me_pghead = mop; + txn->mt_env->me_pghead = mop; } - for (i = mop[0]; i>0; i--) { - if (mop[i] < pg) - mop[j--] = mop[i]; - else - break; + /* Remove from dirty list */ + dl = txn->mt_u.dirty_list; + x = dl[0].mid--; + for (ix = dl[x]; ix.mid != pg; ix = iy) { + if (x > 1) { + x--; + iy = dl[x]; + dl[x] = ix; + } else { + assert(x > 1); + return MDB_CORRUPTED; + } } + /* Insert in me_pghead */ + for (i = mop[0]; i && mop[i] < pg; i--) + mop[j--] = mop[i]; while (j>i) mop[j--] = pg++; mop[0] += ovpages; } else { for (i=0; imc_txn->mt_free_pgs, pg); + mdb_midl_append(&txn->mt_free_pgs, pg); pg++; } } + mc->mc_db->md_overflow_pages -= ovpages; return 0; } From 62e4c4f9d0675647ad0d9ca4573060a13df20f34 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 13 Jun 2013 08:58:24 +0200 Subject: [PATCH 26/45] Don't #define _GNU_SOURCE if already defined. --- libraries/liblmdb/mdb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 536aaaaff5..940335730b 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -32,7 +32,9 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE 1 +#endif #include #include #include From 2bacf6e59ed4510d2540d758c3b079406853b18a Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 13 Jun 2013 08:58:24 +0200 Subject: [PATCH 27/45] ITS#7594 Invalidate a dropped MDB DB's cursors. --- libraries/liblmdb/mdb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 940335730b..2791e798e7 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -7450,7 +7450,7 @@ mdb_drop0(MDB_cursor *mc, int subs) int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) { - MDB_cursor *mc; + MDB_cursor *mc, *m2; int rc; if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID)) @@ -7464,6 +7464,9 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) return rc; rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); + /* Invalidate the dropped DB's cursors */ + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + m2->mc_flags &= ~C_INITIALIZED; if (rc) goto leave; From 79844bd44687dd5ecdf8ab36144584db06939dc7 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 13 Jun 2013 08:58:24 +0200 Subject: [PATCH 28/45] ITS#7594 Fix MDB cursor tracking with subDBs. The tracking code should not change the current cursor. It did when that was a C_SUB cursor, which should not be checked against the tracked cursors but their xcursors. However, do not bother to skip the tracking code for the current cursor when it would not change that cursor anyway. --- libraries/liblmdb/mdb.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 2791e798e7..e1093f6467 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1621,8 +1621,8 @@ finish: MDB_dbi dbi = mc->mc_dbi-1; for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (m2 == mc) continue; m3 = &m2->mc_xcursor->mx_cursor; + if (m3 == mc) continue; if (m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) { m3->mc_pg[mc->mc_top] = mp; @@ -6236,11 +6236,11 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) dbi--; for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (m2 == csrc) continue; if (csrc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; + if (m3 == csrc) continue; if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; @@ -6496,10 +6496,10 @@ mdb_rebalance(MDB_cursor *mc) mc->mc_db->md_depth = 0; mc->mc_db->md_leaf_pages = 0; mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + /* Adjust cursors pointing to mp */ mc->mc_snum = 0; mc->mc_top = 0; { - /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; @@ -6507,7 +6507,6 @@ mdb_rebalance(MDB_cursor *mc) dbi--; for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (m2 == mc) continue; if (mc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; else @@ -6537,12 +6536,11 @@ mdb_rebalance(MDB_cursor *mc) dbi--; for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (m2 == mc) continue; if (mc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; - if (m3->mc_snum < mc->mc_snum) continue; + if (m3 == mc || m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[0] == mp) { m3->mc_pg[0] = mc->mc_pg[0]; m3->mc_snum = 1; @@ -7069,11 +7067,12 @@ done: dbi--; for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - if (m2 == mc) continue; if (mc->mc_flags & C_SUB) m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; + if (m3 == mc) + continue; if (!(m3->mc_flags & C_INITIALIZED)) continue; if (m3->mc_flags & C_SPLITTING) From 7030ad16e2b792d5782c328461d09a31c96b86e6 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 13 Jun 2013 08:58:24 +0200 Subject: [PATCH 29/45] Clean up mdb_page_touch(), mdb_page_copy(). When copying, round up/down to aligned sizes. Skip the unused portion, this was not done when touching a page dirty in the parent txn. No other change in behavior. Simplify mdb_page_touch(), including: Drop test m3==mc, the condition is caught below. Don't "modify" the parent's pgno into the same pgno, when a nested txn copies a parent's page into its freelist. --- libraries/liblmdb/mdb.c | 130 +++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 69 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index e1093f6467..97926d9e14 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1563,7 +1563,7 @@ none: return MDB_SUCCESS; } -/** Copy a page: avoid copying unused portions of the page. +/** Copy the used portions of a non-overflow page. * @param[in] dst page to copy into * @param[in] src page to copy from * @param[in] psize size of a page @@ -1571,17 +1571,19 @@ none: static void mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) { - dst->mp_flags = src->mp_flags | P_DIRTY; - dst->mp_pages = src->mp_pages; + enum { Align = sizeof(pgno_t) }; + indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower; - if (IS_LEAF2(src)) { - memcpy(dst->mp_ptrs, src->mp_ptrs, psize - PAGEHDRSZ - SIZELEFT(src)); + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. + */ + if ((unused &= -Align) && !IS_LEAF2(src)) { + upper &= -Align; + memcpy(dst, src, (lower + (Align-1)) & -Align); + memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper), + psize - upper); } else { - unsigned int i, nkeys = NUMKEYS(src); - for (i=0; imp_ptrs[i] = src->mp_ptrs[i]; - memcpy((char *)dst+src->mp_upper, (char *)src+src->mp_upper, - psize - src->mp_upper); + memcpy(dst, src, psize - unused); } } @@ -1592,76 +1594,36 @@ mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize) static int mdb_page_touch(MDB_cursor *mc) { - MDB_page *mp = mc->mc_pg[mc->mc_top]; + MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDB_cursor *m2, *m3; + MDB_dbi dbi; pgno_t pgno; int rc; if (!F_ISSET(mp->mp_flags, P_DIRTY)) { - MDB_page *np; if ((rc = mdb_page_alloc(mc, 1, &np))) return rc; - DPRINTF("touched db %u page %zu -> %zu", mc->mc_dbi, mp->mp_pgno, np->mp_pgno); - assert(mp->mp_pgno != np->mp_pgno); + pgno = np->mp_pgno; + DPRINTF("touched db %u page %zu -> %zu", mc->mc_dbi,mp->mp_pgno,pgno); + assert(mp->mp_pgno != pgno); mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); - if (SIZELEFT(mp)) { - /* If page isn't full, just copy the used portion */ - mdb_page_copy(np, mp, mc->mc_txn->mt_env->me_psize); + /* Update the parent page, if any, to point to the new page */ + if (mc->mc_top) { + MDB_page *parent = mc->mc_pg[mc->mc_top-1]; + MDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]); + SETPGNO(node, pgno); } else { - pgno = np->mp_pgno; - memcpy(np, mp, mc->mc_txn->mt_env->me_psize); - np->mp_pgno = pgno; - np->mp_flags |= P_DIRTY; + mc->mc_db->md_root = pgno; } - mp = np; - -finish: - /* Adjust other cursors pointing to mp */ - if (mc->mc_flags & C_SUB) { - MDB_cursor *m2, *m3; - MDB_dbi dbi = mc->mc_dbi-1; - - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { - m3 = &m2->mc_xcursor->mx_cursor; - if (m3 == mc) continue; - if (m3->mc_snum < mc->mc_snum) continue; - if (m3->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) { - m3->mc_pg[mc->mc_top] = mp; - } - } - } else { - MDB_cursor *m2; - - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; - if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) { - m2->mc_pg[mc->mc_top] = mp; - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) { - MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!(leaf->mn_flags & F_SUBDATA)) { - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } - } - } - } - } - mc->mc_pg[mc->mc_top] = mp; - /** If this page has a parent, update the parent to point to - * this new page. - */ - if (mc->mc_top) - SETPGNO(NODEPTR(mc->mc_pg[mc->mc_top-1], mc->mc_ki[mc->mc_top-1]), mp->mp_pgno); - else - mc->mc_db->md_root = mp->mp_pgno; } else if (mc->mc_txn->mt_parent && !(mp->mp_flags & P_SUBP)) { - MDB_page *np; MDB_ID2 mid, *dl = mc->mc_txn->mt_u.dirty_list; + pgno = mp->mp_pgno; /* If txn has a parent, make sure the page is in our * dirty list. */ if (dl[0].mid) { - unsigned x = mdb_mid2l_search(dl, mp->mp_pgno); - if (x <= dl[0].mid && dl[x].mid == mp->mp_pgno) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { np = dl[x].mptr; if (mp != np) mc->mc_pg[mc->mc_top] = np; @@ -1673,12 +1635,42 @@ finish: np = mdb_page_malloc(mc, 1); if (!np) return ENOMEM; - memcpy(np, mp, mc->mc_txn->mt_env->me_psize); - mid.mid = np->mp_pgno; + mid.mid = pgno; mid.mptr = np; mdb_mid2l_insert(dl, &mid); - mp = np; - goto finish; + } else { + return 0; + } + + mdb_page_copy(np, mp, mc->mc_txn->mt_env->me_psize); + np->mp_pgno = pgno; + np->mp_flags |= P_DIRTY; + + /* Adjust cursors pointing to mp */ + mc->mc_pg[mc->mc_top] = np; + dbi = mc->mc_dbi; + if (mc->mc_flags & C_SUB) { + dbi--; + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + m3 = &m2->mc_xcursor->mx_cursor; + if (m3->mc_snum < mc->mc_snum) continue; + if (m3->mc_pg[mc->mc_top] == mp) + m3->mc_pg[mc->mc_top] = np; + } + } else { + for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + if (m2->mc_snum < mc->mc_snum) continue; + if (m2->mc_pg[mc->mc_top] == mp) { + m2->mc_pg[mc->mc_top] = np; + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) + { + MDB_node *leaf = NODEPTR(np, mc->mc_ki[mc->mc_top]); + if (!(leaf->mn_flags & F_SUBDATA)) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } + } + } } return 0; } From 60086c1bc77fafecef274a165b308b28902dfebb Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 13 Jun 2013 08:58:25 +0200 Subject: [PATCH 30/45] ITS#7515 Nested MDB txns: Inherit txn flags. Committing a nested txn lost the MDB_TXN_DIRTY flag in the parent, unless the child had set it too. --- libraries/liblmdb/mdb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 97926d9e14..df2e888c61 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1971,6 +1971,7 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) parent->mt_child = txn; txn->mt_parent = parent; txn->mt_numdbs = parent->mt_numdbs; + txn->mt_flags = parent->mt_flags; txn->mt_dbxs = parent->mt_dbxs; memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); /* Copy parent's mt_dbflags, but clear DB_NEW */ From 1d94ea5b55e5bce58502b3c0065cf40983eee9a8 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Mon, 17 Jun 2013 22:26:11 +0200 Subject: [PATCH 31/45] ITS#7623 Clear P_SUBP on conversion from fake page --- libraries/liblmdb/mdb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index df2e888c61..3c8db7eefc 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -5189,6 +5189,7 @@ more: MDB_page *mp; unsigned int offset; unsigned int i; + uint16_t fp_flags; fp = NODEDATA(leaf); if (flags == MDB_CURRENT) { @@ -5208,6 +5209,7 @@ reuse: offset = NODESIZE + sizeof(indx_t) + data->mv_size; } offset += offset & 1; + fp_flags = fp->mp_flags; if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) + offset >= mc->mc_txn->mt_env->me_nodemax) { /* yes, convert it */ @@ -5231,6 +5233,7 @@ reuse: offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf); flags |= F_DUPDATA|F_SUBDATA; dummy.md_root = mp->mp_pgno; + fp_flags &= ~P_SUBP; } else { /* no, just grow it */ rdata = &xdata; @@ -5240,7 +5243,7 @@ reuse: mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; flags |= F_DUPDATA; } - mp->mp_flags = fp->mp_flags | P_DIRTY; + mp->mp_flags = fp_flags | P_DIRTY; mp->mp_pad = fp->mp_pad; mp->mp_lower = fp->mp_lower; mp->mp_upper = fp->mp_upper + offset; From 4a9ee2cb726fe8f5c4f256c1a567cdf6721c10a7 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 20 Jun 2013 07:41:35 +0200 Subject: [PATCH 32/45] ITS#7620: Keep empty IDLs. Tweak mdb_page_alloc(). MDB_env.me_pghead: Don't free it when empty. mdb_ovpage_free() needs it, but cannot allocate it. mdb_midl_alloc(): Fill in length=0. mdb_page_alloc(): Also Skip freeDB if txnid<3, instead of <4, and consistently DPRINTF consumed IDLs. --- libraries/liblmdb/mdb.c | 92 +++++++++++++++++++++------------------- libraries/liblmdb/midl.c | 4 +- 2 files changed, 52 insertions(+), 44 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 3c8db7eefc..3a1d39ed84 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -928,8 +928,8 @@ typedef struct MDB_xcursor { /** State of FreeDB old pages, stored in the MDB_env */ typedef struct MDB_pgstate { - txnid_t mf_pglast; /**< ID of last old page record we used */ - pgno_t *mf_pghead; /**< old pages reclaimed from freelist */ + pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ + txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ } MDB_pgstate; /** The database environment. */ @@ -1341,8 +1341,10 @@ static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) { MDB_txn *txn = mc->mc_txn; + MDB_env *env = txn->mt_env; + pgno_t pgno = P_INVALID, *mop = env->me_pghead; + unsigned mop_len = mop ? mop[0] : 0; MDB_page *np; - pgno_t pgno = P_INVALID; MDB_ID2 mid; txnid_t oldest = 0, last; int rc; @@ -1353,13 +1355,11 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) if (txn->mt_dirty_room == 0) return MDB_TXN_FULL; - /* The free list won't have any content at all until txn 2 has - * committed. The pages freed by txn 2 will be unreferenced - * after txn 3 commits, and so will be safe to re-use in txn 4. + /* Pages freed by txn#1 (after allocating but discarding them) + * are used when txn#1 is unreferenced, i.e. txn#3. */ - if (txn->mt_txnid > 3) { - if (!txn->mt_env->me_pghead && - txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { + if (txn->mt_txnid >= 3) { + if (!mop_len && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { /* See if there's anything in the free DB */ MDB_cursor m2; MDB_node *leaf; @@ -1391,24 +1391,22 @@ again: if (oldest > last) { /* It's usable, grab it. */ - pgno_t *idl, *mop; + pgno_t *idl; if (!txn->mt_env->me_pglast) { mdb_node_read(txn, leaf, &data); } idl = (MDB_ID *) data.mv_data; - /* We might have a zero-length IDL due to freelist growth - * during a prior commit - */ - if (!idl[0]) { - txn->mt_env->me_pglast = last; - goto again; + mop_len = idl[0]; + if (!mop) { + if (!(env->me_pghead = mop = mdb_midl_alloc(mop_len))) + return ENOMEM; + } else if (mop_len > mop[-1]) { + if ((rc = mdb_midl_grow(&env->me_pghead, mop_len)) != 0) + return rc; + mop = env->me_pghead; } - mop = mdb_midl_alloc(idl[0]); - if (!mop) - return ENOMEM; txn->mt_env->me_pglast = last; - txn->mt_env->me_pghead = mop; memcpy(mop, idl, MDB_IDL_SIZEOF(idl)); #if MDB_DEBUG > 1 @@ -1420,11 +1418,15 @@ again: DPRINTF("IDL %zu", idl[i]); } #endif + /* We might have a zero-length IDL due to freelist growth + * during a prior commit + */ + if (!mop_len) + goto again; } } none: - if (txn->mt_env->me_pghead) { - pgno_t *mop = txn->mt_env->me_pghead; + if (mop_len) { if (num > 1) { MDB_cursor m2; int retry = 1, readit = 0, n2 = num-1; @@ -1448,7 +1450,7 @@ none: #endif if (readit) { MDB_val key, data; - pgno_t *idl, *mop2; + pgno_t *idl, old_id, new_id; last = txn->mt_env->me_pglast + 1; @@ -1473,22 +1475,31 @@ none: if (oldest <= last) break; idl = (MDB_ID *) data.mv_data; - mop2 = mdb_midl_alloc(idl[0] + mop[0]); - if (!mop2) - return ENOMEM; - /* merge in sorted order */ - i = idl[0]; j = mop[0]; mop2[0] = k = i+j; - mop[0] = P_INVALID; - while (i>0 || j>0) { - if (i && idl[i] < mop[j]) - mop2[k--] = idl[i--]; - else - mop2[k--] = mop[j--]; + i = idl[0]; + if (mop_len+i > mop[-1]) { + if ((rc = mdb_midl_grow(&env->me_pghead, i)) != 0) + return rc; + mop = env->me_pghead; } +#if MDB_DEBUG > 1 + DPRINTF("IDL read txn %zu root %zu num %u", + last, txn->mt_dbs[FREE_DBI].md_root, i); + for (k = i; k; k--) + DPRINTF("IDL %zu", idl[k]); +#endif + /* merge in sorted order */ + j = mop_len; + k = mop_len += i; + mop[0] = P_INVALID; + old_id = mop[j]; + while (i) { + new_id = idl[i--]; + for (; old_id < new_id; old_id = mop[--j]) + mop[k--] = old_id; + mop[k--] = new_id; + } + mop[0] = mop_len; txn->mt_env->me_pglast = last; - mdb_midl_free(txn->mt_env->me_pghead); - txn->mt_env->me_pghead = mop2; - mop = mop2; /* Keep trying to read until we have enough */ if (mop[0] < (unsigned)num) { continue; @@ -1519,10 +1530,6 @@ none: pgno = MDB_IDL_LAST(mop); mop[0]--; } - if (MDB_IDL_IS_ZERO(mop)) { - mdb_midl_free(txn->mt_env->me_pghead); - txn->mt_env->me_pghead = NULL; - } } } @@ -1966,7 +1973,6 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) txn->mt_toggle = parent->mt_toggle; txn->mt_dirty_room = parent->mt_dirty_room; txn->mt_u.dirty_list[0].mid = 0; - txn->mt_free_pgs[0] = 0; txn->mt_next_pgno = parent->mt_next_pgno; parent->mt_child = txn; txn->mt_parent = parent; @@ -2138,7 +2144,7 @@ mdb_freelist_save(MDB_txn *txn) mdb_cursor_init(&mc, txn, FREE_DBI, NULL); - if (env->me_pghead || env->me_pglast) { + if (env->me_pghead) { /* Make sure first page of freeDB is touched and on freelist */ rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY); if (rc && rc != MDB_NOTFOUND) diff --git a/libraries/liblmdb/midl.c b/libraries/liblmdb/midl.c index d2617c61e4..00df385cdc 100644 --- a/libraries/liblmdb/midl.c +++ b/libraries/liblmdb/midl.c @@ -104,8 +104,10 @@ int mdb_midl_insert( MDB_IDL ids, MDB_ID id ) MDB_IDL mdb_midl_alloc(int num) { MDB_IDL ids = malloc((num+2) * sizeof(MDB_ID)); - if (ids) + if (ids) { *ids++ = num; + *ids = 0; + } return ids; } From c6f9323b3dc0ae8259af12a1b04edb4bbae7f05b Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 20 Jun 2013 07:41:35 +0200 Subject: [PATCH 33/45] Simplify mdb_page_alloc(). Merge if() branches. Restore retry=500 when MDB_PARANOID, for clarity. --- libraries/liblmdb/mdb.c | 289 +++++++++++++++------------------------- 1 file changed, 106 insertions(+), 183 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 3a1d39ed84..5937f0905a 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1340,14 +1340,27 @@ mdb_find_oldest(MDB_txn *txn) static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) { +#ifdef MDB_PARANOID /* Seems like we can ignore this now */ + /* Get at most more freeDB records once me_pghead + * has enough pages. If not enough, use new pages from the map. + * If and mc is updating the freeDB, only get new + * records if me_pghead is empty. Then the freelist cannot play + * catch-up with itself by growing while trying to save it. + */ + enum { Paranoid = 1, Max_retries = 500 }; +#else + enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ }; +#endif + int rc, n2 = num-1, retry = Max_retries; MDB_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; - pgno_t pgno = P_INVALID, *mop = env->me_pghead; + pgno_t pgno, *mop = env->me_pghead; unsigned mop_len = mop ? mop[0] : 0; MDB_page *np; MDB_ID2 mid; txnid_t oldest = 0, last; - int rc; + MDB_cursor_op op; + MDB_cursor m2; *mp = NULL; @@ -1355,211 +1368,121 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) if (txn->mt_dirty_room == 0) return MDB_TXN_FULL; - /* Pages freed by txn#1 (after allocating but discarding them) - * are used when txn#1 is unreferenced, i.e. txn#3. - */ - if (txn->mt_txnid >= 3) { - if (!mop_len && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { - /* See if there's anything in the free DB */ - MDB_cursor m2; - MDB_node *leaf; - MDB_val data; - txnid_t *kptr; + for (op = MDB_FIRST;; op = MDB_NEXT) { + unsigned int i, j, k; + MDB_val key, data; + MDB_node *leaf; + pgno_t *idl, old_id, new_id; + /* Seek a big enough contiguous page range. Prefer + * pages at the tail, just truncating the list. + */ + if (mop_len >= (unsigned)num) { + i = mop_len; + do { + pgno = mop[i]; + if (mop[i-n2] == pgno+n2) { + mop[0] = mop_len -= num; + /* Move any stragglers down */ + for (j = i-n2; j <= mop_len; ) + mop[j++] = mop[++i]; + goto search_done; + } + } while (--i >= (unsigned)num); + if (Max_retries < INT_MAX && --retry < 0) + break; + } + + if (op == MDB_FIRST) { /* 1st iteration */ + /* Prepare to fetch more and coalesce */ + oldest = mdb_find_oldest(txn); + last = env->me_pglast; mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - if (!txn->mt_env->me_pglast) { - mdb_page_search(&m2, NULL, 0); - leaf = NODEPTR(m2.mc_pg[m2.mc_top], 0); - kptr = (txnid_t *)NODEKEY(leaf); - last = *kptr; - } else { - MDB_val key; -again: - last = txn->mt_env->me_pglast + 1; - leaf = NULL; - key.mv_data = &last; + if (last) { + op = MDB_SET_RANGE; + key.mv_data = &last; /* will loop up last+1 */ key.mv_size = sizeof(last); - rc = mdb_cursor_set(&m2, &key, &data, MDB_SET_RANGE, NULL); - if (rc) - goto none; - last = *(txnid_t *)key.mv_data; - } - - if (!oldest) - oldest = mdb_find_oldest(txn); - - if (oldest > last) { - /* It's usable, grab it. - */ - pgno_t *idl; - - if (!txn->mt_env->me_pglast) { - mdb_node_read(txn, leaf, &data); - } - idl = (MDB_ID *) data.mv_data; - mop_len = idl[0]; - if (!mop) { - if (!(env->me_pghead = mop = mdb_midl_alloc(mop_len))) - return ENOMEM; - } else if (mop_len > mop[-1]) { - if ((rc = mdb_midl_grow(&env->me_pghead, mop_len)) != 0) - return rc; - mop = env->me_pghead; - } - txn->mt_env->me_pglast = last; - memcpy(mop, idl, MDB_IDL_SIZEOF(idl)); - -#if MDB_DEBUG > 1 - { - unsigned int i; - DPRINTF("IDL read txn %zu root %zu num %zu", - last, txn->mt_dbs[FREE_DBI].md_root, idl[0]); - for (i = idl[0]; i; i--) - DPRINTF("IDL %zu", idl[i]); - } -#endif - /* We might have a zero-length IDL due to freelist growth - * during a prior commit - */ - if (!mop_len) - goto again; } + if (Paranoid && mc->mc_dbi == FREE_DBI) + retry = -1; } -none: - if (mop_len) { - if (num > 1) { - MDB_cursor m2; - int retry = 1, readit = 0, n2 = num-1; - unsigned int i, j, k; + if (Paranoid && retry < 0 && mop_len) + break; - /* If current list is too short, must fetch more and coalesce */ - if (mop[0] < (unsigned)num) - readit = 1; - - mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - do { -#ifdef MDB_PARANOID /* Seems like we can ignore this now */ - /* If on freelist, don't try to read more. If what we have - * right now isn't enough just use new pages. - * TODO: get all of this working. Many circular dependencies... - */ - if (mc->mc_dbi == FREE_DBI) { - retry = 0; - readit = 0; - } -#endif - if (readit) { - MDB_val key, data; - pgno_t *idl, old_id, new_id; - - last = txn->mt_env->me_pglast + 1; - - /* We haven't hit the readers list yet? */ - if (!oldest) { - oldest = mdb_find_oldest(txn); - } - - /* There's nothing we can use on the freelist */ - if (oldest - last < 1) - break; - - key.mv_data = &last; - key.mv_size = sizeof(last); - rc = mdb_cursor_set(&m2,&key,&data,MDB_SET_RANGE,NULL); - if (rc) { - if (rc == MDB_NOTFOUND) - break; - return rc; - } - last = *(txnid_t*)key.mv_data; - if (oldest <= last) - break; - idl = (MDB_ID *) data.mv_data; - i = idl[0]; - if (mop_len+i > mop[-1]) { - if ((rc = mdb_midl_grow(&env->me_pghead, i)) != 0) - return rc; - mop = env->me_pghead; - } -#if MDB_DEBUG > 1 - DPRINTF("IDL read txn %zu root %zu num %u", - last, txn->mt_dbs[FREE_DBI].md_root, i); - for (k = i; k; k--) - DPRINTF("IDL %zu", idl[k]); -#endif - /* merge in sorted order */ - j = mop_len; - k = mop_len += i; - mop[0] = P_INVALID; - old_id = mop[j]; - while (i) { - new_id = idl[i--]; - for (; old_id < new_id; old_id = mop[--j]) - mop[k--] = old_id; - mop[k--] = new_id; - } - mop[0] = mop_len; - txn->mt_env->me_pglast = last; - /* Keep trying to read until we have enough */ - if (mop[0] < (unsigned)num) { - continue; - } - } - - /* current list has enough pages, but are they contiguous? */ - for (i=mop[0]; i>=(unsigned)num; i--) { - if (mop[i-n2] == mop[i] + n2) { - pgno = mop[i]; - i -= n2; - /* move any stragglers down */ - for (j=i+num; j<=mop[0]; j++) - mop[i++] = mop[j]; - mop[0] -= num; - break; - } - } - - /* Stop if we succeeded, or no retries */ - if (!retry || pgno != P_INVALID) - break; - readit = 1; - - } while (1); - } else { - /* peel pages off tail, so we only have to truncate the list */ - pgno = MDB_IDL_LAST(mop); - mop[0]--; - } + last++; + /* Do not fetch more if the record will be too recent */ + if (oldest <= last) + break; + rc = mdb_cursor_get(&m2, &key, NULL, op); + if (rc) { + if (rc == MDB_NOTFOUND) + break; + return rc; } + last = *(txnid_t*)key.mv_data; + if (oldest <= last) + break; + np = m2.mc_pg[m2.mc_top]; + leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); + if ((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS) + return rc; + + idl = (MDB_ID *) data.mv_data; + i = idl[0]; + if (!mop) { + if (!(env->me_pghead = mop = mdb_midl_alloc(i))) + return ENOMEM; + } else if (mop_len+i > mop[-1]) { + if ((rc = mdb_midl_grow(&env->me_pghead, i)) != 0) + return rc; + mop = env->me_pghead; + } + env->me_pglast = last; +#if MDB_DEBUG > 1 + DPRINTF("IDL read txn %zu root %zu num %u", + last, txn->mt_dbs[FREE_DBI].md_root, i); + for (k = i; k; k--) + DPRINTF("IDL %zu", idl[k]); +#endif + /* Merge in descending sorted order */ + j = mop_len; + k = mop_len += i; + mop[0] = (pgno_t)-1; + old_id = mop[j]; + while (i) { + new_id = idl[i--]; + for (; old_id < new_id; old_id = mop[--j]) + mop[k--] = old_id; + mop[k--] = new_id; + } + mop[0] = mop_len; } - if (pgno == P_INVALID) { - /* DB size is maxed out */ - if (txn->mt_next_pgno + num >= txn->mt_env->me_maxpg) { + /* Use new pages from the map when nothing suitable in the freeDB */ + pgno = P_INVALID; + if (txn->mt_next_pgno + num >= env->me_maxpg) { DPUTS("DB size maxed out"); return MDB_MAP_FULL; - } } - if (txn->mt_env->me_flags & MDB_WRITEMAP) { + +search_done: + if (env->me_flags & MDB_WRITEMAP) { if (pgno == P_INVALID) { pgno = txn->mt_next_pgno; txn->mt_next_pgno += num; } - np = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); - np->mp_pgno = pgno; + np = (MDB_page *)(env->me_map + env->me_psize * pgno); } else { if (!(np = mdb_page_malloc(mc, num))) return ENOMEM; if (pgno == P_INVALID) { - np->mp_pgno = txn->mt_next_pgno; + pgno = txn->mt_next_pgno; txn->mt_next_pgno += num; - } else { - np->mp_pgno = pgno; } } - mid.mid = np->mp_pgno; + mid.mid = np->mp_pgno = pgno; mid.mptr = np; - if (txn->mt_env->me_flags & MDB_WRITEMAP) { + if (env->me_flags & MDB_WRITEMAP) { mdb_mid2l_append(txn->mt_u.dirty_list, &mid); } else { mdb_mid2l_insert(txn->mt_u.dirty_list, &mid); From 9e6ef6c88f842dbd43d8f671d169ca3dc909b3ce Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 22 Jun 2013 11:56:04 +0200 Subject: [PATCH 34/45] Rearrange MDB dirty page code. Split out mdb_dpage_free(), mdb_page_flush() and clean up. --- libraries/liblmdb/mdb.c | 271 +++++++++++++++++++--------------------- 1 file changed, 130 insertions(+), 141 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 5937f0905a..c0d8c30695 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1289,6 +1289,19 @@ mdb_page_free(MDB_env *env, MDB_page *mp) env->me_dpages = mp; } +/* Free a dirty page */ +static void +mdb_dpage_free(MDB_env *env, MDB_page *dp) +{ + if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { + mdb_page_free(env, dp); + } else { + /* large pages just get freed directly */ + VGMEMP_FREE(env, dp); + free(dp); + } +} + /* Return all dirty pages to dpage list */ static void mdb_dlist_free(MDB_txn *txn) @@ -1298,14 +1311,7 @@ mdb_dlist_free(MDB_txn *txn) unsigned i, n = dl[0].mid; for (i = 1; i <= n; i++) { - MDB_page *dp = dl[i].mptr; - if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) { - mdb_page_free(env, dp); - } else { - /* large pages just get freed directly */ - VGMEMP_FREE(env, dp); - free(dp); - } + mdb_dpage_free(env, dl[i].mptr); } dl[0].mid = 0; } @@ -2191,16 +2197,111 @@ mdb_freelist_save(MDB_txn *txn) return rc; } +/** Flush dirty pages to the map, after clearing their dirty flag. + */ +static int +mdb_page_flush(MDB_txn *txn) +{ + MDB_env *env = txn->mt_env; + MDB_ID2L dl = txn->mt_u.dirty_list; + unsigned psize = env->me_psize; + int i, pagecount = dl[0].mid, rc; + size_t size, pos = 0; + pgno_t pgno; + MDB_page *dp; +#ifdef _WIN32 + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); +#else + struct iovec iov[MDB_COMMIT_PAGES]; + ssize_t wpos, wsize, wres; + size_t next_pos = 1; /* impossible pos, so pos != next_pos */ + int n = 0; +#endif + + if (env->me_flags & MDB_WRITEMAP) { + /* Clear dirty flags */ + for (i = pagecount; i; i--) { + dp = dl[i].mptr; + dp->mp_flags &= ~P_DIRTY; + } + dl[0].mid = 0; + return MDB_SUCCESS; + } + + /* Write the pages */ + for (i = 1;; i++) { + if (i <= pagecount) { + dp = dl[i].mptr; + pgno = dl[i].mid; + /* clear dirty flag */ + dp->mp_flags &= ~P_DIRTY; + pos = pgno * psize; + size = psize; + if (IS_OVERFLOW(dp)) size *= dp->mp_pages; + } +#ifdef _WIN32 + else break; + + /* Windows actually supports scatter/gather I/O, but only on + * unbuffered file handles. Since we're relying on the OS page + * cache for all our data, that's self-defeating. So we just + * write pages one at a time. We use the ov structure to set + * the write offset, to at least save the overhead of a Seek + * system call. + */ + DPRINTF("committing page %zu", pgno); + ov.Offset = pos & 0xffffffff; + ov.OffsetHigh = pos >> 16; + ov.OffsetHigh >>= 16; + if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { + rc = ErrCode(); + DPRINTF("WriteFile: %d", rc); + return rc; + } +#else + /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ + if (pos != next_pos || n == MDB_COMMIT_PAGES) { + if (n) { + /* Write previous page(s) */ + lseek(env->me_fd, wpos, SEEK_SET); + wres = writev(env->me_fd, iov, n); + if (wres != wsize) { + rc = ErrCode(); + if (wres < 0) { + DPRINTF("writev: %s", strerror(rc)); + } else { + DPUTS("short write, filesystem full?"); + } + return rc; + } + n = 0; + } + if (i > pagecount) + break; + wpos = pos; + wsize = 0; + } + DPRINTF("committing page %zu", pgno); + next_pos = pos + size; + iov[n].iov_len = size; + iov[n].iov_base = (char *)dp; + wsize += size; + n++; +#endif /* _WIN32 */ + } + + mdb_dlist_free(txn); + + return MDB_SUCCESS; +} + int mdb_txn_commit(MDB_txn *txn) { - int n, done; + int rc; unsigned int i; - ssize_t rc; - off_t size; - MDB_page *dp; MDB_env *env; - pgno_t next; assert(txn != NULL); assert(txn->mt_env != NULL); @@ -2208,10 +2309,8 @@ mdb_txn_commit(MDB_txn *txn) if (txn->mt_child) { rc = mdb_txn_commit(txn->mt_child); txn->mt_child = NULL; - if (rc) { - mdb_txn_abort(txn); - return rc; - } + if (rc) + goto fail; } env = txn->mt_env; @@ -2227,8 +2326,8 @@ mdb_txn_commit(MDB_txn *txn) DPUTS("error flag is set, can't commit"); if (txn->mt_parent) txn->mt_parent->mt_flags |= MDB_TXN_ERROR; - mdb_txn_abort(txn); - return EINVAL; + rc = EINVAL; + goto fail; } if (txn->mt_parent) { @@ -2237,10 +2336,9 @@ mdb_txn_commit(MDB_txn *txn) MDB_ID2L dst, src; /* Append our free list to parent's */ - if (mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs)) { - mdb_txn_abort(txn); - return ENOMEM; - } + rc = mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs); + if (rc) + goto fail; mdb_midl_free(txn->mt_free_pgs); parent->mt_next_pgno = txn->mt_next_pgno; @@ -2302,8 +2400,8 @@ mdb_txn_commit(MDB_txn *txn) if (txn != env->me_txn) { DPUTS("attempt to commit unknown transaction"); - mdb_txn_abort(txn); - return EINVAL; + rc = EINVAL; + goto fail; } mdb_cursors_close(txn, 0); @@ -2338,125 +2436,17 @@ mdb_txn_commit(MDB_txn *txn) mdb_midl_free(env->me_pghead); env->me_pghead = NULL; - if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { - if (mdb_midl_shrink(&txn->mt_free_pgs)) - env->me_free_pgs = txn->mt_free_pgs; - } + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; #if MDB_DEBUG > 2 mdb_audit(txn); #endif - if (env->me_flags & MDB_WRITEMAP) { - for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) { - dp = txn->mt_u.dirty_list[i].mptr; - /* clear dirty flag */ - dp->mp_flags &= ~P_DIRTY; - } - txn->mt_u.dirty_list[0].mid = 0; - goto sync; - } - - /* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done. - */ - next = 0; - i = 1; - do { -#ifdef _WIN32 - /* Windows actually supports scatter/gather I/O, but only on - * unbuffered file handles. Since we're relying on the OS page - * cache for all our data, that's self-defeating. So we just - * write pages one at a time. We use the ov structure to set - * the write offset, to at least save the overhead of a Seek - * system call. - */ - OVERLAPPED ov; - memset(&ov, 0, sizeof(ov)); - for (; i<=txn->mt_u.dirty_list[0].mid; i++) { - size_t wsize; - dp = txn->mt_u.dirty_list[i].mptr; - DPRINTF("committing page %zu", dp->mp_pgno); - size = dp->mp_pgno * env->me_psize; - ov.Offset = size & 0xffffffff; - ov.OffsetHigh = size >> 16; - ov.OffsetHigh >>= 16; - /* clear dirty flag */ - dp->mp_flags &= ~P_DIRTY; - wsize = env->me_psize; - if (IS_OVERFLOW(dp)) wsize *= dp->mp_pages; - rc = WriteFile(env->me_fd, dp, wsize, NULL, &ov); - if (!rc) { - n = ErrCode(); - DPRINTF("WriteFile: %d", n); - mdb_txn_abort(txn); - return n; - } - } - done = 1; -#else - struct iovec iov[MDB_COMMIT_PAGES]; - n = 0; - done = 1; - size = 0; - for (; i<=txn->mt_u.dirty_list[0].mid; i++) { - dp = txn->mt_u.dirty_list[i].mptr; - if (dp->mp_pgno != next) { - if (n) { - rc = writev(env->me_fd, iov, n); - if (rc != size) { - n = ErrCode(); - if (rc > 0) - DPUTS("short write, filesystem full?"); - else - DPRINTF("writev: %s", strerror(n)); - mdb_txn_abort(txn); - return n; - } - n = 0; - size = 0; - } - lseek(env->me_fd, dp->mp_pgno * env->me_psize, SEEK_SET); - next = dp->mp_pgno; - } - DPRINTF("committing page %zu", dp->mp_pgno); - iov[n].iov_len = env->me_psize; - if (IS_OVERFLOW(dp)) iov[n].iov_len *= dp->mp_pages; - iov[n].iov_base = (char *)dp; - size += iov[n].iov_len; - next = dp->mp_pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1); - /* clear dirty flag */ - dp->mp_flags &= ~P_DIRTY; - if (++n >= MDB_COMMIT_PAGES) { - done = 0; - i++; - break; - } - } - - if (n == 0) - break; - - rc = writev(env->me_fd, iov, n); - if (rc != size) { - n = ErrCode(); - if (rc > 0) - DPUTS("short write, filesystem full?"); - else - DPRINTF("writev: %s", strerror(n)); - mdb_txn_abort(txn); - return n; - } -#endif - } while (!done); - - mdb_dlist_free(txn); - -sync: - if ((n = mdb_env_sync(env, 0)) != 0 || - (n = mdb_env_write_meta(txn)) != MDB_SUCCESS) { - mdb_txn_abort(txn); - return n; - } + if ((rc = mdb_page_flush(txn)) || + (rc = mdb_env_sync(env, 0)) || + (rc = mdb_env_write_meta(txn))) + goto fail; done: env->me_pglast = 0; @@ -3499,8 +3489,7 @@ mdb_env_close0(MDB_env *env, int excl) free(env->me_dbxs); free(env->me_path); free(env->me_dirty_list); - if (env->me_free_pgs) - mdb_midl_free(env->me_free_pgs); + mdb_midl_free(env->me_free_pgs); if (env->me_flags & MDB_ENV_TXKEY) { pthread_key_delete(env->me_txkey); From 6b200e3beb55322f9d9ae0e42afe3090cc2f13a6 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 22 Jun 2013 12:30:04 +0200 Subject: [PATCH 35/45] Factor out MDB variables/expressions, cleanup. mdb_page_malloc(): Take a txn arg instead of a cursor. --- libraries/liblmdb/mdb.c | 58 +++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index c0d8c30695..8086bf0517 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1256,9 +1256,9 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) * Re-use old malloc'd pages first for singletons, otherwise just malloc. */ static MDB_page * -mdb_page_malloc(MDB_cursor *mc, unsigned num) +mdb_page_malloc(MDB_txn *txn, unsigned num) { - MDB_env *env = mc->mc_txn->mt_env; + MDB_env *env = txn->mt_env; MDB_page *ret = env->me_dpages; size_t sz = env->me_psize; if (num == 1) { @@ -1479,7 +1479,7 @@ search_done: } np = (MDB_page *)(env->me_map + env->me_psize * pgno); } else { - if (!(np = mdb_page_malloc(mc, num))) + if (!(np = mdb_page_malloc(txn, num))) return ENOMEM; if (pgno == P_INVALID) { pgno = txn->mt_next_pgno; @@ -1531,6 +1531,7 @@ static int mdb_page_touch(MDB_cursor *mc) { MDB_page *mp = mc->mc_pg[mc->mc_top], *np; + MDB_txn *txn = mc->mc_txn; MDB_cursor *m2, *m3; MDB_dbi dbi; pgno_t pgno; @@ -1542,7 +1543,7 @@ mdb_page_touch(MDB_cursor *mc) pgno = np->mp_pgno; DPRINTF("touched db %u page %zu -> %zu", mc->mc_dbi,mp->mp_pgno,pgno); assert(mp->mp_pgno != pgno); - mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + mdb_midl_append(&txn->mt_free_pgs, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { MDB_page *parent = mc->mc_pg[mc->mc_top-1]; @@ -1551,8 +1552,8 @@ mdb_page_touch(MDB_cursor *mc) } else { mc->mc_db->md_root = pgno; } - } else if (mc->mc_txn->mt_parent && !(mp->mp_flags & P_SUBP)) { - MDB_ID2 mid, *dl = mc->mc_txn->mt_u.dirty_list; + } else if (txn->mt_parent && !IS_SUBP(mp)) { + MDB_ID2 mid, *dl = txn->mt_u.dirty_list; pgno = mp->mp_pgno; /* If txn has a parent, make sure the page is in our * dirty list. @@ -1568,7 +1569,7 @@ mdb_page_touch(MDB_cursor *mc) } assert(dl[0].mid < MDB_IDL_UM_MAX); /* No - copy it */ - np = mdb_page_malloc(mc, 1); + np = mdb_page_malloc(txn, 1); if (!np) return ENOMEM; mid.mid = pgno; @@ -1578,7 +1579,7 @@ mdb_page_touch(MDB_cursor *mc) return 0; } - mdb_page_copy(np, mp, mc->mc_txn->mt_env->me_psize); + mdb_page_copy(np, mp, txn->mt_env->me_psize); np->mp_pgno = pgno; np->mp_flags |= P_DIRTY; @@ -1587,14 +1588,14 @@ mdb_page_touch(MDB_cursor *mc) dbi = mc->mc_dbi; if (mc->mc_flags & C_SUB) { dbi--; - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { m3 = &m2->mc_xcursor->mx_cursor; if (m3->mc_snum < mc->mc_snum) continue; if (m3->mc_pg[mc->mc_top] == mp) m3->mc_pg[mc->mc_top] = np; } } else { - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { + for (m2 = txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (m2->mc_snum < mc->mc_snum) continue; if (m2->mc_pg[mc->mc_top] == mp) { m2->mc_pg[mc->mc_top] = np; @@ -2002,13 +2003,12 @@ mdb_txn_reset0(MDB_txn *txn) mdb_midl_free(txn->mt_free_pgs); free(txn->mt_u.dirty_list); return; - } else { - if (mdb_midl_shrink(&txn->mt_free_pgs)) - env->me_free_pgs = txn->mt_free_pgs; } - txn->mt_env->me_pghead = NULL; - txn->mt_env->me_pglast = 0; + if (mdb_midl_shrink(&txn->mt_free_pgs)) + env->me_free_pgs = txn->mt_free_pgs; + env->me_pghead = NULL; + env->me_pglast = 0; env->me_txn = NULL; /* The writer mutex was locked in mdb_txn_begin. */ @@ -2699,7 +2699,7 @@ done: * readers will get consistent data regardless of how fresh or * how stale their view of these values is. */ - txn->mt_env->me_txns->mti_txnid = txn->mt_txnid; + env->me_txns->mti_txnid = txn->mt_txnid; return MDB_SUCCESS; } @@ -5203,7 +5203,7 @@ current: if (level > 1) { /* It is writable only in a parent txn */ size_t sz = (size_t) psize * ovpages, off; - MDB_page *np = mdb_page_malloc(mc, ovpages); + MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages); MDB_ID2 id2; if (!np) return ENOMEM; @@ -6878,7 +6878,7 @@ newsep: /* Move half of the keys to the right sibling. */ /* grab a page to hold a temporary copy */ - copy = mdb_page_malloc(mc, 1); + copy = mdb_page_malloc(mc->mc_txn, 1); if (copy == NULL) return ENOMEM; @@ -7295,6 +7295,7 @@ mdb_drop0(MDB_cursor *mc, int subs) rc = mdb_page_search(mc, NULL, 0); if (rc == MDB_SUCCESS) { + MDB_txn *txn = mc->mc_txn; MDB_node *ni; MDB_cursor mx; unsigned int i; @@ -7305,21 +7306,23 @@ mdb_drop0(MDB_cursor *mc, int subs) mdb_cursor_copy(mc, &mx); while (mc->mc_snum > 0) { - if (IS_LEAF(mc->mc_pg[mc->mc_top])) { - for (i=0; imc_pg[mc->mc_top]); i++) { - ni = NODEPTR(mc->mc_pg[mc->mc_top], i); + MDB_page *mp = mc->mc_pg[mc->mc_top]; + unsigned n = NUMKEYS(mp); + if (IS_LEAF(mp)) { + for (i=0; imn_flags & F_BIGDATA) { int j, ovpages; MDB_page *omp; pgno_t pg; memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL); + rc = mdb_page_get(txn, pg, &omp, NULL); if (rc != 0) return rc; assert(IS_OVERFLOW(omp)); ovpages = omp->mp_pages; for (j=0; jmc_txn->mt_free_pgs, pg); + mdb_midl_append(&txn->mt_free_pgs, pg); pg++; } } else if (subs && (ni->mn_flags & F_SUBDATA)) { @@ -7330,12 +7333,12 @@ mdb_drop0(MDB_cursor *mc, int subs) } } } else { - for (i=0; imc_pg[mc->mc_top]); i++) { + for (i=0; imc_pg[mc->mc_top], i); + ni = NODEPTR(mp, i); pg = NODEPGNO(ni); /* free it */ - mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg); + mdb_midl_append(&txn->mt_free_pgs, pg); } } if (!mc->mc_top) @@ -7355,8 +7358,7 @@ mdb_drop0(MDB_cursor *mc, int subs) } } /* free it */ - mdb_midl_append(&mc->mc_txn->mt_free_pgs, - mc->mc_db->md_root); + mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); } return 0; } From 51ff20a4d6481e8a90ef2c7fc80f71ae8609f74c Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 22 Jun 2013 22:10:43 +0200 Subject: [PATCH 36/45] Tweak MIDLs, catch errors. Grow midls earlier in order to catch errors earlier. Use mdb_midl_need() instead of mdb_midl_grow(), then mdb_midl_xappend() needs no error checks. Factor out mdb_midl_append_range(). --- libraries/liblmdb/mdb.c | 72 ++++++++++++++++++++++------------------ libraries/liblmdb/midl.c | 37 +++++++++++++++++++-- libraries/liblmdb/midl.h | 29 +++++++++++----- 3 files changed, 94 insertions(+), 44 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 8086bf0517..4ab4058277 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1438,8 +1438,8 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) if (!mop) { if (!(env->me_pghead = mop = mdb_midl_alloc(i))) return ENOMEM; - } else if (mop_len+i > mop[-1]) { - if ((rc = mdb_midl_grow(&env->me_pghead, i)) != 0) + } else { + if ((rc = mdb_midl_need(&env->me_pghead, i)) != 0) return rc; mop = env->me_pghead; } @@ -1538,12 +1538,13 @@ mdb_page_touch(MDB_cursor *mc) int rc; if (!F_ISSET(mp->mp_flags, P_DIRTY)) { - if ((rc = mdb_page_alloc(mc, 1, &np))) + if ((rc = mdb_midl_need(&txn->mt_free_pgs, 1)) || + (rc = mdb_page_alloc(mc, 1, &np))) return rc; pgno = np->mp_pgno; DPRINTF("touched db %u page %zu -> %zu", mc->mc_dbi,mp->mp_pgno,pgno); assert(mp->mp_pgno != pgno); - mdb_midl_append(&txn->mt_free_pgs, mp->mp_pgno); + mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { MDB_page *parent = mc->mc_pg[mc->mc_top-1]; @@ -4207,16 +4208,11 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) */ if ((mp->mp_flags & P_DIRTY) && !txn->mt_parent && txn->mt_env->me_pghead) { unsigned j, x; - pgno_t *mop = txn->mt_env->me_pghead; + pgno_t *mop; MDB_ID2 *dl, ix, iy; - /* Prepare to insert pg */ - j = mop[0] + ovpages; - if (j > mop[-1]) { - rc = mdb_midl_grow(&mop, ovpages); - if (rc) - return rc; - txn->mt_env->me_pghead = mop; - } + rc = mdb_midl_need(&txn->mt_env->me_pghead, ovpages); + if (rc) + return rc; /* Remove from dirty list */ dl = txn->mt_u.dirty_list; x = dl[0].mid--; @@ -4231,16 +4227,17 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) } } /* Insert in me_pghead */ + mop = txn->mt_env->me_pghead; + j = mop[0] + ovpages; for (i = mop[0]; i && mop[i] < pg; i--) mop[j--] = mop[i]; while (j>i) mop[j--] = pg++; mop[0] += ovpages; } else { - for (i=0; imt_free_pgs, pg); - pg++; - } + rc = mdb_midl_append_range(&txn->mt_free_pgs, pg, ovpages); + if (rc) + return rc; } mc->mc_db->md_overflow_pages -= ovpages; return 0; @@ -5229,7 +5226,8 @@ current: memcpy(METADATA(omp), data->mv_data, data->mv_size); goto done; } else { - mdb_ovpage_free(mc, omp); + if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS) + return rc2; } } else if (NODEDSZ(leaf) == data->mv_size) { /* same size, just replace it. Note that we could @@ -6308,7 +6306,10 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) return rc; } - mdb_midl_append(&csrc->mc_txn->mt_free_pgs, csrc->mc_pg[csrc->mc_top]->mp_pgno); + rc = mdb_midl_append(&csrc->mc_txn->mt_free_pgs, + csrc->mc_pg[csrc->mc_top]->mp_pgno); + if (rc) + return rc; if (IS_LEAF(csrc->mc_pg[csrc->mc_top])) csrc->mc_db->md_leaf_pages--; else @@ -6409,7 +6410,9 @@ mdb_rebalance(MDB_cursor *mc) mc->mc_db->md_root = P_INVALID; mc->mc_db->md_depth = 0; mc->mc_db->md_leaf_pages = 0; - mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; /* Adjust cursors pointing to mp */ mc->mc_snum = 0; mc->mc_top = 0; @@ -6434,7 +6437,9 @@ mdb_rebalance(MDB_cursor *mc) } } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { DPUTS("collapsing root page!"); - mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno); + if (rc) + return rc; mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); if (rc) @@ -6539,10 +6544,9 @@ mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf) pgno_t pg; memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) != 0) + if ((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || + (rc = mdb_ovpage_free(mc, omp))) return rc; - assert(IS_OVERFLOW(omp)); - mdb_ovpage_free(mc, omp); } mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], mc->mc_db->md_pad); mc->mc_db->md_entries--; @@ -7312,7 +7316,6 @@ mdb_drop0(MDB_cursor *mc, int subs) for (i=0; imn_flags & F_BIGDATA) { - int j, ovpages; MDB_page *omp; pgno_t pg; memcpy(&pg, NODEDATA(ni), sizeof(pg)); @@ -7320,11 +7323,10 @@ mdb_drop0(MDB_cursor *mc, int subs) if (rc != 0) return rc; assert(IS_OVERFLOW(omp)); - ovpages = omp->mp_pages; - for (j=0; jmt_free_pgs, pg); - pg++; - } + rc = mdb_midl_append_range(&txn->mt_free_pgs, + pg, omp->mp_pages); + if (rc) + return rc; } else if (subs && (ni->mn_flags & F_SUBDATA)) { mdb_xcursor_init1(mc, ni); rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0); @@ -7333,12 +7335,14 @@ mdb_drop0(MDB_cursor *mc, int subs) } } } else { + if ((rc = mdb_midl_need(&txn->mt_free_pgs, n)) != 0) + return rc; for (i=0; imt_free_pgs, pg); + mdb_midl_xappend(txn->mt_free_pgs, pg); } } if (!mc->mc_top) @@ -7358,9 +7362,11 @@ mdb_drop0(MDB_cursor *mc, int subs) } } /* free it */ - mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); + rc = mdb_midl_append(&txn->mt_free_pgs, mc->mc_db->md_root); + } else if (rc == MDB_NOTFOUND) { + rc = MDB_SUCCESS; } - return 0; + return rc; } int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) diff --git a/libraries/liblmdb/midl.c b/libraries/liblmdb/midl.c index 00df385cdc..0fcedbe661 100644 --- a/libraries/liblmdb/midl.c +++ b/libraries/liblmdb/midl.c @@ -120,8 +120,9 @@ void mdb_midl_free(MDB_IDL ids) int mdb_midl_shrink( MDB_IDL *idp ) { MDB_IDL ids = *idp; - if (*(--ids) > MDB_IDL_UM_MAX) { - ids = realloc(ids, (MDB_IDL_UM_MAX+1) * sizeof(MDB_ID)); + if (*(--ids) > MDB_IDL_UM_MAX && + (ids = realloc(ids, (MDB_IDL_UM_MAX+1) * sizeof(MDB_ID)))) + { *ids++ = MDB_IDL_UM_MAX; *idp = ids; return 1; @@ -129,7 +130,7 @@ int mdb_midl_shrink( MDB_IDL *idp ) return 0; } -int mdb_midl_grow( MDB_IDL *idp, int num ) +static int mdb_midl_grow( MDB_IDL *idp, int num ) { MDB_IDL idn = *idp-1; /* grow it */ @@ -141,6 +142,20 @@ int mdb_midl_grow( MDB_IDL *idp, int num ) return 0; } +int mdb_midl_need( MDB_IDL *idp, unsigned num ) +{ + MDB_IDL ids = *idp; + num += ids[0]; + if (num > ids[-1]) { + num = (num + num/4 + (256 + 2)) & -256; + if (!(ids = realloc(ids-1, num * sizeof(MDB_ID)))) + return ENOMEM; + *ids++ += num -= 2; + *idp = ids; + } + return 0; +} + int mdb_midl_append( MDB_IDL *idp, MDB_ID id ) { MDB_IDL ids = *idp; @@ -169,6 +184,22 @@ int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ) return 0; } +int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ) +{ + MDB_ID *ids = *idp, len = ids[0]; + /* Too big? */ + if (len + n > ids[-1]) { + if (mdb_midl_grow(idp, n | MDB_IDL_UM_MAX)) + return ENOMEM; + ids = *idp; + } + ids[0] = len + n; + ids += len; + while (n) + ids[n--] = id++; + return 0; +} + /* Quicksort + Insertion sort for small arrays */ #define SMALL 8 diff --git a/libraries/liblmdb/midl.h b/libraries/liblmdb/midl.h index 019d92849e..9ce7133c6e 100644 --- a/libraries/liblmdb/midl.h +++ b/libraries/liblmdb/midl.h @@ -68,6 +68,12 @@ typedef MDB_ID *MDB_IDL; #define MDB_IDL_FIRST( ids ) ( (ids)[1] ) #define MDB_IDL_LAST( ids ) ( (ids)[(ids)[0]] ) + /** Append ID to IDL. The IDL must be big enough. */ +#define mdb_midl_xappend(idl, id) do { \ + MDB_ID *xidl = (idl), xlen = ++(xidl[0]); \ + xidl[xlen] = (id); \ + } while (0) + #if 0 /* superseded by append/sort */ /** Insert an ID into an IDL. * @param[in,out] ids The IDL to insert into. @@ -95,28 +101,35 @@ void mdb_midl_free(MDB_IDL ids); */ int mdb_midl_shrink(MDB_IDL *idp); - /** Grow an IDL. - * Add room for num additional elements. - * @param[in,out] idp Address of the IDL to grow. - * @param[in] num Number of elements to add. - * @return 0 on success, -1 on failure. + /** Make room for num additional elements in an IDL. + * @param[in,out] idp Address of the IDL. + * @param[in] num Number of elements to make room for. + * @return 0 on success, ENOMEM on failure. */ -int mdb_midl_grow(MDB_IDL *idp, int num); +int mdb_midl_need(MDB_IDL *idp, unsigned num); /** Append an ID onto an IDL. * @param[in,out] idp Address of the IDL to append to. * @param[in] id The ID to append. - * @return 0 on success, -1 if the IDL is too large. + * @return 0 on success, ENOMEM if the IDL is too large. */ int mdb_midl_append( MDB_IDL *idp, MDB_ID id ); /** Append an IDL onto an IDL. * @param[in,out] idp Address of the IDL to append to. * @param[in] app The IDL to append. - * @return 0 on success, -1 if the IDL is too large. + * @return 0 on success, ENOMEM if the IDL is too large. */ int mdb_midl_append_list( MDB_IDL *idp, MDB_IDL app ); + /** Append an ID range onto an IDL. + * @param[in,out] idp Address of the IDL to append to. + * @param[in] id The lowest ID to append. + * @param[in] n Number of IDs to append. + * @return 0 on success, ENOMEM if the IDL is too large. + */ +int mdb_midl_append_range( MDB_IDL *idp, MDB_ID id, unsigned n ); + /** Sort an IDL. * @param[in,out] ids The IDL to sort. */ From 3d4ba01e8b8fd37e8323369c345838cbd1522b51 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 22 Jun 2013 22:17:41 +0200 Subject: [PATCH 37/45] Catch more MDB errors. DPRINTF in mdb_env_reset0. --- libraries/liblmdb/mdb.c | 45 +++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 4ab4058277..071d6f90dd 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1726,8 +1726,11 @@ mdb_cursors_close(MDB_txn *txn, unsigned merge) } } +#ifdef MDB_DEBUG_SKIP +#define mdb_txn_reset0(txn, act) mdb_txn_reset0(txn) +#endif static void -mdb_txn_reset0(MDB_txn *txn); +mdb_txn_reset0(MDB_txn *txn, const char *act); /** Common code for #mdb_txn_begin() and #mdb_txn_renew(). * @param[in] txn the transaction handle to initialize @@ -1816,7 +1819,7 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID; if (env->me_maxpg < txn->mt_next_pgno) { - mdb_txn_reset0(txn); + mdb_txn_reset0(txn, "renew0-mapfail"); if (new_notls) { txn->mt_u.reader->mr_pid = 0; txn->mt_u.reader = NULL; @@ -1928,7 +1931,7 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) if (!rc) rc = mdb_cursor_shadow(parent, txn); if (rc) - mdb_txn_reset0(txn); + mdb_txn_reset0(txn, "beginchild-fail"); } else { rc = mdb_txn_renew0(txn); } @@ -1975,13 +1978,17 @@ mdb_dbis_update(MDB_txn *txn, int keep) * @param[in] txn the transaction handle to reset */ static void -mdb_txn_reset0(MDB_txn *txn) +mdb_txn_reset0(MDB_txn *txn, const char *act) { MDB_env *env = txn->mt_env; /* Close any DBI handles opened in this txn */ mdb_dbis_update(txn, 0); + DPRINTF("%s txn %zu%c %p on mdbenv %p, root page %zu", + act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', + (void *) txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); + if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { if (txn->mt_u.reader) { txn->mt_u.reader->mr_txnid = (txnid_t)-1; @@ -2023,15 +2030,11 @@ mdb_txn_reset(MDB_txn *txn) if (txn == NULL) return; - DPRINTF("reset txn %zu%c %p on mdbenv %p, root page %zu", - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *) txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); - /* This call is only valid for read-only txns */ if (!(txn->mt_flags & MDB_TXN_RDONLY)) return; - mdb_txn_reset0(txn); + mdb_txn_reset0(txn, "reset"); } void @@ -2040,14 +2043,10 @@ mdb_txn_abort(MDB_txn *txn) if (txn == NULL) return; - DPRINTF("abort txn %zu%c %p on mdbenv %p, root page %zu", - txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); - if (txn->mt_child) mdb_txn_abort(txn->mt_child); - mdb_txn_reset0(txn); + mdb_txn_reset0(txn, "abort"); /* Free reader slot tied to this txn (if MDB_NOTLS && writable FS) */ if ((txn->mt_flags & MDB_TXN_RDONLY) && txn->mt_u.reader) txn->mt_u.reader->mr_pid = 0; @@ -3578,7 +3577,7 @@ mdb_env_copyfd(MDB_env *env, HANDLE fd) if (env->me_txns) { /* We must start the actual read txn after blocking writers */ - mdb_txn_reset0(txn); + mdb_txn_reset0(txn, "reset-stage1"); /* Temporarily block writers until we snapshot the meta pages */ LOCK_MUTEX_W(env); @@ -4146,7 +4145,9 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) &mc->mc_dbx->md_name, &exact); if (!exact) return MDB_NOTFOUND; - mdb_node_read(mc->mc_txn, leaf, &data); + rc = mdb_node_read(mc->mc_txn, leaf, &data); + if (rc) + return rc; memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), sizeof(uint16_t)); /* The txn may not know this DBI, or another process may @@ -4374,7 +4375,7 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { if (op == MDB_NEXT || op == MDB_NEXT_DUP) { rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT); - if (op != MDB_NEXT || rc == MDB_SUCCESS) + if (op != MDB_NEXT || rc != MDB_NOTFOUND) return rc; } } else { @@ -4388,10 +4389,10 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) { DPUTS("=====> move to next sibling page"); - if (mdb_cursor_sibling(mc, 1) != MDB_SUCCESS) { + if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS) { mc->mc_flags |= C_EOF; mc->mc_flags &= ~C_INITIALIZED; - return MDB_NOTFOUND; + return rc; } mp = mc->mc_pg[mc->mc_top]; DPRINTF("next page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]); @@ -4445,7 +4446,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) if (op == MDB_PREV || op == MDB_PREV_DUP) { if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV); - if (op != MDB_PREV || rc == MDB_SUCCESS) + if (op != MDB_PREV || rc != MDB_NOTFOUND) return rc; } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; @@ -4459,9 +4460,9 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) if (mc->mc_ki[mc->mc_top] == 0) { DPUTS("=====> move to prev sibling page"); - if (mdb_cursor_sibling(mc, 0) != MDB_SUCCESS) { + if ((rc = mdb_cursor_sibling(mc, 0)) != MDB_SUCCESS) { mc->mc_flags &= ~C_INITIALIZED; - return MDB_NOTFOUND; + return rc; } mp = mc->mc_pg[mc->mc_top]; mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; From a5701cf2fe2b577488f4f9d2e616d5872e3c3e14 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 22 Jun 2013 23:01:30 +0200 Subject: [PATCH 38/45] Fix Windows I/O. Don't put a 64-bit filesize in a 32-bit int before shifting down. Always pass &sizehi to SetFilePointer->maxsize, so sizelo not is treated a signed distance. Hide unused vars when _WIN32. Reinitialize OVERLAPPED before reuse. --- libraries/liblmdb/mdb.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 071d6f90dd..2c6a47f31c 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2211,7 +2211,6 @@ mdb_page_flush(MDB_txn *txn) MDB_page *dp; #ifdef _WIN32 OVERLAPPED ov; - memset(&ov, 0, sizeof(ov)); #else struct iovec iov[MDB_COMMIT_PAGES]; ssize_t wpos, wsize, wres; @@ -2251,9 +2250,9 @@ mdb_page_flush(MDB_txn *txn) * system call. */ DPRINTF("committing page %zu", pgno); + memset(&ov, 0, sizeof(ov)); ov.Offset = pos & 0xffffffff; - ov.OffsetHigh = pos >> 16; - ov.OffsetHigh >>= 16; + ov.OffsetHigh = pos >> 16 >> 16; if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) { rc = ErrCode(); DPRINTF("WriteFile: %d", rc); @@ -2606,6 +2605,8 @@ mdb_env_write_meta(MDB_txn *txn) HANDLE mfd; #ifdef _WIN32 OVERLAPPED ov; +#else + int r2; #endif assert(txn != NULL); @@ -2674,7 +2675,6 @@ mdb_env_write_meta(MDB_txn *txn) rc = pwrite(mfd, ptr, len, off); #endif if (rc != len) { - int r2; rc = ErrCode(); DPUTS("write failed, disk error?"); /* On a failure, the pagecache still contains the new data. @@ -2782,9 +2782,12 @@ static int mdb_env_open2(MDB_env *env) { unsigned int flags = env->me_flags; - int i, newenv = 0, prot; + int i, newenv = 0; MDB_meta meta; MDB_page *p; +#ifndef _WIN32 + int prot; +#endif memset(&meta, 0, sizeof(meta)); @@ -2815,16 +2818,15 @@ mdb_env_open2(MDB_env *env) HANDLE mh; LONG sizelo, sizehi; sizelo = env->me_mapsize & 0xffffffff; - sizehi = env->me_mapsize >> 16; /* pointless on WIN32, only needed on W64 */ - sizehi >>= 16; + sizehi = env->me_mapsize >> 16 >> 16; /* only needed on Win64 */ /* Windows won't create mappings for zero length files. * Just allocate the maxsize right now. */ if (newenv) { - SetFilePointer(env->me_fd, sizelo, sizehi ? &sizehi : NULL, 0); - if (!SetEndOfFile(env->me_fd)) + if (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo + || !SetEndOfFile(env->me_fd) + || SetFilePointer(env->me_fd, 0, NULL, 0) != 0) return ErrCode(); - SetFilePointer(env->me_fd, 0, NULL, 0); } mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? PAGE_READWRITE : PAGE_READONLY, From d6d2638acc245116b8f091ac425b6700d06c4713 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 22 Jun 2013 23:15:10 +0200 Subject: [PATCH 39/45] Improve MDB error handling, drop seek calls. Catch I/O errors. Do nothing between OS call failure and ErrCode(). Do not use errno after non-OS-errors like write() >= 0, which could give a failure return of success (errno 0) or some irrelevant error code. Drop seek calls, use pwrite/pread/Windows OVERLAPPED offset. --- libraries/liblmdb/mdb.c | 108 +++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 51 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 2c6a47f31c..84f32c68c2 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -160,7 +160,7 @@ #define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len)) #define ErrCode() GetLastError() #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;} -#define close(fd) CloseHandle(fd) +#define close(fd) (CloseHandle(fd) ? 0 : -1) #define munmap(ptr,len) UnmapViewOfFile(ptr) #else @@ -2263,13 +2263,22 @@ mdb_page_flush(MDB_txn *txn) if (pos != next_pos || n == MDB_COMMIT_PAGES) { if (n) { /* Write previous page(s) */ - lseek(env->me_fd, wpos, SEEK_SET); - wres = writev(env->me_fd, iov, n); - if (wres != wsize) { +#ifdef HAVE_PWRITEV + wres = pwritev(env->me_fd, iov, n, wpos); +#else + if (lseek(env->me_fd, wpos, SEEK_SET) < 0) { rc = ErrCode(); + DPRINTF("lseek: %s", strerror(rc)); + return rc; + } + wres = writev(env->me_fd, iov, n); +#endif + if (wres != wsize) { if (wres < 0) { + rc = ErrCode(); DPRINTF("writev: %s", strerror(rc)); } else { + rc = EIO; /* TODO: Use which error code? */ DPUTS("short write, filesystem full?"); } return rc; @@ -2474,27 +2483,28 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) MDB_pagebuf pbuf; MDB_page *p; MDB_meta *m; - int i, rc, err; + int i, rc, off; /* We don't know the page size yet, so use a minimum value. * Read both meta pages so we can use the latest one. */ - for (i=0; i<2; i++) { + for (i=off=0; i<2; i++, off = meta->mm_psize) { #ifdef _WIN32 - if (!ReadFile(env->me_fd, &pbuf, MDB_PAGESIZE, (DWORD *)&rc, NULL) || rc == 0) + DWORD len; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; + rc = ReadFile(env->me_fd,&pbuf,MDB_PAGESIZE,&len,&ov) ? (int)len : -1; #else - if ((rc = read(env->me_fd, &pbuf, MDB_PAGESIZE)) == 0) + rc = pread(env->me_fd, &pbuf, MDB_PAGESIZE, off); #endif - { - return ENOENT; - } - else if (rc != MDB_PAGESIZE) { - err = ErrCode(); - if (rc > 0) - err = MDB_INVALID; - DPRINTF("read: %s", strerror(err)); - return err; + if (rc != MDB_PAGESIZE) { + if (rc == 0 && off == 0) + return ENOENT; + rc = rc < 0 ? (int) ErrCode() : MDB_INVALID; + DPRINTF("read: %s", mdb_strerror(rc)); + return rc; } p = (MDB_page *)&pbuf; @@ -2516,18 +2526,8 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) return MDB_VERSION_MISMATCH; } - if (i) { - if (m->mm_txnid > meta->mm_txnid) - memcpy(meta, m, sizeof(*m)); - } else { + if (off == 0 || m->mm_txnid > meta->mm_txnid) memcpy(meta, m, sizeof(*m)); -#ifdef _WIN32 - if (SetFilePointer(env->me_fd, meta->mm_psize, NULL, FILE_BEGIN) != meta->mm_psize) -#else - if (lseek(env->me_fd, meta->mm_psize, SEEK_SET) != meta->mm_psize) -#endif - return ErrCode(); - } } return 0; } @@ -2577,14 +2577,14 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) #ifdef _WIN32 { DWORD len; - SetFilePointer(env->me_fd, 0, NULL, FILE_BEGIN); - rc = WriteFile(env->me_fd, p, psize * 2, &len, NULL); - rc = (len == psize * 2) ? MDB_SUCCESS : ErrCode(); + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + rc = WriteFile(env->me_fd, p, psize * 2, &len, &ov); + rc = rc ? (len == psize * 2 ? MDB_SUCCESS : EIO) : ErrCode(); } #else - lseek(env->me_fd, 0, SEEK_SET); - rc = write(env->me_fd, p, psize * 2); - rc = (rc == (int)psize * 2) ? MDB_SUCCESS : ErrCode(); + rc = pwrite(env->me_fd, p, psize * 2, 0); + rc = (rc == (int)psize * 2) ? MDB_SUCCESS : rc < 0 ? ErrCode() : EIO; #endif free(p); return rc; @@ -2669,13 +2669,14 @@ mdb_env_write_meta(MDB_txn *txn) { memset(&ov, 0, sizeof(ov)); ov.Offset = off; - WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov); + if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov)) + rc = -1; } #else rc = pwrite(mfd, ptr, len, off); #endif if (rc != len) { - rc = ErrCode(); + rc = rc < 0 ? ErrCode() : EIO; DPUTS("write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Write some old data back, to prevent it from being used. @@ -2815,6 +2816,7 @@ mdb_env_open2(MDB_env *env) #ifdef _WIN32 { + int rc; HANDLE mh; LONG sizelo, sizehi; sizelo = env->me_mapsize & 0xffffffff; @@ -2836,9 +2838,10 @@ mdb_env_open2(MDB_env *env) env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? FILE_MAP_WRITE : FILE_MAP_READ, 0, 0, env->me_mapsize, meta.mm_address); + rc = env->me_map ? 0 : ErrCode(); CloseHandle(mh); - if (!env->me_map) - return ErrCode(); + if (rc) + return rc; } #else i = MAP_SHARED; @@ -3207,12 +3210,14 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) size = GetFileSize(env->me_lfd, NULL); #else size = lseek(env->me_lfd, 0, SEEK_END); + if (size == -1) goto fail_errno; #endif rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); if (size < rsize && *excl > 0) { #ifdef _WIN32 - SetFilePointer(env->me_lfd, rsize, NULL, 0); - if (!SetEndOfFile(env->me_lfd)) goto fail_errno; + if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != rsize + || !SetEndOfFile(env->me_lfd)) + goto fail_errno; #else if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; #endif @@ -3327,7 +3332,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) goto fail; } rc = ErrCode(); - if (rc != EACCES && rc != EAGAIN) { + if (rc && rc != EACCES && rc != EAGAIN) { goto fail; } #ifdef _WIN32 @@ -3510,9 +3515,9 @@ mdb_env_close0(MDB_env *env, int excl) munmap(env->me_map, env->me_mapsize); } if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) - close(env->me_mfd); + (void) close(env->me_mfd); if (env->me_fd != INVALID_HANDLE_VALUE) - close(env->me_fd); + (void) close(env->me_fd); if (env->me_txns) { pid_t pid = env->me_pid; /* Clearing readers is done in this function because @@ -3556,7 +3561,7 @@ mdb_env_close0(MDB_env *env, int excl) UnlockFile(env->me_lfd, 0, 0, 1, 0); } #endif - close(env->me_lfd); + (void) close(env->me_lfd); } env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); @@ -3596,11 +3601,11 @@ mdb_env_copyfd(MDB_env *env, HANDLE fd) { DWORD len; rc = WriteFile(fd, env->me_map, wsize, &len, NULL); - rc = (len == wsize) ? MDB_SUCCESS : ErrCode(); + rc = rc ? (len == wsize ? MDB_SUCCESS : EIO) : ErrCode(); } #else rc = write(fd, env->me_map, wsize); - rc = (rc == (int)wsize) ? MDB_SUCCESS : ErrCode(); + rc = rc == (int)wsize ? MDB_SUCCESS : rc < 0 ? ErrCode() : EIO; #endif if (env->me_txns) UNLOCK_MUTEX_W(env); @@ -3619,7 +3624,7 @@ mdb_env_copyfd(MDB_env *env, HANDLE fd) else w2 = wsize; rc = WriteFile(fd, ptr, w2, &len, NULL); - rc = (len == w2) ? MDB_SUCCESS : ErrCode(); + rc = rc ? (len == w2 ? MDB_SUCCESS : EIO) : ErrCode(); if (rc) break; wsize -= w2; ptr += w2; @@ -3633,7 +3638,7 @@ mdb_env_copyfd(MDB_env *env, HANDLE fd) else w2 = wsize; wres = write(fd, ptr, w2); - rc = (wres > 0) ? MDB_SUCCESS : ErrCode(); + rc = wres == (ssize_t)w2 ? MDB_SUCCESS : rc < 0 ? ErrCode() : EIO; if (rc) break; wsize -= wres; ptr += wres; @@ -3677,8 +3682,6 @@ mdb_env_copy(MDB_env *env, const char *path) #endif , 0666); #endif - if (!(env->me_flags & MDB_NOSUBDIR)) - free(lpath); if (newfd == INVALID_HANDLE_VALUE) { rc = ErrCode(); goto leave; @@ -3695,8 +3698,11 @@ mdb_env_copy(MDB_env *env, const char *path) rc = mdb_env_copyfd(env, newfd); leave: + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); if (newfd != INVALID_HANDLE_VALUE) - close(newfd); + if (close(newfd) < 0 && rc == MDB_SUCCESS) + rc = ErrCode(); return rc; } From 26a25df5fcc2fcddae6597a61c1b867fc27c568b Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 26 Jun 2013 18:02:17 +0200 Subject: [PATCH 40/45] Tweak I/O, fix last commit. --- libraries/liblmdb/mdb.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 84f32c68c2..c0f43984d6 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -995,6 +995,9 @@ typedef struct MDB_ntxn { #define MDB_COMMIT_PAGES IOV_MAX #endif + /* max bytes to write in one call */ +#define MAX_WRITE (0x80000000U >> (sizeof(ssize_t) == 4)) + static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp); static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); static int mdb_page_touch(MDB_cursor *mc); @@ -2260,23 +2263,27 @@ mdb_page_flush(MDB_txn *txn) } #else /* Write up to MDB_COMMIT_PAGES dirty pages at a time. */ - if (pos != next_pos || n == MDB_COMMIT_PAGES) { + if (pos!=next_pos || n==MDB_COMMIT_PAGES || wsize+size>MAX_WRITE) { if (n) { /* Write previous page(s) */ -#ifdef HAVE_PWRITEV +#ifdef MDB_USE_PWRITEV wres = pwritev(env->me_fd, iov, n, wpos); #else - if (lseek(env->me_fd, wpos, SEEK_SET) < 0) { - rc = ErrCode(); - DPRINTF("lseek: %s", strerror(rc)); - return rc; + if (n == 1) { + wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); + } else { + if (lseek(env->me_fd, wpos, SEEK_SET) < 0) { + rc = ErrCode(); + DPRINTF("lseek: %s", strerror(rc)); + return rc; + } + wres = writev(env->me_fd, iov, n); } - wres = writev(env->me_fd, iov, n); #endif if (wres != wsize) { if (wres < 0) { rc = ErrCode(); - DPRINTF("writev: %s", strerror(rc)); + DPRINTF("Write error: %s", strerror(rc)); } else { rc = EIO; /* TODO: Use which error code? */ DPUTS("short write, filesystem full?"); @@ -2685,6 +2692,8 @@ mdb_env_write_meta(MDB_txn *txn) meta.mm_last_pg = metab.mm_last_pg; meta.mm_txnid = metab.mm_txnid; #ifdef _WIN32 + memset(&ov, 0, sizeof(ov)); + ov.Offset = off; WriteFile(env->me_fd, ptr, len, NULL, &ov); #else r2 = pwrite(env->me_fd, ptr, len, off); @@ -3615,7 +3624,6 @@ mdb_env_copyfd(MDB_env *env, HANDLE fd) ptr = env->me_map + wsize; wsize = txn->mt_next_pgno * env->me_psize - wsize; -#define MAX_WRITE 2147483648U #ifdef _WIN32 while (wsize > 0) { DWORD len, w2; @@ -3638,7 +3646,7 @@ mdb_env_copyfd(MDB_env *env, HANDLE fd) else w2 = wsize; wres = write(fd, ptr, w2); - rc = wres == (ssize_t)w2 ? MDB_SUCCESS : rc < 0 ? ErrCode() : EIO; + rc = wres == (ssize_t)w2 ? MDB_SUCCESS : wres < 0 ? ErrCode() : EIO; if (rc) break; wsize -= wres; ptr += wres; From b7ce06f5c56e452bca6c5a276da7915b8517d80b Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 26 Jun 2013 18:02:26 +0200 Subject: [PATCH 41/45] Makefile/user-macro comments. --- libraries/liblmdb/Makefile | 33 +++++++++++++++++++++++++++++++++ libraries/liblmdb/mdb.c | 3 +++ 2 files changed, 36 insertions(+) diff --git a/libraries/liblmdb/Makefile b/libraries/liblmdb/Makefile index 7c3903bdff..25c52ada8e 100644 --- a/libraries/liblmdb/Makefile +++ b/libraries/liblmdb/Makefile @@ -1,3 +1,34 @@ +# Makefile for liblmdb (Lightning memory-mapped database library). + +######################################################################## +# Configuration. The compiler options must enable threaded compilation. +# +# Preprocessor macros (for CPPFLAGS) of interest: +# +# To compile successfully if the default does not: +# - MDB_USE_POSIX_SEM (enabled by default on BSD, Apple) +# Define if shared mutexes are unsupported. Note that Posix +# semaphores and shared mutexes have different behaviors and +# different problems, see the Caveats section in lmdb.h. +# +# For best performence or to compile successfully: +# - MDB_DSYNC = "O_DSYNC" (default) or "O_SYNC" (less efficient) +# If O_DSYNC is undefined but exists in /usr/include, +# preferably set some compiler flag to get the definition. +# - MDB_FDATASYNC = "fdatasync" or "fsync" +# Function for flushing the data of a file. Define this to +# "fsync" if fdatasync() is not supported. fdatasync is +# default except on BSD, Apple, Android which use fsync. +# - MDB_USE_PWRITEV +# Define if the pwritev() function is supported. +# +# Data format: +# - MDB_MAXKEYSIZE +# Controls data packing and limits, see mdb.c. +# +# Debugging: +# - MDB_DEBUG, MDB_PARANOID. +# CC = gcc W = -W -Wall -Wno-unused-parameter -Wbad-function-cast OPT = -O2 -g @@ -6,6 +37,8 @@ LDLIBS = SOLIBS = prefix = /usr/local +######################################################################## + IHDRS = lmdb.h ILIBS = liblmdb.a liblmdb.so IPROGS = mdb_stat mdb_copy diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index c0f43984d6..c4a80add33 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -348,6 +348,9 @@ static txnid_t mdb_debug_start; #define MDB_VERSION 1 /** @brief The maximum size of a key in the database. + * + * The library rejects bigger keys, and cannot deal with records + * with bigger keys stored by a library with bigger max keysize. * * We require that keys all fit onto a regular page. This limit * could be raised a bit further if needed; to something just From 12c558fe13fbba4c370b0b669271d8ccaad7819a Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 26 Jun 2013 18:02:48 +0200 Subject: [PATCH 42/45] Factor out some vars, simplify. --- libraries/liblmdb/mdb.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index c4a80add33..53568dc92b 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1993,7 +1993,7 @@ mdb_txn_reset0(MDB_txn *txn, const char *act) DPRINTF("%s txn %zu%c %p on mdbenv %p, root page %zu", act, txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', - (void *) txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root); + (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) { if (txn->mt_u.reader) { @@ -2537,7 +2537,7 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) } if (off == 0 || m->mm_txnid > meta->mm_txnid) - memcpy(meta, m, sizeof(*m)); + *meta = *m; } return 0; } @@ -2551,7 +2551,6 @@ static int mdb_env_init_meta(MDB_env *env, MDB_meta *meta) { MDB_page *p, *q; - MDB_meta *m; int rc; unsigned int psize; @@ -2572,17 +2571,12 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) p = calloc(2, psize); p->mp_pgno = 0; p->mp_flags = P_META; - - m = METADATA(p); - memcpy(m, meta, sizeof(*meta)); + *(MDB_meta *)METADATA(p) = *meta; q = (MDB_page *)((char *)p + psize); - q->mp_pgno = 1; q->mp_flags = P_META; - - m = METADATA(q); - memcpy(m, meta, sizeof(*meta)); + *(MDB_meta *)METADATA(q) = *meta; #ifdef _WIN32 { @@ -4218,6 +4212,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) MDB_txn *txn = mc->mc_txn; pgno_t pg = mp->mp_pgno; unsigned i, ovpages = mp->mp_pages; + MDB_env *env = txn->mt_env; int rc; DPRINTF("free ov page %zu (%d)", pg, ovpages); @@ -4226,11 +4221,11 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) * Not currently supported in nested txns. * Otherwise put it onto the list of pages we freed in this txn. */ - if ((mp->mp_flags & P_DIRTY) && !txn->mt_parent && txn->mt_env->me_pghead) { + if ((mp->mp_flags & P_DIRTY) && !txn->mt_parent && env->me_pghead) { unsigned j, x; pgno_t *mop; MDB_ID2 *dl, ix, iy; - rc = mdb_midl_need(&txn->mt_env->me_pghead, ovpages); + rc = mdb_midl_need(&env->me_pghead, ovpages); if (rc) return rc; /* Remove from dirty list */ @@ -4247,7 +4242,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) } } /* Insert in me_pghead */ - mop = txn->mt_env->me_pghead; + mop = env->me_pghead; j = mop[0] + ovpages; for (i = mop[0]; i && mop[i] < pg; i--) mop[j--] = mop[i]; @@ -7406,7 +7401,7 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT); /* Invalidate the dropped DB's cursors */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) + for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) m2->mc_flags &= ~C_INITIALIZED; if (rc) goto leave; From 7f6738355284dbee8bf97a23396a6380e91dfa73 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 26 Jun 2013 18:02:52 +0200 Subject: [PATCH 43/45] Fix alloc/free issues. Page leak, mdb_page_alloc(). On error, don't shorten me_pghead. Memleak, mdb_ovpage_free(). Free page or keep it in dirty_list. Bad MIDL, mdb_midl_need(). Fix midl[-1] (allocated size). --- libraries/liblmdb/mdb.c | 47 ++++++++++++++++++++-------------------- libraries/liblmdb/midl.c | 2 +- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 53568dc92b..d9cac574cf 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1367,12 +1367,13 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) MDB_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; pgno_t pgno, *mop = env->me_pghead; - unsigned mop_len = mop ? mop[0] : 0; + unsigned i, j, k, mop_len = mop ? mop[0] : 0; MDB_page *np; MDB_ID2 mid; txnid_t oldest = 0, last; MDB_cursor_op op; MDB_cursor m2; + int (*insert)(MDB_ID2L, MDB_ID2 *); *mp = NULL; @@ -1381,7 +1382,6 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) return MDB_TXN_FULL; for (op = MDB_FIRST;; op = MDB_NEXT) { - unsigned int i, j, k; MDB_val key, data; MDB_node *leaf; pgno_t *idl, old_id, new_id; @@ -1393,13 +1393,8 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) i = mop_len; do { pgno = mop[i]; - if (mop[i-n2] == pgno+n2) { - mop[0] = mop_len -= num; - /* Move any stragglers down */ - for (j = i-n2; j <= mop_len; ) - mop[j++] = mop[++i]; + if (mop[i-n2] == pgno+n2) goto search_done; - } } while (--i >= (unsigned)num); if (Max_retries < INT_MAX && --retry < 0) break; @@ -1471,34 +1466,33 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) } /* Use new pages from the map when nothing suitable in the freeDB */ - pgno = P_INVALID; - if (txn->mt_next_pgno + num >= env->me_maxpg) { + i = 0; + pgno = txn->mt_next_pgno; + if (pgno + num >= env->me_maxpg) { DPUTS("DB size maxed out"); return MDB_MAP_FULL; } search_done: if (env->me_flags & MDB_WRITEMAP) { - if (pgno == P_INVALID) { - pgno = txn->mt_next_pgno; - txn->mt_next_pgno += num; - } np = (MDB_page *)(env->me_map + env->me_psize * pgno); + insert = mdb_mid2l_append; } else { if (!(np = mdb_page_malloc(txn, num))) return ENOMEM; - if (pgno == P_INVALID) { - pgno = txn->mt_next_pgno; - txn->mt_next_pgno += num; - } + insert = mdb_mid2l_insert; + } + if (i) { + mop[0] = mop_len -= num; + /* Move any stragglers down */ + for (j = i-num; j < mop_len; ) + mop[++j] = mop[++i]; + } else { + txn->mt_next_pgno = pgno + num; } mid.mid = np->mp_pgno = pgno; mid.mptr = np; - if (env->me_flags & MDB_WRITEMAP) { - mdb_mid2l_append(txn->mt_u.dirty_list, &mid); - } else { - mdb_mid2l_insert(txn->mt_u.dirty_list, &mid); - } + insert(txn->mt_u.dirty_list, &mid); txn->mt_dirty_room--; *mp = np; @@ -4231,16 +4225,21 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) /* Remove from dirty list */ dl = txn->mt_u.dirty_list; x = dl[0].mid--; - for (ix = dl[x]; ix.mid != pg; ix = iy) { + for (ix = dl[x]; ix.mptr != mp; ix = iy) { if (x > 1) { x--; iy = dl[x]; dl[x] = ix; } else { assert(x > 1); + j = ++(dl[0].mid); + dl[j] = ix; /* Unsorted. OK when MDB_TXN_ERROR. */ + txn->mt_flags |= MDB_TXN_ERROR; return MDB_CORRUPTED; } } + if (!(env->me_flags & MDB_WRITEMAP)) + mdb_dpage_free(env, mp); /* Insert in me_pghead */ mop = env->me_pghead; j = mop[0] + ovpages; diff --git a/libraries/liblmdb/midl.c b/libraries/liblmdb/midl.c index 0fcedbe661..e7bd680cb0 100644 --- a/libraries/liblmdb/midl.c +++ b/libraries/liblmdb/midl.c @@ -150,7 +150,7 @@ int mdb_midl_need( MDB_IDL *idp, unsigned num ) num = (num + num/4 + (256 + 2)) & -256; if (!(ids = realloc(ids-1, num * sizeof(MDB_ID)))) return ENOMEM; - *ids++ += num -= 2; + *ids++ = num -= 2; *idp = ids; } return 0; From 3347a029051c2154b4dbd9c618f8f11fe29db0c7 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sun, 30 Jun 2013 07:40:02 -0700 Subject: [PATCH 44/45] Fix uninit warnings, lseek usage --- libraries/liblmdb/mdb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index d9cac574cf..40b46e8b9c 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2206,14 +2206,14 @@ mdb_page_flush(MDB_txn *txn) MDB_ID2L dl = txn->mt_u.dirty_list; unsigned psize = env->me_psize; int i, pagecount = dl[0].mid, rc; - size_t size, pos = 0; + size_t size = 0, pos = 0; pgno_t pgno; - MDB_page *dp; + MDB_page *dp = NULL; #ifdef _WIN32 OVERLAPPED ov; #else struct iovec iov[MDB_COMMIT_PAGES]; - ssize_t wpos, wsize, wres; + ssize_t wpos, wsize = 0, wres; size_t next_pos = 1; /* impossible pos, so pos != next_pos */ int n = 0; #endif @@ -2269,7 +2269,7 @@ mdb_page_flush(MDB_txn *txn) if (n == 1) { wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos); } else { - if (lseek(env->me_fd, wpos, SEEK_SET) < 0) { + if (lseek(env->me_fd, wpos, SEEK_SET) == -1) { rc = ErrCode(); DPRINTF("lseek: %s", strerror(rc)); return rc; From 9474c1a0b62cad57b62855d09a201670385caa80 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Mon, 1 Jul 2013 13:41:23 -0700 Subject: [PATCH 45/45] ITS#7635 fix read txn potential data race --- libraries/liblmdb/mdb.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 40b46e8b9c..84405e8963 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1743,6 +1743,7 @@ mdb_txn_renew0(MDB_txn *txn) unsigned int i; uint16_t x; int rc, new_notls = 0; + pgno_t lastpg2; /* Setup db info */ txn->mt_numdbs = env->me_numdbs; @@ -1811,6 +1812,17 @@ mdb_txn_renew0(MDB_txn *txn) /* Copy the DB info and flags */ memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db)); + /* In a read txn, there is a data race here. Make sure our + * last_pg/next_pg are up to date. + */ + lastpg2 = env->me_metas[txn->mt_toggle]->mm_last_pg+1; + if (lastpg2 != txn->mt_next_pgno) { + txn->mt_next_pgno = lastpg2; + /* When this situation occurs, the txnid will certainly also + * be out of date. But as noted before, we don't care about having + * up to date read txn IDs. + */ + } for (i=2; imt_numdbs; i++) { x = env->me_dbflags[i]; txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;