From 1b8bfc575694fe4ee4e40eefd444873da5655086 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 23 Jan 2013 15:47:35 +0100 Subject: [PATCH 01/22] Freelist cleanup/streamlining Drop unneeded definitions, redundant code. --- libraries/liblmdb/mdb.c | 202 +++++++++++++++++----------------------- 1 file changed, 83 insertions(+), 119 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 68f60832b4..ef414583c2 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -911,18 +911,6 @@ typedef struct MDB_xcursor { unsigned char mx_dbflag; } MDB_xcursor; - /** A set of pages freed by an earlier transaction. */ -typedef struct MDB_oldpages { - /** Usually we only read one record from the FREEDB at a time, but - * in case we read more, this will chain them together. - */ - struct MDB_oldpages *mo_next; - /** The ID of the transaction in which these pages were freed. */ - txnid_t mo_txnid; - /** An #MDB_IDL of the pages */ - pgno_t mo_pages[1]; /* dynamic */ -} MDB_oldpages; - /** The database environment. */ struct MDB_env { HANDLE me_fd; /**< The main data file */ @@ -949,12 +937,10 @@ struct MDB_env { size_t me_mapsize; /**< size of the data memory map */ off_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ - txnid_t me_pgfirst; /**< ID of first old page record we used */ txnid_t me_pglast; /**< ID of last old page record we used */ MDB_dbx *me_dbxs; /**< array of static DB info */ uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ - MDB_oldpages *me_pghead; /**< list of old page records */ - MDB_oldpages *me_pgfree; /**< list of page records to free */ + pgno_t *me_pghead; /**< old pages reclaimed from freelist */ pthread_key_t me_txkey; /**< thread-key for readers */ MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ /** IDL of pages that became unused in a write txn */ @@ -1287,7 +1273,6 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) * after txn 3 commits, and so will be safe to re-use in txn 4. */ if (txn->mt_txnid > 3) { - if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { /* See if there's anything in the free DB */ @@ -1298,7 +1283,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) txnid_t *kptr; mdb_cursor_init(&m2, txn, FREE_DBI, NULL); - if (!txn->mt_env->me_pgfirst) { + if (!txn->mt_env->me_pglast) { mdb_page_search(&m2, NULL, 0); leaf = NODEPTR(m2.mc_pg[m2.mc_top], 0); kptr = (txnid_t *)NODEKEY(leaf); @@ -1335,10 +1320,9 @@ again: if (oldest > last) { /* It's usable, grab it. */ - MDB_oldpages *mop; - pgno_t *idl; + pgno_t *idl, *mop; - if (!txn->mt_env->me_pgfirst) { + if (!txn->mt_env->me_pglast) { mdb_node_read(txn, leaf, &data); } idl = (MDB_ID *) data.mv_data; @@ -1347,26 +1331,20 @@ again: */ if (!idl[0]) { txn->mt_env->me_pglast = last; - if (!txn->mt_env->me_pgfirst) - txn->mt_env->me_pgfirst = last; goto again; } - mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t)); + mop = malloc(MDB_IDL_SIZEOF(idl)); if (!mop) return ENOMEM; - mop->mo_next = txn->mt_env->me_pghead; - mop->mo_txnid = last; txn->mt_env->me_pglast = last; - if (!txn->mt_env->me_pgfirst) - txn->mt_env->me_pgfirst = last; txn->mt_env->me_pghead = mop; - memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl)); + memcpy(mop, idl, MDB_IDL_SIZEOF(idl)); #if MDB_DEBUG > 1 { unsigned int i; DPRINTF("IDL read txn %zu root %zu num %zu", - mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]); + last, txn->mt_dbs[FREE_DBI].md_root, idl[0]); for (i=0; imt_env->me_pghead) { - MDB_oldpages *mop = txn->mt_env->me_pghead; + pgno_t *mop = txn->mt_env->me_pghead; if (num > 1) { MDB_cursor m2; int retry = 500, readit = 0, n2 = num-1; unsigned int i, j, k; /* If current list is too short, must fetch more and coalesce */ - if (mop->mo_pages[0] < (unsigned)num) + if (mop[0] < (unsigned)num) readit = 1; mdb_cursor_init(&m2, txn, FREE_DBI, NULL); @@ -1398,11 +1376,10 @@ none: } if (readit) { MDB_val key, data; - MDB_oldpages *mop2; - pgno_t *idl; + pgno_t *idl, *mop2; int exact; - last = mop->mo_txnid + 1; + last = txn->mt_env->me_pglast + 1; /* We haven't hit the readers list yet? */ if (!oldest) { @@ -1432,39 +1409,37 @@ none: if (rc) return rc; idl = (MDB_ID *) data.mv_data; - mop2 = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - 2*sizeof(pgno_t) + MDB_IDL_SIZEOF(mop->mo_pages)); + mop2 = malloc(MDB_IDL_SIZEOF(idl) + MDB_IDL_SIZEOF(mop)); if (!mop2) return ENOMEM; /* merge in sorted order */ - i = idl[0]; j = mop->mo_pages[0]; mop2->mo_pages[0] = k = i+j; - mop->mo_pages[0] = P_INVALID; + i = idl[0]; j = mop[0]; mop2[0] = k = i+j; + mop[0] = P_INVALID; while (i>0 || j>0) { - if (i && idl[i] < mop->mo_pages[j]) - mop2->mo_pages[k--] = idl[i--]; + if (i && idl[i] < mop[j]) + mop2[k--] = idl[i--]; else - mop2->mo_pages[k--] = mop->mo_pages[j--]; + mop2[k--] = mop[j--]; } txn->mt_env->me_pglast = last; - mop2->mo_txnid = last; - mop2->mo_next = mop->mo_next; txn->mt_env->me_pghead = mop2; free(mop); mop = mop2; /* Keep trying to read until we have enough */ - if (mop->mo_pages[0] < (unsigned)num) { + if (mop[0] < (unsigned)num) { continue; } } /* current list has enough pages, but are they contiguous? */ - for (i=mop->mo_pages[0]; i>=(unsigned)num; i--) { - if (mop->mo_pages[i-n2] == mop->mo_pages[i] + n2) { - pgno = mop->mo_pages[i]; + for (i=mop[0]; i>=(unsigned)num; i--) { + if (mop[i-n2] == mop[i] + n2) { + pgno = mop[i]; i -= n2; /* move any stragglers down */ - for (j=i+num; j<=mop->mo_pages[0]; j++) - mop->mo_pages[i++] = mop->mo_pages[j]; - mop->mo_pages[0] -= num; + for (j=i+num; j<=mop[0]; j++) + mop[i++] = mop[j]; + mop[0] -= num; break; } } @@ -1478,17 +1453,12 @@ none: } while (1); } else { /* peel pages off tail, so we only have to truncate the list */ - pgno = MDB_IDL_LAST(mop->mo_pages); - mop->mo_pages[0]--; + pgno = MDB_IDL_LAST(mop); + mop[0]--; } - if (MDB_IDL_IS_ZERO(mop->mo_pages)) { - txn->mt_env->me_pghead = mop->mo_next; - if (mc->mc_dbi == FREE_DBI) { - mop->mo_next = txn->mt_env->me_pgfree; - txn->mt_env->me_pgfree = mop; - } else { - free(mop); - } + if (MDB_IDL_IS_ZERO(mop)) { + txn->mt_env->me_pghead = NULL; + free(mop); } } } @@ -1961,7 +1931,7 @@ mdb_txn_reset0(MDB_txn *txn) if (!(env->me_flags & MDB_ROFS)) txn->mt_u.reader->mr_txnid = (txnid_t)-1; } else { - MDB_oldpages *mop; + pgno_t *mop; MDB_page *dp; unsigned int i; @@ -2001,11 +1971,10 @@ mdb_txn_reset0(MDB_txn *txn) env->me_free_pgs = txn->mt_free_pgs; } - while ((mop = txn->mt_env->me_pghead)) { - txn->mt_env->me_pghead = mop->mo_next; + if ((mop = txn->mt_env->me_pghead) != NULL) { + txn->mt_env->me_pghead = NULL; free(mop); } - txn->mt_env->me_pgfirst = 0; txn->mt_env->me_pglast = 0; env->me_txn = NULL; @@ -2054,6 +2023,7 @@ mdb_txn_commit(MDB_txn *txn) MDB_page *dp; MDB_env *env; pgno_t next, freecnt; + txnid_t oldpg_txnid, id; MDB_cursor mc; assert(txn != NULL); @@ -2165,10 +2135,21 @@ mdb_txn_commit(MDB_txn *txn) } } + /* Save the freelist as of this transaction to the freeDB. This + * can change the freelist, so keep trying until it stabilizes. + * + * env->me_pglast and the length of txn->mt_free_pgs cannot decrease. + * Page numbers cannot disappear from txn->mt_free_pgs. New pages + * can only appear in env->me_pghead when env->me_pglast increases. + * Until then, the me_pghead pointer won't move but can become NULL. + */ + mdb_cursor_init(&mc, txn, FREE_DBI, NULL); + oldpg_txnid = id = 0; + freecnt = 0; /* should only be one record now */ - if (env->me_pghead || env->me_pgfirst) { + if (env->me_pghead || env->me_pglast) { /* make sure first page of freeDB is touched and on freelist */ rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY); if (rc && rc != MDB_NOTFOUND) { @@ -2179,28 +2160,27 @@ fail: } /* Delete IDLs we used from the free list */ - if (env->me_pgfirst) { - txnid_t cur; + if (env->me_pglast) { MDB_val key; - int exact = 0; - key.mv_size = sizeof(cur); - for (cur = env->me_pgfirst; cur <= env->me_pglast; cur++) { - key.mv_data = &cur; - - mdb_cursor_set(&mc, &key, NULL, MDB_SET, &exact); + do { +free_pgfirst: + rc = mdb_cursor_first(&mc, &key, NULL); + if (rc) + goto fail; + oldpg_txnid = *(txnid_t *)key.mv_data; +again: + assert(oldpg_txnid <= env->me_pglast); + id = 0; rc = mdb_cursor_del(&mc, 0); if (rc) goto fail; - } - env->me_pgfirst = 0; - env->me_pglast = 0; + } while (oldpg_txnid < env->me_pglast); } - /* save to free list */ + /* Save IDL of pages freed by this txn, to freeDB */ free2: - freecnt = txn->mt_free_pgs[0]; - if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { + if (freecnt != txn->mt_free_pgs[0]) { MDB_val key, data; /* make sure last page of freeDB is touched and on freelist */ @@ -2225,61 +2205,50 @@ free2: /* write to last page of freeDB */ key.mv_size = sizeof(pgno_t); key.mv_data = &txn->mt_txnid; - data.mv_data = txn->mt_free_pgs; /* The free list can still grow during this call, - * despite the pre-emptive touches above. So check - * and make sure the entire thing got written. + * despite the pre-emptive touches above. So retry + * until the reserved space remains big enough. */ do { + assert(freecnt < txn->mt_free_pgs[0]); freecnt = txn->mt_free_pgs[0]; data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs); - mdb_midl_sort(txn->mt_free_pgs); - rc = mdb_cursor_put(&mc, &key, &data, 0); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); if (rc) goto fail; } while (freecnt != txn->mt_free_pgs[0]); + mdb_midl_sort(txn->mt_free_pgs); + memcpy(data.mv_data, txn->mt_free_pgs, data.mv_size); + if (oldpg_txnid < env->me_pglast || (!env->me_pghead && id)) + goto free_pgfirst; /* used up freeDB[oldpg_txnid] */ } - /* should only be one record now */ -again: + + /* Put back page numbers we took from freeDB but did not use */ if (env->me_pghead) { MDB_val key, data; - MDB_oldpages *mop; - pgno_t orig; - txnid_t id; + pgno_t orig, *mop; mop = env->me_pghead; - id = mop->mo_txnid; + id = env->me_pglast; key.mv_size = sizeof(id); key.mv_data = &id; - data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages); - data.mv_data = mop->mo_pages; - orig = mop->mo_pages[0]; /* These steps may grow the freelist again * due to freed overflow pages... */ - rc = mdb_cursor_put(&mc, &key, &data, 0); - if (rc) - goto fail; - if (mop == env->me_pghead && env->me_pghead->mo_txnid == id) { - /* could have been used again here */ - if (mop->mo_pages[0] != orig) { - data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages); - data.mv_data = mop->mo_pages; - id = mop->mo_txnid; - rc = mdb_cursor_put(&mc, &key, &data, 0); - if (rc) - goto fail; - } - } else { - /* was completely used up */ - rc = mdb_cursor_del(&mc, 0); + i = 2; + do { + orig = mop[0]; + data.mv_size = MDB_IDL_SIZEOF(mop); + rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); if (rc) goto fail; - if (env->me_pghead) - goto again; - } - env->me_pgfirst = 0; - env->me_pglast = 0; + assert(!env->me_pghead || env->me_pglast); + /* mop could have been used again here */ + if (id != env->me_pglast || env->me_pghead == NULL) + goto again; /* was completely used up */ + assert(mop == env->me_pghead && mop[0] <= orig); + } while (mop[0] != orig && --i); + memcpy(data.mv_data, mop, data.mv_size); } /* Check for growth of freelist again */ @@ -2291,12 +2260,6 @@ again: env->me_pghead = NULL; } - while (env->me_pgfree) { - MDB_oldpages *mop = env->me_pgfree; - env->me_pgfree = mop->mo_next; - free(mop); - } - if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { if (mdb_midl_shrink(&txn->mt_free_pgs)) env->me_free_pgs = txn->mt_free_pgs; @@ -2431,6 +2394,7 @@ sync: } done: + env->me_pglast = 0; env->me_txn = NULL; if (txn->mt_numdbs > env->me_numdbs) { /* update the DB flags */ From 52ecd38e18ac25329e5397d34d60387008f12559 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 12 Feb 2013 12:27:59 +0100 Subject: [PATCH 02/22] ITS#7455 Save freelist in single-page chunks --- libraries/liblmdb/mdb.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index ef414583c2..e1f6357c7f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2022,7 +2022,7 @@ mdb_txn_commit(MDB_txn *txn) off_t size; MDB_page *dp; MDB_env *env; - pgno_t next, freecnt; + pgno_t next, freecnt, maxfree_1pg; txnid_t oldpg_txnid, id; MDB_cursor mc; @@ -2138,7 +2138,8 @@ mdb_txn_commit(MDB_txn *txn) /* Save the freelist as of this transaction to the freeDB. This * can change the freelist, so keep trying until it stabilizes. * - * env->me_pglast and the length of txn->mt_free_pgs cannot decrease. + * env->me_pglast and the length of txn->mt_free_pgs cannot decrease, + * except the code below can decrease env->me_pglast to split pghead. * Page numbers cannot disappear from txn->mt_free_pgs. New pages * can only appear in env->me_pghead when env->me_pglast increases. * Until then, the me_pghead pointer won't move but can become NULL. @@ -2147,6 +2148,12 @@ mdb_txn_commit(MDB_txn *txn) mdb_cursor_init(&mc, txn, FREE_DBI, NULL); oldpg_txnid = id = 0; freecnt = 0; + /* Preferred max #items per freelist entry, to avoid overflow pages. + * Leave room for headers, key (txnid), pagecount (pageno_t), and + * FIXME: a bit more in case there is some delimiter I don't know about. + */ + maxfree_1pg = (env->me_psize - (PAGEHDRSZ + NODESIZE + 3*sizeof(MDB_ID))) + / sizeof(pgno_t); /* should only be one record now */ if (env->me_pghead || env->me_pglast) { @@ -2225,6 +2232,7 @@ free2: /* Put back page numbers we took from freeDB but did not use */ if (env->me_pghead) { + for (;;) { MDB_val key, data; pgno_t orig, *mop; @@ -2238,7 +2246,9 @@ free2: i = 2; do { orig = mop[0]; - data.mv_size = MDB_IDL_SIZEOF(mop); + if (orig > maxfree_1pg && id > 4) + orig = maxfree_1pg; /* Do not use an overflow page */ + data.mv_size = (orig + 1) * sizeof(pgno_t); rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); if (rc) goto fail; @@ -2246,9 +2256,19 @@ free2: /* mop could have been used again here */ if (id != env->me_pglast || env->me_pghead == NULL) goto again; /* was completely used up */ - assert(mop == env->me_pghead && mop[0] <= orig); - } while (mop[0] != orig && --i); + assert(mop == env->me_pghead); + } while (mop[0] < orig && --i); memcpy(data.mv_data, mop, data.mv_size); + if (mop[0] <= orig) + break; + *(pgno_t *)data.mv_data = orig; + mop[0] -= orig; + memmove(&mop[1], &mop[1 + orig], + mop[0] * sizeof(pgno_t)); + /* Save more oldpages at the previous txnid. */ + assert(env->me_pglast == id && id == oldpg_txnid); + env->me_pglast = --oldpg_txnid; + } } /* Check for growth of freelist again */ From 5e59695b8d8825ed85e0ebd59aa2da168e869075 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 14 Feb 2013 19:20:45 +0000 Subject: [PATCH 03/22] Don't memmove freelist entry when chunking it Just advance the pointer, now that it's no longer a complex struct --- libraries/liblmdb/mdb.c | 44 +++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index e1f6357c7f..7bfdfc27bc 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -941,12 +941,15 @@ struct MDB_env { MDB_dbx *me_dbxs; /**< array of static DB info */ uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ pgno_t *me_pghead; /**< old pages reclaimed from freelist */ + pgno_t *me_pgfree; /**< memory to free when dropping me_pghead */ pthread_key_t me_txkey; /**< thread-key for readers */ MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ /** IDL of pages that became unused in a write txn */ MDB_IDL me_free_pgs; /** ID2L of pages that were written during a write txn */ MDB_ID2 me_dirty_list[MDB_IDL_UM_SIZE]; + /** Max number of freelist items that can fit in a single overflow page */ + unsigned int me_maxfree_1pg; #ifdef _WIN32 HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */ HANDLE me_wmutex; @@ -1337,7 +1340,7 @@ again: if (!mop) return ENOMEM; txn->mt_env->me_pglast = last; - txn->mt_env->me_pghead = mop; + txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop; memcpy(mop, idl, MDB_IDL_SIZEOF(idl)); #if MDB_DEBUG > 1 @@ -1422,8 +1425,8 @@ none: mop2[k--] = mop[j--]; } txn->mt_env->me_pglast = last; - txn->mt_env->me_pghead = mop2; - free(mop); + free(txn->mt_env->me_pgfree); + txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop2; mop = mop2; /* Keep trying to read until we have enough */ if (mop[0] < (unsigned)num) { @@ -1457,8 +1460,8 @@ none: mop[0]--; } if (MDB_IDL_IS_ZERO(mop)) { - txn->mt_env->me_pghead = NULL; - free(mop); + free(txn->mt_env->me_pgfree); + txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL; } } } @@ -1931,7 +1934,6 @@ mdb_txn_reset0(MDB_txn *txn) if (!(env->me_flags & MDB_ROFS)) txn->mt_u.reader->mr_txnid = (txnid_t)-1; } else { - pgno_t *mop; MDB_page *dp; unsigned int i; @@ -1971,10 +1973,8 @@ mdb_txn_reset0(MDB_txn *txn) env->me_free_pgs = txn->mt_free_pgs; } - if ((mop = txn->mt_env->me_pghead) != NULL) { - txn->mt_env->me_pghead = NULL; - free(mop); - } + free(txn->mt_env->me_pgfree); + txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL; txn->mt_env->me_pglast = 0; env->me_txn = NULL; @@ -2022,7 +2022,7 @@ mdb_txn_commit(MDB_txn *txn) off_t size; MDB_page *dp; MDB_env *env; - pgno_t next, freecnt, maxfree_1pg; + pgno_t next, freecnt; txnid_t oldpg_txnid, id; MDB_cursor mc; @@ -2148,12 +2148,6 @@ mdb_txn_commit(MDB_txn *txn) mdb_cursor_init(&mc, txn, FREE_DBI, NULL); oldpg_txnid = id = 0; freecnt = 0; - /* Preferred max #items per freelist entry, to avoid overflow pages. - * Leave room for headers, key (txnid), pagecount (pageno_t), and - * FIXME: a bit more in case there is some delimiter I don't know about. - */ - maxfree_1pg = (env->me_psize - (PAGEHDRSZ + NODESIZE + 3*sizeof(MDB_ID))) - / sizeof(pgno_t); /* should only be one record now */ if (env->me_pghead || env->me_pglast) { @@ -2246,8 +2240,8 @@ free2: i = 2; do { orig = mop[0]; - if (orig > maxfree_1pg && id > 4) - orig = maxfree_1pg; /* Do not use an overflow page */ + if (orig > env->me_maxfree_1pg && id > 4) + orig = env->me_maxfree_1pg; /* Do not use more than 1 page */ data.mv_size = (orig + 1) * sizeof(pgno_t); rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE); if (rc) @@ -2262,9 +2256,8 @@ free2: if (mop[0] <= orig) break; *(pgno_t *)data.mv_data = orig; - mop[0] -= orig; - memmove(&mop[1], &mop[1 + orig], - mop[0] * sizeof(pgno_t)); + mop[orig] = mop[0] - orig; + env->me_pghead = mop += orig; /* Save more oldpages at the previous txnid. */ assert(env->me_pglast == id && id == oldpg_txnid); env->me_pglast = --oldpg_txnid; @@ -2275,10 +2268,8 @@ free2: if (freecnt != txn->mt_free_pgs[0]) goto free2; - if (env->me_pghead) { - free(env->me_pghead); - env->me_pghead = NULL; - } + free(env->me_pgfree); + env->me_pghead = env->me_pgfree = NULL; if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) { if (mdb_midl_shrink(&txn->mt_free_pgs)) @@ -2842,6 +2833,7 @@ mdb_env_open2(MDB_env *env) return EBUSY; /* TODO: Make a new MDB_* error code? */ } env->me_psize = meta.mm_psize; + env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1; env->me_maxpg = env->me_mapsize / env->me_psize; From 7aba5f5ab92a3fa92de001edb2c5ff4b04fd7d0d Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 16 Feb 2013 19:06:28 +0100 Subject: [PATCH 04/22] Add error code MDB_MAP_RESIZED. --- libraries/liblmdb/lmdb.h | 8 ++++++-- libraries/liblmdb/mdb.c | 8 +++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 7719e3294a..aaaa2922f1 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -353,7 +353,9 @@ typedef enum MDB_cursor_op { #define MDB_CURSOR_FULL (-30787) /** Page has not enough space - internal error */ #define MDB_PAGE_FULL (-30786) -#define MDB_LAST_ERRCODE MDB_PAGE_FULL + /** Database contents grew beyond environment mapsize */ +#define MDB_MAP_RESIZED (-30785) +#define MDB_LAST_ERRCODE MDB_MAP_RESIZED /** @} */ /** @brief Statistics for a database in the environment */ @@ -675,7 +677,9 @@ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); * errors are: *
    *
  • #MDB_PANIC - a fatal error occurred earlier and the environment - * must be shut down. +- * must be shut down. + *
  • #MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's + * mapsize and the environment must be shut down. *
  • ENOMEM - out of memory, or a read-only transaction was requested and * the reader lock table is full. See #mdb_env_set_maxreaders(). *
diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 7bfdfc27bc..40b9bda484 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1054,7 +1054,8 @@ static char *const mdb_errstr[] = { "MDB_TLS_FULL: Thread-local storage keys full - too many environments open", "MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big", "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", - "MDB_PAGE_FULL: Internal error - page has no more space" + "MDB_PAGE_FULL: Internal error - page has no more space", + "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", }; char * @@ -1818,6 +1819,11 @@ mdb_txn_renew0(MDB_txn *txn) if (txn->mt_numdbs > 2) memset(txn->mt_dbflags+2, DB_STALE, txn->mt_numdbs-2); + if (env->me_maxpg < txn->mt_next_pgno) { + mdb_txn_reset0(txn); + return MDB_MAP_RESIZED; + } + return MDB_SUCCESS; } From 8e1bbdf0dd2c05f56937c6293dc23d7c4945fbe2 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 16 Feb 2013 19:07:16 +0100 Subject: [PATCH 05/22] mdb_stat -ff[f]: show contiguous page spans. --- libraries/liblmdb/mdb_stat.1 | 7 ++++--- libraries/liblmdb/mdb_stat.c | 30 +++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/libraries/liblmdb/mdb_stat.1 b/libraries/liblmdb/mdb_stat.1 index 205fc88dee..1307c39d09 100644 --- a/libraries/liblmdb/mdb_stat.1 +++ b/libraries/liblmdb/mdb_stat.1 @@ -9,7 +9,7 @@ mdb_stat \- LMDB environment status tool [\c .BR \-e ] [\c -.BR \-f [ f ]] +.BR \-f [ f [ f ]]] [\c .BR \-n ] [\c @@ -25,8 +25,9 @@ utility displays the status of an LMDB environment. Display information about the database environment. .TP .BR \-f -Display information about the environment freelist. If \fB\-ff\fP is given, -display the full list of page IDs in the freelist. +Display information about the environment freelist. +If \fB\-ff\fP is given, summarize each freelist entry. +If \fB\-fff\fP is given, display the full list of page IDs in the freelist. .TP .BR \-n Display the status of an LMDB database which does not use subdirectories. diff --git a/libraries/liblmdb/mdb_stat.c b/libraries/liblmdb/mdb_stat.c index 13a24f9cbd..dd0735f242 100644 --- a/libraries/liblmdb/mdb_stat.c +++ b/libraries/liblmdb/mdb_stat.c @@ -1,6 +1,6 @@ /* mdb_stat.c - memory-mapped database status tool */ /* - * Copyright 2011 Howard Chu, Symas Corp. + * Copyright 2011-2013 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,7 +31,7 @@ static void prstat(MDB_stat *ms) static void usage(char *prog) { - fprintf(stderr, "usage: %s dbpath [-e] [-f[f]] [-n] [-a|-s subdb]\n", prog); + fprintf(stderr, "usage: %s dbpath [-e] [-f[f[f]]] [-n] [-a|-s subdb]\n", prog); exit(EXIT_FAILURE); } @@ -142,12 +142,28 @@ int main(int argc, char *argv[]) iptr = data.mv_data; pages += *iptr; if (freinfo > 1) { - size_t i, j; + char *bad = ""; + size_t pg, prev; + ssize_t i, j, span = 0; j = *iptr++; - printf(" Transaction %zu, %zu pages\n", - *(size_t *)key.mv_data, j); - for (i=0; i= 0; ) { + pg = iptr[i]; + if (pg <= prev) + bad = " [bad sequence]"; + prev = pg; + pg += span; + for (; i >= span && iptr[i-span] == pg; span++, pg++) ; + } + printf(" Transaction %zu, %zd pages, maxspan %zd%s\n", + *(size_t *)key.mv_data, j, span, bad); + if (freinfo > 2) { + for (--j; j >= 0; ) { + pg = iptr[j]; + for (span=1; --j >= 0 && iptr[j] == pg+span; span++) ; + printf(span>1 ? " %9zu[%zd]\n" : " %9zu\n", + pg, span); + } + } } } mdb_cursor_close(cursor); From c7db955a94344b028ffba506bea1b5c2ecd564ca Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 16 Feb 2013 19:08:37 +0100 Subject: [PATCH 06/22] ITS#7517 Don't save dropped dirty MDB databases. mdb_txn_commit's attempt to save the DB could corrupt another DB if another thread had called mdb_dbi_open and reused the closed DBI. --- libraries/liblmdb/mdb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 40b9bda484..715d1c811a 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -7146,8 +7146,10 @@ int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del) /* Can't delete the main DB */ if (del && dbi > MAIN_DBI) { rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL); - if (!rc) + if (!rc) { + txn->mt_dbflags[dbi] = DB_STALE; mdb_dbi_close(txn->mt_env, dbi); + } } else { /* reset the DB record, mark it dirty */ txn->mt_dbflags[dbi] |= DB_DIRTY; From d90581fa5a9ae42b87325f18259553bc80d89a14 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 16 Feb 2013 19:08:54 +0100 Subject: [PATCH 07/22] ITS#7377 Catch MDB failure updating root pointers. "cannot fail" was wrong, it fails at least when exceeding mapsize. --- libraries/liblmdb/mdb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 715d1c811a..083b25bb8d 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2124,9 +2124,7 @@ mdb_txn_commit(MDB_txn *txn) DPRINTF("committing txn %zu %p on mdbenv %p, root page %zu", txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root); - /* Update DB root pointers. Their pages have already been - * touched so this is all in-place and cannot fail. - */ + /* Update DB root pointers */ if (txn->mt_numdbs > 2) { MDB_dbi i; MDB_val data; @@ -2136,7 +2134,9 @@ mdb_txn_commit(MDB_txn *txn) for (i = 2; i < txn->mt_numdbs; i++) { if (txn->mt_dbflags[i] & DB_DIRTY) { data.mv_data = &txn->mt_dbs[i]; - mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); + rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0); + if (rc) + goto fail; } } } From 00d7a96bd584412016036b1029fd31e441f5dc34 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 16 Feb 2013 19:11:20 +0100 Subject: [PATCH 08/22] ITS#7515 Fix MDB parent/child txn interaction. mdb_txn_commit(child): Copy more state. Copy all of mt_dbs: Include mainDB, and even freeDB since mdb_drop() can update it. Don't skip DBs with unchanged root, this could break when the new was newly opened and the old unused junk. mdb_page_get(): Search parents' dirty lists. --- libraries/liblmdb/mdb.c | 56 ++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 083b25bb8d..40d0207f20 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2063,22 +2063,19 @@ mdb_txn_commit(MDB_txn *txn) } if (txn->mt_parent) { - MDB_db *ip, *jp; - MDB_dbi i; + MDB_txn *parent = txn->mt_parent; unsigned x, y; MDB_ID2L dst, src; + parent->mt_next_pgno = txn->mt_next_pgno; + parent->mt_flags = txn->mt_flags; + /* Merge (and close) our cursors with parent's */ mdb_cursor_merge(txn); - /* Update parent's DB table */ - ip = &txn->mt_parent->mt_dbs[2]; - jp = &txn->mt_dbs[2]; - for (i = 2; i < txn->mt_numdbs; i++) { - if (ip->md_root != jp->md_root) - *ip = *jp; - ip++; jp++; - } + /* Update parent's DB table. */ + memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); + memcpy(parent->mt_dbflags, txn->mt_dbflags, txn->mt_numdbs); txn->mt_parent->mt_numdbs = txn->mt_numdbs; /* Append our free list to parent's */ @@ -3910,28 +3907,31 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret) { MDB_page *p = NULL; - if (txn->mt_env->me_flags & MDB_WRITEMAP) { - if (pgno < txn->mt_next_pgno) - p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); - goto done; + if (!((txn->mt_flags & MDB_TXN_RDONLY) | + (txn->mt_env->me_flags & MDB_WRITEMAP))) + { + MDB_txn *tx2 = txn; + do { + MDB_ID2L dl = tx2->mt_u.dirty_list; + if (dl[0].mid) { + unsigned x = mdb_mid2l_search(dl, pgno); + if (x <= dl[0].mid && dl[x].mid == pgno) { + p = dl[x].mptr; + goto done; + } + } + } while ((tx2 = tx2->mt_parent) != NULL); } - if (!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY) && txn->mt_u.dirty_list[0].mid) { - unsigned x; - x = mdb_mid2l_search(txn->mt_u.dirty_list, pgno); - if (x <= txn->mt_u.dirty_list[0].mid && txn->mt_u.dirty_list[x].mid == pgno) { - p = txn->mt_u.dirty_list[x].mptr; - } - } - if (!p) { - if (pgno < txn->mt_next_pgno) - p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); - } -done: - *ret = p; - if (!p) { + + if (pgno < txn->mt_next_pgno) { + p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); + } else { DPRINTF("page %zu not found", pgno); assert(p != NULL); } + +done: + *ret = p; return (p != NULL) ? MDB_SUCCESS : MDB_PAGE_NOTFOUND; } From 890f1da3eeedab99aecc63c85b9792aa17faf537 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sun, 17 Feb 2013 00:48:43 +0000 Subject: [PATCH 09/22] Don't limit retries when coalescing freelist Try to use whatever's available. --- libraries/liblmdb/mdb.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 40d0207f20..8d7b97e099 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1361,7 +1361,7 @@ none: pgno_t *mop = txn->mt_env->me_pghead; if (num > 1) { MDB_cursor m2; - int retry = 500, readit = 0, n2 = num-1; + int retry = 1, readit = 0, n2 = num-1; unsigned int i, j, k; /* If current list is too short, must fetch more and coalesce */ @@ -1448,11 +1448,10 @@ none: } } - /* Stop if we succeeded, or no more retries */ + /* Stop if we succeeded, or no retries */ if (!retry || pgno != P_INVALID) break; readit = 1; - retry--; } while (1); } else { From fd4861bf00fb0b86a9f3b80d16cbe363a8eac227 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Sun, 17 Feb 2013 00:49:53 +0000 Subject: [PATCH 10/22] ITS#7515 update parent's mt_next_pgno on child commit --- libraries/liblmdb/mdb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 8d7b97e099..7c766aad54 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2104,6 +2104,7 @@ mdb_txn_commit(MDB_txn *txn) dst[0].mid = x; free(txn->mt_u.dirty_list); txn->mt_parent->mt_child = NULL; + txn->mt_parent->mt_next_pgno = txn->mt_next_pgno; free(txn); return MDB_SUCCESS; } From ef25056cfb072d6cbeb29c7e6adbdb8a0a1bc382 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sun, 17 Feb 2013 08:42:14 +0100 Subject: [PATCH 11/22] Revert "ITS#7515 update parent's mt_next_pgno on child commit" This reverts commit fd4861bf00fb0b86a9f3b80d16cbe363a8eac227. It duplicated earlier code. --- libraries/liblmdb/mdb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 7c766aad54..8d7b97e099 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -2104,7 +2104,6 @@ mdb_txn_commit(MDB_txn *txn) dst[0].mid = x; free(txn->mt_u.dirty_list); txn->mt_parent->mt_child = NULL; - txn->mt_parent->mt_next_pgno = txn->mt_next_pgno; free(txn); return MDB_SUCCESS; } From 8ad25001ffed13d12d1e84705002cc799b6c8676 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 19 Feb 2013 21:14:23 +0100 Subject: [PATCH 12/22] ITS#7485 Document key/data size limits in lmdb.h. mdb.c already describes them. The user doc should too. --- libraries/liblmdb/lmdb.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index aaaa2922f1..220eea1c6a 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -192,7 +192,14 @@ typedef unsigned int MDB_dbi; /** @brief Opaque structure for navigating through a database */ typedef struct MDB_cursor MDB_cursor; -/** @brief Generic structure used for passing keys and data in and out of the database. */ +/** @brief Generic structure used for passing keys and data in and out + * of the database. + * + * Key sizes must be between 1 and the liblmdb build-time constant + * #MDB_MAXKEYSIZE inclusive. This currently defaults to 511. The + * same applies to data sizes in databases with the #MDB_DUPSORT flag. + * Other data items can in theory be from 0 to 0xffffffff bytes long. + */ typedef struct MDB_val { size_t mv_size; /**< size of the data item */ void *mv_data; /**< address of the data item */ From f19655eabc49d563878b88db8723478d2ee2cba7 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 19 Feb 2013 21:15:26 +0100 Subject: [PATCH 13/22] ITS#7517 Document that dirty DBs may not be closed --- libraries/liblmdb/lmdb.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 220eea1c6a..646d456e31 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -827,7 +827,8 @@ int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat); * * This call is not mutex protected. Handles should only be closed by * a single thread, and only if no other threads are going to reference - * the database handle or one of its cursors any further. + * the database handle or one of its cursors any further. Do not close + * a handle if an existing transaction has modified its database. * @param[in] env An environment handle returned by #mdb_env_create() * @param[in] dbi A database handle returned by #mdb_dbi_open() */ From 4b6727037434c32038661be08fb2b55914596d28 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 19 Feb 2013 21:17:33 +0100 Subject: [PATCH 14/22] mdb_page_alloc(): Handle freeDB txnid range holes. A txn writes no freeDB entry if previous txn dropped mainDB and a read txn prevents freelist entry reuse. This surprised mdb_page_alloc (and mdb_txn_commit too before 65c053a6e7f6973c1d09710aa1bd57b218206fcb). --- libraries/liblmdb/mdb.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 8d7b97e099..368cf5227f 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1294,14 +1294,12 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) last = *kptr; } else { MDB_val key; - int exact; again: - exact = 0; last = txn->mt_env->me_pglast + 1; leaf = NULL; key.mv_data = &last; key.mv_size = sizeof(last); - rc = mdb_cursor_set(&m2, &key, &data, MDB_SET, &exact); + rc = mdb_cursor_set(&m2, &key, &data, MDB_SET_RANGE, NULL); if (rc) goto none; last = *(txnid_t *)key.mv_data; @@ -1381,7 +1379,6 @@ none: if (readit) { MDB_val key, data; pgno_t *idl, *mop2; - int exact; last = txn->mt_env->me_pglast + 1; @@ -1406,12 +1403,17 @@ none: if (oldest - last < 1) break; - exact = 0; key.mv_data = &last; key.mv_size = sizeof(last); - rc = mdb_cursor_set(&m2, &key, &data, MDB_SET, &exact); - if (rc) + rc = mdb_cursor_set(&m2,&key,&data,MDB_SET_RANGE,NULL); + if (rc) { + if (rc == MDB_NOTFOUND) + break; return rc; + } + last = *(txnid_t*)key.mv_data; + if (oldest <= last) + break; idl = (MDB_ID *) data.mv_data; mop2 = malloc(MDB_IDL_SIZEOF(idl) + MDB_IDL_SIZEOF(mop)); if (!mop2) From e4af9ee5daa207dd726091e9878750e1dbc0c58b Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 19 Feb 2013 22:01:29 +0100 Subject: [PATCH 15/22] ITS#7515 mdb_dbi_open(): Also open in parent txns. This makes aborting nested and non-nested txns more similar: The new DBI is available to the surrounding context (parent txn and MDB_env respectively). --- libraries/liblmdb/lmdb.h | 7 ++++++- libraries/liblmdb/mdb.c | 10 +++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 646d456e31..b91de4df96 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -757,10 +757,15 @@ int mdb_txn_renew(MDB_txn *txn); /** @brief Open a database in the environment. * - * The database handle may be discarded by calling #mdb_dbi_close(). The + * The database handle may be discarded by calling #mdb_dbi_close(). + * It denotes the name and parameters of a database, independently of + * whether such a database exists. It will not exist if the transaction + * which created it aborted, nor if another process deleted it. The * database handle resides in the shared environment, it is not owned * by the given transaction. Only one thread should call this function; * it is not mutex-protected in a read-only transaction. + * Preexisting transactions, other than the current transaction and + * any parents, must not use the new handle. Nor must their children. * To use named databases (with name != NULL), #mdb_env_set_maxdbs() * must be called before opening the environment. * @param[in] txn A transaction handle returned by #mdb_txn_begin() diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 368cf5227f..60189683ed 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -6953,6 +6953,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db MDB_val key, data; MDB_dbi i; MDB_cursor mc; + uint16_t mdflags; int rc, dbflag, exact; unsigned int unused = 0; size_t len; @@ -7035,12 +7036,19 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db txn->mt_dbflags[slot] = dbflag; memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db)); *dbi = slot; - txn->mt_env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags; + txn->mt_env->me_dbflags[slot] = mdflags = txn->mt_dbs[slot].md_flags; mdb_default_cmp(txn, slot); if (!unused) { txn->mt_numdbs++; txn->mt_env->me_numdbs++; } + /* Open the DB in parent txns as well */ + while ((txn = txn->mt_parent) != NULL) { + txn->mt_dbflags[slot] = DB_STALE; + txn->mt_dbs[slot].md_flags = mdflags; + if (!unused) + txn->mt_numdbs++; + } } return rc; From 2dbb8bb833f8495f3638900ca8aaad67e8817658 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 19 Feb 2013 22:02:15 +0100 Subject: [PATCH 16/22] mdb_cursor_prev,mdb_cursor_next: Fix return value. Return mdb_node_read()'s return value if it fails, not 1. (Can happen if mdb_page_get() fails and NDEBUG is #defined.) --- libraries/liblmdb/mdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index 60189683ed..c9dc1a5ee3 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -4262,7 +4262,7 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) mdb_xcursor_init1(mc, leaf); } if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data) != MDB_SUCCESS)) + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -4335,7 +4335,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) mdb_xcursor_init1(mc, leaf); } if (data) { - if ((rc = mdb_node_read(mc->mc_txn, leaf, data) != MDB_SUCCESS)) + if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { From f97552a83abb085bc44b1b578e550d64c1313a4b Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 19 Feb 2013 22:02:37 +0100 Subject: [PATCH 17/22] Check DB flags when refreshing a stale MDB DBI. It's hairy to figure out when a DBI is valid. Catch destructive user errors, and flags which another process changed under us. --- libraries/liblmdb/lmdb.h | 4 +++- libraries/liblmdb/mdb.c | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index b91de4df96..5b6ff6ef58 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -362,7 +362,9 @@ typedef enum MDB_cursor_op { #define MDB_PAGE_FULL (-30786) /** Database contents grew beyond environment mapsize */ #define MDB_MAP_RESIZED (-30785) -#define MDB_LAST_ERRCODE MDB_MAP_RESIZED + /** Operation is incompatible with database */ +#define MDB_INCOMPATIBLE (-30784) +#define MDB_LAST_ERRCODE MDB_INCOMPATIBLE /** @} */ /** @brief Statistics for a database in the environment */ diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index c9dc1a5ee3..f1b7480673 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1056,6 +1056,7 @@ static char *const mdb_errstr[] = { "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", "MDB_PAGE_FULL: Internal error - page has no more space", "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", + "MDB_INCOMPATIBLE: Operation is incompatible with database", }; char * @@ -4056,6 +4057,12 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) if (!exact) return MDB_NOTFOUND; mdb_node_read(mc->mc_txn, leaf, &data); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. + */ + if (mc->mc_db->md_flags != *(uint16_t *) + ((char *) data.mv_data + offsetof(MDB_db, md_flags))) + return MDB_INCOMPATIBLE; memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db)); } if (flags & MDB_PS_MODIFY) From f43ae20be7cb1b45b9a4156a32e6fdcefbe4f63d Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 19 Feb 2013 22:03:04 +0100 Subject: [PATCH 18/22] ITS#7512 Plug mdb_txn_abort(nested txn) page leaks. Also catch mdb_cursor_shadow() errors. --- libraries/liblmdb/mdb.c | 50 ++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index f1b7480673..ab1e497395 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -911,6 +911,13 @@ typedef struct MDB_xcursor { unsigned char mx_dbflag; } MDB_xcursor; + /** State of FreeDB old pages, stored in the MDB_env */ +typedef struct MDB_pgstate { + txnid_t mf_pglast; /**< ID of last old page record we used */ + pgno_t *mf_pghead; /**< old pages reclaimed from freelist */ + pgno_t *mf_pgfree; /**< memory to free when dropping me_pghead */ +} MDB_pgstate; + /** The database environment. */ struct MDB_env { HANDLE me_fd; /**< The main data file */ @@ -937,12 +944,13 @@ struct MDB_env { size_t me_mapsize; /**< size of the data memory map */ off_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ - txnid_t me_pglast; /**< ID of last old page record we used */ MDB_dbx *me_dbxs; /**< array of static DB info */ uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ - pgno_t *me_pghead; /**< old pages reclaimed from freelist */ - pgno_t *me_pgfree; /**< memory to free when dropping me_pghead */ pthread_key_t me_txkey; /**< thread-key for readers */ + MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ +# define me_pglast me_pgstate.mf_pglast +# define me_pghead me_pgstate.mf_pghead +# define me_pgfree me_pgstate.mf_pgfree MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ /** IDL of pages that became unused in a write txn */ MDB_IDL me_free_pgs; @@ -958,6 +966,13 @@ struct MDB_env { sem_t *me_wmutex; #endif }; + + /** Nested transaction */ +typedef struct MDB_ntxn { + MDB_txn mnt_txn; /* the transaction */ + MDB_pgstate mnt_pgstate; /* parent transaction's saved freestate */ +} MDB_ntxn; + /** max number of pages to commit in one writev() call */ #define MDB_COMMIT_PAGES 64 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES @@ -1855,7 +1870,8 @@ int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) { MDB_txn *txn; - int rc, size; + MDB_ntxn *ntxn; + int rc, size, tsize = sizeof(MDB_txn); if (env->me_flags & MDB_FATAL_ERROR) { DPUTS("environment had fatal error, must shutdown!"); @@ -1871,8 +1887,9 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) { return EINVAL; } + tsize = sizeof(MDB_ntxn); } - size = sizeof(MDB_txn) + env->me_maxdbs * (sizeof(MDB_db)+1); + size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1); if (!(flags & MDB_RDONLY)) size += env->me_maxdbs * sizeof(MDB_cursor *); @@ -1880,7 +1897,7 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) DPRINTF("calloc: %s", strerror(ErrCode())); return ENOMEM; } - txn->mt_dbs = (MDB_db *)(txn+1); + txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); if (flags & MDB_RDONLY) { txn->mt_flags |= MDB_TXN_RDONLY; txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs); @@ -1913,8 +1930,22 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) txn->mt_dbxs = parent->mt_dbxs; memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); memcpy(txn->mt_dbflags, parent->mt_dbflags, txn->mt_numdbs); - mdb_cursor_shadow(parent, txn); rc = 0; + ntxn = (MDB_ntxn *)txn; + ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */ + if (env->me_pghead) { + size = MDB_IDL_SIZEOF(env->me_pghead); + env->me_pghead = malloc(size); + if (env->me_pghead) + memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size); + else + rc = ENOMEM; + } + env->me_pgfree = env->me_pghead; + if (!rc) + rc = mdb_cursor_shadow(parent, txn); + if (rc) + mdb_txn_reset0(txn); } else { rc = mdb_txn_renew0(txn); } @@ -1971,8 +2002,11 @@ mdb_txn_reset0(MDB_txn *txn) } } + free(env->me_pgfree); + if (txn->mt_parent) { txn->mt_parent->mt_child = NULL; + env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; mdb_midl_free(txn->mt_free_pgs); free(txn->mt_u.dirty_list); return; @@ -1981,7 +2015,6 @@ mdb_txn_reset0(MDB_txn *txn) env->me_free_pgs = txn->mt_free_pgs; } - free(txn->mt_env->me_pgfree); txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL; txn->mt_env->me_pglast = 0; @@ -2107,6 +2140,7 @@ mdb_txn_commit(MDB_txn *txn) dst[0].mid = x; free(txn->mt_u.dirty_list); txn->mt_parent->mt_child = NULL; + free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pgfree); free(txn); return MDB_SUCCESS; } From 208e5c614dcbb67ac276f45aa7ebabf63565c53c Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 19 Feb 2013 22:03:41 +0100 Subject: [PATCH 19/22] ITS#7515 Fix mdb_txn_commit(nested txn). Don't modify the parent txn until the current txn cannot fail. Don't assume new dirty child pgnos > dirty parent pgnos. Page alloc/touch: Fail if child+parent dirty pages would exceed dirty_list's maxsize. Avoids an error situation in commit. --- libraries/liblmdb/mdb.c | 66 ++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index ab1e497395..d2cbba2648 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -844,6 +844,8 @@ struct MDB_txn { #define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ /** @} */ unsigned int mt_flags; /**< @ref mdb_txn */ + /** dirty_list maxsize - #allocated pages including in parent txns */ + unsigned int mt_dirty_room; /** Tracks which of the two meta pages was used at the start * of this transaction. */ @@ -1285,7 +1287,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) *mp = NULL; /* If our dirty list is already full, we can't do anything */ - if (txn->mt_u.dirty_list[0].mid >= MDB_IDL_UM_MAX) + if (txn->mt_dirty_room == 0) return MDB_TXN_FULL; /* The free list won't have any content at all until txn 2 has @@ -1524,6 +1526,7 @@ none: } else { mdb_mid2l_insert(txn->mt_u.dirty_list, &mid); } + txn->mt_dirty_room--; *mp = np; return MDB_SUCCESS; @@ -1628,8 +1631,6 @@ finish: return 0; } } - if (mc->mc_txn->mt_u.dirty_list[0].mid >= MDB_IDL_UM_MAX) - return MDB_TXN_FULL; /* No - copy it */ np = mdb_page_malloc(mc); if (!np) @@ -1821,6 +1822,7 @@ mdb_txn_renew0(MDB_txn *txn) if (txn->mt_txnid == mdb_debug_start) mdb_debug = 1; #endif + txn->mt_dirty_room = MDB_IDL_UM_MAX; txn->mt_u.dirty_list = env->me_dirty_list; txn->mt_u.dirty_list[0].mid = 0; txn->mt_free_pgs = env->me_free_pgs; @@ -1921,6 +1923,7 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) } txn->mt_txnid = parent->mt_txnid; txn->mt_toggle = parent->mt_toggle; + txn->mt_dirty_room = parent->mt_dirty_room; txn->mt_u.dirty_list[0].mid = 0; txn->mt_free_pgs[0] = 0; txn->mt_next_pgno = parent->mt_next_pgno; @@ -2099,9 +2102,16 @@ mdb_txn_commit(MDB_txn *txn) if (txn->mt_parent) { MDB_txn *parent = txn->mt_parent; - unsigned x, y; + unsigned x, y, len; MDB_ID2L dst, src; + /* Append our free list to parent's */ + if (mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs)) { + mdb_txn_abort(txn); + return ENOMEM; + } + mdb_midl_free(txn->mt_free_pgs); + parent->mt_next_pgno = txn->mt_next_pgno; parent->mt_flags = txn->mt_flags; @@ -2113,32 +2123,40 @@ mdb_txn_commit(MDB_txn *txn) memcpy(parent->mt_dbflags, txn->mt_dbflags, txn->mt_numdbs); txn->mt_parent->mt_numdbs = txn->mt_numdbs; - /* Append our free list to parent's */ - mdb_midl_append_list(&txn->mt_parent->mt_free_pgs, - txn->mt_free_pgs); - mdb_midl_free(txn->mt_free_pgs); - - /* Merge our dirty list with parent's */ dst = txn->mt_parent->mt_u.dirty_list; src = txn->mt_u.dirty_list; - x = mdb_mid2l_search(dst, src[1].mid); - for (y=1; y<=src[0].mid; y++) { - while (x <= dst[0].mid && dst[x].mid != src[y].mid) x++; - if (x > dst[0].mid) - break; - free(dst[x].mptr); - dst[x].mptr = src[y].mptr; - } + /* Find len = length of merging our dirty list with parent's */ x = dst[0].mid; - for (; y<=src[0].mid; y++) { - if (++x >= MDB_IDL_UM_MAX) { - mdb_txn_abort(txn); - return MDB_TXN_FULL; + dst[0].mid = 0; /* simplify loops */ + if (parent->mt_parent) { + len = x + src[0].mid; + y = mdb_mid2l_search(src, dst[x].mid + 1) - 1; + for (i = x; y && i; y--) { + pgno_t yp = src[y].mid; + while (yp < dst[i].mid) + i--; + if (yp == dst[i].mid) { + i--; + len--; + } } - dst[x] = src[y]; + } else { /* Simplify the above for single-ancestor case */ + len = MDB_IDL_UM_MAX - txn->mt_dirty_room; } - dst[0].mid = x; + /* Merge our dirty list with parent's */ + y = src[0].mid; + for (i = len; y; dst[i--] = src[y--]) { + pgno_t yp = src[y].mid; + while (yp < dst[x].mid) + dst[i--] = dst[x--]; + if (yp == dst[x].mid) + free(dst[x--].mptr); + } + assert(i == x); + dst[0].mid = len; free(txn->mt_u.dirty_list); + parent->mt_dirty_room = txn->mt_dirty_room; + txn->mt_parent->mt_child = NULL; free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pgfree); free(txn); From fbd76c44e4d67ed0ffce7a8387fbf505775356d7 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 20 Feb 2013 09:08:41 +0100 Subject: [PATCH 20/22] Tweak prev commit: Restore if-test as an assert --- libraries/liblmdb/mdb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index d2cbba2648..a6193c704d 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1631,6 +1631,7 @@ finish: return 0; } } + assert(mc->mc_txn->mt_u.dirty_list[0].mid < MDB_IDL_UM_MAX); /* No - copy it */ np = mdb_page_malloc(mc); if (!np) From 80cd88118406acdd5e153c91a6a42c7c90804ac6 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 20 Feb 2013 12:19:45 +0100 Subject: [PATCH 21/22] Tweak MDB_INCOMPATIBLE description --- libraries/liblmdb/lmdb.h | 2 +- libraries/liblmdb/mdb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 5b6ff6ef58..029945a608 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -362,7 +362,7 @@ typedef enum MDB_cursor_op { #define MDB_PAGE_FULL (-30786) /** Database contents grew beyond environment mapsize */ #define MDB_MAP_RESIZED (-30785) - /** Operation is incompatible with database */ + /** Database flags changed or would change */ #define MDB_INCOMPATIBLE (-30784) #define MDB_LAST_ERRCODE MDB_INCOMPATIBLE /** @} */ diff --git a/libraries/liblmdb/mdb.c b/libraries/liblmdb/mdb.c index a6193c704d..041acfa8e8 100644 --- a/libraries/liblmdb/mdb.c +++ b/libraries/liblmdb/mdb.c @@ -1073,7 +1073,7 @@ static char *const mdb_errstr[] = { "MDB_CURSOR_FULL: Internal error - cursor stack limit reached", "MDB_PAGE_FULL: Internal error - page has no more space", "MDB_MAP_RESIZED: Database contents grew beyond environment mapsize", - "MDB_INCOMPATIBLE: Operation is incompatible with database", + "MDB_INCOMPATIBLE: Database flags changed or would change", }; char * From 3394bac2c0fcb69f29da5bd182fdb025b6ac42f6 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 20 Feb 2013 05:08:52 -0800 Subject: [PATCH 22/22] Update error code instances --- libraries/liblmdb/lmdb.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/libraries/liblmdb/lmdb.h b/libraries/liblmdb/lmdb.h index 029945a608..b94bd2d3b8 100644 --- a/libraries/liblmdb/lmdb.h +++ b/libraries/liblmdb/lmdb.h @@ -492,7 +492,7 @@ int mdb_env_create(MDB_env **env); *
    *
  • #MDB_VERSION_MISMATCH - the version of the MDB library doesn't match the * version that created the database environment. - *
  • EINVAL - the environment file headers are corrupted. + *
  • #MDB_INVALID - the environment file headers are corrupted. *
  • ENOENT - the directory specified by the path parameter doesn't exist. *
  • EACCES - the user didn't have permission to access the environment files. *
  • EAGAIN - the environment was locked by another process. @@ -689,8 +689,9 @@ int mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs); - * must be shut down. *
  • #MDB_MAP_RESIZED - another process wrote data beyond this MDB_env's * mapsize and the environment must be shut down. - *
  • ENOMEM - out of memory, or a read-only transaction was requested and + *
  • #MDB_READERS_FULL - a read-only transaction was requested and * the reader lock table is full. See #mdb_env_set_maxreaders(). + *
  • ENOMEM - out of memory. *
*/ int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **txn); @@ -706,7 +707,7 @@ int mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn ** *
  • EINVAL - an invalid parameter was specified. *
  • ENOSPC - no more disk space. *
  • EIO - a low-level I/O error occurred while writing. - *
  • ENOMEM - the transaction is nested and could not be merged into its parent. + *
  • ENOMEM - out of memory. * */ int mdb_txn_commit(MDB_txn *txn); @@ -811,7 +812,7 @@ int mdb_txn_renew(MDB_txn *txn); *
      *
    • #MDB_NOTFOUND - the specified database doesn't exist in the environment * and #MDB_CREATE was not specified. - *
    • ENFILE - too many databases have been opened. See #mdb_env_set_maxdbs(). + *
    • #MDB_DBS_FULL - too many databases have been opened. See #mdb_env_set_maxdbs(). *
    */ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi); @@ -997,9 +998,10 @@ int mdb_get(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data); * @return A non-zero error value on failure and 0 on success. Some possible * errors are: *
      + *
    • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). + *
    • #MDB_TXN_FULL - the transaction has too many dirty pages. *
    • EACCES - an attempt was made to write in a read-only transaction. *
    • EINVAL - an invalid parameter was specified. - *
    • ENOMEM - the database is full, see #mdb_env_set_mapsize(). *
    */ int mdb_put(MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, @@ -1139,6 +1141,8 @@ int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data, * @return A non-zero error value on failure and 0 on success. Some possible * errors are: *
      + *
    • #MDB_MAP_FULL - the database is full, see #mdb_env_set_mapsize(). + *
    • #MDB_TXN_FULL - the transaction has too many dirty pages. *
    • EACCES - an attempt was made to modify a read-only database. *
    • EINVAL - an invalid parameter was specified. *