From b802bcf7040110e1f0768d9d9ffa88dabd4bd5e7 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 5 Jul 2012 16:40:46 -0700 Subject: [PATCH 1/4] Partial revert of 13c663f666ac28d7a72cbe644d393fc8d2dd9881 Don't re-use free pages so soon; that leaves us vulnerable to DB corruption if data syncs successfully but meta doesn't. --- libraries/libmdb/mdb.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index 3b0b66fa19..fe89a7a45a 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -1220,7 +1220,11 @@ mdb_page_alloc(MDB_cursor *mc, int num) pgno_t pgno = P_INVALID; MDB_ID2 mid; - if (txn->mt_txnid > 2) { + /* The free list won't have any content at all until txn 2 has + * committed. The pages from txn 1 will be free after txn 3 has + * committed. It will be safe to re-use them during txn 4. + */ + if (txn->mt_txnid > 3) { if (!txn->mt_env->me_pghead && txn->mt_dbs[FREE_DBI].md_root != P_INVALID) { @@ -1253,7 +1257,7 @@ again: { unsigned int i; - oldest = txn->mt_txnid - 1; + oldest = txn->mt_txnid - 2; for (i=0; imt_env->me_txns->mti_numreaders; i++) { txnid_t mr = txn->mt_env->me_txns->mti_readers[i].mr_txnid; if (mr && mr < oldest) From 38560c2517d53bd43b530361ec7682657583aded Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 5 Jul 2012 17:43:29 -0700 Subject: [PATCH 2/4] Tweak b802bcf7040110e1f0768d9d9ffa88dabd4bd5e7 Clarify prev commit, fix Doxygen comments broken by earlier changes --- libraries/libmdb/mdb.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index fe89a7a45a..729725dd92 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -926,7 +926,7 @@ typedef struct MDB_oldpages { struct MDB_oldpages *mo_next; /** The ID of the transaction in which these pages were freed. */ txnid_t mo_txnid; - /** An #IDL of the pages */ + /** An #MDB_IDL of the pages */ pgno_t mo_pages[1]; /* dynamic */ } MDB_oldpages; @@ -1221,8 +1221,8 @@ mdb_page_alloc(MDB_cursor *mc, int num) MDB_ID2 mid; /* The free list won't have any content at all until txn 2 has - * committed. The pages from txn 1 will be free after txn 3 has - * committed. It will be safe to re-use them during txn 4. + * committed. The pages freed by txn 2 will be unreferenced + * after txn 3 commits, and so will be safe to re-use in txn 4. */ if (txn->mt_txnid > 3) { @@ -1257,7 +1257,7 @@ again: { unsigned int i; - oldest = txn->mt_txnid - 2; + oldest = txn->mt_txnid - 1; for (i=0; imt_env->me_txns->mti_numreaders; i++) { txnid_t mr = txn->mt_env->me_txns->mti_readers[i].mr_txnid; if (mr && mr < oldest) @@ -5715,6 +5715,7 @@ mdb_del(MDB_txn *txn, MDB_dbi dbi, * @param[in] newkey The key for the newly inserted node. * @param[in] newdata The data for the newly inserted node. * @param[in] newpgno The page number, if the new node is a branch node. + * @param[in] nflags The #NODE_ADD_FLAGS for the new node. * @return 0 on success, non-zero on failure. */ static int From df7ddb6bf44a6ceec7f3974293fdc0669b68251f Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 5 Jul 2012 18:11:18 -0700 Subject: [PATCH 3/4] Add MDB_NOMETASYNC env option. Just a trial. This may not make sense if we decide to split the meta pages into their own separate file, to allow meta traffic to reside on a separate spindle. --- libraries/libmdb/mdb.c | 4 ++-- libraries/libmdb/mdb.h | 12 ++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index 729725dd92..c32057c8a1 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -3037,7 +3037,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode) } if ((rc = mdb_env_open2(env, flags)) == MDB_SUCCESS) { - if (flags & (MDB_RDONLY|MDB_NOSYNC)) { + if (flags & (MDB_RDONLY|MDB_NOSYNC|MDB_NOMETASYNC)) { env->me_mfd = env->me_fd; } else { /* synchronous fd for meta writes */ @@ -6096,7 +6096,7 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, * at runtime. Changing other flags requires closing the environment * and re-opening it with the new flags. */ -#define CHANGEABLE (MDB_NOSYNC) +#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC) int mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff) { diff --git a/libraries/libmdb/mdb.h b/libraries/libmdb/mdb.h index bcf36659c4..587013f6f4 100644 --- a/libraries/libmdb/mdb.h +++ b/libraries/libmdb/mdb.h @@ -159,6 +159,8 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_NOSYNC 0x10000 /** read only */ #define MDB_RDONLY 0x20000 + /** don't fsync metapage after commit */ +#define MDB_NOMETASYNC 0x40000 /** @} */ /** @defgroup mdb_open Database Flags @@ -334,6 +336,13 @@ int mdb_env_create(MDB_env **env); * at risk is governed by how often the system flushes dirty buffers to disk * and how often #mdb_env_sync() is called. This flag may be changed * at any time using #mdb_env_set_flags(). + *
  • #MDB_NOMETASYNC + * Don't perform a synchronous flush of the meta page after committing + * a transaction. This is similar to the #MDB_NOSYNC case, but safer + * because the transaction data is still flushed. The meta page for any + * transaction N will be flushed by the data flush of transaction N+1. + * In case of a system crash, the last committed transaction may be + * lost. This flag may be changed at any time using #mdb_env_set_flags(). *
  • #MDB_RDONLY * Open the environment in read-only mode. No write operations will be allowed. * @@ -392,8 +401,7 @@ void mdb_env_close(MDB_env *env); /** @brief Set environment flags. * * This may be used to set some flags that weren't already set during - * #mdb_env_open(), or to unset these flags. Currently only the - * #MDB_NOSYNC flag setting may be changed with this function. + * #mdb_env_open(), or to unset these flags. * @param[in] env An environment handle returned by #mdb_env_create() * @param[in] flags The flags to change, bitwise OR'ed together * @param[in] onoff A non-zero value sets the flags, zero clears them. From 433105f09b33b28065a06185f628b4b38e0c4c22 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 6 Jul 2012 17:38:03 -0700 Subject: [PATCH 4/4] Fix: avoid direct reference to meta pages Relevant info should be copied during txn_begin only. --- libraries/libmdb/mdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libraries/libmdb/mdb.c b/libraries/libmdb/mdb.c index c32057c8a1..0c8e6aa0e5 100644 --- a/libraries/libmdb/mdb.c +++ b/libraries/libmdb/mdb.c @@ -1608,9 +1608,9 @@ mdb_txn_renew0(MDB_txn *txn) txn->mt_u.dirty_list[0].mid = 0; txn->mt_free_pgs = env->me_free_pgs; txn->mt_free_pgs[0] = 0; - txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1; env->me_txn = txn; } + txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1; /* Copy the DB arrays */ LAZY_RWLOCK_RDLOCK(&env->me_dblock); @@ -3380,7 +3380,7 @@ mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret) } } if (!p) { - if (pgno <= txn->mt_env->me_metas[txn->mt_toggle]->mm_last_pg) + if (pgno < txn->mt_next_pgno) p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno); } *ret = p;