ITS#9224 lmdb: add support for PREPARE/2-phase commit

This commit is contained in:
Howard Chu 2025-12-08 16:07:00 +00:00 committed by Quanah Gibson-Mount
parent be217bf254
commit 425525fbec
2 changed files with 107 additions and 5 deletions

View file

@ -524,8 +524,12 @@ typedef enum MDB_cursor_op {
#define MDB_CRYPTO_FAIL (-30777)
/** Environment encryption mismatch */
#define MDB_ENV_ENCRYPTION (-30776)
/** Transaction was already prepared */
#define MDB_TXN_PENDING (-30775)
/** Environment can't rollback the last transaction */
#define MDB_CANT_ROLLBACK (-30774)
/** The last defined error code */
#define MDB_LAST_ERRCODE MDB_ENV_ENCRYPTION
#define MDB_LAST_ERRCODE MDB_CANT_ROLLBACK
/** @} */
/** @brief Statistics for a database in the environment */
@ -1128,6 +1132,55 @@ mdb_size_t mdb_txn_id(MDB_txn *txn);
*/
int mdb_txn_commit(MDB_txn *txn);
/** @brief Prepare to commit all the operations of a transaction into the database.
*
* This function exists to support two-phase commit protocols.
* All writes in the transaction are persisted to storage, but the final
* metapage update is not performed. All cursors on the transaction will be
* closed. Only #mdb_txn_abort() or #mdb_txn_commit() are valid after this
* call. It is assumed that once the regular data pages are successfully written
* by this call, the metapage update from a subsequent commit cannot fail, but
* hardware-level media failures could still break this assumption.
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @return A non-zero error value on failure and 0 on success. Some possible
* errors are:
* <ul>
* <li>EINVAL - an invalid parameter was specified.
* <li>ENOSPC - no more disk space.
* <li>EIO - a low-level I/O error occurred while writing.
* <li>ENOMEM - out of memory.
* <li>#MDB_TXN_PENDING - the transaction has already been prepared.
* It can only be aborted or committed.
* </ul>
*/
int mdb_txn_prepare(MDB_txn *txn);
/** @brief Rollback the last committed transaction in the environment.
*
* This function exists to support two-phase commit protocols.
* The metapage update for the last committed transaction will be zeroed,
* so its changes will be ignored. It should only be used when the local
* phase of a multi-phase transaction has fully committed, but some other
* remote phase which successfully prepared has failed to commit.
* This function may not be called twice in a row. No other operations
* may be performed on the environment, by any processes, between the
* preceding #mdb_txn_commit() and this call.
* @param[in] env An environment handle returned by #mdb_env_create().
* @param[in] txnid The ID of the transaction to rollback, obtained from
* #mdb_txnid() on the previous transaction.
* @return A non-zero error value on failure and 0 on success. Some possible
* errors are:
* <ul>
* <li>EINVAL - an invalid parameter was specified.
* <li>ENOSPC - no more disk space.
* <li>EIO - a low-level I/O error occurred while writing.
* <li>#MDB_CANT_ROLLBACK - a rollback has already been done, there is
* no other valid metapage to roll back to, or another transaction
* has already been committed over the specified txnid.
* </ul>
*/
int mdb_env_rollback(MDB_env *env, mdb_size_t txnid);
/** @brief Abandon all the operations of the transaction instead of saving them.
*
* The transaction handle is freed. It and its cursors must not be used

View file

@ -1500,8 +1500,9 @@ struct MDB_txn {
#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */
#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */
#define MDB_TXN_DIRTYNUM 0x20 /**< dirty list uses nump list */
#define MDB_TXN_PREPARE 0x40 /**< prepare txn, don't fully commit */
/** most operations on the txn are currently illegal */
#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD)
#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED|MDB_TXN_ERROR|MDB_TXN_HAS_CHILD|MDB_TXN_PREPARE)
/** @} */
unsigned int mt_flags; /**< @ref mdb_txn */
/** #dirty_list room: Array size - \#dirty pages visible to this txn.
@ -1888,6 +1889,8 @@ static char *const mdb_errstr[] = {
"MDB_BAD_CHECKSUM: Page checksum mismatch",
"MDB_CRYPTO_FAIL: Page encryption or decryption failed",
"MDB_ENV_ENCRYPTION: Environment encryption mismatch",
"MDB_TXN_PENDING: Transaction already prepared, must abort or commit",
"MDB_CANT_ROLLBACK: Environment can't rollback last transaction",
};
char *
@ -4263,7 +4266,7 @@ done:
static int ESECT mdb_env_share_locks(MDB_env *env, int *excl);
static int
_mdb_txn_commit(MDB_txn *txn)
_mdb_txn_commit(MDB_txn *txn, int flag)
{
int rc;
unsigned int i, end_mode;
@ -4276,7 +4279,7 @@ _mdb_txn_commit(MDB_txn *txn)
end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE;
if (txn->mt_child) {
rc = _mdb_txn_commit(txn->mt_child);
rc = _mdb_txn_commit(txn->mt_child, 0);
if (rc)
goto fail;
}
@ -4287,6 +4290,10 @@ _mdb_txn_commit(MDB_txn *txn)
goto done;
}
if (F_ISSET(txn->mt_flags, MDB_TXN_PREPARE)) {
goto prepared;
}
if (txn->mt_flags & (MDB_TXN_FINISHED|MDB_TXN_ERROR)) {
DPUTS("txn has failed/finished, can't commit");
if (txn->mt_parent)
@ -4486,6 +4493,12 @@ _mdb_txn_commit(MDB_txn *txn)
if (!F_ISSET(txn->mt_flags, MDB_TXN_NOSYNC) &&
(rc = mdb_env_sync0(env, 0, txn->mt_next_pgno)))
goto fail;
if (F_ISSET(flag, MDB_TXN_PREPARE)) {
txn->mt_flags |= MDB_TXN_PREPARE;
return MDB_SUCCESS;
}
prepared:
if ((rc = mdb_env_write_meta(txn)))
goto fail;
end_mode = MDB_END_COMMITTED|MDB_END_UPDATE;
@ -4512,7 +4525,43 @@ int
mdb_txn_commit(MDB_txn *txn)
{
MDB_TRACE(("%p", txn));
return _mdb_txn_commit(txn);
return _mdb_txn_commit(txn, 0);
}
int
mdb_txn_prepare(MDB_txn *txn)
{
MDB_TRACE(("%p", txn));
if (F_ISSET(txn->mt_flags, MDB_TXN_PREPARE))
return MDB_TXN_PENDING;
return _mdb_txn_commit(txn, MDB_TXN_PREPARE);
}
int
mdb_env_rollback(MDB_env *env, mdb_size_t txnid)
{
MDB_meta **metas = env->me_metas;
int newest, previous, rc = 0;
if (env->me_txns && LOCK_MUTEX(rc, env, env->me_wmutex))
return rc;
newest = metas[0]->mm_txnid < metas[1]->mm_txnid;
previous = newest ^ 1;
if (!metas[previous]->mm_txnid || metas[newest]->mm_txnid != txnid)
rc = MDB_CANT_ROLLBACK;
else {
MDB_txn txn = {0};
MDB_db dbs[2] = {0};
txn.mt_env = env;
txn.mt_dbs = dbs;
rc = mdb_env_write_meta(&txn);
}
if (env->me_txns) {
if (rc == MDB_SUCCESS)
env->me_txns->mti_txnid = metas[previous]->mm_txnid;
UNLOCK_MUTEX(env->me_wmutex);
}
return rc;
}
static int ESECT mdb_env_map(MDB_env *env, void *addr);