ITS#10395 LMDB: Allow multiple nested read txns from a write txn

This commit is contained in:
Kerollmops 2026-03-30 17:48:04 +02:00 committed by Howard Chu
parent 425525fbec
commit 10161b20fe
2 changed files with 91 additions and 31 deletions

View file

@ -1070,6 +1070,10 @@ int mdb_env_set_checksum(MDB_env *env, MDB_sum_func *func, unsigned int size);
* as its parent. Transactions may be nested to any level. A parent
* transaction and its cursors may not issue any other operations than
* mdb_txn_commit and mdb_txn_abort while it has active child transactions.
* @note A parent transaction with read-only child transactions must not
* issue any operations while it has active child transactions, not even
* #mdb_txn_commit() nor #mdb_txn_abort(). Child transactions must be
* aborted prior to performing actions with the parent one.
* @param[in] flags Special options for this transaction. This parameter
* must be set to 0 or by bitwise OR'ing together one or more of the
* values described here.
@ -1191,7 +1195,7 @@ int mdb_env_rollback(MDB_env *env, mdb_size_t txnid);
*/
void mdb_txn_abort(MDB_txn *txn);
/** @brief Reset a read-only transaction.
/** @brief Reset a non-nested read-only transaction.
*
* Abort the transaction like #mdb_txn_abort(), but keep the transaction
* handle. #mdb_txn_renew() may reuse the handle. This saves allocation
@ -1210,11 +1214,11 @@ void mdb_txn_abort(MDB_txn *txn);
*/
void mdb_txn_reset(MDB_txn *txn);
/** @brief Renew a read-only transaction.
/** @brief Renew a non-nested read-only transaction.
*
* This acquires a new reader lock for a transaction handle that had been
* released by #mdb_txn_reset(). It must be called before a reset transaction
* may be used again.
* may be used again. Nested read-only transactions cannot be renewed.
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @return A non-zero error value on failure and 0 on success. Some possible
* errors are:

View file

@ -374,6 +374,8 @@ typedef HANDLE mdb_mutex_t, mdb_mutexref_t;
#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
#define pthread_mutex_unlock(x) ReleaseMutex(*x)
#define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE)
#define pthread_mutex_init(x,y) ((*x = CreateMutex(NULL, FALSE, NULL)) ? 0 : ErrCode())
#define pthread_mutex_destroy(x) (CloseHandle(*x) ? 0 : ErrCode())
#define pthread_cond_signal(x) SetEvent(*x)
#define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0)
#define THREAD_CREATE(thr,start,arg) \
@ -1391,6 +1393,10 @@ struct MDB_txn {
MDB_txn *mt_parent; /**< parent of a nested txn */
/** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */
MDB_txn *mt_child;
/** The count of nested RDONLY txns under this txn */
unsigned int mt_rdonly_child_count;
/** Mutex protecting mt_rdonly_child_count */
pthread_mutex_t mt_child_mutex;
pgno_t mt_next_pgno; /**< next unallocated page */
#if MDB_RPAGE_CACHE
pgno_t mt_last_pgno; /**< last written page */
@ -3347,6 +3353,8 @@ mdb_txn_renew0(MDB_txn *txn)
mdb_debug = MDB_DBG_INFO;
#endif
txn->mt_child = NULL;
txn->mt_rdonly_child_count = 0;
pthread_mutex_init(&txn->mt_child_mutex, NULL);
txn->mt_loose_pgs = NULL;
txn->mt_loose_count = 0;
if (env->me_flags & MDB_WRITEMAP) {
@ -3461,11 +3469,22 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
return EACCES;
if (parent) {
/* Nested transactions: Max 1 child, write txns only, no writemap */
/* Nested transactions:
* Only write txns may have nested txns;
* if the nested txn is a write txn there may only be 1, no writemap;
* if the nested txn is a read txn there may be arbitrarily many.
*/
pthread_mutex_lock(&parent->mt_child_mutex);
flags |= parent->mt_flags;
if (flags & (MDB_RDONLY|MDB_WRITEMAP|MDB_TXN_BLOCKED)) {
return (parent->mt_flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN;
if (parent->mt_child && F_ISSET(parent->mt_child->mt_flags, MDB_RDONLY) && F_ISSET(flags, MDB_RDONLY)) {
flags &= ~MDB_TXN_HAS_CHILD;
}
if ((F_ISSET(flags, MDB_WRITEMAP) && !F_ISSET(flags, MDB_RDONLY)) || F_ISSET(flags, MDB_TXN_BLOCKED)) {
flags = parent->mt_flags;
pthread_mutex_unlock(&parent->mt_child_mutex);
return (flags & MDB_TXN_RDONLY) ? EINVAL : MDB_BAD_TXN;
}
pthread_mutex_unlock(&parent->mt_child_mutex);
/* Child txns save MDB_pgstate and use own copy of cursors */
size = env->me_maxdbs * (sizeof(MDB_db)+sizeof(MDB_cursor *)+1);
size += tsize = sizeof(MDB_ntxn);
@ -3508,24 +3527,35 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
txn->mt_workid = txn->mt_last_workid = workid;
txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
txn->mt_dbiseqs = parent->mt_dbiseqs;
txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
if (!txn->mt_u.dirty_list ||
!(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)))
{
free(txn->mt_u.dirty_list);
free(txn);
return ENOMEM;
/* share parent dirty and free pages with nested RDONLY txn */
if (F_ISSET(flags, MDB_RDONLY)) {
txn->mt_u.dirty_list = parent->mt_u.dirty_list;
txn->mt_free_pgs = parent->mt_free_pgs;
} else {
txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
if (!txn->mt_u.dirty_list ||
!(txn->mt_free_pgs = mdb_midl_alloc(MDB_IDL_UM_MAX)))
{
free(txn->mt_u.dirty_list);
free(txn);
return ENOMEM;
}
txn->mt_u.dirty_list[0].mid = 0;
}
txn->mt_txnid = parent->mt_txnid;
txn->mt_dirty_room = parent->mt_dirty_room;
txn->mt_u.dirty_list[0].mid = 0;
txn->mt_spill_pgs = NULL;
#if OVERFLOW_NOTYET
txn->mt_dirty_ovs = NULL;
#endif
txn->mt_next_pgno = parent->mt_next_pgno;
pthread_mutex_lock(&parent->mt_child_mutex);
parent->mt_flags |= MDB_TXN_HAS_CHILD;
parent->mt_child = txn;
if (F_ISSET(flags, MDB_RDONLY)) {
parent->mt_rdonly_child_count++;
}
pthread_mutex_unlock(&parent->mt_child_mutex);
txn->mt_parent = parent;
txn->mt_numdbs = parent->mt_numdbs;
#if MDB_RPAGE_CACHE
@ -3538,19 +3568,22 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW;
rc = 0;
ntxn = (MDB_ntxn *)txn;
ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */
if (env->me_pghead) {
size = MDB_IDL_SIZEOF(env->me_pghead);
env->me_pghead = mdb_midl_alloc(env->me_pghead[0]);
if (env->me_pghead)
memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size);
else
rc = ENOMEM;
if (!F_ISSET(flags, MDB_RDONLY)) {
ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */
/* Do not copy parent me_pghead when nested and RDONLY */
if (env->me_pghead) {
size = MDB_IDL_SIZEOF(env->me_pghead);
env->me_pghead = mdb_midl_alloc(env->me_pghead[0]);
if (env->me_pghead)
memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size);
else
rc = ENOMEM;
}
if (!rc)
rc = mdb_cursor_shadow(parent, txn);
if (rc)
mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD);
}
if (!rc)
rc = mdb_cursor_shadow(parent, txn);
if (rc)
mdb_txn_end(txn, MDB_END_FAIL_BEGINCHILD);
} else { /* MDB_RDONLY */
txn->mt_dbiseqs = env->me_dbiseqs;
renew:
@ -3559,7 +3592,9 @@ renew:
if (rc) {
if (txn != env->me_txn0) {
/* mt_rpages belongs to parent */
free(txn->mt_u.dirty_list);
if (!F_ISSET(flags, MDB_RDONLY)) {
free(txn->mt_u.dirty_list);
}
free(txn);
}
} else {
@ -3639,6 +3674,16 @@ mdb_txn_end(MDB_txn *txn, unsigned mode)
(void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root));
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
if (txn->mt_parent) {
pthread_mutex_lock(&txn->mt_parent->mt_child_mutex);
txn->mt_parent->mt_rdonly_child_count--;
/* mark parent txn has no longer having children if this is the last nested txn */
if (txn->mt_parent->mt_rdonly_child_count == 0) {
txn->mt_parent->mt_child = NULL;
txn->mt_parent->mt_flags &= ~MDB_TXN_HAS_CHILD;
}
pthread_mutex_unlock(&txn->mt_parent->mt_child_mutex);
} else
if (txn->mt_u.reader) {
txn->mt_u.reader->mr_txnid = (txnid_t)-1;
if (!(env->me_flags & MDB_NOTLS)) {
@ -3725,8 +3770,11 @@ mdb_txn_end(MDB_txn *txn, unsigned mode)
free(tl);
}
#endif
if (mode & MDB_END_FREE)
if (mode & MDB_END_FREE) {
if (!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
pthread_mutex_destroy(&txn->mt_child_mutex);
free(txn);
}
}
void
@ -3735,8 +3783,8 @@ mdb_txn_reset(MDB_txn *txn)
if (txn == NULL)
return;
/* This call is only valid for read-only txns */
if (!(txn->mt_flags & MDB_TXN_RDONLY))
/* This call is only valid for non-nested read-only txns */
if (!(txn->mt_flags & MDB_TXN_RDONLY) || txn->mt_parent)
return;
mdb_txn_end(txn, MDB_END_RESET);
@ -3748,6 +3796,14 @@ _mdb_txn_abort(MDB_txn *txn)
if (txn == NULL)
return;
if (txn->mt_parent && (txn->mt_flags & MDB_RDONLY)) {
/* You must first abort the child before the parent */
pthread_mutex_lock(&txn->mt_child_mutex);
int count = txn->mt_rdonly_child_count;
pthread_mutex_unlock(&txn->mt_child_mutex);
mdb_tassert(txn, txn->mt_parent && count == 0);
}
if (txn->mt_child)
_mdb_txn_abort(txn->mt_child);
@ -7216,7 +7272,7 @@ mdb_page_get(MDB_cursor *mc, pgno_t pgno,
MDB_txn *txn = mc->mc_txn, *tx2;
MDB_page *p = NULL;
if (! (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP))) {
if (! (( mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP) ) && mc->mc_txn->mt_parent == NULL)) {
for (tx2 = txn;; ) {
MDB_ID2L dl = tx2->mt_u.dirty_list;
MDB_IDL sl;