Merge remote-tracking branch 'origin/mdb.master'

This commit is contained in:
Quanah Gibson-Mount 2014-06-13 12:37:05 -05:00
commit ba4dd5b733
4 changed files with 3793 additions and 86 deletions

View file

@ -1,4 +1,4 @@
Copyright 2011-2013 Howard Chu, Symas Corp.
Copyright 2011-2014 Howard Chu, Symas Corp.
All rights reserved.
Redistribution and use in source and binary forms, with or without

View file

@ -408,7 +408,7 @@ typedef enum MDB_cursor_op {
#define MDB_BAD_RSLOT (-30783)
/** Transaction cannot recover - it must be aborted */
#define MDB_BAD_TXN (-30782)
/** Too big key/data, key is empty, or wrong DUPFIXED size */
/** Unsupported size of key/DB name/data, or wrong DUPFIXED size */
#define MDB_BAD_VALSIZE (-30781)
#define MDB_LAST_ERRCODE MDB_BAD_VALSIZE
/** @} */
@ -784,6 +784,10 @@ int mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers);
* environment. Simpler applications that use the environment as a single
* unnamed database can ignore this option.
* This function may only be called after #mdb_env_create() and before #mdb_env_open().
*
* Currently a moderate number of slots are cheap but a huge number gets
* expensive: 7-120 words per transaction, and every #mdb_dbi_open()
* does a linear search of the opened slots.
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] dbs The maximum number of databases
* @return A non-zero error value on failure and 0 on success. Some possible
@ -951,7 +955,7 @@ int mdb_txn_renew(MDB_txn *txn);
* independently of whether such a database exists.
* The database handle may be discarded by calling #mdb_dbi_close().
* The old database handle is returned if the database was already open.
* The handle must only be closed once.
* The handle may only be closed once.
* The database handle will be private to the current transaction until
* the transaction is successfully committed. If the transaction is
* aborted the handle will be closed automatically.
@ -963,7 +967,8 @@ int mdb_txn_renew(MDB_txn *txn);
* use this function.
*
* To use named databases (with name != NULL), #mdb_env_set_maxdbs()
* must be called before opening the environment.
* must be called before opening the environment. Database names
* are kept as keys in the unnamed database.
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @param[in] name The name of the database to open. If only a single
* database is needed in the environment, this value may be NULL.
@ -1033,12 +1038,19 @@ int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *stat);
*/
int mdb_dbi_flags(MDB_txn *txn, MDB_dbi dbi, unsigned int *flags);
/** @brief Close a database handle.
/** @brief Close a database handle. Normally unnecessary. Use with care:
*
* This call is not mutex protected. Handles should only be closed by
* a single thread, and only if no other threads are going to reference
* the database handle or one of its cursors any further. Do not close
* a handle if an existing transaction has modified its database.
* Doing so can cause misbehavior from database corruption to errors
* like MDB_BAD_VALSIZE (since the DB name is gone).
*
* Closing a database handle is not necessary, but lets #mdb_dbi_open()
* reuse the handle value. Usually it's better to set a bigger
* #mdb_env_set_maxdbs(), unless that value would be large.
*
* @param[in] env An environment handle returned by #mdb_env_create()
* @param[in] dbi A database handle returned by #mdb_dbi_open()
*/
@ -1046,6 +1058,7 @@ void mdb_dbi_close(MDB_env *env, MDB_dbi dbi);
/** @brief Empty or delete+close a database.
*
* See #mdb_dbi_close() for restrictions about closing the DB handle.
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
* @param[in] dbi A database handle returned by #mdb_dbi_open()
* @param[in] del 0 to empty the DB, 1 to delete it from the
@ -1323,9 +1336,9 @@ int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data,
/** @brief Store by cursor.
*
* This function stores key/data pairs into the database.
* If the function fails for any reason, the state of the cursor will be
* unchanged. If the function succeeds and an item is inserted into the
* database, the cursor is always positioned to refer to the newly inserted item.
* The cursor is positioned at the new item, or on failure usually near it.
* @note Earlier documentation incorrectly said errors would leave the
* state of the cursor unchanged.
* @param[in] cursor A cursor handle returned by #mdb_cursor_open()
* @param[in] key The key operated on.
* @param[in] data The data operated on.
@ -1334,7 +1347,9 @@ int mdb_cursor_get(MDB_cursor *cursor, MDB_val *key, MDB_val *data,
* <ul>
* <li>#MDB_CURRENT - overwrite the data of the key/data pair to which
* the cursor refers with the specified data item. The \b key
* parameter is ignored. The \b data size must match the original.
* parameter is not used for positioning the cursor, but should
* still be provided. If using sorted duplicates (#MDB_DUPSORT)
* the data item must still sort into the same place.
* <li>#MDB_NODUPDATA - enter the new key/data pair only if it does not
* already appear in the database. This flag may only be specified
* if the database was opened with #MDB_DUPSORT. The function will

View file

@ -1202,7 +1202,7 @@ static char *const mdb_errstr[] = {
"MDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed",
"MDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
"MDB_BAD_TXN: Transaction cannot recover - it must be aborted",
"MDB_BAD_VALSIZE: Too big key/data, key is empty, or wrong DUPFIXED size",
"MDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size",
};
char *
@ -2770,23 +2770,20 @@ mdb_freelist_save(MDB_txn *txn)
mop += mop_len;
rc = mdb_cursor_first(&mc, &key, &data);
for (; !rc; rc = mdb_cursor_next(&mc, &key, &data, MDB_NEXT)) {
unsigned flags = MDB_CURRENT;
txnid_t id = *(txnid_t *)key.mv_data;
ssize_t len = (ssize_t)(data.mv_size / sizeof(MDB_ID)) - 1;
MDB_ID save;
mdb_tassert(txn, len >= 0 && id <= env->me_pglast);
key.mv_data = &id;
if (len > mop_len) {
len = mop_len;
data.mv_size = (len + 1) * sizeof(MDB_ID);
/* Drop MDB_CURRENT when changing the data size */
key.mv_data = &id;
flags = 0;
}
data.mv_data = mop -= len;
save = mop[0];
mop[0] = len;
rc = mdb_cursor_put(&mc, &key, &data, flags);
rc = mdb_cursor_put(&mc, &key, &data, MDB_CURRENT);
mop[0] = save;
if (rc || !(mop_len -= len))
break;
@ -3680,7 +3677,9 @@ static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr)
case DLL_THREAD_DETACH:
for (i=0; i<mdb_tls_nkeys; i++) {
MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]);
mdb_env_reader_dest(r);
if (r) {
mdb_env_reader_dest(r);
}
}
break;
case DLL_PROCESS_DETACH: break;
@ -4547,6 +4546,13 @@ mdb_cmp_cint(const MDB_val *a, const MDB_val *b)
#endif
}
/** Compare two items pointing at size_t's of unknown alignment. */
#ifdef MISALIGNED_OK
# define mdb_cmp_clong mdb_cmp_long
#else
# define mdb_cmp_clong mdb_cmp_cint
#endif
/** Compare two items lexically */
static int
mdb_cmp_memn(const MDB_val *a, const MDB_val *b)
@ -5415,7 +5421,7 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
if (!mc->mc_top) {
/* There are no other pages */
mc->mc_ki[mc->mc_top] = 0;
if (op == MDB_SET_RANGE) {
if (op == MDB_SET_RANGE && !exactp) {
rc = 0;
goto set1;
} else
@ -5800,14 +5806,14 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
uint16_t fp_flags;
MDB_val xdata, *rdata, dkey, olddata;
MDB_db dummy;
int do_sub = 0, insert;
int do_sub = 0, insert_key, insert_data;
unsigned int mcount = 0, dcount = 0, nospill;
size_t nsize;
int rc, rc2;
unsigned int nflags;
DKBUF;
if (mc == NULL)
if (mc == NULL || key == NULL)
return EINVAL;
env = mc->mc_txn->mt_env;
@ -5828,16 +5834,8 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
return (mc->mc_txn->mt_flags & MDB_TXN_RDONLY) ? EACCES : MDB_BAD_TXN;
if (flags != MDB_CURRENT) {
if (key == NULL)
return EINVAL;
if (key->mv_size-1 >= ENV_MAXKEY(env))
return MDB_BAD_VALSIZE;
} else {
/* Ignore key except in sub-cursor, where key holds the data */
if (!(mc->mc_flags & C_SUB))
key = NULL;
}
if (key->mv_size-1 >= ENV_MAXKEY(env))
return MDB_BAD_VALSIZE;
#if SIZE_MAX > MAXDATASIZE
if (data->mv_size > ((mc->mc_db->md_flags & MDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE))
@ -5927,8 +5925,8 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
return rc2;
}
insert = rc;
if (insert) {
insert_key = insert_data = rc;
if (insert_key) {
/* The key does not exist */
DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top]));
if ((mc->mc_db->md_flags & MDB_DUPSORT) &&
@ -5978,21 +5976,14 @@ more:
#if UINT_MAX < SIZE_MAX
if (mc->mc_dbx->md_dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t))
#ifdef MISALIGNED_OK
mc->mc_dbx->md_dcmp = mdb_cmp_long;
#else
mc->mc_dbx->md_dcmp = mdb_cmp_cint;
#endif
mc->mc_dbx->md_dcmp = mdb_cmp_clong;
#endif
/* if data matches, skip it */
if (!mc->mc_dbx->md_dcmp(data, &olddata)) {
if (flags & MDB_NODUPDATA)
rc = MDB_KEYEXIST;
else if (flags & MDB_MULTIPLE)
goto next_mult;
else
rc = MDB_SUCCESS;
return rc;
return MDB_KEYEXIST;
rc = MDB_SUCCESS;
goto next_sub;
}
/* Back up original data item */
@ -6089,7 +6080,7 @@ prep_subDB:
rdata = &xdata;
flags |= F_DUPDATA;
do_sub = 1;
if (!insert)
if (!insert_key)
mdb_node_del(mc, 0);
goto new_sub;
}
@ -6130,8 +6121,8 @@ current:
return ENOMEM;
id2.mid = pg;
id2.mptr = np;
rc = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
mdb_cassert(mc, rc == 0);
rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
mdb_cassert(mc, rc2 == 0);
if (!(flags & MDB_RESERVE)) {
/* Copy end of page, adjusting alignment so
* compiler may copy words instead of bytes.
@ -6149,7 +6140,7 @@ current:
data->mv_data = METADATA(omp);
else
memcpy(METADATA(omp), data->mv_data, data->mv_size);
goto done;
return MDB_SUCCESS;
}
}
if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
@ -6165,10 +6156,9 @@ current:
memcpy(olddata.mv_data, data->mv_data, data->mv_size);
else
memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
goto done;
return MDB_SUCCESS;
}
mdb_node_del(mc, 0);
mc->mc_db->md_entries--;
}
rdata = data;
@ -6178,14 +6168,14 @@ new_sub:
nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
nflags &= ~MDB_APPEND;
if (!insert)
nflags &= ~MDB_APPEND; /* sub-page may need room to grow */
if (!insert_key)
nflags |= MDB_SPLIT_REPLACE;
rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
} else {
/* There is room already in this leaf page. */
rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
if (rc == 0 && !do_sub && insert) {
if (rc == 0 && insert_key) {
/* Adjust other cursors pointing to mp */
MDB_cursor *m2, *m3;
MDB_dbi dbi = mc->mc_dbi;
@ -6205,9 +6195,7 @@ new_sub:
}
}
if (rc != MDB_SUCCESS)
mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
else {
if (rc == MDB_SUCCESS) {
/* Now store the actual data in the child DB. Note that we're
* storing the user data in the keys field, so there are strict
* size limits on dupdata. The actual data fields of the child
@ -6215,6 +6203,7 @@ new_sub:
*/
if (do_sub) {
int xflags;
size_t ecount;
put_sub:
xdata.mv_size = 0;
xdata.mv_data = "";
@ -6229,10 +6218,8 @@ put_sub:
/* converted, write the original data first */
if (dkey.mv_size) {
rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
if (rc) {
mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
return rc == MDB_KEYEXIST ? MDB_CORRUPTED : rc;
}
if (rc)
goto bad_sub;
{
/* Adjust other cursors pointing to mp */
MDB_cursor *m2;
@ -6250,6 +6237,7 @@ put_sub:
/* we've done our job */
dkey.mv_size = 0;
}
ecount = mc->mc_xcursor->mx_db.md_entries;
if (flags & MDB_APPENDDUP)
xflags |= MDB_APPEND;
rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
@ -6257,31 +6245,39 @@ put_sub:
void *db = NODEDATA(leaf);
memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
}
insert_data = mc->mc_xcursor->mx_db.md_entries - ecount;
}
/* sub-writes might have failed so check rc again.
* Don't increment count if we just replaced an existing item.
*/
if (!rc && !(flags & MDB_CURRENT))
/* Increment count unless we just replaced an existing item. */
if (insert_data)
mc->mc_db->md_entries++;
if (insert_key) {
/* Invalidate txn if we created an empty sub-DB */
if (rc)
goto bad_sub;
/* If we succeeded and the key didn't exist before,
* make sure the cursor is marked valid.
*/
mc->mc_flags |= C_INITIALIZED;
}
next_sub:
if (flags & MDB_MULTIPLE) {
if (!rc) {
next_mult:
mcount++;
/* let caller know how many succeeded, if any */
data[1].mv_size = mcount;
if (mcount < dcount) {
data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
insert_key = insert_data = 0;
goto more;
}
}
}
return rc;
bad_sub:
if (rc == MDB_KEYEXIST) /* should not happen, we deleted that item */
rc = MDB_CORRUPTED;
}
done:
/* If we succeeded and the key didn't exist before, make sure
* the cursor is marked valid.
*/
if (!rc && insert)
mc->mc_flags |= C_INITIALIZED;
mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
return rc;
}
@ -6314,7 +6310,10 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
if (!(flags & MDB_NODUPDATA)) {
if (flags & MDB_NODUPDATA) {
/* mdb_cursor_del0() will subtract the final entry */
mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1;
} else {
if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
}
@ -6353,7 +6352,6 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
if (rc)
goto fail;
mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries;
}
}
@ -6782,11 +6780,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
mx->mx_dbflag = DB_VALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */
#if UINT_MAX < SIZE_MAX
if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
#ifdef MISALIGNED_OK
mx->mx_dbx.md_cmp = mdb_cmp_long;
#else
mx->mx_dbx.md_cmp = mdb_cmp_cint;
#endif
mx->mx_dbx.md_cmp = mdb_cmp_clong;
#endif
}
@ -7295,9 +7289,18 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
}
}
}
mdb_cursor_pop(csrc);
return mdb_rebalance(csrc);
{
unsigned int snum = cdst->mc_snum;
uint16_t depth = cdst->mc_db->md_depth;
mdb_cursor_pop(cdst);
rc = mdb_rebalance(cdst);
/* Did the tree shrink? */
if (depth > cdst->mc_db->md_depth)
snum--;
cdst->mc_snum = snum;
cdst->mc_top = snum-1;
}
return rc;
}
/** Copy the contents of a cursor.
@ -7335,6 +7338,7 @@ mdb_rebalance(MDB_cursor *mc)
int rc;
unsigned int ptop, minkeys;
MDB_cursor mn;
indx_t oldki;
minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top]));
DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)",
@ -7385,6 +7389,7 @@ mdb_rebalance(MDB_cursor *mc)
}
}
} else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
int i;
DPUTS("collapsing root page!");
rc = mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
if (rc)
@ -7396,6 +7401,10 @@ mdb_rebalance(MDB_cursor *mc)
mc->mc_db->md_depth--;
mc->mc_db->md_branch_pages--;
mc->mc_ki[0] = mc->mc_ki[1];
for (i = 1; i<mc->mc_db->md_depth; i++) {
mc->mc_pg[i] = mc->mc_pg[i+1];
mc->mc_ki[i] = mc->mc_ki[i+1];
}
{
/* Adjust other cursors pointing to mp */
MDB_cursor *m2, *m3;
@ -7408,7 +7417,6 @@ mdb_rebalance(MDB_cursor *mc)
m3 = m2;
if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
if (m3->mc_pg[0] == mp) {
int i;
m3->mc_snum--;
m3->mc_top--;
for (i=0; i<m3->mc_snum; i++) {
@ -7439,6 +7447,7 @@ mdb_rebalance(MDB_cursor *mc)
mdb_cursor_copy(mc, &mn);
mn.mc_xcursor = NULL;
oldki = mc->mc_ki[mc->mc_top];
if (mc->mc_ki[ptop] == 0) {
/* We're the leftmost leaf in our parent.
*/
@ -7472,18 +7481,23 @@ mdb_rebalance(MDB_cursor *mc)
* (A branch page must never have less than 2 keys.)
*/
minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top]));
if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys)
return mdb_node_move(&mn, mc);
else {
if (mc->mc_ki[ptop] == 0)
if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) {
rc = mdb_node_move(&mn, mc);
if (mc->mc_ki[ptop]) {
oldki++;
}
} else {
if (mc->mc_ki[ptop] == 0) {
rc = mdb_page_merge(&mn, mc);
else {
} else {
oldki += NUMKEYS(mn.mc_pg[mn.mc_top]);
mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
rc = mdb_page_merge(mc, &mn);
mdb_cursor_copy(&mn, mc);
}
mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
mc->mc_flags &= ~C_EOF;
}
mc->mc_ki[mc->mc_top] = oldki;
return rc;
}

3678
libraries/libmdb/mdb.c Normal file

File diff suppressed because it is too large Load diff