From 358370410ca01ec37e55ea44c0f3cd1fa8860c5d Mon Sep 17 00:00:00 2001 From: Olivier Houchard Date: Sun, 9 Apr 2017 20:53:02 +0000 Subject: [PATCH 1/2] Import CK as of commit 6b141c0bdd21ce8b3e14147af8f87f22b20ecf32 This brings us changes we needed in ck_epoch. --- include/ck_epoch.h | 95 +++++++++++++++++++++++++++++++---- src/ck_epoch.c | 122 ++++++++++++++++++++++++++++++--------------- 2 files changed, 165 insertions(+), 52 deletions(-) diff --git a/include/ck_epoch.h b/include/ck_epoch.h index e7ce5bc0581..9e166e5753a 100644 --- a/include/ck_epoch.h +++ b/include/ck_epoch.h @@ -83,6 +83,7 @@ struct ck_epoch_ref { }; struct ck_epoch_record { + ck_stack_entry_t record_next; struct ck_epoch *global; unsigned int state; unsigned int epoch; @@ -92,17 +93,16 @@ struct ck_epoch_record { } local CK_CC_CACHELINE; unsigned int n_pending; unsigned int n_peak; - unsigned long n_dispatch; + unsigned int n_dispatch; + void *ct; ck_stack_t pending[CK_EPOCH_LENGTH]; - ck_stack_entry_t record_next; } CK_CC_CACHELINE; typedef struct ck_epoch_record ck_epoch_record_t; struct ck_epoch { unsigned int epoch; - char pad[CK_MD_CACHELINE - sizeof(unsigned int)]; - ck_stack_t records; unsigned int n_free; + ck_stack_t records; }; typedef struct ck_epoch ck_epoch_t; @@ -110,7 +110,14 @@ typedef struct ck_epoch ck_epoch_t; * Internal functions. */ void _ck_epoch_addref(ck_epoch_record_t *, ck_epoch_section_t *); -void _ck_epoch_delref(ck_epoch_record_t *, ck_epoch_section_t *); +bool _ck_epoch_delref(ck_epoch_record_t *, ck_epoch_section_t *); + +CK_CC_FORCE_INLINE static void * +ck_epoch_record_ct(const ck_epoch_record_t *record) +{ + + return ck_pr_load_ptr(&record->ct); +} /* * Marks the beginning of an epoch-protected section. @@ -160,9 +167,10 @@ ck_epoch_begin(ck_epoch_record_t *record, ck_epoch_section_t *section) } /* - * Marks the end of an epoch-protected section. + * Marks the end of an epoch-protected section. Returns true if no more + * sections exist for the caller. */ -CK_CC_FORCE_INLINE static void +CK_CC_FORCE_INLINE static bool ck_epoch_end(ck_epoch_record_t *record, ck_epoch_section_t *section) { @@ -170,15 +178,19 @@ ck_epoch_end(ck_epoch_record_t *record, ck_epoch_section_t *section) ck_pr_store_uint(&record->active, record->active - 1); if (section != NULL) - _ck_epoch_delref(record, section); + return _ck_epoch_delref(record, section); - return; + return record->active == 0; } /* * Defers the execution of the function pointed to by the "cb" * argument until an epoch counter loop. This allows for a * non-blocking deferral. + * + * We can get away without a fence here due to the monotonic nature + * of the epoch counter. Worst case, this will result in some delays + * before object destruction. */ CK_CC_FORCE_INLINE static void ck_epoch_call(ck_epoch_record_t *record, @@ -195,13 +207,74 @@ ck_epoch_call(ck_epoch_record_t *record, return; } +/* + * Same as ck_epoch_call, but allows for records to be shared and is reentrant. + */ +CK_CC_FORCE_INLINE static void +ck_epoch_call_strict(ck_epoch_record_t *record, + ck_epoch_entry_t *entry, + ck_epoch_cb_t *function) +{ + struct ck_epoch *epoch = record->global; + unsigned int e = ck_pr_load_uint(&epoch->epoch); + unsigned int offset = e & (CK_EPOCH_LENGTH - 1); + + ck_pr_inc_uint(&record->n_pending); + entry->function = function; + + /* Store fence is implied by push operation. */ + ck_stack_push_upmc(&record->pending[offset], &entry->stack_entry); + return; +} + +/* + * This callback is used for synchronize_wait to allow for custom blocking + * behavior. + */ +typedef void ck_epoch_wait_cb_t(ck_epoch_t *, ck_epoch_record_t *, + void *); + +/* + * Return latest epoch value. This operation provides load ordering. + */ +CK_CC_FORCE_INLINE static unsigned int +ck_epoch_value(const ck_epoch_t *ep) +{ + + ck_pr_fence_load(); + return ck_pr_load_uint(&ep->epoch); +} + void ck_epoch_init(ck_epoch_t *); -ck_epoch_record_t *ck_epoch_recycle(ck_epoch_t *); -void ck_epoch_register(ck_epoch_t *, ck_epoch_record_t *); + +/* + * Attempts to recycle an unused epoch record. If one is successfully + * allocated, the record context pointer is also updated. + */ +ck_epoch_record_t *ck_epoch_recycle(ck_epoch_t *, void *); + +/* + * Registers an epoch record. An optional context pointer may be passed that + * is retrievable with ck_epoch_record_ct. + */ +void ck_epoch_register(ck_epoch_t *, ck_epoch_record_t *, void *); + +/* + * Marks a record as available for re-use by a subsequent recycle operation. + * Note that the record cannot be physically destroyed. + */ void ck_epoch_unregister(ck_epoch_record_t *); + bool ck_epoch_poll(ck_epoch_record_t *); void ck_epoch_synchronize(ck_epoch_record_t *); +void ck_epoch_synchronize_wait(ck_epoch_t *, ck_epoch_wait_cb_t *, void *); void ck_epoch_barrier(ck_epoch_record_t *); +void ck_epoch_barrier_wait(ck_epoch_record_t *, ck_epoch_wait_cb_t *, void *); + +/* + * Reclaim entries associated with a record. This is safe to call only on + * the caller's record or records that are using call_strict. + */ void ck_epoch_reclaim(ck_epoch_record_t *); #endif /* CK_EPOCH_H */ diff --git a/src/ck_epoch.c b/src/ck_epoch.c index a0e9180a1fd..a3273b474b6 100644 --- a/src/ck_epoch.c +++ b/src/ck_epoch.c @@ -139,7 +139,7 @@ CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, #define CK_EPOCH_SENSE_MASK (CK_EPOCH_SENSE - 1) -void +bool _ck_epoch_delref(struct ck_epoch_record *record, struct ck_epoch_section *section) { @@ -150,7 +150,7 @@ _ck_epoch_delref(struct ck_epoch_record *record, current->count--; if (current->count > 0) - return; + return false; /* * If the current bucket no longer has any references, then @@ -161,8 +161,7 @@ _ck_epoch_delref(struct ck_epoch_record *record, * If no other active bucket exists, then the record will go * inactive in order to allow for forward progress. */ - other = &record->local.bucket[(i + 1) & - CK_EPOCH_SENSE_MASK]; + other = &record->local.bucket[(i + 1) & CK_EPOCH_SENSE_MASK]; if (other->count > 0 && ((int)(current->epoch - other->epoch) < 0)) { /* @@ -172,7 +171,7 @@ _ck_epoch_delref(struct ck_epoch_record *record, ck_pr_store_uint(&record->epoch, other->epoch); } - return; + return true; } void @@ -230,7 +229,7 @@ ck_epoch_init(struct ck_epoch *global) } struct ck_epoch_record * -ck_epoch_recycle(struct ck_epoch *global) +ck_epoch_recycle(struct ck_epoch *global, void *ct) { struct ck_epoch_record *record; ck_stack_entry_t *cursor; @@ -249,6 +248,12 @@ ck_epoch_recycle(struct ck_epoch *global) CK_EPOCH_STATE_USED); if (state == CK_EPOCH_STATE_FREE) { ck_pr_dec_uint(&global->n_free); + ck_pr_store_ptr(&record->ct, ct); + + /* + * The context pointer is ordered by a + * subsequent protected section. + */ return record; } } @@ -258,7 +263,8 @@ ck_epoch_recycle(struct ck_epoch *global) } void -ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record) +ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record, + void *ct) { size_t i; @@ -269,6 +275,7 @@ ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record) record->n_dispatch = 0; record->n_peak = 0; record->n_pending = 0; + record->ct = ct; memset(&record->local, 0, sizeof record->local); for (i = 0; i < CK_EPOCH_LENGTH; i++) @@ -295,6 +302,7 @@ ck_epoch_unregister(struct ck_epoch_record *record) for (i = 0; i < CK_EPOCH_LENGTH; i++) ck_stack_init(&record->pending[i]); + ck_pr_store_ptr(&record->ct, NULL); ck_pr_fence_store(); ck_pr_store_uint(&record->state, CK_EPOCH_STATE_FREE); ck_pr_inc_uint(&global->n_free); @@ -345,11 +353,10 @@ ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e) { unsigned int epoch = e & (CK_EPOCH_LENGTH - 1); ck_stack_entry_t *head, *next, *cursor; + unsigned int n_pending, n_peak; unsigned int i = 0; - head = CK_STACK_FIRST(&record->pending[epoch]); - ck_stack_init(&record->pending[epoch]); - + head = ck_stack_batch_pop_upmc(&record->pending[epoch]); for (cursor = head; cursor != NULL; cursor = next) { struct ck_epoch_entry *entry = ck_epoch_entry_container(cursor); @@ -359,11 +366,18 @@ ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e) i++; } - if (record->n_pending > record->n_peak) - record->n_peak = record->n_pending; + n_peak = ck_pr_load_uint(&record->n_peak); + n_pending = ck_pr_load_uint(&record->n_pending); + + /* We don't require accuracy around peak calculation. */ + if (n_pending > n_peak) + ck_pr_store_uint(&record->n_peak, n_peak); + + if (i > 0) { + ck_pr_add_uint(&record->n_dispatch, i); + ck_pr_sub_uint(&record->n_pending, i); + } - record->n_dispatch += i; - record->n_pending -= i; return; } @@ -381,13 +395,24 @@ ck_epoch_reclaim(struct ck_epoch_record *record) return; } +CK_CC_FORCE_INLINE static void +epoch_block(struct ck_epoch *global, struct ck_epoch_record *cr, + ck_epoch_wait_cb_t *cb, void *ct) +{ + + if (cb != NULL) + cb(global, cr, ct); + + return; +} + /* * This function must not be called with-in read section. */ void -ck_epoch_synchronize(struct ck_epoch_record *record) +ck_epoch_synchronize_wait(struct ck_epoch *global, + ck_epoch_wait_cb_t *cb, void *ct) { - struct ck_epoch *global = record->global; struct ck_epoch_record *cr; unsigned int delta, epoch, goal, i; bool active; @@ -424,10 +449,27 @@ ck_epoch_synchronize(struct ck_epoch_record *record) * period. */ e_d = ck_pr_load_uint(&global->epoch); - if (e_d != delta) { - delta = e_d; - goto reload; + if (e_d == delta) { + epoch_block(global, cr, cb, ct); + continue; } + + /* + * If the epoch has been updated, we may have already + * met our goal. + */ + delta = e_d; + if ((goal > epoch) & (delta >= goal)) + goto leave; + + epoch_block(global, cr, cb, ct); + + /* + * If the epoch has been updated, then a grace period + * requires that all threads are observed idle at the + * same epoch. + */ + cr = NULL; } /* @@ -459,20 +501,6 @@ ck_epoch_synchronize(struct ck_epoch_record *record) * Otherwise, we have just acquired latest snapshot. */ delta = delta + r; - continue; - -reload: - if ((goal > epoch) & (delta >= goal)) { - /* - * Right now, epoch overflow is handled as an edge - * case. If we have already observed an epoch - * generation, then we can be sure no hazardous - * references exist to objects from this generation. We - * can actually avoid an addtional scan step at this - * point. - */ - break; - } } /* @@ -480,8 +508,16 @@ reload: * However, if non-temporal instructions are used, full barrier * semantics are necessary. */ +leave: ck_pr_fence_memory(); - record->epoch = delta; + return; +} + +void +ck_epoch_synchronize(struct ck_epoch_record *record) +{ + + ck_epoch_synchronize_wait(record->global, NULL, NULL); return; } @@ -494,6 +530,16 @@ ck_epoch_barrier(struct ck_epoch_record *record) return; } +void +ck_epoch_barrier_wait(struct ck_epoch_record *record, ck_epoch_wait_cb_t *cb, + void *ct) +{ + + ck_epoch_synchronize_wait(record->global, cb, ct); + ck_epoch_reclaim(record); + return; +} + /* * It may be worth it to actually apply these deferral semantics to an epoch * that was observed at ck_epoch_call time. The problem is that the latter @@ -509,7 +555,6 @@ ck_epoch_poll(struct ck_epoch_record *record) { bool active; unsigned int epoch; - unsigned int snapshot; struct ck_epoch_record *cr = NULL; struct ck_epoch *global = record->global; @@ -533,12 +578,7 @@ ck_epoch_poll(struct ck_epoch_record *record) } /* If an active thread exists, rely on epoch observation. */ - if (ck_pr_cas_uint_value(&global->epoch, epoch, epoch + 1, - &snapshot) == false) { - record->epoch = snapshot; - } else { - record->epoch = epoch + 1; - } + (void)ck_pr_cas_uint(&global->epoch, epoch, epoch + 1); ck_epoch_dispatch(record, epoch + 1); return true; From e8d27288c2439ee79c81e4684ea90e8ca1aab845 Mon Sep 17 00:00:00 2001 From: Olivier Houchard Date: Mon, 2 Apr 2018 23:35:32 +0000 Subject: [PATCH 2/2] Import CK as of commit b19ed4c6a56ec93215ab567ba18ba61bf1cfbac8 It should fix ck_pr_[load|store]_ptr on mips and riscv, make sure no *fence instructions are used on i386, as older cpus don't support it, and make sure we don't rely on gcc builtins that can lead to calls to libatomic when linked with -O0. --- include/ck_cc.h | 55 ++++++++++++++----------------- include/ck_hs.h | 4 ++- include/ck_pr.h | 2 ++ include/ck_queue.h | 2 +- include/ck_ring.h | 51 +++++++++++++++++++++++------ include/gcc/ck_cc.h | 19 ++++++----- include/gcc/ck_pr.h | 4 +-- include/gcc/sparcv9/ck_pr.h | 2 +- include/gcc/x86/ck_pr.h | 64 ++++++++++++++++++++++++------------- include/gcc/x86_64/ck_pr.h | 35 ++++++++++++++++---- include/spinlock/dec.h | 3 +- src/ck_hs.c | 45 ++++++++++++++++++-------- src/ck_ht.c | 2 +- src/ck_ht_hash.h | 18 +++++++++++ src/ck_internal.h | 37 --------------------- src/ck_rhs.c | 2 +- 16 files changed, 205 insertions(+), 140 deletions(-) diff --git a/include/ck_cc.h b/include/ck_cc.h index e17dc7b15f2..9a152a3cdda 100644 --- a/include/ck_cc.h +++ b/include/ck_cc.h @@ -104,41 +104,35 @@ #define CK_CC_TYPEOF(X, DEFAULT) (DEFAULT) #endif +#define CK_F_CC_FFS_G(L, T) \ +CK_CC_INLINE static int \ +ck_cc_##L(T v) \ +{ \ + unsigned int i; \ + \ + if (v == 0) \ + return 0; \ + \ + for (i = 1; (v & 1) == 0; i++, v >>= 1); \ + return i; \ +} + #ifndef CK_F_CC_FFS #define CK_F_CC_FFS -CK_CC_INLINE static int -ck_cc_ffs(unsigned int x) -{ - unsigned int i; +CK_F_CC_FFS_G(ffs, unsigned int) +#endif /* CK_F_CC_FFS */ - if (x == 0) - return 0; +#ifndef CK_F_CC_FFSL +#define CK_F_CC_FFSL +CK_F_CC_FFS_G(ffsl, unsigned long) +#endif /* CK_F_CC_FFSL */ - for (i = 1; (x & 1) == 0; i++, x >>= 1); +#ifndef CK_F_CC_FFSLL +#define CK_F_CC_FFSLL +CK_F_CC_FFS_G(ffsll, unsigned long long) +#endif /* CK_F_CC_FFSLL */ - return i; -} -#endif - -#ifndef CK_F_CC_CLZ -#define CK_F_CC_CLZ -#include - -CK_CC_INLINE static int -ck_cc_clz(unsigned int x) -{ - unsigned int count, i; - - for (count = 0, i = sizeof(unsigned int) * CHAR_BIT; i > 0; count++) { - unsigned int bit = 1U << --i; - - if (x & bit) - break; - } - - return count; -} -#endif +#undef CK_F_CC_FFS_G #ifndef CK_F_CC_CTZ #define CK_F_CC_CTZ @@ -151,7 +145,6 @@ ck_cc_ctz(unsigned int x) return 0; for (i = 0; (x & 1) == 0; i++, x >>= 1); - return i; } #endif diff --git a/include/ck_hs.h b/include/ck_hs.h index b3eb04698f7..3c12b6e602a 100644 --- a/include/ck_hs.h +++ b/include/ck_hs.h @@ -100,10 +100,11 @@ struct ck_hs_stat { struct ck_hs_iterator { void **cursor; unsigned long offset; + struct ck_hs_map *map; }; typedef struct ck_hs_iterator ck_hs_iterator_t; -#define CK_HS_ITERATOR_INITIALIZER { NULL, 0 } +#define CK_HS_ITERATOR_INITIALIZER { NULL, 0, NULL } /* Convenience wrapper to table hash function. */ #define CK_HS_HASH(T, F, K) F((K), (T)->seed) @@ -112,6 +113,7 @@ typedef void *ck_hs_apply_fn_t(void *, void *); bool ck_hs_apply(ck_hs_t *, unsigned long, const void *, ck_hs_apply_fn_t *, void *); void ck_hs_iterator_init(ck_hs_iterator_t *); bool ck_hs_next(ck_hs_t *, ck_hs_iterator_t *, void **); +bool ck_hs_next_spmc(ck_hs_t *, ck_hs_iterator_t *, void **); bool ck_hs_move(ck_hs_t *, ck_hs_t *, ck_hs_hash_cb_t *, ck_hs_compare_cb_t *, struct ck_malloc *); bool ck_hs_init(ck_hs_t *, unsigned int, ck_hs_hash_cb_t *, diff --git a/include/ck_pr.h b/include/ck_pr.h index 9b7fc42e99b..4fdbdffa111 100644 --- a/include/ck_pr.h +++ b/include/ck_pr.h @@ -43,6 +43,8 @@ #include "gcc/sparcv9/ck_pr.h" #elif defined(__ppc64__) #include "gcc/ppc64/ck_pr.h" +#elif defined(__s390x__) +#include "gcc/s390x/ck_pr.h" #elif defined(__ppc__) #include "gcc/ppc/ck_pr.h" #elif defined(__arm__) diff --git a/include/ck_queue.h b/include/ck_queue.h index 0b7ce1aec70..28b298e6d9a 100644 --- a/include/ck_queue.h +++ b/include/ck_queue.h @@ -235,7 +235,7 @@ struct { \ * Singly-linked Tail queue functions. */ #define CK_STAILQ_CONCAT(head1, head2) do { \ - if ((head2)->stqh_first == NULL) { \ + if ((head2)->stqh_first != NULL) { \ ck_pr_store_ptr((head1)->stqh_last, (head2)->stqh_first); \ ck_pr_fence_store(); \ (head1)->stqh_last = (head2)->stqh_last; \ diff --git a/include/ck_ring.h b/include/ck_ring.h index 8a2a7913e12..e5f0712ef7c 100644 --- a/include/ck_ring.h +++ b/include/ck_ring.h @@ -176,23 +176,54 @@ _ck_ring_enqueue_mp(struct ck_ring *ring, producer = ck_pr_load_uint(&ring->p_head); - do { + for (;;) { /* - * The snapshot of producer must be up to date with - * respect to consumer. + * The snapshot of producer must be up to date with respect to + * consumer. */ ck_pr_fence_load(); consumer = ck_pr_load_uint(&ring->c_head); delta = producer + 1; - if (CK_CC_UNLIKELY((delta & mask) == (consumer & mask))) { - r = false; - goto leave; + + /* + * Only try to CAS if the producer is not clearly stale (not + * less than consumer) and the buffer is definitely not full. + */ + if (CK_CC_LIKELY((producer - consumer) < mask)) { + if (ck_pr_cas_uint_value(&ring->p_head, + producer, delta, &producer) == true) { + break; + } + } else { + unsigned int new_producer; + + /* + * Slow path. Either the buffer is full or we have a + * stale snapshot of p_head. Execute a second read of + * p_read that must be ordered wrt the snapshot of + * c_head. + */ + ck_pr_fence_load(); + new_producer = ck_pr_load_uint(&ring->p_head); + + /* + * Only fail if we haven't made forward progress in + * production: the buffer must have been full when we + * read new_producer (or we wrapped around UINT_MAX + * during this iteration). + */ + if (producer == new_producer) { + r = false; + goto leave; + } + + /* + * p_head advanced during this iteration. Try again. + */ + producer = new_producer; } - } while (ck_pr_cas_uint_value(&ring->p_head, - producer, - delta, - &producer) == false); + } buffer = (char *)buffer + ts * (producer & mask); memcpy(buffer, entry, ts); diff --git a/include/gcc/ck_cc.h b/include/gcc/ck_cc.h index a14a4b51aa8..6ebc59cb592 100644 --- a/include/gcc/ck_cc.h +++ b/include/gcc/ck_cc.h @@ -103,28 +103,26 @@ #define CK_CC_TYPEOF(X, DEFAULT) __typeof__(X) /* - * Portability wrappers for bitwise ops. + * Portability wrappers for bitwise operations. */ - +#ifndef CK_MD_CC_BUILTIN_DISABLE #define CK_F_CC_FFS -#define CK_F_CC_CLZ -#define CK_F_CC_CTZ -#define CK_F_CC_POPCOUNT - CK_CC_INLINE static int ck_cc_ffs(unsigned int x) { - return __builtin_ffs(x); + return __builtin_ffsl(x); } +#define CK_F_CC_FFSL CK_CC_INLINE static int -ck_cc_clz(unsigned int x) +ck_cc_ffsl(unsigned long x) { - return __builtin_clz(x); + return __builtin_ffsll(x); } +#define CK_F_CC_CTZ CK_CC_INLINE static int ck_cc_ctz(unsigned int x) { @@ -132,11 +130,12 @@ ck_cc_ctz(unsigned int x) return __builtin_ctz(x); } +#define CK_F_CC_POPCOUNT CK_CC_INLINE static int ck_cc_popcount(unsigned int x) { return __builtin_popcount(x); } - +#endif /* CK_MD_CC_BUILTIN_DISABLE */ #endif /* CK_GCC_CC_H */ diff --git a/include/gcc/ck_pr.h b/include/gcc/ck_pr.h index 084d4232d82..108e983a1e5 100644 --- a/include/gcc/ck_pr.h +++ b/include/gcc/ck_pr.h @@ -80,7 +80,7 @@ ck_pr_md_load_ptr(const void *target) void *r; ck_pr_barrier(); - r = CK_CC_DECONST_PTR(CK_PR_ACCESS(target)); + r = CK_CC_DECONST_PTR(*(volatile void *const*)(target)); ck_pr_barrier(); return r; @@ -91,7 +91,7 @@ ck_pr_md_store_ptr(void *target, const void *v) { ck_pr_barrier(); - CK_PR_ACCESS(target) = CK_CC_DECONST_PTR(v); + *(volatile void **)target = CK_CC_DECONST_PTR(v); ck_pr_barrier(); return; } diff --git a/include/gcc/sparcv9/ck_pr.h b/include/gcc/sparcv9/ck_pr.h index 767af6a0268..7dc7172557b 100644 --- a/include/gcc/sparcv9/ck_pr.h +++ b/include/gcc/sparcv9/ck_pr.h @@ -76,7 +76,7 @@ CK_PR_FENCE(store, "membar #StoreStore") CK_PR_FENCE(store_load, "membar #StoreLoad") CK_PR_FENCE(load, "membar #LoadLoad") CK_PR_FENCE(load_store, "membar #LoadStore") -CK_PR_FENCE(memory, "membar #LoadLoad | #LoadStore | #StoreStore | #StoreLoad") +CK_PR_FENCE(memory, "membar #MemIssue") CK_PR_FENCE(acquire, "membar #LoadLoad | #LoadStore") CK_PR_FENCE(release, "membar #LoadStore | #StoreStore") CK_PR_FENCE(acqrel, "membar #LoadLoad | #LoadStore | #StoreStore") diff --git a/include/gcc/x86/ck_pr.h b/include/gcc/x86/ck_pr.h index a04cebfd033..3e36376fdd9 100644 --- a/include/gcc/x86/ck_pr.h +++ b/include/gcc/x86/ck_pr.h @@ -45,15 +45,9 @@ /* Minimum requirements for the CK_PR interface are met. */ #define CK_F_PR -#ifdef CK_MD_UMP -#define CK_PR_LOCK_PREFIX -#else -#define CK_PR_LOCK_PREFIX "lock " -#endif - /* - * Prevent speculative execution in busy-wait loops (P4 <=) - * or "predefined delay". + * Prevent speculative execution in busy-wait loops (P4 <=) or "predefined + * delay". */ CK_CC_INLINE static void ck_pr_stall(void) @@ -62,28 +56,52 @@ ck_pr_stall(void) return; } +#ifdef CK_MD_UMP +#define CK_PR_LOCK_PREFIX +#define CK_PR_FENCE(T, I) \ + CK_CC_INLINE static void \ + ck_pr_fence_strict_##T(void) \ + { \ + __asm__ __volatile__("" ::: "memory"); \ + return; \ + } +#else +#define CK_PR_LOCK_PREFIX "lock " #define CK_PR_FENCE(T, I) \ CK_CC_INLINE static void \ ck_pr_fence_strict_##T(void) \ { \ __asm__ __volatile__(I ::: "memory"); \ + return; \ } +#endif /* CK_MD_UMP */ -CK_PR_FENCE(atomic, "sfence") -CK_PR_FENCE(atomic_store, "sfence") -CK_PR_FENCE(atomic_load, "mfence") -CK_PR_FENCE(store_atomic, "sfence") -CK_PR_FENCE(load_atomic, "mfence") -CK_PR_FENCE(load, "lfence") -CK_PR_FENCE(load_store, "mfence") -CK_PR_FENCE(store, "sfence") -CK_PR_FENCE(store_load, "mfence") -CK_PR_FENCE(memory, "mfence") -CK_PR_FENCE(release, "mfence") -CK_PR_FENCE(acquire, "mfence") -CK_PR_FENCE(acqrel, "mfence") -CK_PR_FENCE(lock, "mfence") -CK_PR_FENCE(unlock, "mfence") +#if defined(CK_MD_SSE_DISABLE) +/* If SSE is disabled, then use atomic operations for serialization. */ +#define CK_MD_X86_MFENCE "lock addl $0, (%%esp)" +#define CK_MD_X86_SFENCE CK_MD_X86_MFENCE +#define CK_MD_X86_LFENCE CK_MD_X86_MFENCE +#else +#define CK_MD_X86_SFENCE "sfence" +#define CK_MD_X86_LFENCE "lfence" +#define CK_MD_X86_MFENCE "mfence" +#endif /* !CK_MD_SSE_DISABLE */ + +CK_PR_FENCE(atomic, "") +CK_PR_FENCE(atomic_store, "") +CK_PR_FENCE(atomic_load, "") +CK_PR_FENCE(store_atomic, "") +CK_PR_FENCE(load_atomic, "") +CK_PR_FENCE(load, CK_MD_X86_LFENCE) +CK_PR_FENCE(load_store, CK_MD_X86_MFENCE) +CK_PR_FENCE(store, CK_MD_X86_SFENCE) +CK_PR_FENCE(store_load, CK_MD_X86_MFENCE) +CK_PR_FENCE(memory, CK_MD_X86_MFENCE) +CK_PR_FENCE(release, CK_MD_X86_MFENCE) +CK_PR_FENCE(acquire, CK_MD_X86_MFENCE) +CK_PR_FENCE(acqrel, CK_MD_X86_MFENCE) +CK_PR_FENCE(lock, CK_MD_X86_MFENCE) +CK_PR_FENCE(unlock, CK_MD_X86_MFENCE) #undef CK_PR_FENCE diff --git a/include/gcc/x86_64/ck_pr.h b/include/gcc/x86_64/ck_pr.h index 532d593f277..4de1332910c 100644 --- a/include/gcc/x86_64/ck_pr.h +++ b/include/gcc/x86_64/ck_pr.h @@ -58,8 +58,8 @@ #endif /* - * Prevent speculative execution in busy-wait loops (P4 <=) - * or "predefined delay". + * Prevent speculative execution in busy-wait loops (P4 <=) or "predefined + * delay". */ CK_CC_INLINE static void ck_pr_stall(void) @@ -75,18 +75,39 @@ ck_pr_stall(void) __asm__ __volatile__(I ::: "memory"); \ } -CK_PR_FENCE(atomic, "sfence") -CK_PR_FENCE(atomic_store, "sfence") -CK_PR_FENCE(atomic_load, "mfence") -CK_PR_FENCE(store_atomic, "sfence") -CK_PR_FENCE(load_atomic, "mfence") +/* Atomic operations are always serializing. */ +CK_PR_FENCE(atomic, "") +CK_PR_FENCE(atomic_store, "") +CK_PR_FENCE(atomic_load, "") +CK_PR_FENCE(store_atomic, "") +CK_PR_FENCE(load_atomic, "") + +/* Traditional fence interface. */ CK_PR_FENCE(load, "lfence") CK_PR_FENCE(load_store, "mfence") CK_PR_FENCE(store, "sfence") CK_PR_FENCE(store_load, "mfence") CK_PR_FENCE(memory, "mfence") + +/* Below are stdatomic-style fences. */ + +/* + * Provides load-store and store-store ordering. However, Intel specifies that + * the WC memory model is relaxed. It is likely an sfence *is* sufficient (in + * particular, stores are not re-ordered with respect to prior loads and it is + * really just the stores that are subject to re-ordering). However, we take + * the conservative route as the manuals are too ambiguous for my taste. + */ CK_PR_FENCE(release, "mfence") + +/* + * Provides load-load and load-store ordering. The lfence instruction ensures + * all prior load operations are complete before any subsequent instructions + * actually begin execution. However, the manual also ends up going to describe + * WC memory as a relaxed model. + */ CK_PR_FENCE(acquire, "mfence") + CK_PR_FENCE(acqrel, "mfence") CK_PR_FENCE(lock, "mfence") CK_PR_FENCE(unlock, "mfence") diff --git a/include/spinlock/dec.h b/include/spinlock/dec.h index 11d36dded02..3e36bf7612a 100644 --- a/include/spinlock/dec.h +++ b/include/spinlock/dec.h @@ -111,7 +111,8 @@ ck_spinlock_dec_lock_eb(struct ck_spinlock_dec *lock) if (r == true) break; - ck_backoff_eb(&backoff); + while (ck_pr_load_uint(&lock->value) != 1) + ck_backoff_eb(&backoff); } ck_pr_fence_lock(); diff --git a/src/ck_hs.c b/src/ck_hs.c index 31510ec51dd..a7e15eaddbe 100644 --- a/src/ck_hs.c +++ b/src/ck_hs.c @@ -105,21 +105,10 @@ ck_hs_map_signal(struct ck_hs_map *map, unsigned long h) return; } -void -ck_hs_iterator_init(struct ck_hs_iterator *iterator) +static bool +_ck_hs_next(struct ck_hs *hs, struct ck_hs_map *map, struct ck_hs_iterator *i, void **key) { - - iterator->cursor = NULL; - iterator->offset = 0; - return; -} - -bool -ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key) -{ - struct ck_hs_map *map = hs->map; void *value; - if (i->offset >= map->capacity) return false; @@ -129,6 +118,8 @@ ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key) #ifdef CK_HS_PP if (hs->mode & CK_HS_MODE_OBJECT) value = CK_HS_VMA(value); +#else + (void)hs; /* Avoid unused parameter warning. */ #endif i->offset++; *key = value; @@ -139,6 +130,32 @@ ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key) return false; } +void +ck_hs_iterator_init(struct ck_hs_iterator *iterator) +{ + + iterator->cursor = NULL; + iterator->offset = 0; + iterator->map = NULL; + return; +} + +bool +ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key) +{ + return _ck_hs_next(hs, hs->map, i, key); +} + +bool +ck_hs_next_spmc(struct ck_hs *hs, struct ck_hs_iterator *i, void **key) +{ + struct ck_hs_map *m = i->map; + if (m == NULL) { + m = i->map = ck_pr_load_ptr(&hs->map); + } + return _ck_hs_next(hs, m, i, key); +} + void ck_hs_stat(struct ck_hs *hs, struct ck_hs_stat *st) { @@ -206,7 +223,7 @@ ck_hs_map_create(struct ck_hs *hs, unsigned long entries) map->probe_limit = (unsigned int)limit; map->probe_maximum = 0; map->capacity = n_entries; - map->step = ck_internal_bsf(n_entries); + map->step = ck_cc_ffsl(n_entries); map->mask = n_entries - 1; map->n_entries = 0; diff --git a/src/ck_ht.c b/src/ck_ht.c index 2c864c5714c..48b04c9678d 100644 --- a/src/ck_ht.c +++ b/src/ck_ht.c @@ -171,7 +171,7 @@ ck_ht_map_create(struct ck_ht *table, CK_HT_TYPE entries) map->deletions = 0; map->probe_maximum = 0; map->capacity = n_entries; - map->step = ck_internal_bsf_64(map->capacity); + map->step = ck_cc_ffsll(map->capacity); map->mask = map->capacity - 1; map->n_entries = 0; map->entries = (struct ck_ht_entry *)(((uintptr_t)&map[1] + prefix + diff --git a/src/ck_ht_hash.h b/src/ck_ht_hash.h index cd3d7a538bd..a47dc406249 100644 --- a/src/ck_ht_hash.h +++ b/src/ck_ht_hash.h @@ -88,7 +88,15 @@ static inline uint64_t rotl64 ( uint64_t x, int8_t r ) FORCE_INLINE static uint32_t getblock ( const uint32_t * p, int i ) { +#ifdef __s390x__ + uint32_t res; + + __asm__ (" lrv %0,%1\n" + : "=r" (res) : "Q" (p[i]) : "cc", "mem"); + return res; +#else return p[i]; +#endif /* !__s390x__ */ } //----------------------------------------------------------------------------- @@ -147,7 +155,9 @@ static inline void MurmurHash3_x86_32 ( const void * key, int len, switch(len & 3) { case 3: k1 ^= tail[2] << 16; + /* fall through */ case 2: k1 ^= tail[1] << 8; + /* fall through */ case 1: k1 ^= tail[0]; k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; }; @@ -196,11 +206,17 @@ static inline uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed switch(len & 7) { case 7: h ^= (uint64_t)(data2[6]) << 48; + /* fall through */ case 6: h ^= (uint64_t)(data2[5]) << 40; + /* fall through */ case 5: h ^= (uint64_t)(data2[4]) << 32; + /* fall through */ case 4: h ^= (uint64_t)(data2[3]) << 24; + /* fall through */ case 3: h ^= (uint64_t)(data2[2]) << 16; + /* fall through */ case 2: h ^= (uint64_t)(data2[1]) << 8; + /* fall through */ case 1: h ^= (uint64_t)(data2[0]); h *= m; }; @@ -249,7 +265,9 @@ static inline uint64_t MurmurHash64B ( const void * key, int len, uint64_t seed switch(len) { case 3: h2 ^= ((const unsigned char*)data)[2] << 16; + /* fall through */ case 2: h2 ^= ((const unsigned char*)data)[1] << 8; + /* fall through */ case 1: h2 ^= ((const unsigned char*)data)[0]; h2 *= m; }; diff --git a/src/ck_internal.h b/src/ck_internal.h index 7aad3d743ce..1bca36a13c7 100644 --- a/src/ck_internal.h +++ b/src/ck_internal.h @@ -80,40 +80,3 @@ ck_internal_max_32(uint32_t x, uint32_t y) return x ^ ((x ^ y) & -(x < y)); } - -CK_CC_INLINE static unsigned long -ck_internal_bsf(unsigned long v) -{ -#if defined(__GNUC__) - return __builtin_ffs(v); -#else - unsigned int i; - const unsigned int s = sizeof(unsigned long) * 8 - 1; - - for (i = 0; i < s; i++) { - if (v & (1UL << (s - i))) - return sizeof(unsigned long) * 8 - i; - } - - return 1; -#endif /* !__GNUC__ */ -} - -CK_CC_INLINE static uint64_t -ck_internal_bsf_64(uint64_t v) -{ -#if defined(__GNUC__) - return __builtin_ffs(v); -#else - unsigned int i; - const unsigned int s = sizeof(unsigned long) * 8 - 1; - - for (i = 0; i < s; i++) { - if (v & (1ULL << (63U - i))) - return i; - } -#endif /* !__GNUC__ */ - - return 1; -} - diff --git a/src/ck_rhs.c b/src/ck_rhs.c index f6dd2ee29e9..1d6b0f0d904 100644 --- a/src/ck_rhs.c +++ b/src/ck_rhs.c @@ -366,7 +366,7 @@ ck_rhs_map_create(struct ck_rhs *hs, unsigned long entries) map->probe_limit = (unsigned int)limit; map->probe_maximum = 0; map->capacity = n_entries; - map->step = ck_internal_bsf(n_entries); + map->step = ck_cc_ffsl(n_entries); map->mask = n_entries - 1; map->n_entries = 0;