2010-01-04 09:23:48 -05:00
|
|
|
/*
|
2020-06-04 12:46:44 -04:00
|
|
|
* include/haproxy/stick_table.h
|
2010-01-04 09:23:48 -05:00
|
|
|
* Functions for stick tables management.
|
|
|
|
|
*
|
|
|
|
|
* Copyright (C) 2009-2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr>
|
2010-06-06 07:34:54 -04:00
|
|
|
* Copyright (C) 2010 Willy Tarreau <w@1wt.eu>
|
2010-01-04 09:23:48 -05:00
|
|
|
*
|
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
|
* License as published by the Free Software Foundation, version 2.1
|
|
|
|
|
* exclusively.
|
|
|
|
|
*
|
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
|
* License along with this library; if not, write to the Free Software
|
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
|
*/
|
|
|
|
|
|
2020-06-04 12:46:44 -04:00
|
|
|
#ifndef _HAPROXY_STICK_TABLE_H
|
|
|
|
|
#define _HAPROXY_STICK_TABLE_H
|
2010-01-04 09:23:48 -05:00
|
|
|
|
2020-06-04 12:46:44 -04:00
|
|
|
#include <haproxy/api.h>
|
2020-06-03 12:23:19 -04:00
|
|
|
#include <haproxy/dict-t.h>
|
2020-05-27 10:10:29 -04:00
|
|
|
#include <haproxy/errors.h>
|
2021-05-08 08:08:38 -04:00
|
|
|
#include <haproxy/freq_ctr.h>
|
2020-06-04 12:46:44 -04:00
|
|
|
#include <haproxy/sample-t.h>
|
|
|
|
|
#include <haproxy/stick_table-t.h>
|
2024-02-14 02:41:11 -05:00
|
|
|
#include <haproxy/thread.h>
|
2020-06-02 12:15:32 -04:00
|
|
|
#include <haproxy/ticks.h>
|
MAJOR: stktable: split the keys across multiple shards to reduce contention
In order to reduce the contention on the table when keys expire quickly,
we're spreading the load over multiple trees. That counts for keys and
expiration dates. The shard number is calculated from the key value
itself, both when looking up and when setting it.
The "show table" dump on the CLI iterates over all shards so that the
output is not fully sorted, it's only sorted within each shard. The Lua
table dump just does the same. It was verified with a Lua program to
count stick-table entries that it works as intended (the test case is
reproduced here as it's clearly not easy to automate as a vtc):
function dump_stk()
local dmp = core.proxies['tbl'].stktable:dump({});
local count = 0
for _, __ in pairs(dmp) do
count = count + 1
end
core.Info('Total entries: ' .. count)
end
core.register_action("dump_stk", {'tcp-req', 'http-req'}, dump_stk, 0);
##
global
tune.lua.log.stderr on
lua-load-per-thread lua-cnttbl.lua
listen front
bind :8001
http-request lua.dump_stk if { path_beg /stk }
http-request track-sc1 rand(),upper,hex table tbl
http-request redirect location /
backend tbl
stick-table size 100k type string len 12 store http_req_cnt
##
$ h2load -c 16 -n 10000 0:8001/
$ curl 0:8001/stk
## A count close to 100k appears on haproxy's stderr
## On the CLI, "show table tbl" | wc will show the same.
Some large parts were reindented only to add a top-level loop to iterate
over shards (e.g. process_table_expire()). Better check the diff using
git show -b.
The number of shards is decided just like for the pools, at build time
based on the max number of threads, so that we can keep a constant. Maybe
this should be done differently. For now CONFIG_HAP_TBL_BUCKETS is used,
and defaults to CONFIG_HAP_POOL_BUCKETS to keep the benefits of all the
measurements made for the pools. It turns out that this value seems to
be the most reasonable one without inflating the struct stktable too
much. By default for 1024 threads the value is 32 and delivers 980k RPS
in a test involving 80 threads, while adding 1kB to the struct stktable
(roughly doubling it). The same test at 64 gives 1008 kRPS and at 128
it gives 1040 kRPS for 8 times the initial size. 16 would be too low
however, with 675k RPS.
The stksess already have a shard number, it's the one used to decide which
peer connection to send the entry. Maybe we should also store the one
associated with the entry itself instead of recalculating it, though it
does not happen that often. The operation is done by hashing the key using
XXH32().
The peers also take and release the table's lock but the way it's used
it not very clear yet, so at this point it's sure this will not work.
At this point, this allowed to completely unlock the performance on a
80-thread setup:
before: 5.4 Gbps, 150k RPS, 80 cores
52.71% haproxy [.] stktable_lookup_key
36.90% haproxy [.] stktable_get_entry.part.0
0.86% haproxy [.] ebmb_lookup
0.18% haproxy [.] process_stream
0.12% haproxy [.] process_table_expire
0.11% haproxy [.] fwrr_get_next_server
0.10% haproxy [.] eb32_insert
0.10% haproxy [.] run_tasks_from_lists
after: 36 Gbps, 980k RPS, 80 cores
44.92% haproxy [.] stktable_get_entry
5.47% haproxy [.] ebmb_lookup
2.50% haproxy [.] fwrr_get_next_server
0.97% haproxy [.] eb32_insert
0.92% haproxy [.] process_stream
0.52% haproxy [.] run_tasks_from_lists
0.45% haproxy [.] conn_backend_get
0.44% haproxy [.] __pool_alloc
0.35% haproxy [.] process_table_expire
0.35% haproxy [.] connect_server
0.35% haproxy [.] h1_headers_to_hdr_list
0.34% haproxy [.] eb_delete
0.31% haproxy [.] srv_add_to_idle_list
0.30% haproxy [.] h1_snd_buf
WIP: uint64_t -> long
WIP: ulong -> uint
code is much smaller
2024-03-04 11:09:28 -05:00
|
|
|
#include <haproxy/xxhash.h>
|
2010-01-04 09:23:48 -05:00
|
|
|
|
2019-03-14 02:07:41 -04:00
|
|
|
extern struct stktable *stktables_list;
|
2023-01-06 10:09:58 -05:00
|
|
|
extern struct pool_head *pool_head_stk_ctr;
|
2020-06-04 12:46:44 -04:00
|
|
|
extern struct stktable_type stktable_types[];
|
2019-03-14 02:07:41 -04:00
|
|
|
|
2010-06-06 10:06:52 -04:00
|
|
|
#define stktable_data_size(type) (sizeof(((union stktable_data*)0)->type))
|
|
|
|
|
#define stktable_data_cast(ptr, type) ((union stktable_data*)(ptr))->type
|
|
|
|
|
|
2019-03-14 02:07:41 -04:00
|
|
|
void stktable_store_name(struct stktable *t);
|
|
|
|
|
struct stktable *stktable_find_by_name(const char *name);
|
2010-01-04 09:23:48 -05:00
|
|
|
struct stksess *stksess_new(struct stktable *t, struct stktable_key *key);
|
2010-06-06 06:11:37 -04:00
|
|
|
void stksess_setkey(struct stktable *t, struct stksess *ts, struct stktable_key *key);
|
2010-01-04 09:23:48 -05:00
|
|
|
void stksess_free(struct stktable *t, struct stksess *ts);
|
2017-06-13 13:37:32 -04:00
|
|
|
int stksess_kill(struct stktable *t, struct stksess *ts, int decrefcount);
|
2022-11-29 11:36:44 -05:00
|
|
|
int stktable_get_key_shard(struct stktable *t, const void *key, size_t len);
|
2010-01-04 09:23:48 -05:00
|
|
|
|
2023-11-02 13:34:51 -04:00
|
|
|
int stktable_init(struct stktable *t, char **err_msg);
|
2023-11-16 10:17:12 -05:00
|
|
|
void stktable_deinit(struct stktable *t);
|
2023-04-13 08:33:52 -04:00
|
|
|
int stktable_parse_type(char **args, int *idx, unsigned long *type, size_t *key_size, const char *file, int linenum);
|
2019-03-08 08:47:00 -05:00
|
|
|
int parse_stick_table(const char *file, int linenum, char **args,
|
2019-03-20 10:06:55 -04:00
|
|
|
struct stktable *t, char *id, char *nid, struct peers *peers);
|
2010-06-14 15:04:55 -04:00
|
|
|
struct stksess *stktable_get_entry(struct stktable *table, struct stktable_key *key);
|
2017-06-13 13:37:32 -04:00
|
|
|
struct stksess *stktable_set_entry(struct stktable *table, struct stksess *nts);
|
2022-10-12 06:00:50 -04:00
|
|
|
void stktable_requeue_exp(struct stktable *t, const struct stksess *ts);
|
2022-10-11 14:17:58 -04:00
|
|
|
void stktable_touch_with_exp(struct stktable *t, struct stksess *ts, int decrefcount, int expire, int decrefcnt);
|
2017-06-13 13:37:32 -04:00
|
|
|
void stktable_touch_remote(struct stktable *t, struct stksess *ts, int decrefcnt);
|
|
|
|
|
void stktable_touch_local(struct stktable *t, struct stksess *ts, int decrefccount);
|
2010-06-06 09:38:59 -04:00
|
|
|
struct stksess *stktable_lookup(struct stktable *t, struct stksess *ts);
|
|
|
|
|
struct stksess *stktable_lookup_key(struct stktable *t, struct stktable_key *key);
|
2010-06-20 06:27:21 -04:00
|
|
|
struct stksess *stktable_update_key(struct stktable *table, struct stktable_key *key);
|
2014-07-03 11:02:46 -04:00
|
|
|
struct stktable_key *smp_to_stkey(struct sample *smp, struct stktable *t);
|
2015-04-03 19:47:55 -04:00
|
|
|
struct stktable_key *stktable_fetch_key(struct stktable *t, struct proxy *px, struct session *sess,
|
2015-04-03 18:52:09 -04:00
|
|
|
struct stream *strm, unsigned int opt,
|
|
|
|
|
struct sample_expr *expr, struct sample *smp);
|
2017-06-13 13:37:32 -04:00
|
|
|
struct stkctr *smp_fetch_sc_stkctr(struct session *sess, struct stream *strm, const struct arg *args, const char *kw, struct stkctr *stkctr);
|
|
|
|
|
struct stkctr *smp_create_src_stkctr(struct session *sess, struct stream *strm, const struct arg *args, const char *kw, struct stkctr *stkctr);
|
2012-04-27 15:37:17 -04:00
|
|
|
int stktable_compatible_sample(struct sample_expr *expr, unsigned long table_type);
|
2014-07-15 10:44:27 -04:00
|
|
|
int stktable_register_data_store(int idx, const char *name, int std_type, int arg_type);
|
2010-06-06 07:34:54 -04:00
|
|
|
int stktable_get_data_type(char *name);
|
2013-09-04 11:54:01 -04:00
|
|
|
int stktable_trash_oldest(struct stktable *t, int to_batch);
|
2017-06-13 13:37:32 -04:00
|
|
|
int __stksess_kill(struct stktable *t, struct stksess *ts);
|
2010-01-04 09:23:48 -05:00
|
|
|
|
2021-05-08 08:12:47 -04:00
|
|
|
/************************* Composite address manipulation *********************
|
|
|
|
|
* Composite addresses are simply unsigned long data in which the higher bits
|
|
|
|
|
* represent a pointer, and the two lower bits are flags. There are several
|
|
|
|
|
* places where we just want to associate one or two flags to a pointer (eg,
|
|
|
|
|
* to type it), and these functions permit this. The pointer is necessarily a
|
|
|
|
|
* 32-bit aligned pointer, as its two lower bits will be cleared and replaced
|
|
|
|
|
* with the flags.
|
|
|
|
|
*****************************************************************************/
|
|
|
|
|
|
|
|
|
|
/* Masks the two lower bits of a composite address and converts it to a
|
|
|
|
|
* pointer. This is used to mix some bits with some aligned pointers to
|
|
|
|
|
* structs and to retrieve the original (32-bit aligned) pointer.
|
|
|
|
|
*/
|
|
|
|
|
static inline void *caddr_to_ptr(unsigned long caddr)
|
|
|
|
|
{
|
|
|
|
|
return (void *)(caddr & ~3UL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Only retrieves the two lower bits of a composite address. This is used to mix
|
|
|
|
|
* some bits with some aligned pointers to structs and to retrieve the original
|
|
|
|
|
* data (2 bits).
|
|
|
|
|
*/
|
|
|
|
|
static inline unsigned int caddr_to_data(unsigned long caddr)
|
|
|
|
|
{
|
|
|
|
|
return (caddr & 3UL);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Combines the aligned pointer whose 2 lower bits will be masked with the bits
|
|
|
|
|
* from <data> to form a composite address. This is used to mix some bits with
|
|
|
|
|
* some aligned pointers to structs and to retrieve the original (32-bit aligned)
|
|
|
|
|
* pointer.
|
|
|
|
|
*/
|
|
|
|
|
static inline unsigned long caddr_from_ptr(void *ptr, unsigned int data)
|
|
|
|
|
{
|
|
|
|
|
return (((unsigned long)ptr) & ~3UL) + (data & 3);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* sets the 2 bits of <data> in the <caddr> composite address */
|
|
|
|
|
static inline unsigned long caddr_set_flags(unsigned long caddr, unsigned int data)
|
|
|
|
|
{
|
|
|
|
|
return caddr | (data & 3);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* clears the 2 bits of <data> in the <caddr> composite address */
|
|
|
|
|
static inline unsigned long caddr_clr_flags(unsigned long caddr, unsigned int data)
|
|
|
|
|
{
|
|
|
|
|
return caddr & ~(unsigned long)(data & 3);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2010-07-18 02:04:30 -04:00
|
|
|
/* return allocation size for standard data type <type> */
|
|
|
|
|
static inline int stktable_type_size(int type)
|
|
|
|
|
{
|
|
|
|
|
switch(type) {
|
|
|
|
|
case STD_T_SINT:
|
|
|
|
|
case STD_T_UINT:
|
|
|
|
|
return sizeof(int);
|
|
|
|
|
case STD_T_ULL:
|
|
|
|
|
return sizeof(unsigned long long);
|
|
|
|
|
case STD_T_FRQP:
|
2021-04-10 17:00:53 -04:00
|
|
|
return sizeof(struct freq_ctr);
|
2019-05-17 04:08:29 -04:00
|
|
|
case STD_T_DICT:
|
|
|
|
|
return sizeof(struct dict_entry *);
|
2010-07-18 02:04:30 -04:00
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2021-06-30 12:01:02 -04:00
|
|
|
int stktable_alloc_data_type(struct stktable *t, int type, const char *sa, const char *sa2);
|
2010-01-04 09:23:48 -05:00
|
|
|
|
2018-09-20 05:06:33 -04:00
|
|
|
/* return pointer for data type <type> in sticky session <ts> of table <t>, all
|
|
|
|
|
* of which must exist (otherwise use stktable_data_ptr() if unsure).
|
|
|
|
|
*/
|
|
|
|
|
static inline void *__stktable_data_ptr(struct stktable *t, struct stksess *ts, int type)
|
|
|
|
|
{
|
|
|
|
|
return (void *)ts + t->data_ofs[type];
|
|
|
|
|
}
|
|
|
|
|
|
2010-06-06 10:06:52 -04:00
|
|
|
/* return pointer for data type <type> in sticky session <ts> of table <t>, or
|
|
|
|
|
* NULL if either <ts> is NULL or the type is not stored.
|
|
|
|
|
*/
|
|
|
|
|
static inline void *stktable_data_ptr(struct stktable *t, struct stksess *ts, int type)
|
|
|
|
|
{
|
|
|
|
|
if (type >= STKTABLE_DATA_TYPES)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
if (!t->data_ofs[type]) /* type not stored */
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
if (!ts)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
2018-09-20 05:06:33 -04:00
|
|
|
return __stktable_data_ptr(t, ts, type);
|
2010-06-06 10:06:52 -04:00
|
|
|
}
|
|
|
|
|
|
2021-06-30 12:01:02 -04:00
|
|
|
/* return pointer on the element of index <idx> from the array data type <type>
|
|
|
|
|
* in sticky session <ts> of table <t>, or NULL if either <ts> is NULL
|
|
|
|
|
* or this element is not stored because this type is not stored or
|
|
|
|
|
* requested index is greater than the number of elements of the array.
|
|
|
|
|
* Note: this function is also usable on non array types, they are
|
|
|
|
|
* considered as array of size 1, so a call with <idx> at 0
|
|
|
|
|
* as the same behavior than 'stktable_data_ptr'.
|
|
|
|
|
*/
|
|
|
|
|
static inline void *stktable_data_ptr_idx(struct stktable *t, struct stksess *ts, int type, unsigned int idx)
|
|
|
|
|
{
|
|
|
|
|
if (type >= STKTABLE_DATA_TYPES)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
if (!t->data_ofs[type]) /* type not stored */
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
if (!ts)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
if (t->data_nbelem[type] <= idx)
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
return __stktable_data_ptr(t, ts, type) + idx*stktable_type_size(stktable_data_types[type].std_type);
|
|
|
|
|
}
|
|
|
|
|
|
MAJOR: stktable: split the keys across multiple shards to reduce contention
In order to reduce the contention on the table when keys expire quickly,
we're spreading the load over multiple trees. That counts for keys and
expiration dates. The shard number is calculated from the key value
itself, both when looking up and when setting it.
The "show table" dump on the CLI iterates over all shards so that the
output is not fully sorted, it's only sorted within each shard. The Lua
table dump just does the same. It was verified with a Lua program to
count stick-table entries that it works as intended (the test case is
reproduced here as it's clearly not easy to automate as a vtc):
function dump_stk()
local dmp = core.proxies['tbl'].stktable:dump({});
local count = 0
for _, __ in pairs(dmp) do
count = count + 1
end
core.Info('Total entries: ' .. count)
end
core.register_action("dump_stk", {'tcp-req', 'http-req'}, dump_stk, 0);
##
global
tune.lua.log.stderr on
lua-load-per-thread lua-cnttbl.lua
listen front
bind :8001
http-request lua.dump_stk if { path_beg /stk }
http-request track-sc1 rand(),upper,hex table tbl
http-request redirect location /
backend tbl
stick-table size 100k type string len 12 store http_req_cnt
##
$ h2load -c 16 -n 10000 0:8001/
$ curl 0:8001/stk
## A count close to 100k appears on haproxy's stderr
## On the CLI, "show table tbl" | wc will show the same.
Some large parts were reindented only to add a top-level loop to iterate
over shards (e.g. process_table_expire()). Better check the diff using
git show -b.
The number of shards is decided just like for the pools, at build time
based on the max number of threads, so that we can keep a constant. Maybe
this should be done differently. For now CONFIG_HAP_TBL_BUCKETS is used,
and defaults to CONFIG_HAP_POOL_BUCKETS to keep the benefits of all the
measurements made for the pools. It turns out that this value seems to
be the most reasonable one without inflating the struct stktable too
much. By default for 1024 threads the value is 32 and delivers 980k RPS
in a test involving 80 threads, while adding 1kB to the struct stktable
(roughly doubling it). The same test at 64 gives 1008 kRPS and at 128
it gives 1040 kRPS for 8 times the initial size. 16 would be too low
however, with 675k RPS.
The stksess already have a shard number, it's the one used to decide which
peer connection to send the entry. Maybe we should also store the one
associated with the entry itself instead of recalculating it, though it
does not happen that often. The operation is done by hashing the key using
XXH32().
The peers also take and release the table's lock but the way it's used
it not very clear yet, so at this point it's sure this will not work.
At this point, this allowed to completely unlock the performance on a
80-thread setup:
before: 5.4 Gbps, 150k RPS, 80 cores
52.71% haproxy [.] stktable_lookup_key
36.90% haproxy [.] stktable_get_entry.part.0
0.86% haproxy [.] ebmb_lookup
0.18% haproxy [.] process_stream
0.12% haproxy [.] process_table_expire
0.11% haproxy [.] fwrr_get_next_server
0.10% haproxy [.] eb32_insert
0.10% haproxy [.] run_tasks_from_lists
after: 36 Gbps, 980k RPS, 80 cores
44.92% haproxy [.] stktable_get_entry
5.47% haproxy [.] ebmb_lookup
2.50% haproxy [.] fwrr_get_next_server
0.97% haproxy [.] eb32_insert
0.92% haproxy [.] process_stream
0.52% haproxy [.] run_tasks_from_lists
0.45% haproxy [.] conn_backend_get
0.44% haproxy [.] __pool_alloc
0.35% haproxy [.] process_table_expire
0.35% haproxy [.] connect_server
0.35% haproxy [.] h1_headers_to_hdr_list
0.34% haproxy [.] eb_delete
0.31% haproxy [.] srv_add_to_idle_list
0.30% haproxy [.] h1_snd_buf
WIP: uint64_t -> long
WIP: ulong -> uint
code is much smaller
2024-03-04 11:09:28 -05:00
|
|
|
/* return a shard number for key <key> of len <len> present in table <t>, for
|
|
|
|
|
* use with the tree indexing. The value will be from 0 to
|
|
|
|
|
* CONFIG_HAP_TBL_BUCKETS-1.
|
|
|
|
|
*/
|
|
|
|
|
static inline uint stktable_calc_shard_num(const struct stktable *t, const void *key, size_t len)
|
|
|
|
|
{
|
2024-04-24 02:19:20 -04:00
|
|
|
#if CONFIG_HAP_TBL_BUCKETS > 1
|
MAJOR: stktable: split the keys across multiple shards to reduce contention
In order to reduce the contention on the table when keys expire quickly,
we're spreading the load over multiple trees. That counts for keys and
expiration dates. The shard number is calculated from the key value
itself, both when looking up and when setting it.
The "show table" dump on the CLI iterates over all shards so that the
output is not fully sorted, it's only sorted within each shard. The Lua
table dump just does the same. It was verified with a Lua program to
count stick-table entries that it works as intended (the test case is
reproduced here as it's clearly not easy to automate as a vtc):
function dump_stk()
local dmp = core.proxies['tbl'].stktable:dump({});
local count = 0
for _, __ in pairs(dmp) do
count = count + 1
end
core.Info('Total entries: ' .. count)
end
core.register_action("dump_stk", {'tcp-req', 'http-req'}, dump_stk, 0);
##
global
tune.lua.log.stderr on
lua-load-per-thread lua-cnttbl.lua
listen front
bind :8001
http-request lua.dump_stk if { path_beg /stk }
http-request track-sc1 rand(),upper,hex table tbl
http-request redirect location /
backend tbl
stick-table size 100k type string len 12 store http_req_cnt
##
$ h2load -c 16 -n 10000 0:8001/
$ curl 0:8001/stk
## A count close to 100k appears on haproxy's stderr
## On the CLI, "show table tbl" | wc will show the same.
Some large parts were reindented only to add a top-level loop to iterate
over shards (e.g. process_table_expire()). Better check the diff using
git show -b.
The number of shards is decided just like for the pools, at build time
based on the max number of threads, so that we can keep a constant. Maybe
this should be done differently. For now CONFIG_HAP_TBL_BUCKETS is used,
and defaults to CONFIG_HAP_POOL_BUCKETS to keep the benefits of all the
measurements made for the pools. It turns out that this value seems to
be the most reasonable one without inflating the struct stktable too
much. By default for 1024 threads the value is 32 and delivers 980k RPS
in a test involving 80 threads, while adding 1kB to the struct stktable
(roughly doubling it). The same test at 64 gives 1008 kRPS and at 128
it gives 1040 kRPS for 8 times the initial size. 16 would be too low
however, with 675k RPS.
The stksess already have a shard number, it's the one used to decide which
peer connection to send the entry. Maybe we should also store the one
associated with the entry itself instead of recalculating it, though it
does not happen that often. The operation is done by hashing the key using
XXH32().
The peers also take and release the table's lock but the way it's used
it not very clear yet, so at this point it's sure this will not work.
At this point, this allowed to completely unlock the performance on a
80-thread setup:
before: 5.4 Gbps, 150k RPS, 80 cores
52.71% haproxy [.] stktable_lookup_key
36.90% haproxy [.] stktable_get_entry.part.0
0.86% haproxy [.] ebmb_lookup
0.18% haproxy [.] process_stream
0.12% haproxy [.] process_table_expire
0.11% haproxy [.] fwrr_get_next_server
0.10% haproxy [.] eb32_insert
0.10% haproxy [.] run_tasks_from_lists
after: 36 Gbps, 980k RPS, 80 cores
44.92% haproxy [.] stktable_get_entry
5.47% haproxy [.] ebmb_lookup
2.50% haproxy [.] fwrr_get_next_server
0.97% haproxy [.] eb32_insert
0.92% haproxy [.] process_stream
0.52% haproxy [.] run_tasks_from_lists
0.45% haproxy [.] conn_backend_get
0.44% haproxy [.] __pool_alloc
0.35% haproxy [.] process_table_expire
0.35% haproxy [.] connect_server
0.35% haproxy [.] h1_headers_to_hdr_list
0.34% haproxy [.] eb_delete
0.31% haproxy [.] srv_add_to_idle_list
0.30% haproxy [.] h1_snd_buf
WIP: uint64_t -> long
WIP: ulong -> uint
code is much smaller
2024-03-04 11:09:28 -05:00
|
|
|
return XXH32(key, len, t->hash_seed) % CONFIG_HAP_TBL_BUCKETS;
|
2024-04-24 02:19:20 -04:00
|
|
|
#else
|
|
|
|
|
return 0;
|
|
|
|
|
#endif
|
MAJOR: stktable: split the keys across multiple shards to reduce contention
In order to reduce the contention on the table when keys expire quickly,
we're spreading the load over multiple trees. That counts for keys and
expiration dates. The shard number is calculated from the key value
itself, both when looking up and when setting it.
The "show table" dump on the CLI iterates over all shards so that the
output is not fully sorted, it's only sorted within each shard. The Lua
table dump just does the same. It was verified with a Lua program to
count stick-table entries that it works as intended (the test case is
reproduced here as it's clearly not easy to automate as a vtc):
function dump_stk()
local dmp = core.proxies['tbl'].stktable:dump({});
local count = 0
for _, __ in pairs(dmp) do
count = count + 1
end
core.Info('Total entries: ' .. count)
end
core.register_action("dump_stk", {'tcp-req', 'http-req'}, dump_stk, 0);
##
global
tune.lua.log.stderr on
lua-load-per-thread lua-cnttbl.lua
listen front
bind :8001
http-request lua.dump_stk if { path_beg /stk }
http-request track-sc1 rand(),upper,hex table tbl
http-request redirect location /
backend tbl
stick-table size 100k type string len 12 store http_req_cnt
##
$ h2load -c 16 -n 10000 0:8001/
$ curl 0:8001/stk
## A count close to 100k appears on haproxy's stderr
## On the CLI, "show table tbl" | wc will show the same.
Some large parts were reindented only to add a top-level loop to iterate
over shards (e.g. process_table_expire()). Better check the diff using
git show -b.
The number of shards is decided just like for the pools, at build time
based on the max number of threads, so that we can keep a constant. Maybe
this should be done differently. For now CONFIG_HAP_TBL_BUCKETS is used,
and defaults to CONFIG_HAP_POOL_BUCKETS to keep the benefits of all the
measurements made for the pools. It turns out that this value seems to
be the most reasonable one without inflating the struct stktable too
much. By default for 1024 threads the value is 32 and delivers 980k RPS
in a test involving 80 threads, while adding 1kB to the struct stktable
(roughly doubling it). The same test at 64 gives 1008 kRPS and at 128
it gives 1040 kRPS for 8 times the initial size. 16 would be too low
however, with 675k RPS.
The stksess already have a shard number, it's the one used to decide which
peer connection to send the entry. Maybe we should also store the one
associated with the entry itself instead of recalculating it, though it
does not happen that often. The operation is done by hashing the key using
XXH32().
The peers also take and release the table's lock but the way it's used
it not very clear yet, so at this point it's sure this will not work.
At this point, this allowed to completely unlock the performance on a
80-thread setup:
before: 5.4 Gbps, 150k RPS, 80 cores
52.71% haproxy [.] stktable_lookup_key
36.90% haproxy [.] stktable_get_entry.part.0
0.86% haproxy [.] ebmb_lookup
0.18% haproxy [.] process_stream
0.12% haproxy [.] process_table_expire
0.11% haproxy [.] fwrr_get_next_server
0.10% haproxy [.] eb32_insert
0.10% haproxy [.] run_tasks_from_lists
after: 36 Gbps, 980k RPS, 80 cores
44.92% haproxy [.] stktable_get_entry
5.47% haproxy [.] ebmb_lookup
2.50% haproxy [.] fwrr_get_next_server
0.97% haproxy [.] eb32_insert
0.92% haproxy [.] process_stream
0.52% haproxy [.] run_tasks_from_lists
0.45% haproxy [.] conn_backend_get
0.44% haproxy [.] __pool_alloc
0.35% haproxy [.] process_table_expire
0.35% haproxy [.] connect_server
0.35% haproxy [.] h1_headers_to_hdr_list
0.34% haproxy [.] eb_delete
0.31% haproxy [.] srv_add_to_idle_list
0.30% haproxy [.] h1_snd_buf
WIP: uint64_t -> long
WIP: ulong -> uint
code is much smaller
2024-03-04 11:09:28 -05:00
|
|
|
}
|
|
|
|
|
|
2010-08-03 14:34:06 -04:00
|
|
|
/* kill an entry if it's expired and its ref_cnt is zero */
|
2017-06-13 13:37:32 -04:00
|
|
|
static inline int __stksess_kill_if_expired(struct stktable *t, struct stksess *ts)
|
2010-08-03 14:34:06 -04:00
|
|
|
{
|
2010-09-23 12:11:05 -04:00
|
|
|
if (t->expire != TICK_ETERNITY && tick_is_expired(ts->expire, now_ms))
|
2017-06-13 13:37:32 -04:00
|
|
|
return __stksess_kill(t, ts);
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2017-10-31 10:45:42 -04:00
|
|
|
static inline void stksess_kill_if_expired(struct stktable *t, struct stksess *ts, int decrefcnt)
|
2017-06-13 13:37:32 -04:00
|
|
|
{
|
MAJOR: stktable: split the keys across multiple shards to reduce contention
In order to reduce the contention on the table when keys expire quickly,
we're spreading the load over multiple trees. That counts for keys and
expiration dates. The shard number is calculated from the key value
itself, both when looking up and when setting it.
The "show table" dump on the CLI iterates over all shards so that the
output is not fully sorted, it's only sorted within each shard. The Lua
table dump just does the same. It was verified with a Lua program to
count stick-table entries that it works as intended (the test case is
reproduced here as it's clearly not easy to automate as a vtc):
function dump_stk()
local dmp = core.proxies['tbl'].stktable:dump({});
local count = 0
for _, __ in pairs(dmp) do
count = count + 1
end
core.Info('Total entries: ' .. count)
end
core.register_action("dump_stk", {'tcp-req', 'http-req'}, dump_stk, 0);
##
global
tune.lua.log.stderr on
lua-load-per-thread lua-cnttbl.lua
listen front
bind :8001
http-request lua.dump_stk if { path_beg /stk }
http-request track-sc1 rand(),upper,hex table tbl
http-request redirect location /
backend tbl
stick-table size 100k type string len 12 store http_req_cnt
##
$ h2load -c 16 -n 10000 0:8001/
$ curl 0:8001/stk
## A count close to 100k appears on haproxy's stderr
## On the CLI, "show table tbl" | wc will show the same.
Some large parts were reindented only to add a top-level loop to iterate
over shards (e.g. process_table_expire()). Better check the diff using
git show -b.
The number of shards is decided just like for the pools, at build time
based on the max number of threads, so that we can keep a constant. Maybe
this should be done differently. For now CONFIG_HAP_TBL_BUCKETS is used,
and defaults to CONFIG_HAP_POOL_BUCKETS to keep the benefits of all the
measurements made for the pools. It turns out that this value seems to
be the most reasonable one without inflating the struct stktable too
much. By default for 1024 threads the value is 32 and delivers 980k RPS
in a test involving 80 threads, while adding 1kB to the struct stktable
(roughly doubling it). The same test at 64 gives 1008 kRPS and at 128
it gives 1040 kRPS for 8 times the initial size. 16 would be too low
however, with 675k RPS.
The stksess already have a shard number, it's the one used to decide which
peer connection to send the entry. Maybe we should also store the one
associated with the entry itself instead of recalculating it, though it
does not happen that often. The operation is done by hashing the key using
XXH32().
The peers also take and release the table's lock but the way it's used
it not very clear yet, so at this point it's sure this will not work.
At this point, this allowed to completely unlock the performance on a
80-thread setup:
before: 5.4 Gbps, 150k RPS, 80 cores
52.71% haproxy [.] stktable_lookup_key
36.90% haproxy [.] stktable_get_entry.part.0
0.86% haproxy [.] ebmb_lookup
0.18% haproxy [.] process_stream
0.12% haproxy [.] process_table_expire
0.11% haproxy [.] fwrr_get_next_server
0.10% haproxy [.] eb32_insert
0.10% haproxy [.] run_tasks_from_lists
after: 36 Gbps, 980k RPS, 80 cores
44.92% haproxy [.] stktable_get_entry
5.47% haproxy [.] ebmb_lookup
2.50% haproxy [.] fwrr_get_next_server
0.97% haproxy [.] eb32_insert
0.92% haproxy [.] process_stream
0.52% haproxy [.] run_tasks_from_lists
0.45% haproxy [.] conn_backend_get
0.44% haproxy [.] __pool_alloc
0.35% haproxy [.] process_table_expire
0.35% haproxy [.] connect_server
0.35% haproxy [.] h1_headers_to_hdr_list
0.34% haproxy [.] eb_delete
0.31% haproxy [.] srv_add_to_idle_list
0.30% haproxy [.] h1_snd_buf
WIP: uint64_t -> long
WIP: ulong -> uint
code is much smaller
2024-03-04 11:09:28 -05:00
|
|
|
uint shard;
|
|
|
|
|
size_t len;
|
2017-06-13 13:37:32 -04:00
|
|
|
|
MEDIUM: stick-table: change the ref_cnt atomically
Due to the ts->ref_cnt being manipulated and checked inside wrlocks,
we continue to have it updated under plenty of read locks, which have
an important cost on many-thread machines.
This patch turns them all to atomic ops and carefully moves them outside
of locks every time this is possible:
- the ref_cnt is incremented before write-unlocking on creation otherwise
the element could vanish before we can do it
- the ref_cnt is decremented after write-locking on release
- for all other cases it's updated out of locks since it's guaranteed by
the sequence that it cannot vanish
- checks are done before locking every time it's used to decide
whether we're going to release the element (saves several write locks)
- expiration tests are just done using atomic loads, since there's no
particular ordering constraint there, we just want consistent values.
For Lua, the loop that is used to dump stick-tables could switch to read
locks only, but this was not done.
For peers, the loop that builds updates in peer_send_teachmsgs is extremely
expensive in write locks and it doesn't seem this is really needed since
the only updated variables are last_pushed and commitupdate, the first
one being on the shared table (thus not used by other threads) and the
commitupdate could likely be changed using a CAS. Thus all of this could
theoretically move under a read lock, but that was not done here.
On a 80-thread machine with a peers section enabled, the request rate
increased from 415 to 520k rps.
2023-05-27 12:55:48 -04:00
|
|
|
if (decrefcnt && HA_ATOMIC_SUB_FETCH(&ts->ref_cnt, 1) != 0)
|
|
|
|
|
return;
|
|
|
|
|
|
2022-10-11 14:50:22 -04:00
|
|
|
if (t->expire != TICK_ETERNITY && tick_is_expired(ts->expire, now_ms)) {
|
MAJOR: stktable: split the keys across multiple shards to reduce contention
In order to reduce the contention on the table when keys expire quickly,
we're spreading the load over multiple trees. That counts for keys and
expiration dates. The shard number is calculated from the key value
itself, both when looking up and when setting it.
The "show table" dump on the CLI iterates over all shards so that the
output is not fully sorted, it's only sorted within each shard. The Lua
table dump just does the same. It was verified with a Lua program to
count stick-table entries that it works as intended (the test case is
reproduced here as it's clearly not easy to automate as a vtc):
function dump_stk()
local dmp = core.proxies['tbl'].stktable:dump({});
local count = 0
for _, __ in pairs(dmp) do
count = count + 1
end
core.Info('Total entries: ' .. count)
end
core.register_action("dump_stk", {'tcp-req', 'http-req'}, dump_stk, 0);
##
global
tune.lua.log.stderr on
lua-load-per-thread lua-cnttbl.lua
listen front
bind :8001
http-request lua.dump_stk if { path_beg /stk }
http-request track-sc1 rand(),upper,hex table tbl
http-request redirect location /
backend tbl
stick-table size 100k type string len 12 store http_req_cnt
##
$ h2load -c 16 -n 10000 0:8001/
$ curl 0:8001/stk
## A count close to 100k appears on haproxy's stderr
## On the CLI, "show table tbl" | wc will show the same.
Some large parts were reindented only to add a top-level loop to iterate
over shards (e.g. process_table_expire()). Better check the diff using
git show -b.
The number of shards is decided just like for the pools, at build time
based on the max number of threads, so that we can keep a constant. Maybe
this should be done differently. For now CONFIG_HAP_TBL_BUCKETS is used,
and defaults to CONFIG_HAP_POOL_BUCKETS to keep the benefits of all the
measurements made for the pools. It turns out that this value seems to
be the most reasonable one without inflating the struct stktable too
much. By default for 1024 threads the value is 32 and delivers 980k RPS
in a test involving 80 threads, while adding 1kB to the struct stktable
(roughly doubling it). The same test at 64 gives 1008 kRPS and at 128
it gives 1040 kRPS for 8 times the initial size. 16 would be too low
however, with 675k RPS.
The stksess already have a shard number, it's the one used to decide which
peer connection to send the entry. Maybe we should also store the one
associated with the entry itself instead of recalculating it, though it
does not happen that often. The operation is done by hashing the key using
XXH32().
The peers also take and release the table's lock but the way it's used
it not very clear yet, so at this point it's sure this will not work.
At this point, this allowed to completely unlock the performance on a
80-thread setup:
before: 5.4 Gbps, 150k RPS, 80 cores
52.71% haproxy [.] stktable_lookup_key
36.90% haproxy [.] stktable_get_entry.part.0
0.86% haproxy [.] ebmb_lookup
0.18% haproxy [.] process_stream
0.12% haproxy [.] process_table_expire
0.11% haproxy [.] fwrr_get_next_server
0.10% haproxy [.] eb32_insert
0.10% haproxy [.] run_tasks_from_lists
after: 36 Gbps, 980k RPS, 80 cores
44.92% haproxy [.] stktable_get_entry
5.47% haproxy [.] ebmb_lookup
2.50% haproxy [.] fwrr_get_next_server
0.97% haproxy [.] eb32_insert
0.92% haproxy [.] process_stream
0.52% haproxy [.] run_tasks_from_lists
0.45% haproxy [.] conn_backend_get
0.44% haproxy [.] __pool_alloc
0.35% haproxy [.] process_table_expire
0.35% haproxy [.] connect_server
0.35% haproxy [.] h1_headers_to_hdr_list
0.34% haproxy [.] eb_delete
0.31% haproxy [.] srv_add_to_idle_list
0.30% haproxy [.] h1_snd_buf
WIP: uint64_t -> long
WIP: ulong -> uint
code is much smaller
2024-03-04 11:09:28 -05:00
|
|
|
if (t->type == SMP_T_STR)
|
|
|
|
|
len = strlen((const char *)ts->key.key);
|
|
|
|
|
else
|
|
|
|
|
len = t->key_size;
|
|
|
|
|
|
|
|
|
|
shard = stktable_calc_shard_num(t, ts->key.key, len);
|
|
|
|
|
|
2024-04-24 02:19:20 -04:00
|
|
|
/* make the compiler happy when shard is not used without threads */
|
|
|
|
|
ALREADY_CHECKED(shard);
|
|
|
|
|
|
MAJOR: stktable: split the keys across multiple shards to reduce contention
In order to reduce the contention on the table when keys expire quickly,
we're spreading the load over multiple trees. That counts for keys and
expiration dates. The shard number is calculated from the key value
itself, both when looking up and when setting it.
The "show table" dump on the CLI iterates over all shards so that the
output is not fully sorted, it's only sorted within each shard. The Lua
table dump just does the same. It was verified with a Lua program to
count stick-table entries that it works as intended (the test case is
reproduced here as it's clearly not easy to automate as a vtc):
function dump_stk()
local dmp = core.proxies['tbl'].stktable:dump({});
local count = 0
for _, __ in pairs(dmp) do
count = count + 1
end
core.Info('Total entries: ' .. count)
end
core.register_action("dump_stk", {'tcp-req', 'http-req'}, dump_stk, 0);
##
global
tune.lua.log.stderr on
lua-load-per-thread lua-cnttbl.lua
listen front
bind :8001
http-request lua.dump_stk if { path_beg /stk }
http-request track-sc1 rand(),upper,hex table tbl
http-request redirect location /
backend tbl
stick-table size 100k type string len 12 store http_req_cnt
##
$ h2load -c 16 -n 10000 0:8001/
$ curl 0:8001/stk
## A count close to 100k appears on haproxy's stderr
## On the CLI, "show table tbl" | wc will show the same.
Some large parts were reindented only to add a top-level loop to iterate
over shards (e.g. process_table_expire()). Better check the diff using
git show -b.
The number of shards is decided just like for the pools, at build time
based on the max number of threads, so that we can keep a constant. Maybe
this should be done differently. For now CONFIG_HAP_TBL_BUCKETS is used,
and defaults to CONFIG_HAP_POOL_BUCKETS to keep the benefits of all the
measurements made for the pools. It turns out that this value seems to
be the most reasonable one without inflating the struct stktable too
much. By default for 1024 threads the value is 32 and delivers 980k RPS
in a test involving 80 threads, while adding 1kB to the struct stktable
(roughly doubling it). The same test at 64 gives 1008 kRPS and at 128
it gives 1040 kRPS for 8 times the initial size. 16 would be too low
however, with 675k RPS.
The stksess already have a shard number, it's the one used to decide which
peer connection to send the entry. Maybe we should also store the one
associated with the entry itself instead of recalculating it, though it
does not happen that often. The operation is done by hashing the key using
XXH32().
The peers also take and release the table's lock but the way it's used
it not very clear yet, so at this point it's sure this will not work.
At this point, this allowed to completely unlock the performance on a
80-thread setup:
before: 5.4 Gbps, 150k RPS, 80 cores
52.71% haproxy [.] stktable_lookup_key
36.90% haproxy [.] stktable_get_entry.part.0
0.86% haproxy [.] ebmb_lookup
0.18% haproxy [.] process_stream
0.12% haproxy [.] process_table_expire
0.11% haproxy [.] fwrr_get_next_server
0.10% haproxy [.] eb32_insert
0.10% haproxy [.] run_tasks_from_lists
after: 36 Gbps, 980k RPS, 80 cores
44.92% haproxy [.] stktable_get_entry
5.47% haproxy [.] ebmb_lookup
2.50% haproxy [.] fwrr_get_next_server
0.97% haproxy [.] eb32_insert
0.92% haproxy [.] process_stream
0.52% haproxy [.] run_tasks_from_lists
0.45% haproxy [.] conn_backend_get
0.44% haproxy [.] __pool_alloc
0.35% haproxy [.] process_table_expire
0.35% haproxy [.] connect_server
0.35% haproxy [.] h1_headers_to_hdr_list
0.34% haproxy [.] eb_delete
0.31% haproxy [.] srv_add_to_idle_list
0.30% haproxy [.] h1_snd_buf
WIP: uint64_t -> long
WIP: ulong -> uint
code is much smaller
2024-03-04 11:09:28 -05:00
|
|
|
HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->shards[shard].sh_lock);
|
2017-10-31 10:45:42 -04:00
|
|
|
__stksess_kill_if_expired(t, ts);
|
MAJOR: stktable: split the keys across multiple shards to reduce contention
In order to reduce the contention on the table when keys expire quickly,
we're spreading the load over multiple trees. That counts for keys and
expiration dates. The shard number is calculated from the key value
itself, both when looking up and when setting it.
The "show table" dump on the CLI iterates over all shards so that the
output is not fully sorted, it's only sorted within each shard. The Lua
table dump just does the same. It was verified with a Lua program to
count stick-table entries that it works as intended (the test case is
reproduced here as it's clearly not easy to automate as a vtc):
function dump_stk()
local dmp = core.proxies['tbl'].stktable:dump({});
local count = 0
for _, __ in pairs(dmp) do
count = count + 1
end
core.Info('Total entries: ' .. count)
end
core.register_action("dump_stk", {'tcp-req', 'http-req'}, dump_stk, 0);
##
global
tune.lua.log.stderr on
lua-load-per-thread lua-cnttbl.lua
listen front
bind :8001
http-request lua.dump_stk if { path_beg /stk }
http-request track-sc1 rand(),upper,hex table tbl
http-request redirect location /
backend tbl
stick-table size 100k type string len 12 store http_req_cnt
##
$ h2load -c 16 -n 10000 0:8001/
$ curl 0:8001/stk
## A count close to 100k appears on haproxy's stderr
## On the CLI, "show table tbl" | wc will show the same.
Some large parts were reindented only to add a top-level loop to iterate
over shards (e.g. process_table_expire()). Better check the diff using
git show -b.
The number of shards is decided just like for the pools, at build time
based on the max number of threads, so that we can keep a constant. Maybe
this should be done differently. For now CONFIG_HAP_TBL_BUCKETS is used,
and defaults to CONFIG_HAP_POOL_BUCKETS to keep the benefits of all the
measurements made for the pools. It turns out that this value seems to
be the most reasonable one without inflating the struct stktable too
much. By default for 1024 threads the value is 32 and delivers 980k RPS
in a test involving 80 threads, while adding 1kB to the struct stktable
(roughly doubling it). The same test at 64 gives 1008 kRPS and at 128
it gives 1040 kRPS for 8 times the initial size. 16 would be too low
however, with 675k RPS.
The stksess already have a shard number, it's the one used to decide which
peer connection to send the entry. Maybe we should also store the one
associated with the entry itself instead of recalculating it, though it
does not happen that often. The operation is done by hashing the key using
XXH32().
The peers also take and release the table's lock but the way it's used
it not very clear yet, so at this point it's sure this will not work.
At this point, this allowed to completely unlock the performance on a
80-thread setup:
before: 5.4 Gbps, 150k RPS, 80 cores
52.71% haproxy [.] stktable_lookup_key
36.90% haproxy [.] stktable_get_entry.part.0
0.86% haproxy [.] ebmb_lookup
0.18% haproxy [.] process_stream
0.12% haproxy [.] process_table_expire
0.11% haproxy [.] fwrr_get_next_server
0.10% haproxy [.] eb32_insert
0.10% haproxy [.] run_tasks_from_lists
after: 36 Gbps, 980k RPS, 80 cores
44.92% haproxy [.] stktable_get_entry
5.47% haproxy [.] ebmb_lookup
2.50% haproxy [.] fwrr_get_next_server
0.97% haproxy [.] eb32_insert
0.92% haproxy [.] process_stream
0.52% haproxy [.] run_tasks_from_lists
0.45% haproxy [.] conn_backend_get
0.44% haproxy [.] __pool_alloc
0.35% haproxy [.] process_table_expire
0.35% haproxy [.] connect_server
0.35% haproxy [.] h1_headers_to_hdr_list
0.34% haproxy [.] eb_delete
0.31% haproxy [.] srv_add_to_idle_list
0.30% haproxy [.] h1_snd_buf
WIP: uint64_t -> long
WIP: ulong -> uint
code is much smaller
2024-03-04 11:09:28 -05:00
|
|
|
HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->shards[shard].sh_lock);
|
2022-10-11 14:50:22 -04:00
|
|
|
}
|
2010-08-03 14:34:06 -04:00
|
|
|
}
|
|
|
|
|
|
2015-04-04 10:24:42 -04:00
|
|
|
/* sets the stick counter's entry pointer */
|
|
|
|
|
static inline void stkctr_set_entry(struct stkctr *stkctr, struct stksess *entry)
|
|
|
|
|
{
|
|
|
|
|
stkctr->entry = caddr_from_ptr(entry, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* returns the entry pointer from a stick counter */
|
|
|
|
|
static inline struct stksess *stkctr_entry(struct stkctr *stkctr)
|
|
|
|
|
{
|
|
|
|
|
return caddr_to_ptr(stkctr->entry);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* returns the two flags from a stick counter */
|
|
|
|
|
static inline unsigned int stkctr_flags(struct stkctr *stkctr)
|
|
|
|
|
{
|
|
|
|
|
return caddr_to_data(stkctr->entry);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* sets up to two flags at a time on a composite address */
|
|
|
|
|
static inline void stkctr_set_flags(struct stkctr *stkctr, unsigned int flags)
|
|
|
|
|
{
|
|
|
|
|
stkctr->entry = caddr_set_flags(stkctr->entry, flags);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* returns the two flags from a stick counter */
|
|
|
|
|
static inline void stkctr_clr_flags(struct stkctr *stkctr, unsigned int flags)
|
|
|
|
|
{
|
|
|
|
|
stkctr->entry = caddr_clr_flags(stkctr->entry, flags);
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-06 07:52:40 -04:00
|
|
|
/* Increase the number of cumulated HTTP requests in the tracked counter
|
|
|
|
|
* <stkctr>. It returns 0 if the entry pointer does not exist and nothing is
|
|
|
|
|
* performed. Otherwise it returns 1.
|
|
|
|
|
*/
|
|
|
|
|
static inline int stkctr_inc_http_req_ctr(struct stkctr *stkctr)
|
|
|
|
|
{
|
|
|
|
|
struct stksess *ts;
|
|
|
|
|
void *ptr1, *ptr2;
|
|
|
|
|
|
|
|
|
|
ts = stkctr_entry(stkctr);
|
|
|
|
|
if (!ts)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
|
|
|
|
|
ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_REQ_CNT);
|
|
|
|
|
if (ptr1)
|
2021-06-30 11:18:28 -04:00
|
|
|
stktable_data_cast(ptr1, std_t_uint)++;
|
2020-10-06 07:52:40 -04:00
|
|
|
|
|
|
|
|
ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_REQ_RATE);
|
|
|
|
|
if (ptr2)
|
2021-06-30 11:18:28 -04:00
|
|
|
update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp),
|
2020-10-06 07:52:40 -04:00
|
|
|
stkctr->table->data_arg[STKTABLE_DT_HTTP_REQ_RATE].u, 1);
|
|
|
|
|
|
|
|
|
|
HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
|
|
|
|
|
/* If data was modified, we need to touch to re-schedule sync */
|
|
|
|
|
if (ptr1 || ptr2)
|
|
|
|
|
stktable_touch_local(stkctr->table, ts, 0);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Increase the number of cumulated failed HTTP requests in the tracked counter
|
|
|
|
|
* <stkctr>. It returns 0 if the entry pointer does not exist and nothing is
|
|
|
|
|
* performed. Otherwise it returns 1.
|
|
|
|
|
*/
|
|
|
|
|
static inline int stkctr_inc_http_err_ctr(struct stkctr *stkctr)
|
|
|
|
|
{
|
|
|
|
|
struct stksess *ts;
|
|
|
|
|
void *ptr1, *ptr2;
|
|
|
|
|
|
|
|
|
|
ts = stkctr_entry(stkctr);
|
|
|
|
|
if (!ts)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
|
|
|
|
|
ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_ERR_CNT);
|
|
|
|
|
if (ptr1)
|
2021-06-30 11:18:28 -04:00
|
|
|
stktable_data_cast(ptr1, std_t_uint)++;
|
2020-10-06 07:52:40 -04:00
|
|
|
|
|
|
|
|
ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_ERR_RATE);
|
|
|
|
|
if (ptr2)
|
2021-06-30 11:18:28 -04:00
|
|
|
update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp),
|
2020-10-06 07:52:40 -04:00
|
|
|
stkctr->table->data_arg[STKTABLE_DT_HTTP_ERR_RATE].u, 1);
|
|
|
|
|
|
|
|
|
|
HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
|
|
|
|
|
/* If data was modified, we need to touch to re-schedule sync */
|
|
|
|
|
if (ptr1 || ptr2)
|
MINOR: stick-tables/counters: add http_fail_cnt and http_fail_rate data types
Historically we've been counting lots of client-triggered events in stick
tables to help detect misbehaving ones, but we've been missing the same on
the server side, and there's been repeated requests for being able to count
the server errors per URL in order to precisely monitor the quality of
service or even to avoid routing requests to certain dead services, which
is also called "circuit breaking" nowadays.
This commit introduces http_fail_cnt and http_fail_rate, which work like
http_err_cnt and http_err_rate in that they respectively count events and
their frequency, but they only consider server-side issues such as network
errors, unparsable and truncated responses, and 5xx status codes other
than 501 and 505 (since these ones are usually triggered by the client).
Note that retryable errors are purposely not accounted for, so that only
what the client really sees is considered.
With this it becomes very simple to put some protective measures in place
to perform a redirect or return an excuse page when the error rate goes
beyond a certain threshold for a given URL, and give more chances to the
server to recover from this condition. Typically it could look like this
to bypass a URL causing more than 10 requests per second:
stick-table type string len 80 size 4k expire 1m store http_fail_rate(1m)
http-request track-sc0 base # track host+path, ignore query string
http-request return status 503 content-type text/html \
lf-file excuse.html if { sc0_http_fail_rate gt 10 }
A more advanced mechanism using gpt0 could even implement high/low rates
to disable/enable the service.
Reg-test converteers_ref_cnt_never_dec.vtc was updated to test it.
2021-02-10 06:07:15 -05:00
|
|
|
stktable_touch_local(stkctr->table, ts, 0);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Increase the number of cumulated failed HTTP responses in the tracked counter
|
|
|
|
|
* <stkctr>. It returns 0 if the entry pointer does not exist and nothing is
|
|
|
|
|
* performed. Otherwise it returns 1.
|
|
|
|
|
*/
|
|
|
|
|
static inline int stkctr_inc_http_fail_ctr(struct stkctr *stkctr)
|
|
|
|
|
{
|
|
|
|
|
struct stksess *ts;
|
|
|
|
|
void *ptr1, *ptr2;
|
|
|
|
|
|
|
|
|
|
ts = stkctr_entry(stkctr);
|
|
|
|
|
if (!ts)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
|
|
|
|
|
ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_FAIL_CNT);
|
|
|
|
|
if (ptr1)
|
2021-06-30 11:18:28 -04:00
|
|
|
stktable_data_cast(ptr1, std_t_uint)++;
|
MINOR: stick-tables/counters: add http_fail_cnt and http_fail_rate data types
Historically we've been counting lots of client-triggered events in stick
tables to help detect misbehaving ones, but we've been missing the same on
the server side, and there's been repeated requests for being able to count
the server errors per URL in order to precisely monitor the quality of
service or even to avoid routing requests to certain dead services, which
is also called "circuit breaking" nowadays.
This commit introduces http_fail_cnt and http_fail_rate, which work like
http_err_cnt and http_err_rate in that they respectively count events and
their frequency, but they only consider server-side issues such as network
errors, unparsable and truncated responses, and 5xx status codes other
than 501 and 505 (since these ones are usually triggered by the client).
Note that retryable errors are purposely not accounted for, so that only
what the client really sees is considered.
With this it becomes very simple to put some protective measures in place
to perform a redirect or return an excuse page when the error rate goes
beyond a certain threshold for a given URL, and give more chances to the
server to recover from this condition. Typically it could look like this
to bypass a URL causing more than 10 requests per second:
stick-table type string len 80 size 4k expire 1m store http_fail_rate(1m)
http-request track-sc0 base # track host+path, ignore query string
http-request return status 503 content-type text/html \
lf-file excuse.html if { sc0_http_fail_rate gt 10 }
A more advanced mechanism using gpt0 could even implement high/low rates
to disable/enable the service.
Reg-test converteers_ref_cnt_never_dec.vtc was updated to test it.
2021-02-10 06:07:15 -05:00
|
|
|
|
|
|
|
|
ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_FAIL_RATE);
|
|
|
|
|
if (ptr2)
|
2021-06-30 11:18:28 -04:00
|
|
|
update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp),
|
MINOR: stick-tables/counters: add http_fail_cnt and http_fail_rate data types
Historically we've been counting lots of client-triggered events in stick
tables to help detect misbehaving ones, but we've been missing the same on
the server side, and there's been repeated requests for being able to count
the server errors per URL in order to precisely monitor the quality of
service or even to avoid routing requests to certain dead services, which
is also called "circuit breaking" nowadays.
This commit introduces http_fail_cnt and http_fail_rate, which work like
http_err_cnt and http_err_rate in that they respectively count events and
their frequency, but they only consider server-side issues such as network
errors, unparsable and truncated responses, and 5xx status codes other
than 501 and 505 (since these ones are usually triggered by the client).
Note that retryable errors are purposely not accounted for, so that only
what the client really sees is considered.
With this it becomes very simple to put some protective measures in place
to perform a redirect or return an excuse page when the error rate goes
beyond a certain threshold for a given URL, and give more chances to the
server to recover from this condition. Typically it could look like this
to bypass a URL causing more than 10 requests per second:
stick-table type string len 80 size 4k expire 1m store http_fail_rate(1m)
http-request track-sc0 base # track host+path, ignore query string
http-request return status 503 content-type text/html \
lf-file excuse.html if { sc0_http_fail_rate gt 10 }
A more advanced mechanism using gpt0 could even implement high/low rates
to disable/enable the service.
Reg-test converteers_ref_cnt_never_dec.vtc was updated to test it.
2021-02-10 06:07:15 -05:00
|
|
|
stkctr->table->data_arg[STKTABLE_DT_HTTP_FAIL_RATE].u, 1);
|
|
|
|
|
|
|
|
|
|
HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
|
|
|
|
|
/* If data was modified, we need to touch to re-schedule sync */
|
|
|
|
|
if (ptr1 || ptr2)
|
2020-10-06 07:52:40 -04:00
|
|
|
stktable_touch_local(stkctr->table, ts, 0);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Increase the number of bytes received in the tracked counter <stkctr>. It
|
|
|
|
|
* returns 0 if the entry pointer does not exist and nothing is
|
|
|
|
|
* performed. Otherwise it returns 1.
|
|
|
|
|
*/
|
|
|
|
|
static inline int stkctr_inc_bytes_in_ctr(struct stkctr *stkctr, unsigned long long bytes)
|
|
|
|
|
{
|
|
|
|
|
struct stksess *ts;
|
|
|
|
|
void *ptr1, *ptr2;
|
|
|
|
|
|
|
|
|
|
ts = stkctr_entry(stkctr);
|
|
|
|
|
if (!ts)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_BYTES_IN_CNT);
|
|
|
|
|
if (ptr1)
|
2021-06-30 11:18:28 -04:00
|
|
|
stktable_data_cast(ptr1, std_t_ull) += bytes;
|
2020-10-06 07:52:40 -04:00
|
|
|
|
|
|
|
|
ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_BYTES_IN_RATE);
|
|
|
|
|
if (ptr2)
|
2021-06-30 11:18:28 -04:00
|
|
|
update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp),
|
2020-10-06 07:52:40 -04:00
|
|
|
stkctr->table->data_arg[STKTABLE_DT_BYTES_IN_RATE].u, bytes);
|
|
|
|
|
HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* If data was modified, we need to touch to re-schedule sync */
|
|
|
|
|
if (ptr1 || ptr2)
|
|
|
|
|
stktable_touch_local(stkctr->table, ts, 0);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Increase the number of bytes sent in the tracked counter <stkctr>. It
|
|
|
|
|
* returns 0 if the entry pointer does not exist and nothing is
|
|
|
|
|
* performed. Otherwise it returns 1.
|
|
|
|
|
*/
|
|
|
|
|
static inline int stkctr_inc_bytes_out_ctr(struct stkctr *stkctr, unsigned long long bytes)
|
|
|
|
|
{
|
|
|
|
|
struct stksess *ts;
|
|
|
|
|
void *ptr1, *ptr2;
|
|
|
|
|
|
|
|
|
|
ts = stkctr_entry(stkctr);
|
|
|
|
|
if (!ts)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_BYTES_OUT_CNT);
|
|
|
|
|
if (ptr1)
|
2021-06-30 11:18:28 -04:00
|
|
|
stktable_data_cast(ptr1, std_t_ull) += bytes;
|
2020-10-06 07:52:40 -04:00
|
|
|
|
|
|
|
|
ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_BYTES_OUT_RATE);
|
|
|
|
|
if (ptr2)
|
2021-06-30 11:18:28 -04:00
|
|
|
update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp),
|
2020-10-06 07:52:40 -04:00
|
|
|
stkctr->table->data_arg[STKTABLE_DT_BYTES_OUT_RATE].u, bytes);
|
|
|
|
|
HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* If data was modified, we need to touch to re-schedule sync */
|
|
|
|
|
if (ptr1 || ptr2)
|
|
|
|
|
stktable_touch_local(stkctr->table, ts, 0);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-19 11:23:07 -05:00
|
|
|
/* Add <inc> to the number of cumulated front glitches in the tracked counter
|
|
|
|
|
* <stkctr>. It returns 0 if the entry pointer does not exist and nothing is
|
|
|
|
|
* performed. Otherwise it returns 1.
|
|
|
|
|
*/
|
|
|
|
|
static inline int stkctr_add_glitch_ctr(struct stkctr *stkctr, uint inc)
|
|
|
|
|
{
|
|
|
|
|
struct stksess *ts;
|
|
|
|
|
void *ptr1, *ptr2;
|
|
|
|
|
|
|
|
|
|
ts = stkctr_entry(stkctr);
|
|
|
|
|
if (!ts)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
|
|
|
|
|
ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_GLITCH_CNT);
|
|
|
|
|
if (ptr1)
|
|
|
|
|
stktable_data_cast(ptr1, std_t_uint) += inc;
|
|
|
|
|
|
|
|
|
|
ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_GLITCH_RATE);
|
|
|
|
|
if (ptr2)
|
|
|
|
|
update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp),
|
|
|
|
|
stkctr->table->data_arg[STKTABLE_DT_GLITCH_RATE].u, inc);
|
|
|
|
|
|
|
|
|
|
HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock);
|
|
|
|
|
|
|
|
|
|
/* If data was modified, we need to touch to re-schedule sync */
|
|
|
|
|
if (ptr1 || ptr2)
|
|
|
|
|
stktable_touch_local(stkctr->table, ts, 0);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-04 12:46:44 -04:00
|
|
|
#endif /* _HAPROXY_STICK_TABLE_H */
|