dtrace: Avoid excessive pcpu allocations

We were previously allocating MAXCPU structures for several purposes,
but this is generally unnecessary and is quite excessive, especially
after MAXCPU was bumped to 1024 on amd64 and arm64.  We already are
careful to allocate only as many per-CPU tracing buffers as are needed;
extend this to other allocations.

For example, in a 2-vCPU VM, the size of a consumer state structure
drops from 64KB to 128B.  The size of the per-consumer `dts_buffer` and
`dts_aggbuffer` arrays shrink similarly.  Ditto for pre-allocations of
local and global D variable storage space.

MFC after:	2 weeks
Differential Revision:	https://reviews.freebsd.org/D47667
This commit is contained in:
Mark Johnston 2024-11-22 13:55:20 +00:00
parent 4efe531c9d
commit 5d12db2daf
3 changed files with 33 additions and 39 deletions

View file

@ -741,7 +741,7 @@ dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
return (0);
maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
maxlocalsize = maxglobalsize * NCPU;
maxlocalsize = maxglobalsize * (mp_maxid + 1);
for (i = 0; i < nsvars; i++) {
dtrace_statvar_t *svar = svars[i];
@ -1744,7 +1744,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate)
dtrace_dynvar_t **rinsep;
int i, j, work = 0;
for (i = 0; i < NCPU; i++) {
CPU_FOREACH(i) {
dcpu = &dstate->dtds_percpu[i];
rinsep = &dcpu->dtdsc_rinsing;
@ -1782,7 +1782,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate)
* rinsing list -- and then we borrow this CPU to
* rinse our dirty list.
*/
for (j = 0; j < NCPU; j++) {
CPU_FOREACH(j) {
dtrace_dstate_percpu_t *rinser;
rinser = &dstate->dtds_percpu[j];
@ -1800,7 +1800,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate)
break;
}
if (j == NCPU) {
if (j > mp_maxid) {
/*
* We were unable to find another CPU that
* could accept this dirty list -- we are
@ -1841,7 +1841,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate)
dtrace_sync();
for (i = 0; i < NCPU; i++) {
CPU_FOREACH(i) {
dcpu = &dstate->dtds_percpu[i];
if (dcpu->dtdsc_rinsing == NULL)
@ -2190,7 +2190,7 @@ retry:
case DTRACE_DSTATE_CLEAN: {
void *sp = &dstate->dtds_state;
if (++cpu >= NCPU)
if (++cpu > mp_maxid)
cpu = 0;
if (dcpu->dtdsc_dirty != NULL &&
@ -6504,7 +6504,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
size_t lim;
sz += sizeof (uint64_t);
ASSERT(svar->dtsv_size == NCPU * sz);
ASSERT(svar->dtsv_size == (mp_maxid + 1) * sz);
a += curcpu * sz;
if (*(uint8_t *)a == UINT8_MAX) {
@ -6521,7 +6521,8 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
break;
}
ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
ASSERT(svar->dtsv_size ==
(mp_maxid + 1) * sizeof (uint64_t));
tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
regs[rd] = tmp[curcpu];
break;
@ -6544,7 +6545,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
size_t lim;
sz += sizeof (uint64_t);
ASSERT(svar->dtsv_size == NCPU * sz);
ASSERT(svar->dtsv_size == (mp_maxid + 1) * sz);
a += curcpu * sz;
if (regs[rd] == 0) {
@ -6565,7 +6566,8 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
break;
}
ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
ASSERT(svar->dtsv_size ==
(mp_maxid + 1) * sizeof (uint64_t));
tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
tmp[curcpu] = regs[rd];
break;
@ -10678,10 +10680,11 @@ dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
svarp = &vstate->dtvs_locals;
if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
dsize = NCPU * (v->dtdv_type.dtdt_size +
dsize = (mp_maxid + 1) *
(v->dtdv_type.dtdt_size +
sizeof (uint64_t));
else
dsize = NCPU * sizeof (uint64_t);
dsize = (mp_maxid + 1) * sizeof (uint64_t);
break;
@ -12603,7 +12606,7 @@ dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
{
int i;
for (i = 0; i < NCPU; i++) {
CPU_FOREACH(i) {
dtrace_buffer_t *buf = &bufs[i];
if (buf->dtb_size == 0)
@ -12627,7 +12630,7 @@ dtrace_buffer_free(dtrace_buffer_t *bufs)
{
int i;
for (i = 0; i < NCPU; i++) {
CPU_FOREACH(i) {
dtrace_buffer_t *buf = &bufs[i];
if (buf->dtb_tomax == NULL) {
@ -14379,7 +14382,8 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
dstate->dtds_size = size;
dstate->dtds_base = base;
dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
bzero(dstate->dtds_percpu,
(mp_maxid + 1) * sizeof (dtrace_dstate_percpu_t));
hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
@ -14413,14 +14417,10 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
VERIFY((uintptr_t)start < limit);
VERIFY((uintptr_t)start >= (uintptr_t)base);
maxper = (limit - (uintptr_t)start) / NCPU;
maxper = (limit - (uintptr_t)start) / (mp_maxid + 1);
maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
#ifndef illumos
CPU_FOREACH(i) {
#else
for (i = 0; i < NCPU; i++) {
#endif
dstate->dtds_percpu[i].dtdsc_free = dvar = start;
/*
@ -14430,7 +14430,7 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
* whatever is left over. In either case, we set the limit to
* be the limit of the dynamic variable space.
*/
if (maxper == 0 || i == NCPU - 1) {
if (maxper == 0 || i == mp_maxid) {
limit = (uintptr_t)base + size;
start = NULL;
} else {
@ -14603,7 +14603,7 @@ dtrace_state_create(struct cdev *dev, struct ucred *cred __unused)
char c[30];
dtrace_state_t *state;
dtrace_optval_t *opt;
int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
int bufsize = (mp_maxid + 1) * sizeof (dtrace_buffer_t), i;
int cpu_it;
ASSERT(MUTEX_HELD(&dtrace_lock));
@ -14651,12 +14651,6 @@ dtrace_state_create(struct cdev *dev, struct ucred *cred __unused)
state->dts_dev = dev;
#endif
/*
* We allocate NCPU buffers. On the one hand, this can be quite
* a bit of memory per instance (nearly 36K on a Starcat). On the
* other hand, it saves an additional memory reference in the probe
* path.
*/
state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
@ -14666,12 +14660,12 @@ dtrace_state_create(struct cdev *dev, struct ucred *cred __unused)
* assumed to be seeded at this point (if from Fortuna seed file).
*/
arc4random_buf(&state->dts_rstate[0], 2 * sizeof(uint64_t));
for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
for (cpu_it = 1; cpu_it <= mp_maxid; cpu_it++) {
/*
* Each CPU is assigned a 2^64 period, non-overlapping
* subsequence.
*/
dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it - 1],
state->dts_rstate[cpu_it]);
}
@ -14970,7 +14964,7 @@ dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
cyc_handler_t hdlr;
cyc_time_t when;
#endif
int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
int rval = 0, i, bufsize = (mp_maxid + 1) * sizeof (dtrace_buffer_t);
dtrace_icookie_t cookie;
mutex_enter(&cpu_lock);
@ -15224,10 +15218,10 @@ dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
* We enable anonymous tracing before APs are started, so we must
* activate buffers using the current CPU.
*/
if (state == dtrace_anon.dta_state)
for (int i = 0; i < NCPU; i++)
if (state == dtrace_anon.dta_state) {
CPU_FOREACH(i)
dtrace_buffer_activate_cpu(state, i);
else
} else
dtrace_xcall(DTRACE_CPUALL,
(dtrace_xcall_t)dtrace_buffer_activate, state);
#else
@ -15410,7 +15404,7 @@ dtrace_state_destroy(dtrace_state_t *state)
#ifdef illumos
minor_t minor = getminor(state->dts_dev);
#endif
int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
int i, bufsize = (mp_maxid + 1) * sizeof (dtrace_buffer_t);
dtrace_speculation_t *spec = state->dts_speculations;
int nspec = state->dts_nspeculations;
uint32_t match;
@ -15724,7 +15718,7 @@ dtrace_helper_trace(dtrace_helper_action_t *helper,
if ((svar = vstate->dtvs_locals[i]) == NULL)
continue;
ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
ASSERT(svar->dtsv_size >= (mp_maxid + 1) * sizeof (uint64_t));
ent->dtht_locals[i] =
((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
}

View file

@ -229,7 +229,7 @@ dtrace_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
"DTRACEIOC_AGGSNAP":"DTRACEIOC_BUFSNAP",
curcpu, desc.dtbd_cpu);
if (desc.dtbd_cpu >= MAXCPU || CPU_ABSENT(desc.dtbd_cpu))
if (desc.dtbd_cpu > mp_maxid || CPU_ABSENT(desc.dtbd_cpu))
return (ENOENT);
mutex_enter(&dtrace_lock);

View file

@ -100,8 +100,8 @@ dtrace_load(void *dummy)
mutex_enter(&dtrace_lock);
dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
NULL, NULL, NULL, NULL, NULL, 0);
sizeof (dtrace_dstate_percpu_t) * (mp_maxid + 1),
DTRACE_STATE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
ASSERT(MUTEX_HELD(&cpu_lock));
dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),