time: use precise callout for clock_nanosleep(2) and nanosleep(2)

Don't apply tc_precexp and TIMESEL() that uses sbt_timethreshold (both
derivatives of kern.timecounter.alloweddeviation) to sleep callout when
processing the default and precise clocks.  The default timer deviation of
5% is our internal optimization in the kernel, and we shouldn't leak that
into the POSIX APIs.  Note that application doesn't have any control to
cancel the deviation, only a superuser can change the global tunable [with
side effects].

Leave the deviation for CLOCK_*_FAST and CLOCK_SECOND that are documented
as imprecise.

Provide a sysctl kern.timecounter.nanosleep_precise that allows to restore
the previous behavior.

Improve documentation.

Reviewed by:		ziaee, vangyzen, imp, kib
Differential Revision:	https://reviews.freebsd.org/D50075
This commit is contained in:
Gleb Smirnoff 2025-04-30 09:47:57 -07:00
parent b8b94f5ab1
commit 626ea75ed2
2 changed files with 72 additions and 16 deletions

View file

@ -27,7 +27,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE. .\" SUCH DAMAGE.
.\" .\"
.Dd April 3, 2022 .Dd April 29, 2025
.Dt NANOSLEEP 2 .Dt NANOSLEEP 2
.Os .Os
.Sh NAME .Sh NAME
@ -87,14 +87,6 @@ If, at the time of the call, the time value specified by
is less than or equal to the time value of the specified clock, then is less than or equal to the time value of the specified clock, then
.Fn clock_nanosleep .Fn clock_nanosleep
returns immediately and the calling thread is not suspended. returns immediately and the calling thread is not suspended.
.Pp
The suspension time may be longer than requested due to the
scheduling of other activity by the system.
It is also subject to the allowed time interval deviation
specified by the
.Va kern.timecounter.alloweddeviation
.Xr sysctl 8
variable.
An unmasked signal will terminate the sleep early, regardless of the An unmasked signal will terminate the sleep early, regardless of the
.Dv SA_RESTART .Dv SA_RESTART
value on the interrupting signal. value on the interrupting signal.
@ -131,6 +123,32 @@ CLOCK_UPTIME_FAST
CLOCK_UPTIME_PRECISE CLOCK_UPTIME_PRECISE
.El .El
.Pp .Pp
The suspension time may be longer than requested due to the
scheduling of other activity by the system.
The clocks with the
.Dv _FAST
suffix and the
.Dv CLOCK_SECOND
are subject to the allowed time interval deviation specified by the
.Va kern.timecounter.alloweddeviation
.Xr sysctl 8
variable.
The clocks with the
.Dv _PRECISE
suffix are always as precise as possible.
The
.Dv CLOCK_MONOTONIC ,
.Dv CLOCK_REALTIME
and
.Dv CLOCK_UPTIME
are precise by default.
Setting the
.Va kern.timecounter.nanosleep_precise
.Xr sysctl 8
to a false value would make those clocks to behave like the
.Dv _FAST
clocks.
.Pp
The The
.Fn nanosleep .Fn nanosleep
function behaves like function behaves like
@ -217,3 +235,19 @@ and was ported to
.Ox 2.1 .Ox 2.1
and and
.Fx 3.0 . .Fx 3.0 .
The
.Fn clock_nanosleep
system call has been available since
.Fx 11.1 .
.Pp
In
.Fx 15.0
the default behavior of
.Fn clock_nanosleep
with
.Dv CLOCK_MONOTONIC ,
.Dv CLOCK_REALTIME ,
.Dv CLOCK_UPTIME
clocks and
.Fn nanosleep
has been switched to use precise clock.

View file

@ -494,6 +494,10 @@ kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
rmt)); rmt));
} }
static __read_mostly bool nanosleep_precise = true;
SYSCTL_BOOL(_kern_timecounter, OID_AUTO, nanosleep_precise, CTLFLAG_RW,
&nanosleep_precise, 0, "clock_nanosleep() with CLOCK_REALTIME, "
"CLOCK_MONOTONIC, CLOCK_UPTIME and nanosleep(2) use precise clock");
static uint8_t nanowait[MAXCPU]; static uint8_t nanowait[MAXCPU];
int int
@ -504,7 +508,7 @@ kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
sbintime_t sbt, sbtt, prec, tmp; sbintime_t sbt, sbtt, prec, tmp;
time_t over; time_t over;
int error; int error;
bool is_abs_real; bool is_abs_real, precise;
if (rqt->tv_nsec < 0 || rqt->tv_nsec >= NS_PER_SEC) if (rqt->tv_nsec < 0 || rqt->tv_nsec >= NS_PER_SEC)
return (EINVAL); return (EINVAL);
@ -512,17 +516,31 @@ kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
return (EINVAL); return (EINVAL);
switch (clock_id) { switch (clock_id) {
case CLOCK_REALTIME: case CLOCK_REALTIME:
precise = nanosleep_precise;
is_abs_real = (flags & TIMER_ABSTIME) != 0;
break;
case CLOCK_REALTIME_PRECISE: case CLOCK_REALTIME_PRECISE:
precise = true;
is_abs_real = (flags & TIMER_ABSTIME) != 0;
break;
case CLOCK_REALTIME_FAST: case CLOCK_REALTIME_FAST:
case CLOCK_SECOND: case CLOCK_SECOND:
precise = false;
is_abs_real = (flags & TIMER_ABSTIME) != 0; is_abs_real = (flags & TIMER_ABSTIME) != 0;
break; break;
case CLOCK_MONOTONIC: case CLOCK_MONOTONIC:
case CLOCK_MONOTONIC_PRECISE:
case CLOCK_MONOTONIC_FAST:
case CLOCK_UPTIME: case CLOCK_UPTIME:
precise = nanosleep_precise;
is_abs_real = false;
break;
case CLOCK_MONOTONIC_PRECISE:
case CLOCK_UPTIME_PRECISE: case CLOCK_UPTIME_PRECISE:
precise = true;
is_abs_real = false;
break;
case CLOCK_MONOTONIC_FAST:
case CLOCK_UPTIME_FAST: case CLOCK_UPTIME_FAST:
precise = false;
is_abs_real = false; is_abs_real = false;
break; break;
case CLOCK_VIRTUAL: case CLOCK_VIRTUAL:
@ -553,10 +571,14 @@ kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags,
} else } else
over = 0; over = 0;
tmp = tstosbt(ts); tmp = tstosbt(ts);
prec = tmp; if (precise) {
prec >>= tc_precexp; prec = 0;
if (TIMESEL(&sbt, tmp)) sbt = sbinuptime();
sbt += tc_tick_sbt; } else {
prec = tmp >> tc_precexp;
if (TIMESEL(&sbt, tmp))
sbt += tc_tick_sbt;
}
sbt += tmp; sbt += tmp;
error = tsleep_sbt(&nanowait[curcpu], PWAIT | PCATCH, "nanslp", error = tsleep_sbt(&nanowait[curcpu], PWAIT | PCATCH, "nanslp",
sbt, prec, C_ABSOLUTE); sbt, prec, C_ABSOLUTE);