diff --git a/sys/kern/sched_core.c b/sys/kern/sched_core.c index 494e04fd05f..6ddf8b81128 100644 --- a/sys/kern/sched_core.c +++ b/sys/kern/sched_core.c @@ -185,24 +185,26 @@ struct krunq { * have several of these. */ struct kse { + struct thread *ke_thread; /* (*) Active associated thread. */ TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ int ke_flags; /* (j) KEF_* flags. */ - struct thread *ke_thread; /* (*) Active associated thread. */ fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ u_char ke_rqindex; /* (j) Run queue index. */ enum { KES_THREAD = 0x0, /* slaved to thread state */ KES_ONRUNQ } ke_state; /* (j) thread sched specific status. */ - int ke_slice; - struct krunq *ke_runq; - int ke_cpu; /* CPU that we have affinity for. */ - int ke_activated; - uint64_t ke_timestamp; - uint64_t ke_lastran; + int ke_slice; /* Time slice in ticks */ + struct kseq *ke_kseq; /* Kseq the thread belongs to */ + struct krunq *ke_runq; /* Assiociated runqueue */ #ifdef SMP - int ke_tocpu; + int ke_cpu; /* CPU that we have affinity for. */ + int ke_wakeup_cpu; /* CPU that has activated us. */ #endif + int ke_activated; /* How is the thread activated. */ + uint64_t ke_timestamp; /* Last timestamp dependent on state.*/ + unsigned ke_lastran; /* Last timestamp the thread ran. */ + /* The following variables are only used for pctcpu calculation */ int ke_ltick; /* Last tick that we were running on */ int ke_ftick; /* First tick that we were running on */ @@ -214,19 +216,14 @@ struct kse { #define ke_ksegrp ke_thread->td_ksegrp /* flags kept in ke_flags */ -#define KEF_ASSIGNED 0x0001 /* Thread is being migrated. */ -#define KEF_BOUND 0x0002 /* Thread can not migrate. */ -#define KEF_XFERABLE 0x0004 /* Thread was added as transferable. */ -#define KEF_HOLD 0x0008 /* Thread is temporarily bound. */ -#define KEF_REMOVED 0x0010 /* Thread was removed while ASSIGNED */ -#define KEF_INTERNAL 0x0020 /* Thread added due to migration. */ -#define KEF_PREEMPTED 0x0040 /* Thread was preempted. */ -#define KEF_MIGRATING 0x0080 /* Thread is migrating. */ -#define KEF_SLEEP 0x0100 /* Thread did sleep. */ -#define KEF_DIDRUN 0x2000 /* Thread actually ran. */ -#define KEF_EXIT 0x4000 /* Thread is being killed. */ -#define KEF_NEXTRQ 0x8000 /* Thread should be in next queue. */ -#define KEF_FIRST_SLICE 0x10000 /* Thread has first time slice left. */ +#define KEF_BOUND 0x0001 /* Thread can not migrate. */ +#define KEF_PREEMPTED 0x0002 /* Thread was preempted. */ +#define KEF_MIGRATING 0x0004 /* Thread is migrating. */ +#define KEF_SLEEP 0x0008 /* Thread did sleep. */ +#define KEF_DIDRUN 0x0010 /* Thread actually ran. */ +#define KEF_EXIT 0x0020 /* Thread is being killed. */ +#define KEF_NEXTRQ 0x0400 /* Thread should be in next queue. */ +#define KEF_FIRST_SLICE 0x0800 /* Thread has first time slice left. */ struct kg_sched { struct thread *skg_last_assigned; /* (j) Last thread assigned to */ @@ -259,47 +256,16 @@ struct kg_sched { * kseq - per processor runqs and statistics. */ struct kseq { - struct krunq ksq_idle; /* Queue of IDLE threads. */ - struct krunq ksq_timeshare[2]; /* Run queues for !IDLE. */ - struct krunq *ksq_next; /* Next timeshare queue. */ struct krunq *ksq_curr; /* Current queue. */ - int ksq_load_timeshare; /* Load for timeshare. */ - int ksq_load_idle; - int ksq_load; /* Aggregate load. */ - int ksq_sysload; /* For loadavg, !P_NOLOAD */ - uint64_t ksq_expired_timestamp; - uint64_t ksq_last_timestamp; - signed char ksq_best_expired_nice; -#ifdef SMP - int ksq_transferable; - LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ - struct kseq_group *ksq_group; /* Our processor group. */ - struct thread *ksq_migrated; - TAILQ_HEAD(,kse) ksq_migrateq; - int ksq_avgload; -#endif + struct krunq *ksq_next; /* Next timeshare queue. */ + struct krunq ksq_timeshare[2]; /* Run queues for !IDLE. */ + struct krunq ksq_idle; /* Queue of IDLE threads. */ + int ksq_load; + uint64_t ksq_last_timestamp; /* Per-cpu last clock tick */ + unsigned ksq_expired_tick; /* First expired tick */ + signed char ksq_expired_nice; /* Lowest nice in nextq */ }; -#ifdef SMP -/* - * kseq groups are groups of processors which can cheaply share threads. When - * one processor in the group goes idle it will check the runqs of the other - * processors in its group prior to halting and waiting for an interrupt. - * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. - * In a NUMA environment we'd want an idle bitmap per group and a two tiered - * load balancer. - */ -struct kseq_group { - int ksg_cpus; /* Count of CPUs in this kseq group. */ - cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ - cpumask_t ksg_idlemask; /* Idle cpus in this group. */ - cpumask_t ksg_mask; /* Bit mask for first cpu. */ - int ksg_transferable; /* Transferable load of this group. */ - LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ - int ksg_balance_tick; -}; -#endif - static struct kse kse0; static struct kg_sched kg_sched0; @@ -307,33 +273,25 @@ static int min_timeslice = 5; static int def_timeslice = 100; static int granularity = 10; static int realstathz; +static int sched_tdcnt; +static struct kseq kseq_global; /* * One kse queue per processor. */ #ifdef SMP -static cpumask_t kseq_idle; -static int ksg_maxid; -static struct kseq kseq_cpu[MAXCPU]; -static struct kseq_group kseq_groups[MAXCPU]; -static int balance_tick; -static int balance_interval = 1; -static int balance_interval_max = 32; -static int balance_interval_min = 8; -static int balance_busy_factor = 32; -static int imbalance_pct = 25; -static int imbalance_pct2 = 50; -static int ignore_topology = 1; +static struct kseq kseq_cpu[MAXCPU]; #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) #define KSEQ_CPU(x) (&kseq_cpu[(x)]) #define KSEQ_ID(x) ((x) - kseq_cpu) -#define KSEQ_GROUP(x) (&kseq_groups[(x)]) -#else /* !SMP */ -static struct kseq kseq_cpu; -#define KSEQ_SELF() (&kseq_cpu) -#define KSEQ_CPU(x) (&kseq_cpu) +static cpumask_t cpu_sibling[MAXCPU]; + +#else /* !SMP */ + +#define KSEQ_SELF() (&kseq_global) +#define KSEQ_CPU(x) (&kseq_global) #endif /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ @@ -348,46 +306,72 @@ SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL) static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); -SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "core", 0, +SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "CORE", 0, "Scheduler name"); #ifdef SMP -SYSCTL_INT(_kern_sched, OID_AUTO, imbalance_pct, CTLFLAG_RW, - &imbalance_pct, 0, ""); +/* Enable forwarding of wakeups to all other cpus */ +SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP"); -SYSCTL_INT(_kern_sched, OID_AUTO, imbalance_pct2, CTLFLAG_RW, - &imbalance_pct2, 0, ""); +static int runq_fuzz = 0; +SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, ""); -SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval_min, CTLFLAG_RW, - &balance_interval_min, 0, ""); +static int forward_wakeup_enabled = 1; +SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW, + &forward_wakeup_enabled, 0, + "Forwarding of wakeup to idle CPUs"); -SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval_max, CTLFLAG_RW, - &balance_interval_max, 0, ""); +static int forward_wakeups_requested = 0; +SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD, + &forward_wakeups_requested, 0, + "Requests for Forwarding of wakeup to idle CPUs"); + +static int forward_wakeups_delivered = 0; +SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD, + &forward_wakeups_delivered, 0, + "Completed Forwarding of wakeup to idle CPUs"); + +static int forward_wakeup_use_mask = 1; +SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW, + &forward_wakeup_use_mask, 0, + "Use the mask of idle cpus"); + +static int forward_wakeup_use_loop = 0; +SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW, + &forward_wakeup_use_loop, 0, + "Use a loop to find idle cpus"); + +static int forward_wakeup_use_single = 0; +SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW, + &forward_wakeup_use_single, 0, + "Only signal one idle cpu"); + +static int forward_wakeup_use_htt = 0; +SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW, + &forward_wakeup_use_htt, 0, + "account for htt"); #endif static void slot_fill(struct ksegrp *); -static void krunq_add(struct krunq *, struct kse *, int flags); +static void krunq_add(struct krunq *, struct kse *); static struct kse *krunq_choose(struct krunq *); static void krunq_clrbit(struct krunq *rq, int pri); static int krunq_findbit(struct krunq *rq); static void krunq_init(struct krunq *); static void krunq_remove(struct krunq *, struct kse *); -#ifdef SMP -static struct kse *krunq_steal(struct krunq *rq, int my_cpu); -#endif static struct kse * kseq_choose(struct kseq *); static void kseq_load_add(struct kseq *, struct kse *); static void kseq_load_rem(struct kseq *, struct kse *); -static void kseq_runq_add(struct kseq *, struct kse *, int); +static void kseq_runq_add(struct kseq *, struct kse *); static void kseq_runq_rem(struct kseq *, struct kse *); static void kseq_setup(struct kseq *); static int sched_is_timeshare(struct ksegrp *kg); static struct kse *sched_choose(void); static int sched_calc_pri(struct ksegrp *kg); -static int sched_starving(struct kseq *, uint64_t, struct kse *); +static int sched_starving(struct kseq *, unsigned, struct kse *); static void sched_pctcpu_update(struct kse *); static void sched_thread_priority(struct thread *, u_char); static uint64_t sched_timestamp(void); @@ -396,49 +380,6 @@ static int sched_timeslice(struct kse *ke); static void sched_update_runtime(struct kse *ke, uint64_t now); static void sched_commit_runtime(struct kse *ke); -#ifdef SMP -static void sched_balance_tick(int my_cpu, int idle); -static int sched_balance_idle(int my_cpu, int idle); -static int sched_balance(int my_cpu, int idle); -struct kseq_group *sched_find_busiest_group(int my_cpu, int idle, - int *imbalance); -static struct kseq *sched_find_busiest_queue(struct kseq_group *ksg); -static int sched_find_idlest_cpu(struct kse *ke, int cpu); -static int sched_pull_threads(struct kseq *high, struct kseq *myksq, - int max_move, int idle); -static int sched_pull_one(struct kseq *from, struct kseq *myksq, int idle); -static struct kse *sched_steal(struct kseq *, int my_cpu, int stealidle); -static int sched_idled(struct kseq *, int idle); -static int sched_find_idle_cpu(int defcpu); -static void migrated_setup(void *dummy); -static void migrated(void *dummy); -SYSINIT(migrated_setup, SI_SUB_KTHREAD_IDLE, SI_ORDER_MIDDLE, migrated_setup, - NULL); - -#endif /* SMP */ - -static inline int -kse_pinned(struct kse *ke) -{ - if (ke->ke_thread->td_pinned) - return (1); - - if (ke->ke_flags & KEF_BOUND) - return (1); - - return (0); -} - -#ifdef SMP -static inline int -kse_can_migrate(struct kse *ke) -{ - if (kse_pinned(ke)) - return (0); - return (1); -} -#endif - /* * Initialize a run structure. */ @@ -486,6 +427,20 @@ krunq_findbit(struct krunq *rq) return (-1); } +static int +krunq_check(struct krunq *rq) +{ + struct krqbits *rqb; + int i; + + rqb = &rq->rq_status; + for (i = 0; i < KQB_LEN; i++) { + if (rqb->rqb_bits[i]) + return (1); + } + return (0); +} + /* * Set the status bit of the queue corresponding to priority level pri, * indicating that it is non-empty. @@ -504,7 +459,7 @@ krunq_setbit(struct krunq *rq, int pri) * corresponding status bit. */ static void -krunq_add(struct krunq *rq, struct kse *ke, int flags) +krunq_add(struct krunq *rq, struct kse *ke) { struct krqhead *rqh; int pri; @@ -513,7 +468,7 @@ krunq_add(struct krunq *rq, struct kse *ke, int flags) ke->ke_rqindex = pri; krunq_setbit(rq, pri); rqh = &rq->rq_queues[pri]; - if (flags & SRQ_PREEMPTED) + if (ke->ke_flags & KEF_PREEMPTED) TAILQ_INSERT_HEAD(rqh, ke, ke_procq); else TAILQ_INSERT_TAIL(rqh, ke, ke_procq); @@ -533,7 +488,29 @@ krunq_choose(struct krunq *rq) if ((pri = krunq_findbit(rq)) != -1) { rqh = &rq->rq_queues[pri]; ke = TAILQ_FIRST(rqh); - KASSERT(ke != NULL, ("runq_choose: no proc on busy queue")); + KASSERT(ke != NULL, ("krunq_choose: no thread on busy queue")); +#ifdef SMP + if (pri <= PRI_MAX_ITHD || runq_fuzz <= 0) + return (ke); + + /* + * In the first couple of entries, check if + * there is one for our CPU as a preference. + */ + struct kse *ke2 = ke; + const int mycpu = PCPU_GET(cpuid); + const int mymask = 1 << mycpu; + int count = runq_fuzz; + + while (count-- && ke2) { + const int cpu = ke2->ke_wakeup_cpu; + if (cpu_sibling[cpu] & mymask) { + ke = ke2; + break; + } + ke2 = TAILQ_NEXT(ke2, ke_procq); + } +#endif return (ke); } @@ -561,110 +538,40 @@ krunq_remove(struct krunq *rq, struct kse *ke) krunq_clrbit(rq, pri); } -#ifdef SMP -static struct kse * -krunq_steal(struct krunq *rq, int my_cpu) -{ - struct krqhead *rqh; - struct krqbits *rqb; - struct kse *ke; - kqb_word_t word; - int i, bit; - - (void)my_cpu; - - mtx_assert(&sched_lock, MA_OWNED); - rqb = &rq->rq_status; - for (i = 0; i < KQB_LEN; i++) { - if ((word = rqb->rqb_bits[i]) == 0) - continue; - do { - bit = KQB_FFS(word); - rqh = &rq->rq_queues[bit + (i << KQB_L2BPW)]; - TAILQ_FOREACH(ke, rqh, ke_procq) { - if (kse_can_migrate(ke)) - return (ke); - } - word &= ~((kqb_word_t)1 << bit); - } while (word != 0); - } - return (NULL); -} -#endif - static inline void -kseq_runq_add(struct kseq *kseq, struct kse *ke, int flags) +kseq_runq_add(struct kseq *kseq, struct kse *ke) { -#ifdef SMP - if (kse_pinned(ke) == 0) { - kseq->ksq_transferable++; - kseq->ksq_group->ksg_transferable++; - ke->ke_flags |= KEF_XFERABLE; - } -#endif - if (ke->ke_flags & KEF_PREEMPTED) - flags |= SRQ_PREEMPTED; - krunq_add(ke->ke_runq, ke, flags); + krunq_add(ke->ke_runq, ke); + ke->ke_kseq = kseq; } static inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke) { -#ifdef SMP - if (ke->ke_flags & KEF_XFERABLE) { - kseq->ksq_transferable--; - kseq->ksq_group->ksg_transferable--; - ke->ke_flags &= ~KEF_XFERABLE; - } -#endif krunq_remove(ke->ke_runq, ke); + ke->ke_kseq = NULL; ke->ke_runq = NULL; } -static void +static inline void kseq_load_add(struct kseq *kseq, struct kse *ke) { - int class; - - mtx_assert(&sched_lock, MA_OWNED); -#ifdef SMP - if (__predict_false(ke->ke_thread == kseq->ksq_migrated)) - return; -#endif - class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); - if (class == PRI_TIMESHARE) - kseq->ksq_load_timeshare++; - else if (class == PRI_IDLE) - kseq->ksq_load_idle++; kseq->ksq_load++; if ((ke->ke_proc->p_flag & P_NOLOAD) == 0) - kseq->ksq_sysload++; + sched_tdcnt++; } -static void +static inline void kseq_load_rem(struct kseq *kseq, struct kse *ke) { - int class; - - mtx_assert(&sched_lock, MA_OWNED); -#ifdef SMP - if (__predict_false(ke->ke_thread == kseq->ksq_migrated)) - return; -#endif - class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); - if (class == PRI_TIMESHARE) - kseq->ksq_load_timeshare--; - else if (class == PRI_IDLE) - kseq->ksq_load_idle--; kseq->ksq_load--; if ((ke->ke_proc->p_flag & P_NOLOAD) == 0) - kseq->ksq_sysload--; + sched_tdcnt++; } /* * Pick the highest priority task we have and return it. */ - static struct kse * kseq_choose(struct kseq *kseq) { @@ -672,13 +579,12 @@ kseq_choose(struct kseq *kseq) struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); - ke = krunq_choose(kseq->ksq_curr); if (ke != NULL) return (ke); - kseq->ksq_best_expired_nice = 21; - kseq->ksq_expired_timestamp = 0; + kseq->ksq_expired_nice = PRIO_MAX + 1; + kseq->ksq_expired_tick = 0; swap = kseq->ksq_curr; kseq->ksq_curr = kseq->ksq_next; kseq->ksq_next = swap; @@ -689,6 +595,8 @@ kseq_choose(struct kseq *kseq) return krunq_choose(&kseq->ksq_idle); } +extern unsigned long long cycles_2_ns(unsigned long long cyc); + static inline uint64_t sched_timestamp(void) { @@ -704,17 +612,12 @@ sched_timeslice(struct kse *ke) if (ke->ke_proc->p_nice < 0) return SCALE_USER_PRI(def_timeslice*4, PROC_USER_PRI(p)); else - return SCALE_USER_PRI(def_timeslice, PROC_USER_PRI(p)); + return SCALE_USER_PRI(def_timeslice, PROC_USER_PRI(p)); } static inline int sched_is_timeshare(struct ksegrp *kg) { - /* - * XXX P_KTHREAD should be checked, but unfortunately, the - * readonly flag resides in a volatile member p_flag, reading - * it could cause lots of cache line sharing and invalidating. - */ return (kg->kg_pri_class == PRI_TIMESHARE); } @@ -723,15 +626,16 @@ sched_calc_pri(struct ksegrp *kg) { int score, pri; - if (__predict_false(!sched_is_timeshare(kg))) - return (kg->kg_user_pri); - score = CURRENT_SCORE(kg) - MAX_SCORE / 2; - pri = PROC_PRI(kg->kg_proc) - score; - if (pri < PUSER) - pri = PUSER; - if (pri > PUSER_MAX) - pri = PUSER_MAX; - return (pri); + if (sched_is_timeshare(kg)) { + score = CURRENT_SCORE(kg) - MAX_SCORE / 2; + pri = PROC_PRI(kg->kg_proc) - score; + if (pri < PUSER) + pri = PUSER; + if (pri > PUSER_MAX) + pri = PUSER_MAX; + return (pri); + } + return (kg->kg_user_pri); } static int @@ -820,459 +724,6 @@ sched_commit_runtime(struct kse *ke) kg->kg_runtime = 0; } -#ifdef SMP - -/* staged balancing operations between CPUs */ -#define CPU_OFFSET(cpu) (hz * cpu / MAXCPU) - -static void -sched_balance_tick(int my_cpu, int idle) -{ - struct kseq *kseq = KSEQ_CPU(my_cpu); - unsigned t = ticks + CPU_OFFSET(my_cpu); - int old_load, cur_load; - int interval; - - old_load = kseq->ksq_avgload; - cur_load = kseq->ksq_load * SCHED_LOAD_SCALE; - if (cur_load > old_load) - old_load++; - kseq->ksq_avgload = (old_load + cur_load) / 2; - - interval = balance_interval; - if (idle == NOT_IDLE) - interval *= balance_busy_factor; - interval = MS_TO_HZ(interval); - if (interval == 0) - interval = 1; - if (t - balance_tick >= interval) { - sched_balance(my_cpu, idle); - balance_tick += interval; - } -} - -static int -sched_balance(int my_cpu, int idle) -{ - struct kseq_group *high_group; - struct kseq *high_queue; - int imbalance, pulled; - - mtx_assert(&sched_lock, MA_OWNED); - high_group = sched_find_busiest_group(my_cpu, idle, &imbalance); - if (high_group == NULL) - goto out; - high_queue = sched_find_busiest_queue(high_group); - if (high_queue == NULL) - goto out; - pulled = sched_pull_threads(high_queue, KSEQ_CPU(my_cpu), imbalance, - idle); - if (pulled == 0) { - if (balance_interval < balance_interval_max) - balance_interval++; - } else { - balance_interval = balance_interval_min; - } - return (pulled); -out: - if (balance_interval < balance_interval_max) - balance_interval *= 2; - return (0); -} - -static int -sched_balance_idle(int my_cpu, int idle) -{ - struct kseq_group *high_group; - struct kseq *high_queue; - int imbalance, pulled; - - mtx_assert(&sched_lock, MA_OWNED); - high_group = sched_find_busiest_group(my_cpu, idle, &imbalance); - if (high_group == NULL) - return (0); - high_queue = sched_find_busiest_queue(high_group); - if (high_queue == NULL) - return (0); - pulled = sched_pull_threads(high_queue, KSEQ_CPU(my_cpu), imbalance, - idle); - return (pulled); -} - -static inline int -kseq_source_load(struct kseq *ksq) -{ - int load = ksq->ksq_load * SCHED_LOAD_SCALE; - return (MIN(ksq->ksq_avgload, load)); -} - -static inline int -kseq_dest_load(struct kseq *ksq) -{ - int load = ksq->ksq_load * SCHED_LOAD_SCALE; - return (MAX(ksq->ksq_avgload, load)); -} - -struct kseq_group * -sched_find_busiest_group(int my_cpu, int idle, int *imbalance) -{ - static unsigned stage_cpu; - struct kseq_group *high; - struct kseq_group *ksg; - struct kseq *my_ksq, *ksq; - int my_load, high_load, avg_load, total_load, load; - int diff, cnt, i; - - *imbalance = 0; - if (__predict_false(smp_started == 0)) - return (NULL); - - my_ksq = KSEQ_CPU(my_cpu); - high = NULL; - high_load = total_load = my_load = 0; - i = (stage_cpu++) % (ksg_maxid + 1); - for (cnt = 0; cnt <= ksg_maxid; cnt++) { - ksg = KSEQ_GROUP(i); - /* - * Find the CPU with the highest load that has some - * threads to transfer. - */ - load = 0; - LIST_FOREACH(ksq, &ksg->ksg_members, ksq_siblings) { - if (ksg == my_ksq->ksq_group) - load += kseq_dest_load(ksq); - else - load += kseq_source_load(ksq); - } - if (ksg == my_ksq->ksq_group) { - my_load = load; - } else if (load > high_load && ksg->ksg_transferable) { - high = ksg; - high_load = load; - } - total_load += load; - if (++i > ksg_maxid) - i = 0; - } - - avg_load = total_load / (ksg_maxid + 1); - - if (high == NULL) - return (NULL); - - if (my_load >= avg_load || - (high_load - my_load) * 100 < imbalance_pct * my_load) { - if (idle == IDLE_IDLE || - (idle == IDLE && high_load > SCHED_LOAD_SCALE)) { - *imbalance = 1; - return (high); - } else { - return (NULL); - } - } - - /* - * Pick a minimum imbalance value, avoid raising our load - * higher than average and pushing busiest load under average. - */ - diff = MIN(high_load - avg_load, avg_load - my_load); - if (diff < SCHED_LOAD_SCALE) { - if (high_load - my_load >= SCHED_LOAD_SCALE * 2) { - *imbalance = 1; - return (high); - } - } - - *imbalance = diff / SCHED_LOAD_SCALE; - return (high); -} - -static struct kseq * -sched_find_busiest_queue(struct kseq_group *ksg) -{ - struct kseq *kseq, *high = NULL; - int load, high_load = 0; - - LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { - load = kseq_source_load(kseq); - if (load > high_load) { - high_load = load; - high = kseq; - } - } - - return (high); -} - -static int -sched_pull_threads(struct kseq *high, struct kseq *myksq, int max_pull, - int idle) -{ - int pulled, i; - - mtx_assert(&sched_lock, MA_OWNED); - pulled = 0; - for (i = 0; i < max_pull; i++) { - if (sched_pull_one(high, myksq, idle)) - pulled++; - else - break; - } - return (pulled); -} - -static int -sched_pull_one(struct kseq *from, struct kseq *myksq, int idle) -{ - struct kseq *kseq; - struct kse *ke; - struct krunq *destq; - int class; - - mtx_assert(&sched_lock, MA_OWNED); - kseq = from; - ke = sched_steal(kseq, KSEQ_ID(myksq), idle); - if (ke == NULL) { - /* doing balance in same group */ - if (from->ksq_group == myksq->ksq_group) - return (0); - - struct kseq_group *ksg; - - ksg = kseq->ksq_group; - LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { - if (kseq == from || kseq == myksq || - kseq->ksq_transferable == 0) - continue; - ke = sched_steal(kseq, KSEQ_ID(myksq), idle); - break; - } - if (ke == NULL) - return (0); - } - ke->ke_timestamp = ke->ke_timestamp + myksq->ksq_last_timestamp - - kseq->ksq_last_timestamp; - ke->ke_lastran = 0; - if (ke->ke_runq == from->ksq_curr) - destq = myksq->ksq_curr; - else if (ke->ke_runq == from->ksq_next) - destq = myksq->ksq_next; - else - destq = &myksq->ksq_idle; - kseq_runq_rem(kseq, ke); - kseq_load_rem(kseq, ke); - ke->ke_cpu = KSEQ_ID(myksq); - ke->ke_runq = destq; - ke->ke_state = KES_ONRUNQ; - kseq_runq_add(myksq, ke, 0); - kseq_load_add(myksq, ke); - class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); - if (class != PRI_IDLE) { - if (kseq_idle & myksq->ksq_group->ksg_mask) - kseq_idle &= ~myksq->ksq_group->ksg_mask; - if (myksq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) - myksq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); - } - if (ke->ke_thread->td_priority < curthread->td_priority) - curthread->td_flags |= TDF_NEEDRESCHED; - return (1); -} - -static struct kse * -sched_steal(struct kseq *kseq, int my_cpu, int idle) -{ - struct kse *ke; - - /* - * Steal from expired queue first to try to get a non-interactive - * task that may not have run for a while. - */ - if ((ke = krunq_steal(kseq->ksq_next, my_cpu)) != NULL) - return (ke); - if ((ke = krunq_steal(kseq->ksq_curr, my_cpu)) != NULL) - return (ke); - if (idle == IDLE_IDLE) - return (krunq_steal(&kseq->ksq_idle, my_cpu)); - return (NULL); -} - -static int -sched_idled(struct kseq *kseq, int idle) -{ - struct kseq_group *ksg; - struct kseq *steal; - - mtx_assert(&sched_lock, MA_OWNED); - ksg = kseq->ksq_group; - /* - * If we're in a cpu group, try and steal kses from another cpu in - * the group before idling. - */ - if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { - LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { - if (steal == kseq || steal->ksq_transferable == 0) - continue; - if (sched_pull_one(steal, kseq, idle)) - return (0); - } - } - - if (sched_balance_idle(PCPU_GET(cpuid), idle)) - return (0); - - /* - * We only set the idled bit when all of the cpus in the group are - * idle. Otherwise we could get into a situation where a KSE bounces - * back and forth between two idle cores on seperate physical CPUs. - */ - ksg->ksg_idlemask |= PCPU_GET(cpumask); - if (ksg->ksg_idlemask != ksg->ksg_cpumask) - return (1); - kseq_idle |= ksg->ksg_mask; - return (1); -} - -static int -sched_find_idle_cpu(int defcpu) -{ - struct pcpu *pcpu; - struct kseq_group *ksg; - struct kseq *ksq; - int cpu; - - mtx_assert(&sched_lock, MA_OWNED); - ksq = KSEQ_CPU(defcpu); - ksg = ksq->ksq_group; - pcpu = pcpu_find(defcpu); - if (ksg->ksg_idlemask & pcpu->pc_cpumask) - return (defcpu); - - /* Try to find a fully idled cpu. */ - if (kseq_idle) { - cpu = ffs(kseq_idle); - if (cpu) - goto migrate; - } - - /* - * If another cpu in this group has idled, assign a thread over - * to them after checking to see if there are idled groups. - */ - if (ksg->ksg_idlemask) { - cpu = ffs(ksg->ksg_idlemask); - if (cpu) - goto migrate; - } - return (defcpu); - -migrate: - /* - * Now that we've found an idle CPU, migrate the thread. - */ - cpu--; - return (cpu); -} - -static int -sched_find_idlest_cpu(struct kse *ke, int cpu) -{ - static unsigned stage_cpu; - - struct kseq_group *ksg; - struct kseq *ksq; - int load, min_load = INT_MAX; - int first = 1; - int idlest = -1; - int i, cnt; - - (void)ke; - - if (__predict_false(smp_started == 0)) - return (cpu); - - first = 1; - i = (stage_cpu++) % (ksg_maxid + 1); - for (cnt = 0; cnt <= ksg_maxid; cnt++) { - ksg = KSEQ_GROUP(i); - LIST_FOREACH(ksq, &ksg->ksg_members, ksq_siblings) { - load = kseq_source_load(ksq); - if (first || load < min_load) { - first = 0; - load = min_load; - idlest = KSEQ_ID(ksq); - } - } - if (++i > ksg_maxid) - i = 0; - } - return (idlest); -} - -static void -migrated_setup(void *dummy) -{ - struct kseq *kseq; - struct proc *p; - struct thread *td; - int i, error; - - for (i = 0; i < MAXCPU; i++) { - if (CPU_ABSENT(i)) - continue; - kseq = &kseq_cpu[i]; - error = kthread_create(migrated, kseq, &p, RFSTOPPED, 0, - "migrated%d", i); - if (error) - panic("can not create migration thread"); - PROC_LOCK(p); - p->p_flag |= P_NOLOAD; - mtx_lock_spin(&sched_lock); - td = FIRST_THREAD_IN_PROC(p); - td->td_kse->ke_flags |= KEF_BOUND; - td->td_kse->ke_cpu = i; - kseq->ksq_migrated = td; - sched_class(td->td_ksegrp, PRI_ITHD); - td->td_kse->ke_runq = kseq->ksq_curr; - sched_prio(td, PRI_MIN); - SLOT_USE(td->td_ksegrp); - kseq_runq_add(kseq, td->td_kse, 0); - td->td_kse->ke_state = KES_ONRUNQ; - mtx_unlock_spin(&sched_lock); - PROC_UNLOCK(p); - } -} - -static void -migrated(void *dummy) -{ - struct thread *td = curthread; - struct kseq *kseq = KSEQ_SELF(); - struct kse *ke; - - mtx_lock_spin(&sched_lock); - for (;;) { - while ((ke = TAILQ_FIRST(&kseq->ksq_migrateq)) != NULL) { - TAILQ_REMOVE(&kseq->ksq_migrateq, ke, ke_procq); - kseq_load_rem(kseq, ke); - ke->ke_flags &= ~KEF_MIGRATING; - ke->ke_cpu = ke->ke_tocpu; - setrunqueue(ke->ke_thread, SRQ_BORING); - } - TD_SET_IWAIT(td); - mi_switch(SW_VOL, NULL); - } - mtx_unlock_spin(&sched_lock); -} -#else - -static inline void -sched_balance_tick(int my_cpu, int idle) -{ -} - -#endif /* SMP */ - - static void kseq_setup(struct kseq *kseq) { @@ -1281,10 +732,8 @@ kseq_setup(struct kseq *kseq) krunq_init(&kseq->ksq_idle); kseq->ksq_curr = &kseq->ksq_timeshare[0]; kseq->ksq_next = &kseq->ksq_timeshare[1]; - kseq->ksq_best_expired_nice = 21; -#ifdef SMP - TAILQ_INIT(&kseq->ksq_migrateq); -#endif + kseq->ksq_expired_nice = PRIO_MAX + 1; + kseq->ksq_expired_tick = 0; } static void @@ -1292,7 +741,6 @@ sched_setup(void *dummy) { #ifdef SMP int i; - int t; #endif /* @@ -1304,9 +752,9 @@ sched_setup(void *dummy) def_timeslice = MAX(100 * hz / 1000, 1); granularity = MAX(10 * hz / 1000, 1); + kseq_setup(&kseq_global); #ifdef SMP - t = ticks; - balance_tick = t; + runq_fuzz = MIN(mp_ncpus * 2, 8); /* * Initialize the kseqs. */ @@ -1315,64 +763,29 @@ sched_setup(void *dummy) ksq = &kseq_cpu[i]; kseq_setup(&kseq_cpu[i]); + cpu_sibling[i] = 1 << i; } - if (smp_topology == NULL || ignore_topology) { - struct kseq_group *ksg; - struct kseq *ksq; - int cpus; - - for (cpus = 0, i = 0; i < MAXCPU; i++) { - if (CPU_ABSENT(i)) - continue; - ksq = &kseq_cpu[i]; - ksg = &kseq_groups[cpus]; - /* - * Setup a kseq group with one member. - */ - ksq->ksq_group = ksg; - ksg->ksg_cpus = 1; - ksg->ksg_idlemask = 0; - ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; - ksg->ksg_balance_tick = t; - LIST_INIT(&ksg->ksg_members); - LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); - cpus++; - } - ksg_maxid = cpus - 1; - } else { - struct kseq_group *ksg; + if (smp_topology != NULL) { + int i, j; + cpumask_t visited; struct cpu_group *cg; - int j; + visited = 0; for (i = 0; i < smp_topology->ct_count; i++) { cg = &smp_topology->ct_group[i]; - ksg = &kseq_groups[i]; - /* - * Initialize the group. - */ - ksg->ksg_idlemask = 0; - ksg->ksg_cpus = cg->cg_count; - ksg->ksg_cpumask = cg->cg_mask; - LIST_INIT(&ksg->ksg_members); - /* - * Find all of the group members and add them. - */ + if (cg->cg_mask & visited) + panic("duplicated cpumask in ct_group."); + if (cg->cg_mask == 0) + continue; + visited |= cg->cg_mask; for (j = 0; j < MAXCPU; j++) { - if ((cg->cg_mask & (1 << j)) != 0) { - if (ksg->ksg_mask == 0) - ksg->ksg_mask = 1 << j; - kseq_cpu[j].ksq_group = ksg; - LIST_INSERT_HEAD(&ksg->ksg_members, - &kseq_cpu[j], ksq_siblings); - } + if ((cg->cg_mask & (1 << j)) != 0) + cpu_sibling[j] |= cg->cg_mask; } - ksg->ksg_balance_tick = t; } - ksg_maxid = smp_topology->ct_count - 1; } -#else - kseq_setup(KSEQ_SELF()); #endif + mtx_lock_spin(&sched_lock); kseq_load_add(KSEQ_SELF(), &kse0); mtx_unlock_spin(&sched_lock); @@ -1441,7 +854,7 @@ sched_pctcpu_update(struct kse *ke) ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; } -void +static void sched_thread_priority(struct thread *td, u_char prio) { struct kse *ke; @@ -1459,19 +872,17 @@ sched_thread_priority(struct thread *td, u_char prio) * needs to fix things up. */ if (prio < td->td_priority && ke->ke_runq != NULL && - ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { + ke->ke_runq != ke->ke_kseq->ksq_curr) { krunq_remove(ke->ke_runq, ke); - ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; - krunq_add(ke->ke_runq, ke, 0); + ke->ke_runq = ke->ke_kseq->ksq_curr; + krunq_add(ke->ke_runq, ke); } /* * Hold this kse on this cpu so that sched_prio() doesn't * cause excessive migration. We only want migration to * happen as the result of a wakeup. */ - ke->ke_flags |= KEF_HOLD; adjustrunqueue(td, prio); - ke->ke_flags &= ~KEF_HOLD; } else td->td_priority = prio; } @@ -1518,6 +929,9 @@ sched_prio(struct thread *td, u_char prio) { u_char oldprio; + if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE) + prio = MIN(prio, PUSER_MAX); + /* First, update the base priority. */ td->td_base_pri = prio; @@ -1550,7 +964,6 @@ sched_switch(struct thread *td, struct thread *newtd, int flags) mtx_assert(&sched_lock, MA_OWNED); - now = sched_timestamp(); ke = td->td_kse; kg = td->td_ksegrp; ksq = KSEQ_SELF(); @@ -1560,37 +973,30 @@ sched_switch(struct thread *td, struct thread *newtd, int flags) td->td_flags &= ~TDF_NEEDRESCHED; td->td_owepreempt = 0; - /* - * If the KSE has been assigned it may be in the process of switching - * to the new cpu. This is the case in sched_bind(). - */ - if (__predict_false(td == PCPU_GET(idlethread))) { + if (td == PCPU_GET(idlethread)) { TD_SET_CAN_RUN(td); - } else if (__predict_false((ke->ke_flags & KEF_MIGRATING) != 0)) { - SLOT_RELEASE(td->td_ksegrp); } else { /* We are ending our run so make our slot available again */ SLOT_RELEASE(td->td_ksegrp); kseq_load_rem(ksq, ke); if (TD_IS_RUNNING(td)) { - /* - * Don't allow the thread to migrate - * from a preemption. - */ - ke->ke_flags |= KEF_HOLD; setrunqueue(td, (flags & SW_PREEMPT) ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING); - ke->ke_flags &= ~KEF_HOLD; - } else if ((td->td_proc->p_flag & P_HADTHREADS) && - (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp)) - /* - * We will not be on the run queue. - * So we must be sleeping or similar. - * Don't use the slot if we will need it - * for newtd. - */ - slot_fill(td->td_ksegrp); + } else { + if ((td->td_proc->p_flag & P_HADTHREADS) && + (newtd == NULL || + newtd->td_ksegrp != td->td_ksegrp)) { + /* + * We will not be on the run queue. + * So we must be sleeping or similar. + * Don't use the slot if we will need it + * for newtd. + */ + slot_fill(td->td_ksegrp); + } + ke->ke_flags &= ~KEF_NEXTRQ; + } } if (newtd != NULL) { @@ -1598,24 +1004,19 @@ sched_switch(struct thread *td, struct thread *newtd, int flags) * If we bring in a thread account for it as if it had been * added to the run queue and then chosen. */ + SLOT_USE(newtd->td_ksegrp); newtd->td_kse->ke_flags |= KEF_DIDRUN; TD_SET_RUNNING(newtd); - kseq_load_add(KSEQ_SELF(), newtd->td_kse); - /* - * XXX When we preempt, we've already consumed a slot because - * we got here through sched_add(). However, newtd can come - * from thread_switchout() which can't SLOT_USE() because - * the SLOT code is scheduler dependent. We must use the - * slot here otherwise. - */ - if ((flags & SW_PREEMPT) == 0) - SLOT_USE(newtd->td_ksegrp); - newtd->td_kse->ke_timestamp = now; - } else + kseq_load_add(ksq, newtd->td_kse); + now = newtd->td_kse->ke_timestamp = sched_timestamp(); + } else { newtd = choosethread(); + /* sched_choose sets ke_timestamp, just reuse it */ + now = newtd->td_kse->ke_timestamp; + } if (td != newtd) { sched_update_runtime(ke, now); - ke->ke_lastran = now; + ke->ke_lastran = tick; #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) @@ -1676,11 +1077,11 @@ sched_wakeup(struct thread *td) mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; kg = td->td_ksegrp; - kseq = KSEQ_CPU(ke->ke_cpu); mykseq = KSEQ_SELF(); if (ke->ke_flags & KEF_SLEEP) { ke->ke_flags &= ~KEF_SLEEP; if (sched_is_timeshare(kg)) { + kseq = KSEQ_CPU(td->td_lastcpu); now = sched_timestamp(); sched_commit_runtime(ke); #ifdef SMP @@ -1691,7 +1092,6 @@ sched_wakeup(struct thread *td) kg->kg_user_pri = sched_recalc_pri(ke, now); } } - ke->ke_flags &= ~KEF_NEXTRQ; setrunqueue(td, SRQ_BORING); } @@ -1730,15 +1130,9 @@ sched_fork_thread(struct thread *td, struct thread *child) ke = td->td_kse; ke2 = child->td_kse; -#ifdef SMP - ke2->ke_cpu = sched_find_idlest_cpu(ke, PCPU_GET(cpuid)); -#else - ke2->ke_cpu = ke->ke_cpu; -#endif ke2->ke_slice = (ke->ke_slice + 1) >> 1; - ke2->ke_flags |= KEF_FIRST_SLICE; + ke2->ke_flags |= KEF_FIRST_SLICE | (ke->ke_flags & KEF_NEXTRQ); ke2->ke_activated = 0; - ke2->ke_timestamp = sched_timestamp(); ke->ke_slice >>= 1; if (ke->ke_slice == 0) { ke->ke_slice = 1; @@ -1754,37 +1148,7 @@ sched_fork_thread(struct thread *td, struct thread *child) void sched_class(struct ksegrp *kg, int class) { - struct kseq *kseq; - struct kse *ke; - struct thread *td; - int nclass; - int oclass; - mtx_assert(&sched_lock, MA_OWNED); - if (kg->kg_pri_class == class) - return; - - nclass = PRI_BASE(class); - oclass = PRI_BASE(kg->kg_pri_class); - FOREACH_THREAD_IN_GROUP(kg, td) { - ke = td->td_kse; - - /* New thread does not have runq assigned */ - if (ke->ke_runq == NULL) - continue; - - kseq = KSEQ_CPU(ke->ke_cpu); - if (oclass == PRI_TIMESHARE) - kseq->ksq_load_timeshare--; - else if (oclass == PRI_IDLE) - kseq->ksq_load_idle--; - - if (nclass == PRI_TIMESHARE) - kseq->ksq_load_timeshare++; - else if (nclass == PRI_IDLE) - kseq->ksq_load_idle++; - } - kg->kg_pri_class = class; } @@ -1815,7 +1179,7 @@ sched_exit_thread(struct thread *td, struct thread *childtd) struct kse *childke = childtd->td_kse; struct kse *parentke = td->td_kse; - kseq_load_rem(KSEQ_CPU(childke->ke_cpu), childke); + kseq_load_rem(KSEQ_SELF(), childke); sched_update_runtime(childke, sched_timestamp()); sched_commit_runtime(childke); if ((childke->ke_flags & KEF_FIRST_SLICE) && @@ -1827,16 +1191,16 @@ sched_exit_thread(struct thread *td, struct thread *childtd) } static int -sched_starving(struct kseq *ksq, uint64_t now, struct kse *ke) +sched_starving(struct kseq *ksq, unsigned now, struct kse *ke) { uint64_t delta; - if (PROC_NICE(ke->ke_proc) > ksq->ksq_best_expired_nice) + if (ke->ke_proc->p_nice > ksq->ksq_expired_nice) return (1); - if (ksq->ksq_expired_timestamp == 0) + if (ksq->ksq_expired_tick == 0) return (0); - delta = now - ksq->ksq_expired_timestamp; - if (delta > STARVATION_TIME * (ksq->ksq_load - ksq->ksq_load_idle)) + delta = HZ_TO_NS((uint64_t)now - ksq->ksq_expired_tick); + if (delta > STARVATION_TIME * ksq->ksq_load) return (1); return (0); } @@ -1877,8 +1241,8 @@ sched_tick(void) td = curthread; ke = td->td_kse; - kg = ke->ke_ksegrp; - p = ke->ke_proc; + kg = td->td_ksegrp; + p = td->td_proc; class = PRI_BASE(kg->kg_pri_class); now = sched_timestamp(); cpuid = PCPU_GET(cpuid); @@ -1886,24 +1250,16 @@ sched_tick(void) kseq->ksq_last_timestamp = now; if (class == PRI_IDLE) { - int idle_td = (curthread == PCPU_GET(idlethread)); /* * Processes of equal idle priority are run round-robin. */ - if (!idle_td && --ke->ke_slice <= 0) { + if (td != PCPU_GET(idlethread) && --ke->ke_slice <= 0) { ke->ke_slice = def_timeslice; td->td_flags |= TDF_NEEDRESCHED; } - sched_balance_tick(cpuid, idle_td ? IDLE_IDLE : IDLE); return; } - if (ke->ke_flags & KEF_NEXTRQ) { - /* The thread was already scheduled off. */ - curthread->td_flags |= TDF_NEEDRESCHED; - goto out; - } - if (class == PRI_REALTIME) { /* * Realtime scheduling, do round robin for RR class, FIFO @@ -1911,33 +1267,44 @@ sched_tick(void) */ if (PRI_NEED_RR(kg->kg_pri_class) && --ke->ke_slice <= 0) { ke->ke_slice = def_timeslice; - curthread->td_flags |= TDF_NEEDRESCHED; + td->td_flags |= TDF_NEEDRESCHED; } - goto out; + return; } /* - * Current, we skip kernel thread, though it may be classified as - * TIMESHARE. + * We skip kernel thread, though it may be classified as TIMESHARE. */ if (class != PRI_TIMESHARE || (p->p_flag & P_KTHREAD) != 0) - goto out; + return; if (--ke->ke_slice <= 0) { - curthread->td_flags |= TDF_NEEDRESCHED; + td->td_flags |= TDF_NEEDRESCHED; sched_update_runtime(ke, now); sched_commit_runtime(ke); kg->kg_user_pri = sched_calc_pri(kg); ke->ke_slice = sched_timeslice(ke); ke->ke_flags &= ~KEF_FIRST_SLICE; - if (!kseq->ksq_expired_timestamp) - kseq->ksq_expired_timestamp = now; + if (ke->ke_flags & KEF_BOUND || td->td_pinned) { + if (kseq->ksq_expired_tick == 0) + kseq->ksq_expired_tick = tick; + } else { + if (kseq_global.ksq_expired_tick == 0) + kseq_global.ksq_expired_tick = tick; + } if (!THREAD_IS_INTERACTIVE(ke) || - sched_starving(kseq, now, ke)) { + sched_starving(kseq, tick, ke) || + sched_starving(&kseq_global, tick, ke)) { /* The thead becomes cpu hog, schedule it off. */ ke->ke_flags |= KEF_NEXTRQ; - if (PROC_NICE(p) < kseq->ksq_best_expired_nice) - kseq->ksq_best_expired_nice = PROC_NICE(p); + if (ke->ke_flags & KEF_BOUND || td->td_pinned) { + if (p->p_nice < kseq->ksq_expired_nice) + kseq->ksq_expired_nice = p->p_nice; + } else { + if (p->p_nice < kseq_global.ksq_expired_nice) + kseq_global.ksq_expired_nice = + p->p_nice; + } } } else { /* @@ -1947,11 +1314,8 @@ sched_tick(void) * interactive threads. */ if (THREAD_IS_INTERACTIVE(ke) && sched_timeslice_split(ke)) - curthread->td_flags |= TDF_NEEDRESCHED; + td->td_flags |= TDF_NEEDRESCHED; } - -out: - sched_balance_tick(cpuid, NOT_IDLE); } void @@ -1973,17 +1337,22 @@ sched_clock(struct thread *td) sched_pctcpu_update(ke); } +static int +kseq_runnable(struct kseq *kseq) +{ + return (krunq_check(kseq->ksq_curr) || + krunq_check(kseq->ksq_next) || + krunq_check(&kseq->ksq_idle)); +} + int sched_runnable(void) { - struct kseq *kseq; - - kseq = KSEQ_SELF(); - if (krunq_findbit(kseq->ksq_curr) != -1 || - krunq_findbit(kseq->ksq_next) != -1 || - krunq_findbit(&kseq->ksq_idle) != -1) - return (1); - return (0); +#ifdef SMP + return (kseq_runnable(&kseq_global) || kseq_runnable(KSEQ_SELF())); +#else + return (kseq_runnable(&kseq_global)); +#endif } void @@ -2005,53 +1374,142 @@ sched_userret(struct thread *td) struct kse * sched_choose(void) { + struct kse *ke; struct kseq *kseq; - struct kse *ke; + +#ifdef SMP + struct kse *kecpu; mtx_assert(&sched_lock, MA_OWNED); - kseq = KSEQ_SELF(); -#ifdef SMP -restart: -#endif + kseq = &kseq_global; + ke = kseq_choose(&kseq_global); + kecpu = kseq_choose(KSEQ_SELF()); + + if (ke == NULL || + (kecpu != NULL && + kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) { + ke = kecpu; + kseq = KSEQ_SELF(); + } +#else + kseq = &kseq_global; ke = kseq_choose(kseq); - if (ke) { -#ifdef SMP - if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) - if (sched_idled(kseq, IDLE) == 0) - goto restart; #endif + + if (ke != NULL) { kseq_runq_rem(kseq, ke); ke->ke_state = KES_THREAD; ke->ke_flags &= ~KEF_PREEMPTED; ke->ke_timestamp = sched_timestamp(); - return (ke); } -#ifdef SMP - if (sched_idled(kseq, IDLE_IDLE) == 0) - goto restart; -#endif - return (NULL); + + return (ke); } +#ifdef SMP +static int +forward_wakeup(int cpunum, cpumask_t me) +{ + cpumask_t map, dontuse; + cpumask_t map2; + struct pcpu *pc; + cpumask_t id, map3; + + mtx_assert(&sched_lock, MA_OWNED); + + CTR0(KTR_RUNQ, "forward_wakeup()"); + + if ((!forward_wakeup_enabled) || + (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0)) + return (0); + if (!smp_started || cold || panicstr) + return (0); + + forward_wakeups_requested++; + + /* + * check the idle mask we received against what we calculated before + * in the old version. + */ + /* + * don't bother if we should be doing it ourself.. + */ + if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum))) + return (0); + + dontuse = me | stopped_cpus | hlt_cpus_mask; + map3 = 0; + if (forward_wakeup_use_loop) { + SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { + id = pc->pc_cpumask; + if ( (id & dontuse) == 0 && + pc->pc_curthread == pc->pc_idlethread) { + map3 |= id; + } + } + } + + if (forward_wakeup_use_mask) { + map = 0; + map = idle_cpus_mask & ~dontuse; + + /* If they are both on, compare and use loop if different */ + if (forward_wakeup_use_loop) { + if (map != map3) { + printf("map (%02X) != map3 (%02X)\n", + map, map3); + map = map3; + } + } + } else { + map = map3; + } + /* If we only allow a specific CPU, then mask off all the others */ + if (cpunum != NOCPU) { + KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum.")); + map &= (1 << cpunum); + } else { + /* Try choose an idle die. */ + if (forward_wakeup_use_htt) { + map2 = (map & (map >> 1)) & 0x5555; + if (map2) { + map = map2; + } + } + + /* set only one bit */ + if (forward_wakeup_use_single) { + map = map & ((~map) + 1); + } + } + if (map) { + forward_wakeups_delivered++; + ipi_selected(map, IPI_AST); + return (1); + } + return (0); +} +#endif + void sched_add(struct thread *td, int flags) { - struct kseq *ksq, *my_ksq; + struct kseq *ksq; struct ksegrp *kg; struct kse *ke; - int preemptive; - int canmigrate; + struct thread *mytd; int class; - int my_cpu; int nextrq; + int need_resched = 0; #ifdef SMP - struct thread *td2; - struct pcpu *pcpu; - int cpu, new_cpu; - int load, my_load; + int cpu; + int mycpu; + int pinned; + struct kseq *myksq; #endif mtx_assert(&sched_lock, MA_OWNED); + mytd = curthread; ke = td->td_kse; kg = td->td_ksegrp; KASSERT(ke->ke_state != KES_ONRUNQ, @@ -2062,66 +1520,31 @@ sched_add(struct thread *td, int flags) KASSERT(ke->ke_runq == NULL, ("sched_add: KSE %p is still assigned to a run queue", ke)); - canmigrate = 1; - preemptive = !(flags & SRQ_YIELDING); class = PRI_BASE(kg->kg_pri_class); - my_cpu = PCPU_GET(cpuid); - my_ksq = KSEQ_CPU(my_cpu); +#ifdef SMP + mycpu = PCPU_GET(cpuid); + myksq = KSEQ_CPU(mycpu); + ke->ke_wakeup_cpu = mycpu; +#endif + nextrq = (ke->ke_flags & KEF_NEXTRQ); + ke->ke_flags &= ~KEF_NEXTRQ; if (flags & SRQ_PREEMPTED) ke->ke_flags |= KEF_PREEMPTED; - if ((ke->ke_flags & KEF_INTERNAL) == 0) - SLOT_USE(td->td_ksegrp); - nextrq = (ke->ke_flags & KEF_NEXTRQ); - ke->ke_flags &= ~(KEF_NEXTRQ | KEF_INTERNAL); - + ksq = &kseq_global; #ifdef SMP - cpu = ke->ke_cpu; - canmigrate = kse_can_migrate(ke); - /* - * Don't migrate running threads here. Force the long term balancer - * to do it. - */ - if (ke->ke_flags & KEF_HOLD) { - ke->ke_flags &= ~KEF_HOLD; - canmigrate = 0; + if (td->td_pinned != 0) { + cpu = td->td_lastcpu; + ksq = KSEQ_CPU(cpu); + pinned = 1; + } else if ((ke)->ke_flags & KEF_BOUND) { + cpu = ke->ke_cpu; + ksq = KSEQ_CPU(cpu); + pinned = 1; + } else { + pinned = 0; + cpu = NOCPU; } - - /* - * If this thread is pinned or bound, notify the target cpu. - */ - if (!canmigrate) - goto activate_it; - - if (class == PRI_ITHD) { - ke->ke_cpu = my_cpu; - goto activate_it; - } - - if (ke->ke_cpu == my_cpu) - goto activate_it; - - if (my_ksq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) { - ke->ke_cpu = my_cpu; - goto activate_it; - } - - new_cpu = my_cpu; - - load = kseq_source_load(KSEQ_CPU(cpu)); - my_load = kseq_dest_load(my_ksq); - if ((my_load - load) * 100 < my_load * imbalance_pct2) - goto try_idle_cpu; - new_cpu = cpu; - -try_idle_cpu: - new_cpu = sched_find_idle_cpu(new_cpu); - ke->ke_cpu = new_cpu; - -activate_it: - if (ke->ke_cpu != cpu) - ke->ke_lastran = 0; #endif - ksq = KSEQ_CPU(ke->ke_cpu); switch (class) { case PRI_ITHD: case PRI_REALTIME: @@ -2147,42 +1570,57 @@ activate_it: break; } - if (ke->ke_runq == my_ksq->ksq_curr && - td->td_priority < curthread->td_priority) { - curthread->td_flags |= TDF_NEEDRESCHED; - ke->ke_runq = NULL; - if (preemptive && maybe_preempt(td)) - return; - ke->ke_runq = my_ksq->ksq_curr; - if (curthread->td_ksegrp->kg_pri_class == PRI_IDLE) - td->td_owepreempt = 1; - } - ke->ke_state = KES_ONRUNQ; - kseq_runq_add(ksq, ke, flags); - kseq_load_add(ksq, ke); #ifdef SMP - pcpu = pcpu_find(ke->ke_cpu); - if (class != PRI_IDLE) { - if (kseq_idle & ksq->ksq_group->ksg_mask) - kseq_idle &= ~ksq->ksq_group->ksg_mask; - if (ksq->ksq_group->ksg_idlemask & pcpu->pc_cpumask) - ksq->ksq_group->ksg_idlemask &= ~pcpu->pc_cpumask; + if ((ke->ke_runq == kseq_global.ksq_curr || + ke->ke_runq == myksq->ksq_curr) && + td->td_priority < mytd->td_priority) { +#else + if (ke->ke_runq == kseq_global.ksq_curr && + td->td_priority < mytd->td_priority) { +#endif + struct krunq *rq; + + rq = ke->ke_runq; + ke->ke_runq = NULL; + if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td)) + return; + ke->ke_runq = rq; + need_resched = TDF_NEEDRESCHED; } - if (ke->ke_cpu != my_cpu) { - td2 = pcpu->pc_curthread; - if (__predict_false(td2 == pcpu->pc_idlethread)) { - td2->td_flags |= TDF_NEEDRESCHED; - ipi_selected(pcpu->pc_cpumask, IPI_AST); - } else if (td->td_priority < td2->td_priority) { - if (class == PRI_ITHD || class == PRI_REALTIME || - td2->td_ksegrp->kg_pri_class == PRI_IDLE) - ipi_selected(pcpu->pc_cpumask, IPI_PREEMPT); - else if ((td2->td_flags & TDF_NEEDRESCHED) == 0) { - td2->td_flags |= TDF_NEEDRESCHED; - ipi_selected(pcpu->pc_cpumask, IPI_AST); + + SLOT_USE(kg); + ke->ke_state = KES_ONRUNQ; + kseq_runq_add(ksq, ke); + kseq_load_add(ksq, ke); + +#ifdef SMP + if (pinned) { + if (cpu != mycpu) { + struct thread *running = pcpu_find(cpu)->pc_curthread; + if (ksq->ksq_curr == ke->ke_runq && + running->td_priority < td->td_priority) { + if (td->td_priority < PRI_MAX_ITHD) + ipi_selected(1 << cpu, IPI_PREEMPT); + else { + running->td_flags |= TDF_NEEDRESCHED; + ipi_selected(1 << cpu, IPI_AST); + } } - } + } else + curthread->td_flags |= need_resched; + } else { + cpumask_t me = 1 << mycpu; + cpumask_t idle = idle_cpus_mask & me; + int forwarded = 0; + + if (!idle && ((flags & SRQ_INTR) == 0) && + (idle_cpus_mask & ~(hlt_cpus_mask | me))) + forwarded = forward_wakeup(cpu, me); + if (forwarded == 0) + curthread->td_flags |= need_resched; } +#else + mytd->td_flags |= need_resched; #endif } @@ -2194,26 +1632,13 @@ sched_rem(struct thread *td) mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; - ke->ke_flags &= ~KEF_PREEMPTED; KASSERT((ke->ke_state == KES_ONRUNQ), ("sched_rem: KSE not on run queue")); - kseq = KSEQ_CPU(ke->ke_cpu); -#ifdef SMP - if (ke->ke_flags & KEF_MIGRATING) { - ke->ke_flags &= ~KEF_MIGRATING; - kseq_load_rem(kseq, ke); - TAILQ_REMOVE(&kseq->ksq_migrateq, ke, ke_procq); - ke->ke_cpu = ke->ke_tocpu; - } else -#endif - { - KASSERT((ke->ke_state == KES_ONRUNQ), - ("sched_rem: KSE not on run queue")); - SLOT_RELEASE(td->td_ksegrp); - kseq_runq_rem(kseq, ke); - kseq_load_rem(kseq, ke); - } + kseq = ke->ke_kseq; + SLOT_RELEASE(td->td_ksegrp); + kseq_runq_rem(kseq, ke); + kseq_load_rem(kseq, ke); ke->ke_state = KES_THREAD; } @@ -2254,29 +1679,16 @@ sched_pctcpu(struct thread *td) void sched_bind(struct thread *td, int cpu) { - struct kseq *kseq; struct kse *ke; mtx_assert(&sched_lock, MA_OWNED); ke = td->td_kse; ke->ke_flags |= KEF_BOUND; #ifdef SMP + ke->ke_cpu = cpu; if (PCPU_GET(cpuid) == cpu) return; - kseq = KSEQ_SELF(); - ke->ke_flags |= KEF_MIGRATING; - ke->ke_tocpu = cpu; - TAILQ_INSERT_TAIL(&kseq->ksq_migrateq, ke, ke_procq); - if (kseq->ksq_migrated) { - if (TD_AWAITING_INTR(kseq->ksq_migrated)) { - TD_CLR_IWAIT(kseq->ksq_migrated); - setrunqueue(kseq->ksq_migrated, SRQ_YIELDING); - } - } - /* When we return from mi_switch we'll be on the correct cpu. */ mi_switch(SW_VOL, NULL); -#else - (void)kseq; #endif } @@ -2297,17 +1709,7 @@ sched_is_bound(struct thread *td) int sched_load(void) { -#ifdef SMP - int total; - int i; - - total = 0; - for (i = 0; i < MAXCPU; i++) - total += KSEQ_CPU(i)->ksq_sysload; - return (total); -#else - return (KSEQ_SELF()->ksq_sysload); -#endif + return (sched_tdcnt); } void