File: | kern/sched_bsd.c |
Warning: | line 609, column 3 Value stored to 'speedup' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* $OpenBSD: sched_bsd.c,v 1.89 2023/10/17 00:04:02 cheloha Exp $ */ |
2 | /* $NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $ */ |
3 | |
4 | /*- |
5 | * Copyright (c) 1982, 1986, 1990, 1991, 1993 |
6 | * The Regents of the University of California. All rights reserved. |
7 | * (c) UNIX System Laboratories, Inc. |
8 | * All or some portions of this file are derived from material licensed |
9 | * to the University of California by American Telephone and Telegraph |
10 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
11 | * the permission of UNIX System Laboratories, Inc. |
12 | * |
13 | * Redistribution and use in source and binary forms, with or without |
14 | * modification, are permitted provided that the following conditions |
15 | * are met: |
16 | * 1. Redistributions of source code must retain the above copyright |
17 | * notice, this list of conditions and the following disclaimer. |
18 | * 2. Redistributions in binary form must reproduce the above copyright |
19 | * notice, this list of conditions and the following disclaimer in the |
20 | * documentation and/or other materials provided with the distribution. |
21 | * 3. Neither the name of the University nor the names of its contributors |
22 | * may be used to endorse or promote products derived from this software |
23 | * without specific prior written permission. |
24 | * |
25 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
26 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
29 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
30 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
31 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
32 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
33 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
34 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
35 | * SUCH DAMAGE. |
36 | * |
37 | * @(#)kern_synch.c 8.6 (Berkeley) 1/21/94 |
38 | */ |
39 | |
40 | #include <sys/param.h> |
41 | #include <sys/systm.h> |
42 | #include <sys/clockintr.h> |
43 | #include <sys/proc.h> |
44 | #include <sys/kernel.h> |
45 | #include <sys/malloc.h> |
46 | #include <sys/resourcevar.h> |
47 | #include <uvm/uvm_extern.h> |
48 | #include <sys/sched.h> |
49 | #include <sys/timeout.h> |
50 | #include <sys/smr.h> |
51 | #include <sys/tracepoint.h> |
52 | |
53 | #ifdef KTRACE1 |
54 | #include <sys/ktrace.h> |
55 | #endif |
56 | |
57 | uint64_t roundrobin_period; /* [I] roundrobin period (ns) */ |
58 | int lbolt; /* once a second sleep address */ |
59 | |
60 | #ifdef MULTIPROCESSOR1 |
61 | struct __mp_lock sched_lock; |
62 | #endif |
63 | |
64 | void update_loadavg(void *); |
65 | void schedcpu(void *); |
66 | uint32_t decay_aftersleep(uint32_t, uint32_t); |
67 | |
68 | extern struct cpuset sched_idle_cpus; |
69 | |
70 | /* |
71 | * constants for averages over 1, 5, and 15 minutes when sampling at |
72 | * 5 second intervals. |
73 | */ |
74 | static const fixpt_t cexp[3] = { |
75 | 0.9200444146293232 * FSCALE(1<<11), /* exp(-1/12) */ |
76 | 0.9834714538216174 * FSCALE(1<<11), /* exp(-1/60) */ |
77 | 0.9944598480048967 * FSCALE(1<<11), /* exp(-1/180) */ |
78 | }; |
79 | |
80 | struct loadavg averunnable; |
81 | |
82 | /* |
83 | * Force switch among equal priority processes every 100ms. |
84 | */ |
85 | void |
86 | roundrobin(struct clockrequest *cr, void *cf, void *arg) |
87 | { |
88 | uint64_t count; |
89 | struct cpu_info *ci = curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;}); |
90 | struct schedstate_percpu *spc = &ci->ci_schedstate; |
91 | |
92 | count = clockrequest_advance(cr, roundrobin_period); |
93 | |
94 | if (ci->ci_curproc != NULL((void *)0)) { |
95 | if (spc->spc_schedflags & SPCF_SEENRR0x0001 || count >= 2) { |
96 | /* |
97 | * The process has already been through a roundrobin |
98 | * without switching and may be hogging the CPU. |
99 | * Indicate that the process should yield. |
100 | */ |
101 | atomic_setbits_intx86_atomic_setbits_u32(&spc->spc_schedflags, |
102 | SPCF_SEENRR0x0001 | SPCF_SHOULDYIELD0x0002); |
103 | } else { |
104 | atomic_setbits_intx86_atomic_setbits_u32(&spc->spc_schedflags, |
105 | SPCF_SEENRR0x0001); |
106 | } |
107 | } |
108 | |
109 | if (spc->spc_nrun || spc->spc_schedflags & SPCF_SHOULDYIELD0x0002) |
110 | need_resched(ci); |
111 | } |
112 | |
113 | |
114 | |
115 | /* |
116 | * update_loadav: compute a tenex style load average of a quantity on |
117 | * 1, 5, and 15 minute intervals. |
118 | */ |
119 | void |
120 | update_loadavg(void *unused) |
121 | { |
122 | static struct timeout to = TIMEOUT_INITIALIZER(update_loadavg, NULL){ .to_list = { ((void *)0), ((void *)0) }, .to_abstime = { .tv_sec = 0, .tv_nsec = 0 }, .to_func = ((update_loadavg)), .to_arg = ((((void *)0))), .to_time = 0, .to_flags = (0) | 0x04, .to_kclock = ((-1)) }; |
123 | CPU_INFO_ITERATORint cii; |
124 | struct cpu_info *ci; |
125 | u_int i, nrun = 0; |
126 | |
127 | CPU_INFO_FOREACH(cii, ci)for (cii = 0, ci = cpu_info_list; ci != ((void *)0); ci = ci-> ci_next) { |
128 | if (!cpuset_isset(&sched_idle_cpus, ci)) |
129 | nrun++; |
130 | nrun += ci->ci_schedstate.spc_nrun; |
131 | } |
132 | |
133 | for (i = 0; i < 3; i++) { |
134 | averunnable.ldavg[i] = (cexp[i] * averunnable.ldavg[i] + |
135 | nrun * FSCALE(1<<11) * (FSCALE(1<<11) - cexp[i])) >> FSHIFT11; |
136 | } |
137 | |
138 | timeout_add_sec(&to, 5); |
139 | } |
140 | |
141 | /* |
142 | * Constants for digital decay and forget: |
143 | * 90% of (p_estcpu) usage in 5 * loadav time |
144 | * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) |
145 | * Note that, as ps(1) mentions, this can let percentages |
146 | * total over 100% (I've seen 137.9% for 3 processes). |
147 | * |
148 | * Note that hardclock updates p_estcpu and p_cpticks independently. |
149 | * |
150 | * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. |
151 | * That is, the system wants to compute a value of decay such |
152 | * that the following for loop: |
153 | * for (i = 0; i < (5 * loadavg); i++) |
154 | * p_estcpu *= decay; |
155 | * will compute |
156 | * p_estcpu *= 0.1; |
157 | * for all values of loadavg: |
158 | * |
159 | * Mathematically this loop can be expressed by saying: |
160 | * decay ** (5 * loadavg) ~= .1 |
161 | * |
162 | * The system computes decay as: |
163 | * decay = (2 * loadavg) / (2 * loadavg + 1) |
164 | * |
165 | * We wish to prove that the system's computation of decay |
166 | * will always fulfill the equation: |
167 | * decay ** (5 * loadavg) ~= .1 |
168 | * |
169 | * If we compute b as: |
170 | * b = 2 * loadavg |
171 | * then |
172 | * decay = b / (b + 1) |
173 | * |
174 | * We now need to prove two things: |
175 | * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) |
176 | * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) |
177 | * |
178 | * Facts: |
179 | * For x close to zero, exp(x) =~ 1 + x, since |
180 | * exp(x) = 0! + x**1/1! + x**2/2! + ... . |
181 | * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. |
182 | * For x close to zero, ln(1+x) =~ x, since |
183 | * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 |
184 | * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). |
185 | * ln(.1) =~ -2.30 |
186 | * |
187 | * Proof of (1): |
188 | * Solve (factor)**(power) =~ .1 given power (5*loadav): |
189 | * solving for factor, |
190 | * ln(factor) =~ (-2.30/5*loadav), or |
191 | * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = |
192 | * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED |
193 | * |
194 | * Proof of (2): |
195 | * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): |
196 | * solving for power, |
197 | * power*ln(b/(b+1)) =~ -2.30, or |
198 | * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED |
199 | * |
200 | * Actual power values for the implemented algorithm are as follows: |
201 | * loadav: 1 2 3 4 |
202 | * power: 5.68 10.32 14.94 19.55 |
203 | */ |
204 | |
205 | /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ |
206 | #define loadfactor(loadav)(2 * (loadav)) (2 * (loadav)) |
207 | #define decay_cpu(loadfac, cpu)(((loadfac) * (cpu)) / ((loadfac) + (1<<11))) (((loadfac) * (cpu)) / ((loadfac) + FSCALE(1<<11))) |
208 | |
209 | /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ |
210 | fixpt_t ccpu = 0.95122942450071400909 * FSCALE(1<<11); /* exp(-1/20) */ |
211 | |
212 | /* |
213 | * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the |
214 | * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below |
215 | * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). |
216 | * |
217 | * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: |
218 | * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). |
219 | * |
220 | * If you don't want to bother with the faster/more-accurate formula, you |
221 | * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate |
222 | * (more general) method of calculating the %age of CPU used by a process. |
223 | */ |
224 | #define CCPU_SHIFT11 11 |
225 | |
226 | /* |
227 | * Recompute process priorities, every second. |
228 | */ |
229 | void |
230 | schedcpu(void *unused) |
231 | { |
232 | static struct timeout to = TIMEOUT_INITIALIZER(schedcpu, NULL){ .to_list = { ((void *)0), ((void *)0) }, .to_abstime = { .tv_sec = 0, .tv_nsec = 0 }, .to_func = ((schedcpu)), .to_arg = (((( void *)0))), .to_time = 0, .to_flags = (0) | 0x04, .to_kclock = ((-1)) }; |
233 | fixpt_t loadfac = loadfactor(averunnable.ldavg[0])(2 * (averunnable.ldavg[0])); |
234 | struct proc *p; |
235 | int s; |
236 | unsigned int newcpu; |
237 | |
238 | LIST_FOREACH(p, &allproc, p_list)for((p) = ((&allproc)->lh_first); (p)!= ((void *)0); ( p) = ((p)->p_list.le_next)) { |
239 | /* |
240 | * Idle threads are never placed on the runqueue, |
241 | * therefore computing their priority is pointless. |
242 | */ |
243 | if (p->p_cpu != NULL((void *)0) && |
244 | p->p_cpu->ci_schedstate.spc_idleproc == p) |
245 | continue; |
246 | /* |
247 | * Increment sleep time (if sleeping). We ignore overflow. |
248 | */ |
249 | if (p->p_stat == SSLEEP3 || p->p_stat == SSTOP4) |
250 | p->p_slptime++; |
251 | p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT11; |
252 | /* |
253 | * If the process has slept the entire second, |
254 | * stop recalculating its priority until it wakes up. |
255 | */ |
256 | if (p->p_slptime > 1) |
257 | continue; |
258 | SCHED_LOCK(s)do { s = splraise(0xc); __mp_lock(&sched_lock); } while ( 0); |
259 | /* |
260 | * p_pctcpu is only for diagnostic tools such as ps. |
261 | */ |
262 | #if (FSHIFT11 >= CCPU_SHIFT11) |
263 | p->p_pctcpu += (stathz == 100)? |
264 | ((fixpt_t) p->p_cpticks) << (FSHIFT11 - CCPU_SHIFT11): |
265 | 100 * (((fixpt_t) p->p_cpticks) |
266 | << (FSHIFT11 - CCPU_SHIFT11)) / stathz; |
267 | #else |
268 | p->p_pctcpu += ((FSCALE(1<<11) - ccpu) * |
269 | (p->p_cpticks * FSCALE(1<<11) / stathz)) >> FSHIFT11; |
270 | #endif |
271 | p->p_cpticks = 0; |
272 | newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu)(((loadfac) * (p->p_estcpu)) / ((loadfac) + (1<<11)) ); |
273 | setpriority(p, newcpu, p->p_p->ps_nice); |
274 | |
275 | if (p->p_stat == SRUN2 && |
276 | (p->p_runpri / SCHED_PPQ(128 / 32)) != (p->p_usrpri / SCHED_PPQ(128 / 32))) { |
277 | remrunqueue(p); |
278 | setrunqueue(p->p_cpu, p, p->p_usrpri); |
279 | } |
280 | SCHED_UNLOCK(s)do { __mp_unlock(&sched_lock); spllower(s); } while ( 0); |
281 | } |
282 | wakeup(&lbolt); |
283 | timeout_add_sec(&to, 1); |
284 | } |
285 | |
286 | /* |
287 | * Recalculate the priority of a process after it has slept for a while. |
288 | * For all load averages >= 1 and max p_estcpu of 255, sleeping for at |
289 | * least six times the loadfactor will decay p_estcpu to zero. |
290 | */ |
291 | uint32_t |
292 | decay_aftersleep(uint32_t estcpu, uint32_t slptime) |
293 | { |
294 | fixpt_t loadfac = loadfactor(averunnable.ldavg[0])(2 * (averunnable.ldavg[0])); |
295 | uint32_t newcpu; |
296 | |
297 | if (slptime > 5 * loadfac) |
298 | newcpu = 0; |
299 | else { |
300 | newcpu = estcpu; |
301 | slptime--; /* the first time was done in schedcpu */ |
302 | while (newcpu && --slptime) |
303 | newcpu = decay_cpu(loadfac, newcpu)(((loadfac) * (newcpu)) / ((loadfac) + (1<<11))); |
304 | |
305 | } |
306 | |
307 | return (newcpu); |
308 | } |
309 | |
310 | /* |
311 | * General yield call. Puts the current process back on its run queue and |
312 | * performs a voluntary context switch. |
313 | */ |
314 | void |
315 | yield(void) |
316 | { |
317 | struct proc *p = curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc; |
318 | int s; |
319 | |
320 | SCHED_LOCK(s)do { s = splraise(0xc); __mp_lock(&sched_lock); } while ( 0); |
321 | setrunqueue(p->p_cpu, p, p->p_usrpri); |
322 | p->p_ru.ru_nvcsw++; |
323 | mi_switch(); |
324 | SCHED_UNLOCK(s)do { __mp_unlock(&sched_lock); spllower(s); } while ( 0); |
325 | } |
326 | |
327 | /* |
328 | * General preemption call. Puts the current process back on its run queue |
329 | * and performs an involuntary context switch. If a process is supplied, |
330 | * we switch to that process. Otherwise, we use the normal process selection |
331 | * criteria. |
332 | */ |
333 | void |
334 | preempt(void) |
335 | { |
336 | struct proc *p = curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc; |
337 | int s; |
338 | |
339 | SCHED_LOCK(s)do { s = splraise(0xc); __mp_lock(&sched_lock); } while ( 0); |
340 | setrunqueue(p->p_cpu, p, p->p_usrpri); |
341 | p->p_ru.ru_nivcsw++; |
342 | mi_switch(); |
343 | SCHED_UNLOCK(s)do { __mp_unlock(&sched_lock); spllower(s); } while ( 0); |
344 | } |
345 | |
346 | void |
347 | mi_switch(void) |
348 | { |
349 | struct schedstate_percpu *spc = &curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_schedstate; |
350 | struct proc *p = curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc; |
351 | struct proc *nextproc; |
352 | struct process *pr = p->p_p; |
353 | struct timespec ts; |
354 | #ifdef MULTIPROCESSOR1 |
355 | int hold_count; |
356 | int sched_count; |
357 | #endif |
358 | |
359 | assertwaitok(); |
360 | KASSERT(p->p_stat != SONPROC)((p->p_stat != 7) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/sched_bsd.c" , 360, "p->p_stat != SONPROC")); |
361 | |
362 | SCHED_ASSERT_LOCKED()do { do { if (splassert_ctl > 0) { splassert_check(0xc, __func__ ); } } while (0); ((__mp_lock_held(&sched_lock, ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof (struct cpu_info, ci_self))); __ci;}))) ? (void)0 : __assert( "diagnostic ", "/usr/src/sys/kern/sched_bsd.c", 362, "__mp_lock_held(&sched_lock, curcpu())" )); } while (0); |
363 | |
364 | #ifdef MULTIPROCESSOR1 |
365 | /* |
366 | * Release the kernel_lock, as we are about to yield the CPU. |
367 | */ |
368 | sched_count = __mp_release_all_but_one(&sched_lock); |
369 | if (_kernel_lock_held()) |
370 | hold_count = __mp_release_all(&kernel_lock); |
371 | else |
372 | hold_count = 0; |
373 | #endif |
374 | |
375 | /* |
376 | * Compute the amount of time during which the current |
377 | * process was running, and add that to its total so far. |
378 | */ |
379 | nanouptime(&ts); |
380 | if (timespeccmp(&ts, &spc->spc_runtime, <)(((&ts)->tv_sec == (&spc->spc_runtime)->tv_sec ) ? ((&ts)->tv_nsec < (&spc->spc_runtime)-> tv_nsec) : ((&ts)->tv_sec < (&spc->spc_runtime )->tv_sec))) { |
381 | #if 0 |
382 | printf("uptime is not monotonic! " |
383 | "ts=%lld.%09lu, runtime=%lld.%09lu\n", |
384 | (long long)tv.tv_sec, tv.tv_nsec, |
385 | (long long)spc->spc_runtime.tv_sec, |
386 | spc->spc_runtime.tv_nsec); |
387 | #endif |
388 | timespecclear(&ts)(&ts)->tv_sec = (&ts)->tv_nsec = 0; |
389 | } else { |
390 | timespecsub(&ts, &spc->spc_runtime, &ts)do { (&ts)->tv_sec = (&ts)->tv_sec - (&spc-> spc_runtime)->tv_sec; (&ts)->tv_nsec = (&ts)-> tv_nsec - (&spc->spc_runtime)->tv_nsec; if ((&ts )->tv_nsec < 0) { (&ts)->tv_sec--; (&ts)-> tv_nsec += 1000000000L; } } while (0); |
391 | } |
392 | |
393 | /* add the time counts for this thread to the process's total */ |
394 | tuagg_locked(pr, p, &ts); |
395 | |
396 | /* Stop any optional clock interrupts. */ |
397 | if (ISSET(spc->spc_schedflags, SPCF_ITIMER)((spc->spc_schedflags) & (0x0020))) { |
398 | atomic_clearbits_intx86_atomic_clearbits_u32(&spc->spc_schedflags, SPCF_ITIMER0x0020); |
399 | clockintr_cancel(spc->spc_itimer); |
400 | } |
401 | if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)((spc->spc_schedflags) & (0x0010))) { |
402 | atomic_clearbits_intx86_atomic_clearbits_u32(&spc->spc_schedflags, SPCF_PROFCLOCK0x0010); |
403 | clockintr_cancel(spc->spc_profclock); |
404 | } |
405 | |
406 | /* |
407 | * Process is about to yield the CPU; clear the appropriate |
408 | * scheduling flags. |
409 | */ |
410 | atomic_clearbits_intx86_atomic_clearbits_u32(&spc->spc_schedflags, SPCF_SWITCHCLEAR(0x0001|0x0002)); |
411 | |
412 | nextproc = sched_chooseproc(); |
413 | |
414 | if (p != nextproc) { |
415 | uvmexp.swtch++; |
416 | TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET,do { extern struct dt_probe (dt_static_sched_off__cpu); struct dt_probe *dtp = &(dt_static_sched_off__cpu); if (__builtin_expect (((dt_tracing) != 0), 0) && __builtin_expect(((dtp-> dtp_recording) != 0), 0)) { struct dt_provider *dtpv = dtp-> dtp_prov; dtpv->dtpv_enter(dtpv, dtp, nextproc->p_tid + 100000, nextproc->p_p->ps_pid); } } while (0) |
417 | nextproc->p_p->ps_pid)do { extern struct dt_probe (dt_static_sched_off__cpu); struct dt_probe *dtp = &(dt_static_sched_off__cpu); if (__builtin_expect (((dt_tracing) != 0), 0) && __builtin_expect(((dtp-> dtp_recording) != 0), 0)) { struct dt_provider *dtpv = dtp-> dtp_prov; dtpv->dtpv_enter(dtpv, dtp, nextproc->p_tid + 100000, nextproc->p_p->ps_pid); } } while (0); |
418 | cpu_switchto(p, nextproc); |
419 | TRACEPOINT(sched, on__cpu, NULL)do { extern struct dt_probe (dt_static_sched_on__cpu); struct dt_probe *dtp = &(dt_static_sched_on__cpu); if (__builtin_expect (((dt_tracing) != 0), 0) && __builtin_expect(((dtp-> dtp_recording) != 0), 0)) { struct dt_provider *dtpv = dtp-> dtp_prov; dtpv->dtpv_enter(dtpv, dtp, ((void *)0)); } } while (0); |
420 | } else { |
421 | TRACEPOINT(sched, remain__cpu, NULL)do { extern struct dt_probe (dt_static_sched_remain__cpu); struct dt_probe *dtp = &(dt_static_sched_remain__cpu); if (__builtin_expect (((dt_tracing) != 0), 0) && __builtin_expect(((dtp-> dtp_recording) != 0), 0)) { struct dt_provider *dtpv = dtp-> dtp_prov; dtpv->dtpv_enter(dtpv, dtp, ((void *)0)); } } while (0); |
422 | p->p_stat = SONPROC7; |
423 | } |
424 | |
425 | clear_resched(curcpu())(({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;}))->ci_want_resched = 0; |
426 | |
427 | SCHED_ASSERT_LOCKED()do { do { if (splassert_ctl > 0) { splassert_check(0xc, __func__ ); } } while (0); ((__mp_lock_held(&sched_lock, ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof (struct cpu_info, ci_self))); __ci;}))) ? (void)0 : __assert( "diagnostic ", "/usr/src/sys/kern/sched_bsd.c", 427, "__mp_lock_held(&sched_lock, curcpu())" )); } while (0); |
428 | |
429 | /* |
430 | * To preserve lock ordering, we need to release the sched lock |
431 | * and grab it after we grab the big lock. |
432 | * In the future, when the sched lock isn't recursive, we'll |
433 | * just release it here. |
434 | */ |
435 | #ifdef MULTIPROCESSOR1 |
436 | __mp_unlock(&sched_lock); |
437 | #endif |
438 | |
439 | SCHED_ASSERT_UNLOCKED()do { ((__mp_lock_held(&sched_lock, ({struct cpu_info *__ci ; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof (struct cpu_info, ci_self))); __ci;})) == 0) ? (void)0 : __assert ("diagnostic ", "/usr/src/sys/kern/sched_bsd.c", 439, "__mp_lock_held(&sched_lock, curcpu()) == 0" )); } while (0); |
440 | |
441 | smr_idle(); |
442 | |
443 | /* |
444 | * We're running again; record our new start time. We might |
445 | * be running on a new CPU now, so refetch the schedstate_percpu |
446 | * pointer. |
447 | */ |
448 | KASSERT(p->p_cpu == curcpu())((p->p_cpu == ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self ))); __ci;})) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/sched_bsd.c" , 448, "p->p_cpu == curcpu()")); |
449 | spc = &p->p_cpu->ci_schedstate; |
450 | |
451 | /* Start any optional clock interrupts needed by the thread. */ |
452 | if (ISSET(p->p_p->ps_flags, PS_ITIMER)((p->p_p->ps_flags) & (0x04000000))) { |
453 | atomic_setbits_intx86_atomic_setbits_u32(&spc->spc_schedflags, SPCF_ITIMER0x0020); |
454 | clockintr_advance(spc->spc_itimer, hardclock_period); |
455 | } |
456 | if (ISSET(p->p_p->ps_flags, PS_PROFIL)((p->p_p->ps_flags) & (0x00000100))) { |
457 | atomic_setbits_intx86_atomic_setbits_u32(&spc->spc_schedflags, SPCF_PROFCLOCK0x0010); |
458 | clockintr_advance(spc->spc_profclock, profclock_period); |
459 | } |
460 | |
461 | nanouptime(&spc->spc_runtime); |
462 | |
463 | #ifdef MULTIPROCESSOR1 |
464 | /* |
465 | * Reacquire the kernel_lock now. We do this after we've |
466 | * released the scheduler lock to avoid deadlock, and before |
467 | * we reacquire the interlock and the scheduler lock. |
468 | */ |
469 | if (hold_count) |
470 | __mp_acquire_count(&kernel_lock, hold_count); |
471 | __mp_acquire_count(&sched_lock, sched_count + 1); |
472 | #endif |
473 | } |
474 | |
475 | /* |
476 | * Change process state to be runnable, |
477 | * placing it on the run queue. |
478 | */ |
479 | void |
480 | setrunnable(struct proc *p) |
481 | { |
482 | struct process *pr = p->p_p; |
483 | u_char prio; |
484 | |
485 | SCHED_ASSERT_LOCKED()do { do { if (splassert_ctl > 0) { splassert_check(0xc, __func__ ); } } while (0); ((__mp_lock_held(&sched_lock, ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof (struct cpu_info, ci_self))); __ci;}))) ? (void)0 : __assert( "diagnostic ", "/usr/src/sys/kern/sched_bsd.c", 485, "__mp_lock_held(&sched_lock, curcpu())" )); } while (0); |
486 | |
487 | switch (p->p_stat) { |
488 | case 0: |
489 | case SRUN2: |
490 | case SONPROC7: |
491 | case SDEAD6: |
492 | case SIDL1: |
493 | default: |
494 | panic("setrunnable"); |
495 | case SSTOP4: |
496 | /* |
497 | * If we're being traced (possibly because someone attached us |
498 | * while we were stopped), check for a signal from the debugger. |
499 | */ |
500 | if ((pr->ps_flags & PS_TRACED0x00000200) != 0 && pr->ps_xsig != 0) |
501 | atomic_setbits_intx86_atomic_setbits_u32(&p->p_siglist, sigmask(pr->ps_xsig)(1U << ((pr->ps_xsig)-1))); |
502 | prio = p->p_usrpri; |
503 | unsleep(p); |
504 | setrunqueue(NULL((void *)0), p, prio); |
505 | break; |
506 | case SSLEEP3: |
507 | prio = p->p_slppri; |
508 | unsleep(p); /* e.g. when sending signals */ |
509 | |
510 | /* if not yet asleep, don't add to runqueue */ |
511 | if (ISSET(p->p_flag, P_WSLEEP)((p->p_flag) & (0x00000020))) |
512 | return; |
513 | setrunqueue(NULL((void *)0), p, prio); |
514 | TRACEPOINT(sched, wakeup, p->p_tid + THREAD_PID_OFFSET,do { extern struct dt_probe (dt_static_sched_wakeup); struct dt_probe *dtp = &(dt_static_sched_wakeup); if (__builtin_expect(( (dt_tracing) != 0), 0) && __builtin_expect(((dtp-> dtp_recording) != 0), 0)) { struct dt_provider *dtpv = dtp-> dtp_prov; dtpv->dtpv_enter(dtpv, dtp, p->p_tid + 100000 , p->p_p->ps_pid, ((p->p_cpu)->ci_dev ? (p->p_cpu )->ci_dev->dv_unit : 0)); } } while (0) |
515 | p->p_p->ps_pid, CPU_INFO_UNIT(p->p_cpu))do { extern struct dt_probe (dt_static_sched_wakeup); struct dt_probe *dtp = &(dt_static_sched_wakeup); if (__builtin_expect(( (dt_tracing) != 0), 0) && __builtin_expect(((dtp-> dtp_recording) != 0), 0)) { struct dt_provider *dtpv = dtp-> dtp_prov; dtpv->dtpv_enter(dtpv, dtp, p->p_tid + 100000 , p->p_p->ps_pid, ((p->p_cpu)->ci_dev ? (p->p_cpu )->ci_dev->dv_unit : 0)); } } while (0); |
516 | break; |
517 | } |
518 | if (p->p_slptime > 1) { |
519 | uint32_t newcpu; |
520 | |
521 | newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime); |
522 | setpriority(p, newcpu, pr->ps_nice); |
523 | } |
524 | p->p_slptime = 0; |
525 | } |
526 | |
527 | /* |
528 | * Compute the priority of a process. |
529 | */ |
530 | void |
531 | setpriority(struct proc *p, uint32_t newcpu, uint8_t nice) |
532 | { |
533 | unsigned int newprio; |
534 | |
535 | newprio = min((PUSER50 + newcpu + NICE_WEIGHT2 * (nice - NZERO20)), MAXPRI127); |
536 | |
537 | SCHED_ASSERT_LOCKED()do { do { if (splassert_ctl > 0) { splassert_check(0xc, __func__ ); } } while (0); ((__mp_lock_held(&sched_lock, ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof (struct cpu_info, ci_self))); __ci;}))) ? (void)0 : __assert( "diagnostic ", "/usr/src/sys/kern/sched_bsd.c", 537, "__mp_lock_held(&sched_lock, curcpu())" )); } while (0); |
538 | p->p_estcpu = newcpu; |
539 | p->p_usrpri = newprio; |
540 | } |
541 | |
542 | /* |
543 | * We adjust the priority of the current process. The priority of a process |
544 | * gets worse as it accumulates CPU time. The cpu usage estimator (p_estcpu) |
545 | * is increased here. The formula for computing priorities (in kern_synch.c) |
546 | * will compute a different value each time p_estcpu increases. This can |
547 | * cause a switch, but unless the priority crosses a PPQ boundary the actual |
548 | * queue will not change. The cpu usage estimator ramps up quite quickly |
549 | * when the process is running (linearly), and decays away exponentially, at |
550 | * a rate which is proportionally slower when the system is busy. The basic |
551 | * principle is that the system will 90% forget that the process used a lot |
552 | * of CPU time in 5 * loadav seconds. This causes the system to favor |
553 | * processes which haven't run much recently, and to round-robin among other |
554 | * processes. |
555 | */ |
556 | void |
557 | schedclock(struct proc *p) |
558 | { |
559 | struct cpu_info *ci = curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;}); |
560 | struct schedstate_percpu *spc = &ci->ci_schedstate; |
561 | uint32_t newcpu; |
562 | int s; |
563 | |
564 | if (p == spc->spc_idleproc || spc->spc_spinning) |
565 | return; |
566 | |
567 | SCHED_LOCK(s)do { s = splraise(0xc); __mp_lock(&sched_lock); } while ( 0); |
568 | newcpu = ESTCPULIM(p->p_estcpu + 1)min((p->p_estcpu + 1), 2 * 20 - (128 / 32)); |
569 | setpriority(p, newcpu, p->p_p->ps_nice); |
570 | SCHED_UNLOCK(s)do { __mp_unlock(&sched_lock); spllower(s); } while ( 0); |
571 | } |
572 | |
573 | void (*cpu_setperf)(int); |
574 | |
575 | #define PERFPOL_MANUAL0 0 |
576 | #define PERFPOL_AUTO1 1 |
577 | #define PERFPOL_HIGH2 2 |
578 | int perflevel = 100; |
579 | int perfpolicy = PERFPOL_AUTO1; |
580 | |
581 | #ifndef SMALL_KERNEL |
582 | /* |
583 | * The code below handles CPU throttling. |
584 | */ |
585 | #include <sys/sysctl.h> |
586 | |
587 | void setperf_auto(void *); |
588 | struct timeout setperf_to = TIMEOUT_INITIALIZER(setperf_auto, NULL){ .to_list = { ((void *)0), ((void *)0) }, .to_abstime = { .tv_sec = 0, .tv_nsec = 0 }, .to_func = ((setperf_auto)), .to_arg = ( (((void *)0))), .to_time = 0, .to_flags = (0) | 0x04, .to_kclock = ((-1)) }; |
589 | extern int hw_power; |
590 | |
591 | void |
592 | setperf_auto(void *v) |
593 | { |
594 | static uint64_t *idleticks, *totalticks; |
595 | static int downbeats; |
596 | int i, j = 0; |
597 | int speedup = 0; |
598 | CPU_INFO_ITERATORint cii; |
599 | struct cpu_info *ci; |
600 | uint64_t idle, total, allidle = 0, alltotal = 0; |
601 | |
602 | if (perfpolicy != PERFPOL_AUTO1) |
603 | return; |
604 | |
605 | if (cpu_setperf == NULL((void *)0)) |
606 | return; |
607 | |
608 | if (hw_power) { |
609 | speedup = 1; |
Value stored to 'speedup' is never read | |
610 | goto faster; |
611 | } |
612 | |
613 | if (!idleticks) |
614 | if (!(idleticks = mallocarray(ncpusfound, sizeof(*idleticks), |
615 | M_DEVBUF2, M_NOWAIT0x0002 | M_ZERO0x0008))) |
616 | return; |
617 | if (!totalticks) |
618 | if (!(totalticks = mallocarray(ncpusfound, sizeof(*totalticks), |
619 | M_DEVBUF2, M_NOWAIT0x0002 | M_ZERO0x0008))) { |
620 | free(idleticks, M_DEVBUF2, |
621 | sizeof(*idleticks) * ncpusfound); |
622 | return; |
623 | } |
624 | CPU_INFO_FOREACH(cii, ci)for (cii = 0, ci = cpu_info_list; ci != ((void *)0); ci = ci-> ci_next) { |
625 | if (!cpu_is_online(ci)) |
626 | continue; |
627 | total = 0; |
628 | for (i = 0; i < CPUSTATES6; i++) { |
629 | total += ci->ci_schedstate.spc_cp_time[i]; |
630 | } |
631 | total -= totalticks[j]; |
632 | idle = ci->ci_schedstate.spc_cp_time[CP_IDLE5] - idleticks[j]; |
633 | if (idle < total / 3) |
634 | speedup = 1; |
635 | alltotal += total; |
636 | allidle += idle; |
637 | idleticks[j] += idle; |
638 | totalticks[j] += total; |
639 | j++; |
640 | } |
641 | if (allidle < alltotal / 2) |
642 | speedup = 1; |
643 | if (speedup && downbeats < 5) |
644 | downbeats++; |
645 | |
646 | if (speedup && perflevel != 100) { |
647 | faster: |
648 | perflevel = 100; |
649 | cpu_setperf(perflevel); |
650 | } else if (!speedup && perflevel != 0 && --downbeats <= 0) { |
651 | perflevel = 0; |
652 | cpu_setperf(perflevel); |
653 | } |
654 | |
655 | timeout_add_msec(&setperf_to, 100); |
656 | } |
657 | |
658 | int |
659 | sysctl_hwsetperf(void *oldp, size_t *oldlenp, void *newp, size_t newlen) |
660 | { |
661 | int err; |
662 | |
663 | if (!cpu_setperf) |
664 | return EOPNOTSUPP45; |
665 | |
666 | if (perfpolicy != PERFPOL_MANUAL0) |
667 | return sysctl_rdint(oldp, oldlenp, newp, perflevel); |
668 | |
669 | err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, |
670 | &perflevel, 0, 100); |
671 | if (err) |
672 | return err; |
673 | |
674 | if (newp != NULL((void *)0)) |
675 | cpu_setperf(perflevel); |
676 | |
677 | return 0; |
678 | } |
679 | |
680 | int |
681 | sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen) |
682 | { |
683 | char policy[32]; |
684 | int err; |
685 | |
686 | if (!cpu_setperf) |
687 | return EOPNOTSUPP45; |
688 | |
689 | switch (perfpolicy) { |
690 | case PERFPOL_MANUAL0: |
691 | strlcpy(policy, "manual", sizeof(policy)); |
692 | break; |
693 | case PERFPOL_AUTO1: |
694 | strlcpy(policy, "auto", sizeof(policy)); |
695 | break; |
696 | case PERFPOL_HIGH2: |
697 | strlcpy(policy, "high", sizeof(policy)); |
698 | break; |
699 | default: |
700 | strlcpy(policy, "unknown", sizeof(policy)); |
701 | break; |
702 | } |
703 | |
704 | if (newp == NULL((void *)0)) |
705 | return sysctl_rdstring(oldp, oldlenp, newp, policy); |
706 | |
707 | err = sysctl_string(oldp, oldlenp, newp, newlen, policy, sizeof(policy)); |
708 | if (err) |
709 | return err; |
710 | if (strcmp(policy, "manual") == 0) |
711 | perfpolicy = PERFPOL_MANUAL0; |
712 | else if (strcmp(policy, "auto") == 0) |
713 | perfpolicy = PERFPOL_AUTO1; |
714 | else if (strcmp(policy, "high") == 0) |
715 | perfpolicy = PERFPOL_HIGH2; |
716 | else |
717 | return EINVAL22; |
718 | |
719 | if (perfpolicy == PERFPOL_AUTO1) { |
720 | timeout_add_msec(&setperf_to, 200); |
721 | } else if (perfpolicy == PERFPOL_HIGH2) { |
722 | perflevel = 100; |
723 | cpu_setperf(perflevel); |
724 | } |
725 | return 0; |
726 | } |
727 | #endif |
728 | |
729 | /* |
730 | * Start the scheduler's periodic timeouts. |
731 | */ |
732 | void |
733 | scheduler_start(void) |
734 | { |
735 | schedcpu(NULL((void *)0)); |
736 | update_loadavg(NULL((void *)0)); |
737 | |
738 | #ifndef SMALL_KERNEL |
739 | if (perfpolicy == PERFPOL_AUTO1) |
740 | timeout_add_msec(&setperf_to, 200); |
741 | #endif |
742 | } |
743 |