File: | kern/sched_bsd.c |
Warning: | line 548, column 3 Value stored to 'speedup' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | /* $OpenBSD: sched_bsd.c,v 1.70 2021/10/30 23:24:48 deraadt Exp $ */ |
2 | /* $NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $ */ |
3 | |
4 | /*- |
5 | * Copyright (c) 1982, 1986, 1990, 1991, 1993 |
6 | * The Regents of the University of California. All rights reserved. |
7 | * (c) UNIX System Laboratories, Inc. |
8 | * All or some portions of this file are derived from material licensed |
9 | * to the University of California by American Telephone and Telegraph |
10 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with |
11 | * the permission of UNIX System Laboratories, Inc. |
12 | * |
13 | * Redistribution and use in source and binary forms, with or without |
14 | * modification, are permitted provided that the following conditions |
15 | * are met: |
16 | * 1. Redistributions of source code must retain the above copyright |
17 | * notice, this list of conditions and the following disclaimer. |
18 | * 2. Redistributions in binary form must reproduce the above copyright |
19 | * notice, this list of conditions and the following disclaimer in the |
20 | * documentation and/or other materials provided with the distribution. |
21 | * 3. Neither the name of the University nor the names of its contributors |
22 | * may be used to endorse or promote products derived from this software |
23 | * without specific prior written permission. |
24 | * |
25 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
26 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
27 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
28 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
29 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
30 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
31 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
32 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
33 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
34 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
35 | * SUCH DAMAGE. |
36 | * |
37 | * @(#)kern_synch.c 8.6 (Berkeley) 1/21/94 |
38 | */ |
39 | |
40 | #include <sys/param.h> |
41 | #include <sys/systm.h> |
42 | #include <sys/proc.h> |
43 | #include <sys/kernel.h> |
44 | #include <sys/malloc.h> |
45 | #include <sys/signalvar.h> |
46 | #include <sys/resourcevar.h> |
47 | #include <uvm/uvm_extern.h> |
48 | #include <sys/sched.h> |
49 | #include <sys/timeout.h> |
50 | #include <sys/smr.h> |
51 | #include <sys/tracepoint.h> |
52 | |
53 | #ifdef KTRACE1 |
54 | #include <sys/ktrace.h> |
55 | #endif |
56 | |
57 | |
58 | int lbolt; /* once a second sleep address */ |
59 | int rrticks_init; /* # of hardclock ticks per roundrobin() */ |
60 | |
61 | #ifdef MULTIPROCESSOR1 |
62 | struct __mp_lock sched_lock; |
63 | #endif |
64 | |
65 | void schedcpu(void *); |
66 | uint32_t decay_aftersleep(uint32_t, uint32_t); |
67 | |
68 | /* |
69 | * Force switch among equal priority processes every 100ms. |
70 | */ |
71 | void |
72 | roundrobin(struct cpu_info *ci) |
73 | { |
74 | struct schedstate_percpu *spc = &ci->ci_schedstate; |
75 | |
76 | spc->spc_rrticks = rrticks_init; |
77 | |
78 | if (ci->ci_curproc != NULL((void *)0)) { |
79 | if (spc->spc_schedflags & SPCF_SEENRR0x0001) { |
80 | /* |
81 | * The process has already been through a roundrobin |
82 | * without switching and may be hogging the CPU. |
83 | * Indicate that the process should yield. |
84 | */ |
85 | atomic_setbits_intx86_atomic_setbits_u32(&spc->spc_schedflags, |
86 | SPCF_SHOULDYIELD0x0002); |
87 | } else { |
88 | atomic_setbits_intx86_atomic_setbits_u32(&spc->spc_schedflags, |
89 | SPCF_SEENRR0x0001); |
90 | } |
91 | } |
92 | |
93 | if (spc->spc_nrun) |
94 | need_resched(ci); |
95 | } |
96 | |
97 | /* |
98 | * Constants for digital decay and forget: |
99 | * 90% of (p_estcpu) usage in 5 * loadav time |
100 | * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) |
101 | * Note that, as ps(1) mentions, this can let percentages |
102 | * total over 100% (I've seen 137.9% for 3 processes). |
103 | * |
104 | * Note that hardclock updates p_estcpu and p_cpticks independently. |
105 | * |
106 | * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. |
107 | * That is, the system wants to compute a value of decay such |
108 | * that the following for loop: |
109 | * for (i = 0; i < (5 * loadavg); i++) |
110 | * p_estcpu *= decay; |
111 | * will compute |
112 | * p_estcpu *= 0.1; |
113 | * for all values of loadavg: |
114 | * |
115 | * Mathematically this loop can be expressed by saying: |
116 | * decay ** (5 * loadavg) ~= .1 |
117 | * |
118 | * The system computes decay as: |
119 | * decay = (2 * loadavg) / (2 * loadavg + 1) |
120 | * |
121 | * We wish to prove that the system's computation of decay |
122 | * will always fulfill the equation: |
123 | * decay ** (5 * loadavg) ~= .1 |
124 | * |
125 | * If we compute b as: |
126 | * b = 2 * loadavg |
127 | * then |
128 | * decay = b / (b + 1) |
129 | * |
130 | * We now need to prove two things: |
131 | * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) |
132 | * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) |
133 | * |
134 | * Facts: |
135 | * For x close to zero, exp(x) =~ 1 + x, since |
136 | * exp(x) = 0! + x**1/1! + x**2/2! + ... . |
137 | * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. |
138 | * For x close to zero, ln(1+x) =~ x, since |
139 | * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 |
140 | * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). |
141 | * ln(.1) =~ -2.30 |
142 | * |
143 | * Proof of (1): |
144 | * Solve (factor)**(power) =~ .1 given power (5*loadav): |
145 | * solving for factor, |
146 | * ln(factor) =~ (-2.30/5*loadav), or |
147 | * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = |
148 | * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED |
149 | * |
150 | * Proof of (2): |
151 | * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): |
152 | * solving for power, |
153 | * power*ln(b/(b+1)) =~ -2.30, or |
154 | * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED |
155 | * |
156 | * Actual power values for the implemented algorithm are as follows: |
157 | * loadav: 1 2 3 4 |
158 | * power: 5.68 10.32 14.94 19.55 |
159 | */ |
160 | |
161 | /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ |
162 | #define loadfactor(loadav)(2 * (loadav)) (2 * (loadav)) |
163 | #define decay_cpu(loadfac, cpu)(((loadfac) * (cpu)) / ((loadfac) + (1<<11))) (((loadfac) * (cpu)) / ((loadfac) + FSCALE(1<<11))) |
164 | |
165 | /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ |
166 | fixpt_t ccpu = 0.95122942450071400909 * FSCALE(1<<11); /* exp(-1/20) */ |
167 | |
168 | /* |
169 | * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the |
170 | * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below |
171 | * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). |
172 | * |
173 | * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: |
174 | * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). |
175 | * |
176 | * If you don't want to bother with the faster/more-accurate formula, you |
177 | * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate |
178 | * (more general) method of calculating the %age of CPU used by a process. |
179 | */ |
180 | #define CCPU_SHIFT11 11 |
181 | |
182 | /* |
183 | * Recompute process priorities, every second. |
184 | */ |
185 | void |
186 | schedcpu(void *arg) |
187 | { |
188 | struct timeout *to = (struct timeout *)arg; |
189 | fixpt_t loadfac = loadfactor(averunnable.ldavg[0])(2 * (averunnable.ldavg[0])); |
190 | struct proc *p; |
191 | int s; |
192 | unsigned int newcpu; |
193 | int phz; |
194 | |
195 | /* |
196 | * If we have a statistics clock, use that to calculate CPU |
197 | * time, otherwise revert to using the profiling clock (which, |
198 | * in turn, defaults to hz if there is no separate profiling |
199 | * clock available) |
200 | */ |
201 | phz = stathz ? stathz : profhz; |
202 | KASSERT(phz)((phz) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/sched_bsd.c" , 202, "phz")); |
203 | |
204 | LIST_FOREACH(p, &allproc, p_list)for((p) = ((&allproc)->lh_first); (p)!= ((void *)0); ( p) = ((p)->p_list.le_next)) { |
205 | /* |
206 | * Idle threads are never placed on the runqueue, |
207 | * therefore computing their priority is pointless. |
208 | */ |
209 | if (p->p_cpu != NULL((void *)0) && |
210 | p->p_cpu->ci_schedstate.spc_idleproc == p) |
211 | continue; |
212 | /* |
213 | * Increment sleep time (if sleeping). We ignore overflow. |
214 | */ |
215 | if (p->p_stat == SSLEEP3 || p->p_stat == SSTOP4) |
216 | p->p_slptime++; |
217 | p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT11; |
218 | /* |
219 | * If the process has slept the entire second, |
220 | * stop recalculating its priority until it wakes up. |
221 | */ |
222 | if (p->p_slptime > 1) |
223 | continue; |
224 | SCHED_LOCK(s)do { s = splraise(0xc); __mp_lock(&sched_lock); } while ( 0); |
225 | /* |
226 | * p_pctcpu is only for diagnostic tools such as ps. |
227 | */ |
228 | #if (FSHIFT11 >= CCPU_SHIFT11) |
229 | p->p_pctcpu += (phz == 100)? |
230 | ((fixpt_t) p->p_cpticks) << (FSHIFT11 - CCPU_SHIFT11): |
231 | 100 * (((fixpt_t) p->p_cpticks) |
232 | << (FSHIFT11 - CCPU_SHIFT11)) / phz; |
233 | #else |
234 | p->p_pctcpu += ((FSCALE(1<<11) - ccpu) * |
235 | (p->p_cpticks * FSCALE(1<<11) / phz)) >> FSHIFT11; |
236 | #endif |
237 | p->p_cpticks = 0; |
238 | newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu)(((loadfac) * (p->p_estcpu)) / ((loadfac) + (1<<11)) ); |
239 | setpriority(p, newcpu, p->p_p->ps_nice); |
240 | |
241 | if (p->p_stat == SRUN2 && |
242 | (p->p_runpri / SCHED_PPQ(128 / 32)) != (p->p_usrpri / SCHED_PPQ(128 / 32))) { |
243 | remrunqueue(p); |
244 | setrunqueue(p->p_cpu, p, p->p_usrpri); |
245 | } |
246 | SCHED_UNLOCK(s)do { __mp_unlock(&sched_lock); spllower(s); } while ( 0); |
247 | } |
248 | uvm_meter(); |
249 | wakeup(&lbolt); |
250 | timeout_add_sec(to, 1); |
251 | } |
252 | |
253 | /* |
254 | * Recalculate the priority of a process after it has slept for a while. |
255 | * For all load averages >= 1 and max p_estcpu of 255, sleeping for at |
256 | * least six times the loadfactor will decay p_estcpu to zero. |
257 | */ |
258 | uint32_t |
259 | decay_aftersleep(uint32_t estcpu, uint32_t slptime) |
260 | { |
261 | fixpt_t loadfac = loadfactor(averunnable.ldavg[0])(2 * (averunnable.ldavg[0])); |
262 | uint32_t newcpu; |
263 | |
264 | if (slptime > 5 * loadfac) |
265 | newcpu = 0; |
266 | else { |
267 | newcpu = estcpu; |
268 | slptime--; /* the first time was done in schedcpu */ |
269 | while (newcpu && --slptime) |
270 | newcpu = decay_cpu(loadfac, newcpu)(((loadfac) * (newcpu)) / ((loadfac) + (1<<11))); |
271 | |
272 | } |
273 | |
274 | return (newcpu); |
275 | } |
276 | |
277 | /* |
278 | * General yield call. Puts the current process back on its run queue and |
279 | * performs a voluntary context switch. |
280 | */ |
281 | void |
282 | yield(void) |
283 | { |
284 | struct proc *p = curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc; |
285 | int s; |
286 | |
287 | SCHED_LOCK(s)do { s = splraise(0xc); __mp_lock(&sched_lock); } while ( 0); |
288 | setrunqueue(p->p_cpu, p, p->p_usrpri); |
289 | p->p_ru.ru_nvcsw++; |
290 | mi_switch(); |
291 | SCHED_UNLOCK(s)do { __mp_unlock(&sched_lock); spllower(s); } while ( 0); |
292 | } |
293 | |
294 | /* |
295 | * General preemption call. Puts the current process back on its run queue |
296 | * and performs an involuntary context switch. If a process is supplied, |
297 | * we switch to that process. Otherwise, we use the normal process selection |
298 | * criteria. |
299 | */ |
300 | void |
301 | preempt(void) |
302 | { |
303 | struct proc *p = curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc; |
304 | int s; |
305 | |
306 | SCHED_LOCK(s)do { s = splraise(0xc); __mp_lock(&sched_lock); } while ( 0); |
307 | setrunqueue(p->p_cpu, p, p->p_usrpri); |
308 | p->p_ru.ru_nivcsw++; |
309 | mi_switch(); |
310 | SCHED_UNLOCK(s)do { __mp_unlock(&sched_lock); spllower(s); } while ( 0); |
311 | } |
312 | |
313 | void |
314 | mi_switch(void) |
315 | { |
316 | struct schedstate_percpu *spc = &curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_schedstate; |
317 | struct proc *p = curproc({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;})->ci_curproc; |
318 | struct proc *nextproc; |
319 | struct process *pr = p->p_p; |
320 | struct timespec ts; |
321 | #ifdef MULTIPROCESSOR1 |
322 | int hold_count; |
323 | int sched_count; |
324 | #endif |
325 | |
326 | assertwaitok(); |
327 | KASSERT(p->p_stat != SONPROC)((p->p_stat != 7) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/sched_bsd.c" , 327, "p->p_stat != SONPROC")); |
328 | |
329 | SCHED_ASSERT_LOCKED()do { do { if (splassert_ctl > 0) { splassert_check(0xc, __func__ ); } } while (0); ((__mp_lock_held(&sched_lock, ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof (struct cpu_info, ci_self))); __ci;}))) ? (void)0 : __assert( "diagnostic ", "/usr/src/sys/kern/sched_bsd.c", 329, "__mp_lock_held(&sched_lock, curcpu())" )); } while (0); |
330 | |
331 | #ifdef MULTIPROCESSOR1 |
332 | /* |
333 | * Release the kernel_lock, as we are about to yield the CPU. |
334 | */ |
335 | sched_count = __mp_release_all_but_one(&sched_lock); |
336 | if (_kernel_lock_held()) |
337 | hold_count = __mp_release_all(&kernel_lock); |
338 | else |
339 | hold_count = 0; |
340 | #endif |
341 | |
342 | /* |
343 | * Compute the amount of time during which the current |
344 | * process was running, and add that to its total so far. |
345 | */ |
346 | nanouptime(&ts); |
347 | if (timespeccmp(&ts, &spc->spc_runtime, <)(((&ts)->tv_sec == (&spc->spc_runtime)->tv_sec ) ? ((&ts)->tv_nsec < (&spc->spc_runtime)-> tv_nsec) : ((&ts)->tv_sec < (&spc->spc_runtime )->tv_sec))) { |
348 | #if 0 |
349 | printf("uptime is not monotonic! " |
350 | "ts=%lld.%09lu, runtime=%lld.%09lu\n", |
351 | (long long)tv.tv_sec, tv.tv_nsec, |
352 | (long long)spc->spc_runtime.tv_sec, |
353 | spc->spc_runtime.tv_nsec); |
354 | #endif |
355 | } else { |
356 | timespecsub(&ts, &spc->spc_runtime, &ts)do { (&ts)->tv_sec = (&ts)->tv_sec - (&spc-> spc_runtime)->tv_sec; (&ts)->tv_nsec = (&ts)-> tv_nsec - (&spc->spc_runtime)->tv_nsec; if ((&ts )->tv_nsec < 0) { (&ts)->tv_sec--; (&ts)-> tv_nsec += 1000000000L; } } while (0); |
357 | timespecadd(&p->p_rtime, &ts, &p->p_rtime)do { (&p->p_rtime)->tv_sec = (&p->p_rtime)-> tv_sec + (&ts)->tv_sec; (&p->p_rtime)->tv_nsec = (&p->p_rtime)->tv_nsec + (&ts)->tv_nsec; if ((&p->p_rtime)->tv_nsec >= 1000000000L) { (& p->p_rtime)->tv_sec++; (&p->p_rtime)->tv_nsec -= 1000000000L; } } while (0); |
358 | } |
359 | |
360 | /* add the time counts for this thread to the process's total */ |
361 | tuagg_unlocked(pr, p); |
362 | |
363 | /* |
364 | * Process is about to yield the CPU; clear the appropriate |
365 | * scheduling flags. |
366 | */ |
367 | atomic_clearbits_intx86_atomic_clearbits_u32(&spc->spc_schedflags, SPCF_SWITCHCLEAR(0x0001|0x0002)); |
368 | |
369 | nextproc = sched_chooseproc(); |
370 | |
371 | if (p != nextproc) { |
372 | uvmexp.swtch++; |
373 | TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET,do { extern struct dt_probe (dt_static_sched_off__cpu); struct dt_probe *dtp = &(dt_static_sched_off__cpu); struct dt_provider *dtpv = dtp->dtp_prov; if (__builtin_expect(((dt_tracing) != 0), 0) && __builtin_expect(((dtp->dtp_recording ) != 0), 0)) { dtpv->dtpv_enter(dtpv, dtp, nextproc->p_tid + 100000, nextproc->p_p->ps_pid); } } while (0) |
374 | nextproc->p_p->ps_pid)do { extern struct dt_probe (dt_static_sched_off__cpu); struct dt_probe *dtp = &(dt_static_sched_off__cpu); struct dt_provider *dtpv = dtp->dtp_prov; if (__builtin_expect(((dt_tracing) != 0), 0) && __builtin_expect(((dtp->dtp_recording ) != 0), 0)) { dtpv->dtpv_enter(dtpv, dtp, nextproc->p_tid + 100000, nextproc->p_p->ps_pid); } } while (0); |
375 | cpu_switchto(p, nextproc); |
376 | TRACEPOINT(sched, on__cpu, NULL)do { extern struct dt_probe (dt_static_sched_on__cpu); struct dt_probe *dtp = &(dt_static_sched_on__cpu); struct dt_provider *dtpv = dtp->dtp_prov; if (__builtin_expect(((dt_tracing) != 0), 0) && __builtin_expect(((dtp->dtp_recording ) != 0), 0)) { dtpv->dtpv_enter(dtpv, dtp, ((void *)0)); } } while (0); |
377 | } else { |
378 | TRACEPOINT(sched, remain__cpu, NULL)do { extern struct dt_probe (dt_static_sched_remain__cpu); struct dt_probe *dtp = &(dt_static_sched_remain__cpu); struct dt_provider *dtpv = dtp->dtp_prov; if (__builtin_expect(((dt_tracing) != 0), 0) && __builtin_expect(((dtp->dtp_recording ) != 0), 0)) { dtpv->dtpv_enter(dtpv, dtp, ((void *)0)); } } while (0); |
379 | p->p_stat = SONPROC7; |
380 | } |
381 | |
382 | clear_resched(curcpu())(({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;}))->ci_want_resched = 0; |
383 | |
384 | SCHED_ASSERT_LOCKED()do { do { if (splassert_ctl > 0) { splassert_check(0xc, __func__ ); } } while (0); ((__mp_lock_held(&sched_lock, ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof (struct cpu_info, ci_self))); __ci;}))) ? (void)0 : __assert( "diagnostic ", "/usr/src/sys/kern/sched_bsd.c", 384, "__mp_lock_held(&sched_lock, curcpu())" )); } while (0); |
385 | |
386 | /* |
387 | * To preserve lock ordering, we need to release the sched lock |
388 | * and grab it after we grab the big lock. |
389 | * In the future, when the sched lock isn't recursive, we'll |
390 | * just release it here. |
391 | */ |
392 | #ifdef MULTIPROCESSOR1 |
393 | __mp_unlock(&sched_lock); |
394 | #endif |
395 | |
396 | SCHED_ASSERT_UNLOCKED()do { ((__mp_lock_held(&sched_lock, ({struct cpu_info *__ci ; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof (struct cpu_info, ci_self))); __ci;})) == 0) ? (void)0 : __assert ("diagnostic ", "/usr/src/sys/kern/sched_bsd.c", 396, "__mp_lock_held(&sched_lock, curcpu()) == 0" )); } while (0); |
397 | |
398 | smr_idle(); |
399 | |
400 | /* |
401 | * We're running again; record our new start time. We might |
402 | * be running on a new CPU now, so don't use the cache'd |
403 | * schedstate_percpu pointer. |
404 | */ |
405 | KASSERT(p->p_cpu == curcpu())((p->p_cpu == ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self ))); __ci;})) ? (void)0 : __assert("diagnostic ", "/usr/src/sys/kern/sched_bsd.c" , 405, "p->p_cpu == curcpu()")); |
406 | |
407 | nanouptime(&p->p_cpu->ci_schedstate.spc_runtime); |
408 | |
409 | #ifdef MULTIPROCESSOR1 |
410 | /* |
411 | * Reacquire the kernel_lock now. We do this after we've |
412 | * released the scheduler lock to avoid deadlock, and before |
413 | * we reacquire the interlock and the scheduler lock. |
414 | */ |
415 | if (hold_count) |
416 | __mp_acquire_count(&kernel_lock, hold_count); |
417 | __mp_acquire_count(&sched_lock, sched_count + 1); |
418 | #endif |
419 | } |
420 | |
421 | /* |
422 | * Change process state to be runnable, |
423 | * placing it on the run queue. |
424 | */ |
425 | void |
426 | setrunnable(struct proc *p) |
427 | { |
428 | struct process *pr = p->p_p; |
429 | u_char prio; |
430 | |
431 | SCHED_ASSERT_LOCKED()do { do { if (splassert_ctl > 0) { splassert_check(0xc, __func__ ); } } while (0); ((__mp_lock_held(&sched_lock, ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof (struct cpu_info, ci_self))); __ci;}))) ? (void)0 : __assert( "diagnostic ", "/usr/src/sys/kern/sched_bsd.c", 431, "__mp_lock_held(&sched_lock, curcpu())" )); } while (0); |
432 | |
433 | switch (p->p_stat) { |
434 | case 0: |
435 | case SRUN2: |
436 | case SONPROC7: |
437 | case SDEAD6: |
438 | case SIDL1: |
439 | default: |
440 | panic("setrunnable"); |
441 | case SSTOP4: |
442 | /* |
443 | * If we're being traced (possibly because someone attached us |
444 | * while we were stopped), check for a signal from the debugger. |
445 | */ |
446 | if ((pr->ps_flags & PS_TRACED0x00000200) != 0 && pr->ps_xsig != 0) |
447 | atomic_setbits_intx86_atomic_setbits_u32(&p->p_siglist, sigmask(pr->ps_xsig)(1U << ((pr->ps_xsig)-1))); |
448 | prio = p->p_usrpri; |
449 | unsleep(p); |
450 | break; |
451 | case SSLEEP3: |
452 | prio = p->p_slppri; |
453 | unsleep(p); /* e.g. when sending signals */ |
454 | break; |
455 | } |
456 | setrunqueue(NULL((void *)0), p, prio); |
457 | if (p->p_slptime > 1) { |
458 | uint32_t newcpu; |
459 | |
460 | newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime); |
461 | setpriority(p, newcpu, pr->ps_nice); |
462 | } |
463 | p->p_slptime = 0; |
464 | } |
465 | |
466 | /* |
467 | * Compute the priority of a process. |
468 | */ |
469 | void |
470 | setpriority(struct proc *p, uint32_t newcpu, uint8_t nice) |
471 | { |
472 | unsigned int newprio; |
473 | |
474 | newprio = min((PUSER50 + newcpu + NICE_WEIGHT2 * (nice - NZERO20)), MAXPRI127); |
475 | |
476 | SCHED_ASSERT_LOCKED()do { do { if (splassert_ctl > 0) { splassert_check(0xc, __func__ ); } } while (0); ((__mp_lock_held(&sched_lock, ({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof (struct cpu_info, ci_self))); __ci;}))) ? (void)0 : __assert( "diagnostic ", "/usr/src/sys/kern/sched_bsd.c", 476, "__mp_lock_held(&sched_lock, curcpu())" )); } while (0); |
477 | p->p_estcpu = newcpu; |
478 | p->p_usrpri = newprio; |
479 | } |
480 | |
481 | /* |
482 | * We adjust the priority of the current process. The priority of a process |
483 | * gets worse as it accumulates CPU time. The cpu usage estimator (p_estcpu) |
484 | * is increased here. The formula for computing priorities (in kern_synch.c) |
485 | * will compute a different value each time p_estcpu increases. This can |
486 | * cause a switch, but unless the priority crosses a PPQ boundary the actual |
487 | * queue will not change. The cpu usage estimator ramps up quite quickly |
488 | * when the process is running (linearly), and decays away exponentially, at |
489 | * a rate which is proportionally slower when the system is busy. The basic |
490 | * principle is that the system will 90% forget that the process used a lot |
491 | * of CPU time in 5 * loadav seconds. This causes the system to favor |
492 | * processes which haven't run much recently, and to round-robin among other |
493 | * processes. |
494 | */ |
495 | void |
496 | schedclock(struct proc *p) |
497 | { |
498 | struct cpu_info *ci = curcpu()({struct cpu_info *__ci; asm volatile("movq %%gs:%P1,%0" : "=r" (__ci) :"n" (__builtin_offsetof(struct cpu_info, ci_self))); __ci;}); |
499 | struct schedstate_percpu *spc = &ci->ci_schedstate; |
500 | uint32_t newcpu; |
501 | int s; |
502 | |
503 | if (p == spc->spc_idleproc || spc->spc_spinning) |
504 | return; |
505 | |
506 | SCHED_LOCK(s)do { s = splraise(0xc); __mp_lock(&sched_lock); } while ( 0); |
507 | newcpu = ESTCPULIM(p->p_estcpu + 1)min((p->p_estcpu + 1), 2 * 20 - (128 / 32)); |
508 | setpriority(p, newcpu, p->p_p->ps_nice); |
509 | SCHED_UNLOCK(s)do { __mp_unlock(&sched_lock); spllower(s); } while ( 0); |
510 | } |
511 | |
512 | void (*cpu_setperf)(int); |
513 | |
514 | #define PERFPOL_MANUAL0 0 |
515 | #define PERFPOL_AUTO1 1 |
516 | #define PERFPOL_HIGH2 2 |
517 | int perflevel = 100; |
518 | int perfpolicy = PERFPOL_AUTO1; |
519 | |
520 | #ifndef SMALL_KERNEL |
521 | /* |
522 | * The code below handles CPU throttling. |
523 | */ |
524 | #include <sys/sysctl.h> |
525 | |
526 | void setperf_auto(void *); |
527 | struct timeout setperf_to = TIMEOUT_INITIALIZER(setperf_auto, NULL){ .to_list = { ((void *)0), ((void *)0) }, .to_abstime = { .tv_sec = 0, .tv_nsec = 0 }, .to_func = ((setperf_auto)), .to_arg = ( (((void *)0))), .to_time = 0, .to_flags = (0) | 0x04, .to_kclock = ((-1)) }; |
528 | extern int hw_power; |
529 | |
530 | void |
531 | setperf_auto(void *v) |
532 | { |
533 | static uint64_t *idleticks, *totalticks; |
534 | static int downbeats; |
535 | int i, j = 0; |
536 | int speedup = 0; |
537 | CPU_INFO_ITERATORint cii; |
538 | struct cpu_info *ci; |
539 | uint64_t idle, total, allidle = 0, alltotal = 0; |
540 | |
541 | if (perfpolicy != PERFPOL_AUTO1) |
542 | return; |
543 | |
544 | if (cpu_setperf == NULL((void *)0)) |
545 | return; |
546 | |
547 | if (hw_power) { |
548 | speedup = 1; |
Value stored to 'speedup' is never read | |
549 | goto faster; |
550 | } |
551 | |
552 | if (!idleticks) |
553 | if (!(idleticks = mallocarray(ncpusfound, sizeof(*idleticks), |
554 | M_DEVBUF2, M_NOWAIT0x0002 | M_ZERO0x0008))) |
555 | return; |
556 | if (!totalticks) |
557 | if (!(totalticks = mallocarray(ncpusfound, sizeof(*totalticks), |
558 | M_DEVBUF2, M_NOWAIT0x0002 | M_ZERO0x0008))) { |
559 | free(idleticks, M_DEVBUF2, |
560 | sizeof(*idleticks) * ncpusfound); |
561 | return; |
562 | } |
563 | CPU_INFO_FOREACH(cii, ci)for (cii = 0, ci = cpu_info_list; ci != ((void *)0); ci = ci-> ci_next) { |
564 | if (!cpu_is_online(ci)) |
565 | continue; |
566 | total = 0; |
567 | for (i = 0; i < CPUSTATES6; i++) { |
568 | total += ci->ci_schedstate.spc_cp_time[i]; |
569 | } |
570 | total -= totalticks[j]; |
571 | idle = ci->ci_schedstate.spc_cp_time[CP_IDLE5] - idleticks[j]; |
572 | if (idle < total / 3) |
573 | speedup = 1; |
574 | alltotal += total; |
575 | allidle += idle; |
576 | idleticks[j] += idle; |
577 | totalticks[j] += total; |
578 | j++; |
579 | } |
580 | if (allidle < alltotal / 2) |
581 | speedup = 1; |
582 | if (speedup) |
583 | downbeats = 5; |
584 | |
585 | if (speedup && perflevel != 100) { |
586 | faster: |
587 | perflevel = 100; |
588 | cpu_setperf(perflevel); |
589 | } else if (!speedup && perflevel != 0 && --downbeats <= 0) { |
590 | perflevel = 0; |
591 | cpu_setperf(perflevel); |
592 | } |
593 | |
594 | timeout_add_msec(&setperf_to, 100); |
595 | } |
596 | |
597 | int |
598 | sysctl_hwsetperf(void *oldp, size_t *oldlenp, void *newp, size_t newlen) |
599 | { |
600 | int err; |
601 | |
602 | if (!cpu_setperf) |
603 | return EOPNOTSUPP45; |
604 | |
605 | if (perfpolicy != PERFPOL_MANUAL0) |
606 | return sysctl_rdint(oldp, oldlenp, newp, perflevel); |
607 | |
608 | err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, |
609 | &perflevel, 0, 100); |
610 | if (err) |
611 | return err; |
612 | |
613 | if (newp != NULL((void *)0)) |
614 | cpu_setperf(perflevel); |
615 | |
616 | return 0; |
617 | } |
618 | |
619 | int |
620 | sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen) |
621 | { |
622 | char policy[32]; |
623 | int err; |
624 | |
625 | if (!cpu_setperf) |
626 | return EOPNOTSUPP45; |
627 | |
628 | switch (perfpolicy) { |
629 | case PERFPOL_MANUAL0: |
630 | strlcpy(policy, "manual", sizeof(policy)); |
631 | break; |
632 | case PERFPOL_AUTO1: |
633 | strlcpy(policy, "auto", sizeof(policy)); |
634 | break; |
635 | case PERFPOL_HIGH2: |
636 | strlcpy(policy, "high", sizeof(policy)); |
637 | break; |
638 | default: |
639 | strlcpy(policy, "unknown", sizeof(policy)); |
640 | break; |
641 | } |
642 | |
643 | if (newp == NULL((void *)0)) |
644 | return sysctl_rdstring(oldp, oldlenp, newp, policy); |
645 | |
646 | err = sysctl_string(oldp, oldlenp, newp, newlen, policy, sizeof(policy)); |
647 | if (err) |
648 | return err; |
649 | if (strcmp(policy, "manual") == 0) |
650 | perfpolicy = PERFPOL_MANUAL0; |
651 | else if (strcmp(policy, "auto") == 0) |
652 | perfpolicy = PERFPOL_AUTO1; |
653 | else if (strcmp(policy, "high") == 0) |
654 | perfpolicy = PERFPOL_HIGH2; |
655 | else |
656 | return EINVAL22; |
657 | |
658 | if (perfpolicy == PERFPOL_AUTO1) { |
659 | timeout_add_msec(&setperf_to, 200); |
660 | } else if (perfpolicy == PERFPOL_HIGH2) { |
661 | perflevel = 100; |
662 | cpu_setperf(perflevel); |
663 | } |
664 | return 0; |
665 | } |
666 | #endif |
667 | |
668 | void |
669 | scheduler_start(void) |
670 | { |
671 | static struct timeout schedcpu_to; |
672 | |
673 | /* |
674 | * We avoid polluting the global namespace by keeping the scheduler |
675 | * timeouts static in this function. |
676 | * We setup the timeout here and kick schedcpu once to make it do |
677 | * its job. |
678 | */ |
679 | timeout_set(&schedcpu_to, schedcpu, &schedcpu_to); |
680 | |
681 | rrticks_init = hz / 10; |
682 | schedcpu(&schedcpu_to); |
683 | |
684 | #ifndef SMALL_KERNEL |
685 | if (perfpolicy == PERFPOL_AUTO1) |
686 | timeout_add_msec(&setperf_to, 200); |
687 | #endif |
688 | } |
689 |