]>
Commit | Line | Data |
---|---|---|
73fbec60 FW |
1 | #include <linux/export.h> |
2 | #include <linux/sched.h> | |
3 | #include <linux/tsacct_kern.h> | |
4 | #include <linux/kernel_stat.h> | |
5 | #include <linux/static_key.h> | |
abf917cd | 6 | #include <linux/context_tracking.h> |
fb8b049c | 7 | #include <linux/cputime.h> |
73fbec60 | 8 | #include "sched.h" |
73fbec60 FW |
9 | |
10 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | |
11 | ||
12 | /* | |
13 | * There are no locks covering percpu hardirq/softirq time. | |
bf9fae9f | 14 | * They are only modified in vtime_account, on corresponding CPU |
73fbec60 FW |
15 | * with interrupts disabled. So, writes are safe. |
16 | * They are read and saved off onto struct rq in update_rq_clock(). | |
17 | * This may result in other CPU reading this CPU's irq time and can | |
bf9fae9f | 18 | * race with irq/vtime_account on this CPU. We would either get old |
73fbec60 FW |
19 | * or new value with a side effect of accounting a slice of irq time to wrong |
20 | * task when irq is in progress while we read rq->clock. That is a worthy | |
21 | * compromise in place of having locks on each irq in account_system_time. | |
22 | */ | |
19d23dbf | 23 | DEFINE_PER_CPU(struct irqtime, cpu_irqtime); |
73fbec60 | 24 | |
73fbec60 FW |
25 | static int sched_clock_irqtime; |
26 | ||
27 | void enable_sched_clock_irqtime(void) | |
28 | { | |
29 | sched_clock_irqtime = 1; | |
30 | } | |
31 | ||
32 | void disable_sched_clock_irqtime(void) | |
33 | { | |
34 | sched_clock_irqtime = 0; | |
35 | } | |
36 | ||
73fbec60 FW |
37 | /* |
38 | * Called before incrementing preempt_count on {soft,}irq_enter | |
39 | * and before decrementing preempt_count on {soft,}irq_exit. | |
40 | */ | |
3e1df4f5 | 41 | void irqtime_account_irq(struct task_struct *curr) |
73fbec60 | 42 | { |
19d23dbf | 43 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); |
a499a5a1 | 44 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
73fbec60 FW |
45 | s64 delta; |
46 | int cpu; | |
47 | ||
48 | if (!sched_clock_irqtime) | |
49 | return; | |
50 | ||
73fbec60 | 51 | cpu = smp_processor_id(); |
19d23dbf FW |
52 | delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; |
53 | irqtime->irq_start_time += delta; | |
73fbec60 | 54 | |
19d23dbf | 55 | u64_stats_update_begin(&irqtime->sync); |
73fbec60 FW |
56 | /* |
57 | * We do not account for softirq time from ksoftirqd here. | |
58 | * We want to continue accounting softirq time to ksoftirqd thread | |
59 | * in that case, so as not to confuse scheduler with a special task | |
60 | * that do not consume any time, but still wants to run. | |
61 | */ | |
a499a5a1 FW |
62 | if (hardirq_count()) { |
63 | cpustat[CPUTIME_IRQ] += delta; | |
64 | irqtime->tick_delta += delta; | |
65 | } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) { | |
66 | cpustat[CPUTIME_SOFTIRQ] += delta; | |
67 | irqtime->tick_delta += delta; | |
68 | } | |
73fbec60 | 69 | |
19d23dbf | 70 | u64_stats_update_end(&irqtime->sync); |
73fbec60 | 71 | } |
3e1df4f5 | 72 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
73fbec60 | 73 | |
2b1f967d | 74 | static u64 irqtime_tick_accounted(u64 maxtime) |
73fbec60 | 75 | { |
a499a5a1 | 76 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); |
2b1f967d | 77 | u64 delta; |
73fbec60 | 78 | |
2b1f967d FW |
79 | delta = min(irqtime->tick_delta, maxtime); |
80 | irqtime->tick_delta -= delta; | |
2810f611 | 81 | |
a499a5a1 | 82 | return delta; |
73fbec60 FW |
83 | } |
84 | ||
85 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | |
86 | ||
87 | #define sched_clock_irqtime (0) | |
88 | ||
2b1f967d | 89 | static u64 irqtime_tick_accounted(u64 dummy) |
57430218 RR |
90 | { |
91 | return 0; | |
92 | } | |
93 | ||
73fbec60 FW |
94 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ |
95 | ||
96 | static inline void task_group_account_field(struct task_struct *p, int index, | |
97 | u64 tmp) | |
98 | { | |
73fbec60 FW |
99 | /* |
100 | * Since all updates are sure to touch the root cgroup, we | |
101 | * get ourselves ahead and touch it first. If the root cgroup | |
102 | * is the only cgroup, then nothing else should be necessary. | |
103 | * | |
104 | */ | |
a4f61cc0 | 105 | __this_cpu_add(kernel_cpustat.cpustat[index], tmp); |
73fbec60 | 106 | |
1966aaf7 | 107 | cpuacct_account_field(p, index, tmp); |
73fbec60 FW |
108 | } |
109 | ||
110 | /* | |
111 | * Account user cpu time to a process. | |
112 | * @p: the process that the cpu time gets accounted to | |
113 | * @cputime: the cpu time spent in user space since the last update | |
73fbec60 | 114 | */ |
23244a5c | 115 | void account_user_time(struct task_struct *p, u64 cputime) |
73fbec60 FW |
116 | { |
117 | int index; | |
118 | ||
119 | /* Add user time to process. */ | |
23244a5c FW |
120 | p->utime += cputime; |
121 | account_group_user_time(p, cputime); | |
73fbec60 | 122 | |
d0ea0268 | 123 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; |
73fbec60 FW |
124 | |
125 | /* Add user time to cpustat. */ | |
23244a5c | 126 | task_group_account_field(p, index, cputime); |
73fbec60 FW |
127 | |
128 | /* Account for user time used */ | |
6fac4829 | 129 | acct_account_cputime(p); |
73fbec60 FW |
130 | } |
131 | ||
132 | /* | |
133 | * Account guest cpu time to a process. | |
134 | * @p: the process that the cpu time gets accounted to | |
135 | * @cputime: the cpu time spent in virtual machine since the last update | |
73fbec60 | 136 | */ |
fb8b049c | 137 | void account_guest_time(struct task_struct *p, u64 cputime) |
73fbec60 FW |
138 | { |
139 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
140 | ||
141 | /* Add guest time to process. */ | |
fb8b049c FW |
142 | p->utime += cputime; |
143 | account_group_user_time(p, cputime); | |
144 | p->gtime += cputime; | |
73fbec60 FW |
145 | |
146 | /* Add guest time to cpustat. */ | |
d0ea0268 | 147 | if (task_nice(p) > 0) { |
fb8b049c FW |
148 | cpustat[CPUTIME_NICE] += cputime; |
149 | cpustat[CPUTIME_GUEST_NICE] += cputime; | |
73fbec60 | 150 | } else { |
fb8b049c FW |
151 | cpustat[CPUTIME_USER] += cputime; |
152 | cpustat[CPUTIME_GUEST] += cputime; | |
73fbec60 FW |
153 | } |
154 | } | |
155 | ||
156 | /* | |
157 | * Account system cpu time to a process and desired cpustat field | |
158 | * @p: the process that the cpu time gets accounted to | |
159 | * @cputime: the cpu time spent in kernel space since the last update | |
40565b5a | 160 | * @index: pointer to cpustat field that has to be updated |
73fbec60 | 161 | */ |
c31cc6a5 | 162 | void account_system_index_time(struct task_struct *p, |
fb8b049c | 163 | u64 cputime, enum cpu_usage_stat index) |
73fbec60 FW |
164 | { |
165 | /* Add system time to process. */ | |
fb8b049c FW |
166 | p->stime += cputime; |
167 | account_group_system_time(p, cputime); | |
73fbec60 FW |
168 | |
169 | /* Add system time to cpustat. */ | |
fb8b049c | 170 | task_group_account_field(p, index, cputime); |
73fbec60 FW |
171 | |
172 | /* Account for system time used */ | |
6fac4829 | 173 | acct_account_cputime(p); |
73fbec60 FW |
174 | } |
175 | ||
176 | /* | |
177 | * Account system cpu time to a process. | |
178 | * @p: the process that the cpu time gets accounted to | |
179 | * @hardirq_offset: the offset to subtract from hardirq_count() | |
180 | * @cputime: the cpu time spent in kernel space since the last update | |
73fbec60 | 181 | */ |
fb8b049c | 182 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) |
73fbec60 FW |
183 | { |
184 | int index; | |
185 | ||
186 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | |
40565b5a | 187 | account_guest_time(p, cputime); |
73fbec60 FW |
188 | return; |
189 | } | |
190 | ||
191 | if (hardirq_count() - hardirq_offset) | |
192 | index = CPUTIME_IRQ; | |
193 | else if (in_serving_softirq()) | |
194 | index = CPUTIME_SOFTIRQ; | |
195 | else | |
196 | index = CPUTIME_SYSTEM; | |
197 | ||
c31cc6a5 | 198 | account_system_index_time(p, cputime, index); |
73fbec60 FW |
199 | } |
200 | ||
201 | /* | |
202 | * Account for involuntary wait time. | |
203 | * @cputime: the cpu time spent in involuntary wait | |
204 | */ | |
be9095ed | 205 | void account_steal_time(u64 cputime) |
73fbec60 FW |
206 | { |
207 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
208 | ||
be9095ed | 209 | cpustat[CPUTIME_STEAL] += cputime; |
73fbec60 FW |
210 | } |
211 | ||
212 | /* | |
213 | * Account for idle time. | |
214 | * @cputime: the cpu time spent in idle wait | |
215 | */ | |
18b43a9b | 216 | void account_idle_time(u64 cputime) |
73fbec60 FW |
217 | { |
218 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
219 | struct rq *rq = this_rq(); | |
220 | ||
221 | if (atomic_read(&rq->nr_iowait) > 0) | |
18b43a9b | 222 | cpustat[CPUTIME_IOWAIT] += cputime; |
73fbec60 | 223 | else |
18b43a9b | 224 | cpustat[CPUTIME_IDLE] += cputime; |
73fbec60 FW |
225 | } |
226 | ||
03cbc732 WL |
227 | /* |
228 | * When a guest is interrupted for a longer amount of time, missed clock | |
229 | * ticks are not redelivered later. Due to that, this function may on | |
230 | * occasion account more time than the calling functions think elapsed. | |
231 | */ | |
2b1f967d | 232 | static __always_inline u64 steal_account_process_time(u64 maxtime) |
73fbec60 FW |
233 | { |
234 | #ifdef CONFIG_PARAVIRT | |
235 | if (static_key_false(¶virt_steal_enabled)) { | |
2b1f967d | 236 | u64 steal; |
73fbec60 FW |
237 | |
238 | steal = paravirt_steal_clock(smp_processor_id()); | |
239 | steal -= this_rq()->prev_steal_time; | |
2b1f967d FW |
240 | steal = min(steal, maxtime); |
241 | account_steal_time(steal); | |
242 | this_rq()->prev_steal_time += steal; | |
73fbec60 | 243 | |
2b1f967d | 244 | return steal; |
73fbec60 FW |
245 | } |
246 | #endif | |
807e5b80 | 247 | return 0; |
73fbec60 FW |
248 | } |
249 | ||
57430218 RR |
250 | /* |
251 | * Account how much elapsed time was spent in steal, irq, or softirq time. | |
252 | */ | |
2b1f967d | 253 | static inline u64 account_other_time(u64 max) |
57430218 | 254 | { |
2b1f967d | 255 | u64 accounted; |
57430218 | 256 | |
2810f611 FW |
257 | /* Shall be converted to a lockdep-enabled lightweight check */ |
258 | WARN_ON_ONCE(!irqs_disabled()); | |
259 | ||
57430218 RR |
260 | accounted = steal_account_process_time(max); |
261 | ||
262 | if (accounted < max) | |
a499a5a1 | 263 | accounted += irqtime_tick_accounted(max - accounted); |
57430218 RR |
264 | |
265 | return accounted; | |
266 | } | |
267 | ||
a1eb1411 SG |
268 | #ifdef CONFIG_64BIT |
269 | static inline u64 read_sum_exec_runtime(struct task_struct *t) | |
270 | { | |
271 | return t->se.sum_exec_runtime; | |
272 | } | |
273 | #else | |
274 | static u64 read_sum_exec_runtime(struct task_struct *t) | |
275 | { | |
276 | u64 ns; | |
277 | struct rq_flags rf; | |
278 | struct rq *rq; | |
279 | ||
280 | rq = task_rq_lock(t, &rf); | |
281 | ns = t->se.sum_exec_runtime; | |
282 | task_rq_unlock(rq, t, &rf); | |
283 | ||
284 | return ns; | |
285 | } | |
286 | #endif | |
287 | ||
a634f933 FW |
288 | /* |
289 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live | |
290 | * tasks (sum on group iteration) belonging to @tsk's group. | |
291 | */ | |
292 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |
293 | { | |
294 | struct signal_struct *sig = tsk->signal; | |
5613fda9 | 295 | u64 utime, stime; |
a634f933 | 296 | struct task_struct *t; |
e78c3496 | 297 | unsigned int seq, nextseq; |
9c368b5b | 298 | unsigned long flags; |
a634f933 | 299 | |
a1eb1411 SG |
300 | /* |
301 | * Update current task runtime to account pending time since last | |
302 | * scheduler action or thread_group_cputime() call. This thread group | |
303 | * might have other running tasks on different CPUs, but updating | |
304 | * their runtime can affect syscall performance, so we skip account | |
305 | * those pending times and rely only on values updated on tick or | |
306 | * other scheduler action. | |
307 | */ | |
308 | if (same_thread_group(current, tsk)) | |
309 | (void) task_sched_runtime(current); | |
310 | ||
a634f933 | 311 | rcu_read_lock(); |
e78c3496 RR |
312 | /* Attempt a lockless read on the first round. */ |
313 | nextseq = 0; | |
314 | do { | |
315 | seq = nextseq; | |
9c368b5b | 316 | flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); |
e78c3496 RR |
317 | times->utime = sig->utime; |
318 | times->stime = sig->stime; | |
319 | times->sum_exec_runtime = sig->sum_sched_runtime; | |
320 | ||
321 | for_each_thread(tsk, t) { | |
322 | task_cputime(t, &utime, &stime); | |
323 | times->utime += utime; | |
324 | times->stime += stime; | |
a1eb1411 | 325 | times->sum_exec_runtime += read_sum_exec_runtime(t); |
e78c3496 RR |
326 | } |
327 | /* If lockless access failed, take the lock. */ | |
328 | nextseq = 1; | |
329 | } while (need_seqretry(&sig->stats_lock, seq)); | |
9c368b5b | 330 | done_seqretry_irqrestore(&sig->stats_lock, seq, flags); |
a634f933 FW |
331 | rcu_read_unlock(); |
332 | } | |
333 | ||
73fbec60 FW |
334 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
335 | /* | |
336 | * Account a tick to a process and cpustat | |
337 | * @p: the process that the cpu time gets accounted to | |
338 | * @user_tick: is the tick from userspace | |
339 | * @rq: the pointer to rq | |
340 | * | |
341 | * Tick demultiplexing follows the order | |
342 | * - pending hardirq update | |
343 | * - pending softirq update | |
344 | * - user_time | |
345 | * - idle_time | |
346 | * - system time | |
347 | * - check for guest_time | |
348 | * - else account as system_time | |
349 | * | |
350 | * Check for hardirq is done both for system and user time as there is | |
351 | * no timer going off while we are on hardirq and hence we may never get an | |
352 | * opportunity to update it solely in system time. | |
353 | * p->stime and friends are only updated on system time and not on irq | |
354 | * softirq as those do not count in task exec_runtime any more. | |
355 | */ | |
356 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |
2d513868 | 357 | struct rq *rq, int ticks) |
73fbec60 | 358 | { |
2b1f967d | 359 | u64 other, cputime = TICK_NSEC * ticks; |
73fbec60 | 360 | |
57430218 RR |
361 | /* |
362 | * When returning from idle, many ticks can get accounted at | |
363 | * once, including some ticks of steal, irq, and softirq time. | |
364 | * Subtract those ticks from the amount of time accounted to | |
365 | * idle, or potentially user or system time. Due to rounding, | |
366 | * other time can exceed ticks occasionally. | |
367 | */ | |
03cbc732 | 368 | other = account_other_time(ULONG_MAX); |
2b1f967d | 369 | if (other >= cputime) |
73fbec60 | 370 | return; |
23244a5c | 371 | |
2b1f967d | 372 | cputime -= other; |
73fbec60 | 373 | |
57430218 | 374 | if (this_cpu_ksoftirqd() == p) { |
73fbec60 FW |
375 | /* |
376 | * ksoftirqd time do not get accounted in cpu_softirq_time. | |
377 | * So, we have to handle it separately here. | |
378 | * Also, p->stime needs to be updated for ksoftirqd. | |
379 | */ | |
fb8b049c | 380 | account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); |
73fbec60 | 381 | } else if (user_tick) { |
40565b5a | 382 | account_user_time(p, cputime); |
73fbec60 | 383 | } else if (p == rq->idle) { |
18b43a9b | 384 | account_idle_time(cputime); |
73fbec60 | 385 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ |
fb8b049c | 386 | account_guest_time(p, cputime); |
73fbec60 | 387 | } else { |
fb8b049c | 388 | account_system_index_time(p, cputime, CPUTIME_SYSTEM); |
73fbec60 FW |
389 | } |
390 | } | |
391 | ||
392 | static void irqtime_account_idle_ticks(int ticks) | |
393 | { | |
73fbec60 FW |
394 | struct rq *rq = this_rq(); |
395 | ||
2d513868 | 396 | irqtime_account_process_tick(current, 0, rq, ticks); |
73fbec60 FW |
397 | } |
398 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | |
3f4724ea FW |
399 | static inline void irqtime_account_idle_ticks(int ticks) {} |
400 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |
2d513868 | 401 | struct rq *rq, int nr_ticks) {} |
73fbec60 FW |
402 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
403 | ||
73fbec60 FW |
404 | /* |
405 | * Use precise platform statistics if available: | |
406 | */ | |
407 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | |
a7e1a9e3 | 408 | |
e3942ba0 | 409 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH |
b0493406 | 410 | void vtime_common_task_switch(struct task_struct *prev) |
e3942ba0 FW |
411 | { |
412 | if (is_idle_task(prev)) | |
413 | vtime_account_idle(prev); | |
414 | else | |
415 | vtime_account_system(prev); | |
416 | ||
c8d7dabf | 417 | vtime_flush(prev); |
e3942ba0 FW |
418 | arch_vtime_task_switch(prev); |
419 | } | |
420 | #endif | |
11113334 | 421 | |
0cfdf9a1 FW |
422 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
423 | ||
424 | ||
425 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | |
a7e1a9e3 FW |
426 | /* |
427 | * Archs that account the whole time spent in the idle task | |
428 | * (outside irq) as idle time can rely on this and just implement | |
fd25b4c2 | 429 | * vtime_account_system() and vtime_account_idle(). Archs that |
a7e1a9e3 FW |
430 | * have other meaning of the idle time (s390 only includes the |
431 | * time spent by the CPU when it's in low power mode) must override | |
432 | * vtime_account(). | |
433 | */ | |
434 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | |
0cfdf9a1 | 435 | void vtime_account_irq_enter(struct task_struct *tsk) |
a7e1a9e3 | 436 | { |
0cfdf9a1 FW |
437 | if (!in_interrupt() && is_idle_task(tsk)) |
438 | vtime_account_idle(tsk); | |
439 | else | |
440 | vtime_account_system(tsk); | |
a7e1a9e3 | 441 | } |
0cfdf9a1 | 442 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); |
a7e1a9e3 | 443 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
9fbc42ea | 444 | |
5613fda9 | 445 | void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
9fbc42ea FW |
446 | { |
447 | *ut = p->utime; | |
448 | *st = p->stime; | |
449 | } | |
9eec50b8 | 450 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); |
a7e1a9e3 | 451 | |
5613fda9 | 452 | void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
9fbc42ea FW |
453 | { |
454 | struct task_cputime cputime; | |
73fbec60 | 455 | |
9fbc42ea FW |
456 | thread_group_cputime(p, &cputime); |
457 | ||
458 | *ut = cputime.utime; | |
459 | *st = cputime.stime; | |
460 | } | |
461 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | |
462 | /* | |
463 | * Account a single tick of cpu time. | |
464 | * @p: the process that the cpu time gets accounted to | |
465 | * @user_tick: indicates if the tick is a user or a system tick | |
466 | */ | |
467 | void account_process_tick(struct task_struct *p, int user_tick) | |
73fbec60 | 468 | { |
2b1f967d | 469 | u64 cputime, steal; |
9fbc42ea | 470 | struct rq *rq = this_rq(); |
73fbec60 | 471 | |
55dbdcfa | 472 | if (vtime_accounting_cpu_enabled()) |
9fbc42ea FW |
473 | return; |
474 | ||
475 | if (sched_clock_irqtime) { | |
2d513868 | 476 | irqtime_account_process_tick(p, user_tick, rq, 1); |
9fbc42ea FW |
477 | return; |
478 | } | |
479 | ||
2b1f967d | 480 | cputime = TICK_NSEC; |
03cbc732 | 481 | steal = steal_account_process_time(ULONG_MAX); |
57430218 | 482 | |
2b1f967d | 483 | if (steal >= cputime) |
9fbc42ea | 484 | return; |
73fbec60 | 485 | |
2b1f967d | 486 | cputime -= steal; |
57430218 | 487 | |
9fbc42ea | 488 | if (user_tick) |
40565b5a | 489 | account_user_time(p, cputime); |
9fbc42ea | 490 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
fb8b049c | 491 | account_system_time(p, HARDIRQ_OFFSET, cputime); |
73fbec60 | 492 | else |
18b43a9b | 493 | account_idle_time(cputime); |
9fbc42ea | 494 | } |
73fbec60 | 495 | |
9fbc42ea FW |
496 | /* |
497 | * Account multiple ticks of idle time. | |
498 | * @ticks: number of stolen ticks | |
499 | */ | |
500 | void account_idle_ticks(unsigned long ticks) | |
501 | { | |
18b43a9b | 502 | u64 cputime, steal; |
26f2c75c | 503 | |
9fbc42ea FW |
504 | if (sched_clock_irqtime) { |
505 | irqtime_account_idle_ticks(ticks); | |
506 | return; | |
507 | } | |
508 | ||
18b43a9b | 509 | cputime = ticks * TICK_NSEC; |
2b1f967d | 510 | steal = steal_account_process_time(ULONG_MAX); |
f9bcf1e0 WL |
511 | |
512 | if (steal >= cputime) | |
513 | return; | |
514 | ||
515 | cputime -= steal; | |
516 | account_idle_time(cputime); | |
9fbc42ea | 517 | } |
73fbec60 | 518 | |
d9a3c982 | 519 | /* |
55eaa7c1 SG |
520 | * Perform (stime * rtime) / total, but avoid multiplication overflow by |
521 | * loosing precision when the numbers are big. | |
d9a3c982 | 522 | */ |
5613fda9 | 523 | static u64 scale_stime(u64 stime, u64 rtime, u64 total) |
73fbec60 | 524 | { |
55eaa7c1 | 525 | u64 scaled; |
73fbec60 | 526 | |
55eaa7c1 SG |
527 | for (;;) { |
528 | /* Make sure "rtime" is the bigger of stime/rtime */ | |
84f9f3a1 SG |
529 | if (stime > rtime) |
530 | swap(rtime, stime); | |
55eaa7c1 SG |
531 | |
532 | /* Make sure 'total' fits in 32 bits */ | |
533 | if (total >> 32) | |
534 | goto drop_precision; | |
535 | ||
536 | /* Does rtime (and thus stime) fit in 32 bits? */ | |
537 | if (!(rtime >> 32)) | |
538 | break; | |
539 | ||
540 | /* Can we just balance rtime/stime rather than dropping bits? */ | |
541 | if (stime >> 31) | |
542 | goto drop_precision; | |
543 | ||
544 | /* We can grow stime and shrink rtime and try to make them both fit */ | |
545 | stime <<= 1; | |
546 | rtime >>= 1; | |
547 | continue; | |
548 | ||
549 | drop_precision: | |
550 | /* We drop from rtime, it has more bits than stime */ | |
551 | rtime >>= 1; | |
552 | total >>= 1; | |
d9a3c982 | 553 | } |
73fbec60 | 554 | |
55eaa7c1 SG |
555 | /* |
556 | * Make sure gcc understands that this is a 32x32->64 multiply, | |
557 | * followed by a 64/32->64 divide. | |
558 | */ | |
559 | scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); | |
5613fda9 | 560 | return scaled; |
73fbec60 FW |
561 | } |
562 | ||
347abad9 | 563 | /* |
9d7fb042 PZ |
564 | * Adjust tick based cputime random precision against scheduler runtime |
565 | * accounting. | |
347abad9 | 566 | * |
9d7fb042 PZ |
567 | * Tick based cputime accounting depend on random scheduling timeslices of a |
568 | * task to be interrupted or not by the timer. Depending on these | |
569 | * circumstances, the number of these interrupts may be over or | |
570 | * under-optimistic, matching the real user and system cputime with a variable | |
571 | * precision. | |
572 | * | |
573 | * Fix this by scaling these tick based values against the total runtime | |
574 | * accounted by the CFS scheduler. | |
575 | * | |
576 | * This code provides the following guarantees: | |
577 | * | |
578 | * stime + utime == rtime | |
579 | * stime_i+1 >= stime_i, utime_i+1 >= utime_i | |
580 | * | |
581 | * Assuming that rtime_i+1 >= rtime_i. | |
fa092057 | 582 | */ |
d37f761d | 583 | static void cputime_adjust(struct task_cputime *curr, |
9d7fb042 | 584 | struct prev_cputime *prev, |
5613fda9 | 585 | u64 *ut, u64 *st) |
73fbec60 | 586 | { |
5613fda9 | 587 | u64 rtime, stime, utime; |
9d7fb042 | 588 | unsigned long flags; |
fa092057 | 589 | |
9d7fb042 PZ |
590 | /* Serialize concurrent callers such that we can honour our guarantees */ |
591 | raw_spin_lock_irqsave(&prev->lock, flags); | |
5613fda9 | 592 | rtime = curr->sum_exec_runtime; |
73fbec60 | 593 | |
772c808a | 594 | /* |
9d7fb042 PZ |
595 | * This is possible under two circumstances: |
596 | * - rtime isn't monotonic after all (a bug); | |
597 | * - we got reordered by the lock. | |
598 | * | |
599 | * In both cases this acts as a filter such that the rest of the code | |
600 | * can assume it is monotonic regardless of anything else. | |
772c808a SG |
601 | */ |
602 | if (prev->stime + prev->utime >= rtime) | |
603 | goto out; | |
604 | ||
5a8e01f8 SG |
605 | stime = curr->stime; |
606 | utime = curr->utime; | |
607 | ||
173be9a1 PZ |
608 | /* |
609 | * If either stime or both stime and utime are 0, assume all runtime is | |
610 | * userspace. Once a task gets some ticks, the monotonicy code at | |
611 | * 'update' will ensure things converge to the observed ratio. | |
612 | */ | |
613 | if (stime == 0) { | |
614 | utime = rtime; | |
9d7fb042 PZ |
615 | goto update; |
616 | } | |
5a8e01f8 | 617 | |
173be9a1 PZ |
618 | if (utime == 0) { |
619 | stime = rtime; | |
9d7fb042 | 620 | goto update; |
d9a3c982 | 621 | } |
73fbec60 | 622 | |
5613fda9 | 623 | stime = scale_stime(stime, rtime, stime + utime); |
9d7fb042 | 624 | |
173be9a1 | 625 | update: |
9d7fb042 PZ |
626 | /* |
627 | * Make sure stime doesn't go backwards; this preserves monotonicity | |
628 | * for utime because rtime is monotonic. | |
629 | * | |
630 | * utime_i+1 = rtime_i+1 - stime_i | |
631 | * = rtime_i+1 - (rtime_i - utime_i) | |
632 | * = (rtime_i+1 - rtime_i) + utime_i | |
633 | * >= utime_i | |
634 | */ | |
635 | if (stime < prev->stime) | |
636 | stime = prev->stime; | |
637 | utime = rtime - stime; | |
638 | ||
639 | /* | |
640 | * Make sure utime doesn't go backwards; this still preserves | |
641 | * monotonicity for stime, analogous argument to above. | |
642 | */ | |
643 | if (utime < prev->utime) { | |
644 | utime = prev->utime; | |
645 | stime = rtime - utime; | |
646 | } | |
d37f761d | 647 | |
9d7fb042 PZ |
648 | prev->stime = stime; |
649 | prev->utime = utime; | |
772c808a | 650 | out: |
d37f761d FW |
651 | *ut = prev->utime; |
652 | *st = prev->stime; | |
9d7fb042 | 653 | raw_spin_unlock_irqrestore(&prev->lock, flags); |
d37f761d | 654 | } |
73fbec60 | 655 | |
5613fda9 | 656 | void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
d37f761d FW |
657 | { |
658 | struct task_cputime cputime = { | |
d37f761d FW |
659 | .sum_exec_runtime = p->se.sum_exec_runtime, |
660 | }; | |
661 | ||
6fac4829 | 662 | task_cputime(p, &cputime.utime, &cputime.stime); |
d37f761d | 663 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); |
73fbec60 | 664 | } |
9eec50b8 | 665 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); |
73fbec60 | 666 | |
5613fda9 | 667 | void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
73fbec60 | 668 | { |
73fbec60 | 669 | struct task_cputime cputime; |
73fbec60 FW |
670 | |
671 | thread_group_cputime(p, &cputime); | |
d37f761d | 672 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); |
73fbec60 | 673 | } |
9fbc42ea | 674 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
abf917cd FW |
675 | |
676 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | |
bfce1d60 | 677 | static u64 vtime_delta(struct task_struct *tsk) |
6a61671b | 678 | { |
ff9a9b4c | 679 | unsigned long now = READ_ONCE(jiffies); |
6a61671b | 680 | |
ff9a9b4c | 681 | if (time_before(now, (unsigned long)tsk->vtime_snap)) |
6a61671b | 682 | return 0; |
abf917cd | 683 | |
bfce1d60 | 684 | return jiffies_to_nsecs(now - tsk->vtime_snap); |
6a61671b FW |
685 | } |
686 | ||
bfce1d60 | 687 | static u64 get_vtime_delta(struct task_struct *tsk) |
abf917cd | 688 | { |
ff9a9b4c | 689 | unsigned long now = READ_ONCE(jiffies); |
bfce1d60 | 690 | u64 delta, other; |
abf917cd | 691 | |
03cbc732 WL |
692 | /* |
693 | * Unlike tick based timing, vtime based timing never has lost | |
694 | * ticks, and no need for steal time accounting to make up for | |
695 | * lost ticks. Vtime accounts a rounded version of actual | |
696 | * elapsed time. Limit account_other_time to prevent rounding | |
697 | * errors from causing elapsed vtime to go negative. | |
698 | */ | |
bfce1d60 | 699 | delta = jiffies_to_nsecs(now - tsk->vtime_snap); |
b58c3584 | 700 | other = account_other_time(delta); |
7098c1ea | 701 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
ff9a9b4c | 702 | tsk->vtime_snap = now; |
abf917cd | 703 | |
b58c3584 | 704 | return delta - other; |
abf917cd FW |
705 | } |
706 | ||
6a61671b FW |
707 | static void __vtime_account_system(struct task_struct *tsk) |
708 | { | |
bfce1d60 | 709 | account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); |
6a61671b FW |
710 | } |
711 | ||
abf917cd FW |
712 | void vtime_account_system(struct task_struct *tsk) |
713 | { | |
ff9a9b4c RR |
714 | if (!vtime_delta(tsk)) |
715 | return; | |
716 | ||
b7ce2277 | 717 | write_seqcount_begin(&tsk->vtime_seqcount); |
6a61671b | 718 | __vtime_account_system(tsk); |
b7ce2277 | 719 | write_seqcount_end(&tsk->vtime_seqcount); |
6a61671b | 720 | } |
3f4724ea | 721 | |
abf917cd FW |
722 | void vtime_account_user(struct task_struct *tsk) |
723 | { | |
b7ce2277 | 724 | write_seqcount_begin(&tsk->vtime_seqcount); |
6a61671b | 725 | tsk->vtime_snap_whence = VTIME_SYS; |
bfce1d60 FW |
726 | if (vtime_delta(tsk)) |
727 | account_user_time(tsk, get_vtime_delta(tsk)); | |
b7ce2277 | 728 | write_seqcount_end(&tsk->vtime_seqcount); |
6a61671b FW |
729 | } |
730 | ||
731 | void vtime_user_enter(struct task_struct *tsk) | |
732 | { | |
b7ce2277 | 733 | write_seqcount_begin(&tsk->vtime_seqcount); |
ff9a9b4c RR |
734 | if (vtime_delta(tsk)) |
735 | __vtime_account_system(tsk); | |
af2350bd | 736 | tsk->vtime_snap_whence = VTIME_USER; |
b7ce2277 | 737 | write_seqcount_end(&tsk->vtime_seqcount); |
6a61671b FW |
738 | } |
739 | ||
740 | void vtime_guest_enter(struct task_struct *tsk) | |
741 | { | |
5b206d48 FW |
742 | /* |
743 | * The flags must be updated under the lock with | |
744 | * the vtime_snap flush and update. | |
745 | * That enforces a right ordering and update sequence | |
746 | * synchronization against the reader (task_gtime()) | |
747 | * that can thus safely catch up with a tickless delta. | |
748 | */ | |
b7ce2277 | 749 | write_seqcount_begin(&tsk->vtime_seqcount); |
ff9a9b4c RR |
750 | if (vtime_delta(tsk)) |
751 | __vtime_account_system(tsk); | |
6a61671b | 752 | current->flags |= PF_VCPU; |
b7ce2277 | 753 | write_seqcount_end(&tsk->vtime_seqcount); |
6a61671b | 754 | } |
48d6a816 | 755 | EXPORT_SYMBOL_GPL(vtime_guest_enter); |
6a61671b FW |
756 | |
757 | void vtime_guest_exit(struct task_struct *tsk) | |
758 | { | |
b7ce2277 | 759 | write_seqcount_begin(&tsk->vtime_seqcount); |
6a61671b FW |
760 | __vtime_account_system(tsk); |
761 | current->flags &= ~PF_VCPU; | |
b7ce2277 | 762 | write_seqcount_end(&tsk->vtime_seqcount); |
abf917cd | 763 | } |
48d6a816 | 764 | EXPORT_SYMBOL_GPL(vtime_guest_exit); |
abf917cd FW |
765 | |
766 | void vtime_account_idle(struct task_struct *tsk) | |
767 | { | |
bfce1d60 | 768 | account_idle_time(get_vtime_delta(tsk)); |
abf917cd | 769 | } |
3f4724ea | 770 | |
6a61671b FW |
771 | void arch_vtime_task_switch(struct task_struct *prev) |
772 | { | |
b7ce2277 | 773 | write_seqcount_begin(&prev->vtime_seqcount); |
7098c1ea | 774 | prev->vtime_snap_whence = VTIME_INACTIVE; |
b7ce2277 | 775 | write_seqcount_end(&prev->vtime_seqcount); |
6a61671b | 776 | |
b7ce2277 | 777 | write_seqcount_begin(¤t->vtime_seqcount); |
6a61671b | 778 | current->vtime_snap_whence = VTIME_SYS; |
ff9a9b4c | 779 | current->vtime_snap = jiffies; |
b7ce2277 | 780 | write_seqcount_end(¤t->vtime_seqcount); |
6a61671b FW |
781 | } |
782 | ||
45eacc69 | 783 | void vtime_init_idle(struct task_struct *t, int cpu) |
6a61671b FW |
784 | { |
785 | unsigned long flags; | |
786 | ||
b7ce2277 FW |
787 | local_irq_save(flags); |
788 | write_seqcount_begin(&t->vtime_seqcount); | |
6a61671b | 789 | t->vtime_snap_whence = VTIME_SYS; |
ff9a9b4c | 790 | t->vtime_snap = jiffies; |
b7ce2277 FW |
791 | write_seqcount_end(&t->vtime_seqcount); |
792 | local_irq_restore(flags); | |
6a61671b FW |
793 | } |
794 | ||
16a6d9be | 795 | u64 task_gtime(struct task_struct *t) |
6a61671b | 796 | { |
6a61671b | 797 | unsigned int seq; |
16a6d9be | 798 | u64 gtime; |
6a61671b | 799 | |
e5925394 | 800 | if (!vtime_accounting_enabled()) |
2541117b HS |
801 | return t->gtime; |
802 | ||
6a61671b | 803 | do { |
b7ce2277 | 804 | seq = read_seqcount_begin(&t->vtime_seqcount); |
6a61671b FW |
805 | |
806 | gtime = t->gtime; | |
cab245d6 | 807 | if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) |
bfce1d60 | 808 | gtime += vtime_delta(t); |
6a61671b | 809 | |
b7ce2277 | 810 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); |
6a61671b FW |
811 | |
812 | return gtime; | |
813 | } | |
814 | ||
815 | /* | |
816 | * Fetch cputime raw values from fields of task_struct and | |
817 | * add up the pending nohz execution time since the last | |
818 | * cputime snapshot. | |
819 | */ | |
5613fda9 | 820 | void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) |
6a61671b | 821 | { |
5613fda9 | 822 | u64 delta; |
6a61671b | 823 | unsigned int seq; |
6a61671b | 824 | |
353c50eb SG |
825 | if (!vtime_accounting_enabled()) { |
826 | *utime = t->utime; | |
827 | *stime = t->stime; | |
828 | return; | |
829 | } | |
6a61671b | 830 | |
353c50eb | 831 | do { |
b7ce2277 | 832 | seq = read_seqcount_begin(&t->vtime_seqcount); |
6a61671b | 833 | |
353c50eb SG |
834 | *utime = t->utime; |
835 | *stime = t->stime; | |
6a61671b FW |
836 | |
837 | /* Task is sleeping, nothing to add */ | |
353c50eb | 838 | if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) |
6a61671b FW |
839 | continue; |
840 | ||
bfce1d60 | 841 | delta = vtime_delta(t); |
6a61671b FW |
842 | |
843 | /* | |
844 | * Task runs either in user or kernel space, add pending nohz time to | |
845 | * the right place. | |
846 | */ | |
353c50eb SG |
847 | if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) |
848 | *utime += delta; | |
849 | else if (t->vtime_snap_whence == VTIME_SYS) | |
850 | *stime += delta; | |
b7ce2277 | 851 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); |
6a61671b | 852 | } |
abf917cd | 853 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ |