]>
Commit | Line | Data |
---|---|---|
1 | #include <linux/export.h> | |
2 | #include <linux/sched.h> | |
3 | #include <linux/tsacct_kern.h> | |
4 | #include <linux/kernel_stat.h> | |
5 | #include <linux/static_key.h> | |
6 | #include <linux/context_tracking.h> | |
7 | #include "sched.h" | |
8 | #ifdef CONFIG_PARAVIRT | |
9 | #include <asm/paravirt.h> | |
10 | #endif | |
11 | ||
12 | ||
13 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | |
14 | ||
15 | /* | |
16 | * There are no locks covering percpu hardirq/softirq time. | |
17 | * They are only modified in vtime_account, on corresponding CPU | |
18 | * with interrupts disabled. So, writes are safe. | |
19 | * They are read and saved off onto struct rq in update_rq_clock(). | |
20 | * This may result in other CPU reading this CPU's irq time and can | |
21 | * race with irq/vtime_account on this CPU. We would either get old | |
22 | * or new value with a side effect of accounting a slice of irq time to wrong | |
23 | * task when irq is in progress while we read rq->clock. That is a worthy | |
24 | * compromise in place of having locks on each irq in account_system_time. | |
25 | */ | |
26 | DEFINE_PER_CPU(u64, cpu_hardirq_time); | |
27 | DEFINE_PER_CPU(u64, cpu_softirq_time); | |
28 | ||
29 | static DEFINE_PER_CPU(u64, irq_start_time); | |
30 | static int sched_clock_irqtime; | |
31 | ||
32 | void enable_sched_clock_irqtime(void) | |
33 | { | |
34 | sched_clock_irqtime = 1; | |
35 | } | |
36 | ||
37 | void disable_sched_clock_irqtime(void) | |
38 | { | |
39 | sched_clock_irqtime = 0; | |
40 | } | |
41 | ||
42 | #ifndef CONFIG_64BIT | |
43 | DEFINE_PER_CPU(seqcount_t, irq_time_seq); | |
44 | #endif /* CONFIG_64BIT */ | |
45 | ||
46 | /* | |
47 | * Called before incrementing preempt_count on {soft,}irq_enter | |
48 | * and before decrementing preempt_count on {soft,}irq_exit. | |
49 | */ | |
50 | void irqtime_account_irq(struct task_struct *curr) | |
51 | { | |
52 | s64 delta; | |
53 | int cpu; | |
54 | ||
55 | if (!sched_clock_irqtime) | |
56 | return; | |
57 | ||
58 | cpu = smp_processor_id(); | |
59 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | |
60 | __this_cpu_add(irq_start_time, delta); | |
61 | ||
62 | irq_time_write_begin(); | |
63 | /* | |
64 | * We do not account for softirq time from ksoftirqd here. | |
65 | * We want to continue accounting softirq time to ksoftirqd thread | |
66 | * in that case, so as not to confuse scheduler with a special task | |
67 | * that do not consume any time, but still wants to run. | |
68 | */ | |
69 | if (hardirq_count()) | |
70 | __this_cpu_add(cpu_hardirq_time, delta); | |
71 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | |
72 | __this_cpu_add(cpu_softirq_time, delta); | |
73 | ||
74 | irq_time_write_end(); | |
75 | } | |
76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); | |
77 | ||
78 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) | |
79 | { | |
80 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
81 | unsigned long flags; | |
82 | cputime_t irq_cputime; | |
83 | ||
84 | local_irq_save(flags); | |
85 | irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - | |
86 | cpustat[CPUTIME_IRQ]; | |
87 | irq_cputime = min(irq_cputime, maxtime); | |
88 | cpustat[CPUTIME_IRQ] += irq_cputime; | |
89 | local_irq_restore(flags); | |
90 | return irq_cputime; | |
91 | } | |
92 | ||
93 | static cputime_t irqtime_account_si_update(cputime_t maxtime) | |
94 | { | |
95 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
96 | unsigned long flags; | |
97 | cputime_t softirq_cputime; | |
98 | ||
99 | local_irq_save(flags); | |
100 | softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - | |
101 | cpustat[CPUTIME_SOFTIRQ]; | |
102 | softirq_cputime = min(softirq_cputime, maxtime); | |
103 | cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; | |
104 | local_irq_restore(flags); | |
105 | return softirq_cputime; | |
106 | } | |
107 | ||
108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | |
109 | ||
110 | #define sched_clock_irqtime (0) | |
111 | ||
112 | static cputime_t irqtime_account_hi_update(cputime_t dummy) | |
113 | { | |
114 | return 0; | |
115 | } | |
116 | ||
117 | static cputime_t irqtime_account_si_update(cputime_t dummy) | |
118 | { | |
119 | return 0; | |
120 | } | |
121 | ||
122 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ | |
123 | ||
124 | static inline void task_group_account_field(struct task_struct *p, int index, | |
125 | u64 tmp) | |
126 | { | |
127 | /* | |
128 | * Since all updates are sure to touch the root cgroup, we | |
129 | * get ourselves ahead and touch it first. If the root cgroup | |
130 | * is the only cgroup, then nothing else should be necessary. | |
131 | * | |
132 | */ | |
133 | __this_cpu_add(kernel_cpustat.cpustat[index], tmp); | |
134 | ||
135 | cpuacct_account_field(p, index, tmp); | |
136 | } | |
137 | ||
138 | /* | |
139 | * Account user cpu time to a process. | |
140 | * @p: the process that the cpu time gets accounted to | |
141 | * @cputime: the cpu time spent in user space since the last update | |
142 | * @cputime_scaled: cputime scaled by cpu frequency | |
143 | */ | |
144 | void account_user_time(struct task_struct *p, cputime_t cputime, | |
145 | cputime_t cputime_scaled) | |
146 | { | |
147 | int index; | |
148 | ||
149 | /* Add user time to process. */ | |
150 | p->utime += cputime; | |
151 | p->utimescaled += cputime_scaled; | |
152 | account_group_user_time(p, cputime); | |
153 | ||
154 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | |
155 | ||
156 | /* Add user time to cpustat. */ | |
157 | task_group_account_field(p, index, (__force u64) cputime); | |
158 | ||
159 | /* Account for user time used */ | |
160 | acct_account_cputime(p); | |
161 | } | |
162 | ||
163 | /* | |
164 | * Account guest cpu time to a process. | |
165 | * @p: the process that the cpu time gets accounted to | |
166 | * @cputime: the cpu time spent in virtual machine since the last update | |
167 | * @cputime_scaled: cputime scaled by cpu frequency | |
168 | */ | |
169 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | |
170 | cputime_t cputime_scaled) | |
171 | { | |
172 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
173 | ||
174 | /* Add guest time to process. */ | |
175 | p->utime += cputime; | |
176 | p->utimescaled += cputime_scaled; | |
177 | account_group_user_time(p, cputime); | |
178 | p->gtime += cputime; | |
179 | ||
180 | /* Add guest time to cpustat. */ | |
181 | if (task_nice(p) > 0) { | |
182 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | |
183 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | |
184 | } else { | |
185 | cpustat[CPUTIME_USER] += (__force u64) cputime; | |
186 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | |
187 | } | |
188 | } | |
189 | ||
190 | /* | |
191 | * Account system cpu time to a process and desired cpustat field | |
192 | * @p: the process that the cpu time gets accounted to | |
193 | * @cputime: the cpu time spent in kernel space since the last update | |
194 | * @cputime_scaled: cputime scaled by cpu frequency | |
195 | * @target_cputime64: pointer to cpustat field that has to be updated | |
196 | */ | |
197 | static inline | |
198 | void __account_system_time(struct task_struct *p, cputime_t cputime, | |
199 | cputime_t cputime_scaled, int index) | |
200 | { | |
201 | /* Add system time to process. */ | |
202 | p->stime += cputime; | |
203 | p->stimescaled += cputime_scaled; | |
204 | account_group_system_time(p, cputime); | |
205 | ||
206 | /* Add system time to cpustat. */ | |
207 | task_group_account_field(p, index, (__force u64) cputime); | |
208 | ||
209 | /* Account for system time used */ | |
210 | acct_account_cputime(p); | |
211 | } | |
212 | ||
213 | /* | |
214 | * Account system cpu time to a process. | |
215 | * @p: the process that the cpu time gets accounted to | |
216 | * @hardirq_offset: the offset to subtract from hardirq_count() | |
217 | * @cputime: the cpu time spent in kernel space since the last update | |
218 | * @cputime_scaled: cputime scaled by cpu frequency | |
219 | */ | |
220 | void account_system_time(struct task_struct *p, int hardirq_offset, | |
221 | cputime_t cputime, cputime_t cputime_scaled) | |
222 | { | |
223 | int index; | |
224 | ||
225 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | |
226 | account_guest_time(p, cputime, cputime_scaled); | |
227 | return; | |
228 | } | |
229 | ||
230 | if (hardirq_count() - hardirq_offset) | |
231 | index = CPUTIME_IRQ; | |
232 | else if (in_serving_softirq()) | |
233 | index = CPUTIME_SOFTIRQ; | |
234 | else | |
235 | index = CPUTIME_SYSTEM; | |
236 | ||
237 | __account_system_time(p, cputime, cputime_scaled, index); | |
238 | } | |
239 | ||
240 | /* | |
241 | * Account for involuntary wait time. | |
242 | * @cputime: the cpu time spent in involuntary wait | |
243 | */ | |
244 | void account_steal_time(cputime_t cputime) | |
245 | { | |
246 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
247 | ||
248 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | |
249 | } | |
250 | ||
251 | /* | |
252 | * Account for idle time. | |
253 | * @cputime: the cpu time spent in idle wait | |
254 | */ | |
255 | void account_idle_time(cputime_t cputime) | |
256 | { | |
257 | u64 *cpustat = kcpustat_this_cpu->cpustat; | |
258 | struct rq *rq = this_rq(); | |
259 | ||
260 | if (atomic_read(&rq->nr_iowait) > 0) | |
261 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | |
262 | else | |
263 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | |
264 | } | |
265 | ||
266 | static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) | |
267 | { | |
268 | #ifdef CONFIG_PARAVIRT | |
269 | if (static_key_false(¶virt_steal_enabled)) { | |
270 | cputime_t steal_cputime; | |
271 | u64 steal; | |
272 | ||
273 | steal = paravirt_steal_clock(smp_processor_id()); | |
274 | steal -= this_rq()->prev_steal_time; | |
275 | ||
276 | steal_cputime = min(nsecs_to_cputime(steal), maxtime); | |
277 | account_steal_time(steal_cputime); | |
278 | this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); | |
279 | ||
280 | return steal_cputime; | |
281 | } | |
282 | #endif | |
283 | return 0; | |
284 | } | |
285 | ||
286 | /* | |
287 | * Account how much elapsed time was spent in steal, irq, or softirq time. | |
288 | */ | |
289 | static inline cputime_t account_other_time(cputime_t max) | |
290 | { | |
291 | cputime_t accounted; | |
292 | ||
293 | accounted = steal_account_process_time(max); | |
294 | ||
295 | if (accounted < max) | |
296 | accounted += irqtime_account_hi_update(max - accounted); | |
297 | ||
298 | if (accounted < max) | |
299 | accounted += irqtime_account_si_update(max - accounted); | |
300 | ||
301 | return accounted; | |
302 | } | |
303 | ||
304 | /* | |
305 | * Accumulate raw cputime values of dead tasks (sig->[us]time) and live | |
306 | * tasks (sum on group iteration) belonging to @tsk's group. | |
307 | */ | |
308 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |
309 | { | |
310 | struct signal_struct *sig = tsk->signal; | |
311 | cputime_t utime, stime; | |
312 | struct task_struct *t; | |
313 | unsigned int seq, nextseq; | |
314 | unsigned long flags; | |
315 | ||
316 | rcu_read_lock(); | |
317 | /* Attempt a lockless read on the first round. */ | |
318 | nextseq = 0; | |
319 | do { | |
320 | seq = nextseq; | |
321 | flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); | |
322 | times->utime = sig->utime; | |
323 | times->stime = sig->stime; | |
324 | times->sum_exec_runtime = sig->sum_sched_runtime; | |
325 | ||
326 | for_each_thread(tsk, t) { | |
327 | task_cputime(t, &utime, &stime); | |
328 | times->utime += utime; | |
329 | times->stime += stime; | |
330 | times->sum_exec_runtime += task_sched_runtime(t); | |
331 | } | |
332 | /* If lockless access failed, take the lock. */ | |
333 | nextseq = 1; | |
334 | } while (need_seqretry(&sig->stats_lock, seq)); | |
335 | done_seqretry_irqrestore(&sig->stats_lock, seq, flags); | |
336 | rcu_read_unlock(); | |
337 | } | |
338 | ||
339 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | |
340 | /* | |
341 | * Account a tick to a process and cpustat | |
342 | * @p: the process that the cpu time gets accounted to | |
343 | * @user_tick: is the tick from userspace | |
344 | * @rq: the pointer to rq | |
345 | * | |
346 | * Tick demultiplexing follows the order | |
347 | * - pending hardirq update | |
348 | * - pending softirq update | |
349 | * - user_time | |
350 | * - idle_time | |
351 | * - system time | |
352 | * - check for guest_time | |
353 | * - else account as system_time | |
354 | * | |
355 | * Check for hardirq is done both for system and user time as there is | |
356 | * no timer going off while we are on hardirq and hence we may never get an | |
357 | * opportunity to update it solely in system time. | |
358 | * p->stime and friends are only updated on system time and not on irq | |
359 | * softirq as those do not count in task exec_runtime any more. | |
360 | */ | |
361 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |
362 | struct rq *rq, int ticks) | |
363 | { | |
364 | u64 cputime = (__force u64) cputime_one_jiffy * ticks; | |
365 | cputime_t scaled, other; | |
366 | ||
367 | /* | |
368 | * When returning from idle, many ticks can get accounted at | |
369 | * once, including some ticks of steal, irq, and softirq time. | |
370 | * Subtract those ticks from the amount of time accounted to | |
371 | * idle, or potentially user or system time. Due to rounding, | |
372 | * other time can exceed ticks occasionally. | |
373 | */ | |
374 | other = account_other_time(cputime); | |
375 | if (other >= cputime) | |
376 | return; | |
377 | cputime -= other; | |
378 | scaled = cputime_to_scaled(cputime); | |
379 | ||
380 | if (this_cpu_ksoftirqd() == p) { | |
381 | /* | |
382 | * ksoftirqd time do not get accounted in cpu_softirq_time. | |
383 | * So, we have to handle it separately here. | |
384 | * Also, p->stime needs to be updated for ksoftirqd. | |
385 | */ | |
386 | __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); | |
387 | } else if (user_tick) { | |
388 | account_user_time(p, cputime, scaled); | |
389 | } else if (p == rq->idle) { | |
390 | account_idle_time(cputime); | |
391 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | |
392 | account_guest_time(p, cputime, scaled); | |
393 | } else { | |
394 | __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); | |
395 | } | |
396 | } | |
397 | ||
398 | static void irqtime_account_idle_ticks(int ticks) | |
399 | { | |
400 | struct rq *rq = this_rq(); | |
401 | ||
402 | irqtime_account_process_tick(current, 0, rq, ticks); | |
403 | } | |
404 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | |
405 | static inline void irqtime_account_idle_ticks(int ticks) {} | |
406 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |
407 | struct rq *rq, int nr_ticks) {} | |
408 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | |
409 | ||
410 | /* | |
411 | * Use precise platform statistics if available: | |
412 | */ | |
413 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | |
414 | ||
415 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | |
416 | void vtime_common_task_switch(struct task_struct *prev) | |
417 | { | |
418 | if (is_idle_task(prev)) | |
419 | vtime_account_idle(prev); | |
420 | else | |
421 | vtime_account_system(prev); | |
422 | ||
423 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | |
424 | vtime_account_user(prev); | |
425 | #endif | |
426 | arch_vtime_task_switch(prev); | |
427 | } | |
428 | #endif | |
429 | ||
430 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | |
431 | ||
432 | ||
433 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | |
434 | /* | |
435 | * Archs that account the whole time spent in the idle task | |
436 | * (outside irq) as idle time can rely on this and just implement | |
437 | * vtime_account_system() and vtime_account_idle(). Archs that | |
438 | * have other meaning of the idle time (s390 only includes the | |
439 | * time spent by the CPU when it's in low power mode) must override | |
440 | * vtime_account(). | |
441 | */ | |
442 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | |
443 | void vtime_account_irq_enter(struct task_struct *tsk) | |
444 | { | |
445 | if (!in_interrupt() && is_idle_task(tsk)) | |
446 | vtime_account_idle(tsk); | |
447 | else | |
448 | vtime_account_system(tsk); | |
449 | } | |
450 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | |
451 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | |
452 | ||
453 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
454 | { | |
455 | *ut = p->utime; | |
456 | *st = p->stime; | |
457 | } | |
458 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); | |
459 | ||
460 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
461 | { | |
462 | struct task_cputime cputime; | |
463 | ||
464 | thread_group_cputime(p, &cputime); | |
465 | ||
466 | *ut = cputime.utime; | |
467 | *st = cputime.stime; | |
468 | } | |
469 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | |
470 | /* | |
471 | * Account a single tick of cpu time. | |
472 | * @p: the process that the cpu time gets accounted to | |
473 | * @user_tick: indicates if the tick is a user or a system tick | |
474 | */ | |
475 | void account_process_tick(struct task_struct *p, int user_tick) | |
476 | { | |
477 | cputime_t cputime, scaled, steal; | |
478 | struct rq *rq = this_rq(); | |
479 | ||
480 | if (vtime_accounting_cpu_enabled()) | |
481 | return; | |
482 | ||
483 | if (sched_clock_irqtime) { | |
484 | irqtime_account_process_tick(p, user_tick, rq, 1); | |
485 | return; | |
486 | } | |
487 | ||
488 | cputime = cputime_one_jiffy; | |
489 | steal = steal_account_process_time(cputime); | |
490 | ||
491 | if (steal >= cputime) | |
492 | return; | |
493 | ||
494 | cputime -= steal; | |
495 | scaled = cputime_to_scaled(cputime); | |
496 | ||
497 | if (user_tick) | |
498 | account_user_time(p, cputime, scaled); | |
499 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | |
500 | account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); | |
501 | else | |
502 | account_idle_time(cputime); | |
503 | } | |
504 | ||
505 | /* | |
506 | * Account multiple ticks of idle time. | |
507 | * @ticks: number of stolen ticks | |
508 | */ | |
509 | void account_idle_ticks(unsigned long ticks) | |
510 | { | |
511 | cputime_t cputime, steal; | |
512 | ||
513 | if (sched_clock_irqtime) { | |
514 | irqtime_account_idle_ticks(ticks); | |
515 | return; | |
516 | } | |
517 | ||
518 | cputime = jiffies_to_cputime(ticks); | |
519 | steal = steal_account_process_time(cputime); | |
520 | ||
521 | if (steal >= cputime) | |
522 | return; | |
523 | ||
524 | cputime -= steal; | |
525 | account_idle_time(cputime); | |
526 | } | |
527 | ||
528 | /* | |
529 | * Perform (stime * rtime) / total, but avoid multiplication overflow by | |
530 | * loosing precision when the numbers are big. | |
531 | */ | |
532 | static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) | |
533 | { | |
534 | u64 scaled; | |
535 | ||
536 | for (;;) { | |
537 | /* Make sure "rtime" is the bigger of stime/rtime */ | |
538 | if (stime > rtime) | |
539 | swap(rtime, stime); | |
540 | ||
541 | /* Make sure 'total' fits in 32 bits */ | |
542 | if (total >> 32) | |
543 | goto drop_precision; | |
544 | ||
545 | /* Does rtime (and thus stime) fit in 32 bits? */ | |
546 | if (!(rtime >> 32)) | |
547 | break; | |
548 | ||
549 | /* Can we just balance rtime/stime rather than dropping bits? */ | |
550 | if (stime >> 31) | |
551 | goto drop_precision; | |
552 | ||
553 | /* We can grow stime and shrink rtime and try to make them both fit */ | |
554 | stime <<= 1; | |
555 | rtime >>= 1; | |
556 | continue; | |
557 | ||
558 | drop_precision: | |
559 | /* We drop from rtime, it has more bits than stime */ | |
560 | rtime >>= 1; | |
561 | total >>= 1; | |
562 | } | |
563 | ||
564 | /* | |
565 | * Make sure gcc understands that this is a 32x32->64 multiply, | |
566 | * followed by a 64/32->64 divide. | |
567 | */ | |
568 | scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); | |
569 | return (__force cputime_t) scaled; | |
570 | } | |
571 | ||
572 | /* | |
573 | * Adjust tick based cputime random precision against scheduler runtime | |
574 | * accounting. | |
575 | * | |
576 | * Tick based cputime accounting depend on random scheduling timeslices of a | |
577 | * task to be interrupted or not by the timer. Depending on these | |
578 | * circumstances, the number of these interrupts may be over or | |
579 | * under-optimistic, matching the real user and system cputime with a variable | |
580 | * precision. | |
581 | * | |
582 | * Fix this by scaling these tick based values against the total runtime | |
583 | * accounted by the CFS scheduler. | |
584 | * | |
585 | * This code provides the following guarantees: | |
586 | * | |
587 | * stime + utime == rtime | |
588 | * stime_i+1 >= stime_i, utime_i+1 >= utime_i | |
589 | * | |
590 | * Assuming that rtime_i+1 >= rtime_i. | |
591 | */ | |
592 | static void cputime_adjust(struct task_cputime *curr, | |
593 | struct prev_cputime *prev, | |
594 | cputime_t *ut, cputime_t *st) | |
595 | { | |
596 | cputime_t rtime, stime, utime; | |
597 | unsigned long flags; | |
598 | ||
599 | /* Serialize concurrent callers such that we can honour our guarantees */ | |
600 | raw_spin_lock_irqsave(&prev->lock, flags); | |
601 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); | |
602 | ||
603 | /* | |
604 | * This is possible under two circumstances: | |
605 | * - rtime isn't monotonic after all (a bug); | |
606 | * - we got reordered by the lock. | |
607 | * | |
608 | * In both cases this acts as a filter such that the rest of the code | |
609 | * can assume it is monotonic regardless of anything else. | |
610 | */ | |
611 | if (prev->stime + prev->utime >= rtime) | |
612 | goto out; | |
613 | ||
614 | stime = curr->stime; | |
615 | utime = curr->utime; | |
616 | ||
617 | if (utime == 0) { | |
618 | stime = rtime; | |
619 | goto update; | |
620 | } | |
621 | ||
622 | if (stime == 0) { | |
623 | utime = rtime; | |
624 | goto update; | |
625 | } | |
626 | ||
627 | stime = scale_stime((__force u64)stime, (__force u64)rtime, | |
628 | (__force u64)(stime + utime)); | |
629 | ||
630 | /* | |
631 | * Make sure stime doesn't go backwards; this preserves monotonicity | |
632 | * for utime because rtime is monotonic. | |
633 | * | |
634 | * utime_i+1 = rtime_i+1 - stime_i | |
635 | * = rtime_i+1 - (rtime_i - utime_i) | |
636 | * = (rtime_i+1 - rtime_i) + utime_i | |
637 | * >= utime_i | |
638 | */ | |
639 | if (stime < prev->stime) | |
640 | stime = prev->stime; | |
641 | utime = rtime - stime; | |
642 | ||
643 | /* | |
644 | * Make sure utime doesn't go backwards; this still preserves | |
645 | * monotonicity for stime, analogous argument to above. | |
646 | */ | |
647 | if (utime < prev->utime) { | |
648 | utime = prev->utime; | |
649 | stime = rtime - utime; | |
650 | } | |
651 | ||
652 | update: | |
653 | prev->stime = stime; | |
654 | prev->utime = utime; | |
655 | out: | |
656 | *ut = prev->utime; | |
657 | *st = prev->stime; | |
658 | raw_spin_unlock_irqrestore(&prev->lock, flags); | |
659 | } | |
660 | ||
661 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
662 | { | |
663 | struct task_cputime cputime = { | |
664 | .sum_exec_runtime = p->se.sum_exec_runtime, | |
665 | }; | |
666 | ||
667 | task_cputime(p, &cputime.utime, &cputime.stime); | |
668 | cputime_adjust(&cputime, &p->prev_cputime, ut, st); | |
669 | } | |
670 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); | |
671 | ||
672 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |
673 | { | |
674 | struct task_cputime cputime; | |
675 | ||
676 | thread_group_cputime(p, &cputime); | |
677 | cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); | |
678 | } | |
679 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | |
680 | ||
681 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | |
682 | static cputime_t vtime_delta(struct task_struct *tsk) | |
683 | { | |
684 | unsigned long now = READ_ONCE(jiffies); | |
685 | ||
686 | if (time_before(now, (unsigned long)tsk->vtime_snap)) | |
687 | return 0; | |
688 | ||
689 | return jiffies_to_cputime(now - tsk->vtime_snap); | |
690 | } | |
691 | ||
692 | static cputime_t get_vtime_delta(struct task_struct *tsk) | |
693 | { | |
694 | unsigned long now = READ_ONCE(jiffies); | |
695 | cputime_t delta, other; | |
696 | ||
697 | delta = jiffies_to_cputime(now - tsk->vtime_snap); | |
698 | other = account_other_time(delta); | |
699 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | |
700 | tsk->vtime_snap = now; | |
701 | ||
702 | return delta - other; | |
703 | } | |
704 | ||
705 | static void __vtime_account_system(struct task_struct *tsk) | |
706 | { | |
707 | cputime_t delta_cpu = get_vtime_delta(tsk); | |
708 | ||
709 | account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); | |
710 | } | |
711 | ||
712 | void vtime_account_system(struct task_struct *tsk) | |
713 | { | |
714 | if (!vtime_delta(tsk)) | |
715 | return; | |
716 | ||
717 | write_seqcount_begin(&tsk->vtime_seqcount); | |
718 | __vtime_account_system(tsk); | |
719 | write_seqcount_end(&tsk->vtime_seqcount); | |
720 | } | |
721 | ||
722 | void vtime_account_user(struct task_struct *tsk) | |
723 | { | |
724 | cputime_t delta_cpu; | |
725 | ||
726 | write_seqcount_begin(&tsk->vtime_seqcount); | |
727 | tsk->vtime_snap_whence = VTIME_SYS; | |
728 | if (vtime_delta(tsk)) { | |
729 | delta_cpu = get_vtime_delta(tsk); | |
730 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | |
731 | } | |
732 | write_seqcount_end(&tsk->vtime_seqcount); | |
733 | } | |
734 | ||
735 | void vtime_user_enter(struct task_struct *tsk) | |
736 | { | |
737 | write_seqcount_begin(&tsk->vtime_seqcount); | |
738 | if (vtime_delta(tsk)) | |
739 | __vtime_account_system(tsk); | |
740 | tsk->vtime_snap_whence = VTIME_USER; | |
741 | write_seqcount_end(&tsk->vtime_seqcount); | |
742 | } | |
743 | ||
744 | void vtime_guest_enter(struct task_struct *tsk) | |
745 | { | |
746 | /* | |
747 | * The flags must be updated under the lock with | |
748 | * the vtime_snap flush and update. | |
749 | * That enforces a right ordering and update sequence | |
750 | * synchronization against the reader (task_gtime()) | |
751 | * that can thus safely catch up with a tickless delta. | |
752 | */ | |
753 | write_seqcount_begin(&tsk->vtime_seqcount); | |
754 | if (vtime_delta(tsk)) | |
755 | __vtime_account_system(tsk); | |
756 | current->flags |= PF_VCPU; | |
757 | write_seqcount_end(&tsk->vtime_seqcount); | |
758 | } | |
759 | EXPORT_SYMBOL_GPL(vtime_guest_enter); | |
760 | ||
761 | void vtime_guest_exit(struct task_struct *tsk) | |
762 | { | |
763 | write_seqcount_begin(&tsk->vtime_seqcount); | |
764 | __vtime_account_system(tsk); | |
765 | current->flags &= ~PF_VCPU; | |
766 | write_seqcount_end(&tsk->vtime_seqcount); | |
767 | } | |
768 | EXPORT_SYMBOL_GPL(vtime_guest_exit); | |
769 | ||
770 | void vtime_account_idle(struct task_struct *tsk) | |
771 | { | |
772 | cputime_t delta_cpu = get_vtime_delta(tsk); | |
773 | ||
774 | account_idle_time(delta_cpu); | |
775 | } | |
776 | ||
777 | void arch_vtime_task_switch(struct task_struct *prev) | |
778 | { | |
779 | write_seqcount_begin(&prev->vtime_seqcount); | |
780 | prev->vtime_snap_whence = VTIME_INACTIVE; | |
781 | write_seqcount_end(&prev->vtime_seqcount); | |
782 | ||
783 | write_seqcount_begin(¤t->vtime_seqcount); | |
784 | current->vtime_snap_whence = VTIME_SYS; | |
785 | current->vtime_snap = jiffies; | |
786 | write_seqcount_end(¤t->vtime_seqcount); | |
787 | } | |
788 | ||
789 | void vtime_init_idle(struct task_struct *t, int cpu) | |
790 | { | |
791 | unsigned long flags; | |
792 | ||
793 | local_irq_save(flags); | |
794 | write_seqcount_begin(&t->vtime_seqcount); | |
795 | t->vtime_snap_whence = VTIME_SYS; | |
796 | t->vtime_snap = jiffies; | |
797 | write_seqcount_end(&t->vtime_seqcount); | |
798 | local_irq_restore(flags); | |
799 | } | |
800 | ||
801 | cputime_t task_gtime(struct task_struct *t) | |
802 | { | |
803 | unsigned int seq; | |
804 | cputime_t gtime; | |
805 | ||
806 | if (!vtime_accounting_enabled()) | |
807 | return t->gtime; | |
808 | ||
809 | do { | |
810 | seq = read_seqcount_begin(&t->vtime_seqcount); | |
811 | ||
812 | gtime = t->gtime; | |
813 | if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) | |
814 | gtime += vtime_delta(t); | |
815 | ||
816 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); | |
817 | ||
818 | return gtime; | |
819 | } | |
820 | ||
821 | /* | |
822 | * Fetch cputime raw values from fields of task_struct and | |
823 | * add up the pending nohz execution time since the last | |
824 | * cputime snapshot. | |
825 | */ | |
826 | static void | |
827 | fetch_task_cputime(struct task_struct *t, | |
828 | cputime_t *u_dst, cputime_t *s_dst, | |
829 | cputime_t *u_src, cputime_t *s_src, | |
830 | cputime_t *udelta, cputime_t *sdelta) | |
831 | { | |
832 | unsigned int seq; | |
833 | unsigned long long delta; | |
834 | ||
835 | do { | |
836 | *udelta = 0; | |
837 | *sdelta = 0; | |
838 | ||
839 | seq = read_seqcount_begin(&t->vtime_seqcount); | |
840 | ||
841 | if (u_dst) | |
842 | *u_dst = *u_src; | |
843 | if (s_dst) | |
844 | *s_dst = *s_src; | |
845 | ||
846 | /* Task is sleeping, nothing to add */ | |
847 | if (t->vtime_snap_whence == VTIME_INACTIVE || | |
848 | is_idle_task(t)) | |
849 | continue; | |
850 | ||
851 | delta = vtime_delta(t); | |
852 | ||
853 | /* | |
854 | * Task runs either in user or kernel space, add pending nohz time to | |
855 | * the right place. | |
856 | */ | |
857 | if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { | |
858 | *udelta = delta; | |
859 | } else { | |
860 | if (t->vtime_snap_whence == VTIME_SYS) | |
861 | *sdelta = delta; | |
862 | } | |
863 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); | |
864 | } | |
865 | ||
866 | ||
867 | void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) | |
868 | { | |
869 | cputime_t udelta, sdelta; | |
870 | ||
871 | if (!vtime_accounting_enabled()) { | |
872 | if (utime) | |
873 | *utime = t->utime; | |
874 | if (stime) | |
875 | *stime = t->stime; | |
876 | return; | |
877 | } | |
878 | ||
879 | fetch_task_cputime(t, utime, stime, &t->utime, | |
880 | &t->stime, &udelta, &sdelta); | |
881 | if (utime) | |
882 | *utime += udelta; | |
883 | if (stime) | |
884 | *stime += sdelta; | |
885 | } | |
886 | ||
887 | void task_cputime_scaled(struct task_struct *t, | |
888 | cputime_t *utimescaled, cputime_t *stimescaled) | |
889 | { | |
890 | cputime_t udelta, sdelta; | |
891 | ||
892 | if (!vtime_accounting_enabled()) { | |
893 | if (utimescaled) | |
894 | *utimescaled = t->utimescaled; | |
895 | if (stimescaled) | |
896 | *stimescaled = t->stimescaled; | |
897 | return; | |
898 | } | |
899 | ||
900 | fetch_task_cputime(t, utimescaled, stimescaled, | |
901 | &t->utimescaled, &t->stimescaled, &udelta, &sdelta); | |
902 | if (utimescaled) | |
903 | *utimescaled += cputime_to_scaled(udelta); | |
904 | if (stimescaled) | |
905 | *stimescaled += cputime_to_scaled(sdelta); | |
906 | } | |
907 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ |