kernel/sched/cputime.c

   1 #include <linux/export.h>
   2 #include <linux/sched.h>
   3 #include <linux/tsacct_kern.h>
   4 #include <linux/kernel_stat.h>
   5 #include <linux/static_key.h>
   6 #include <linux/context_tracking.h>
   7 #include "sched.h"
   8 #ifdef CONFIG_PARAVIRT
   9 #include <asm/paravirt.h>
  10 #endif
  11
  12
  13 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
  14
  15 /*
  16  * There are no locks covering percpu hardirq/softirq time.
  17  * They are only modified in vtime_account, on corresponding CPU
  18  * with interrupts disabled. So, writes are safe.
  19  * They are read and saved off onto struct rq in update_rq_clock().
  20  * This may result in other CPU reading this CPU's irq time and can
  21  * race with irq/vtime_account on this CPU. We would either get old
  22  * or new value with a side effect of accounting a slice of irq time to wrong
  23  * task when irq is in progress while we read rq->clock. That is a worthy
  24  * compromise in place of having locks on each irq in account_system_time.
  25  */
  26 DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
  27
  28 static int sched_clock_irqtime;
  29
  30 void enable_sched_clock_irqtime(void)
  31 {
  32         sched_clock_irqtime = 1;
  33 }
  34
  35 void disable_sched_clock_irqtime(void)
  36 {
  37         sched_clock_irqtime = 0;
  38 }
  39
  40 /*
  41  * Called before incrementing preempt_count on {soft,}irq_enter
  42  * and before decrementing preempt_count on {soft,}irq_exit.
  43  */
  44 void irqtime_account_irq(struct task_struct *curr)
  45 {
  46         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
  47         s64 delta;
  48         int cpu;
  49
  50         if (!sched_clock_irqtime)
  51                 return;
  52
  53         cpu = smp_processor_id();
  54         delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
  55         irqtime->irq_start_time += delta;
  56
  57         u64_stats_update_begin(&irqtime->sync);
  58         /*
  59          * We do not account for softirq time from ksoftirqd here.
  60          * We want to continue accounting softirq time to ksoftirqd thread
  61          * in that case, so as not to confuse scheduler with a special task
  62          * that do not consume any time, but still wants to run.
  63          */
  64         if (hardirq_count())
  65                 irqtime->hardirq_time += delta;
  66         else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
  67                 irqtime->softirq_time += delta;
  68
  69         u64_stats_update_end(&irqtime->sync);
  70 }
  71 EXPORT_SYMBOL_GPL(irqtime_account_irq);
  72
  73 static cputime_t irqtime_account_hi_update(cputime_t maxtime)
  74 {
  75         u64 *cpustat = kcpustat_this_cpu->cpustat;
  76         cputime_t irq_cputime;
  77         u64 nsecs;
  78
  79         nsecs = __this_cpu_read(cpu_irqtime.hardirq_time);
  80         irq_cputime = nsecs_to_cputime64(nsecs) - cpustat[CPUTIME_IRQ];
  81         irq_cputime = min(irq_cputime, maxtime);
  82         cpustat[CPUTIME_IRQ] += irq_cputime;
  83
  84         return irq_cputime;
  85 }
  86
  87 static cputime_t irqtime_account_si_update(cputime_t maxtime)
  88 {
  89         u64 *cpustat = kcpustat_this_cpu->cpustat;
  90         cputime_t softirq_cputime;
  91         u64 nsecs;
  92
  93         nsecs = __this_cpu_read(cpu_irqtime.softirq_time);
  94         softirq_cputime = nsecs_to_cputime64(nsecs) - cpustat[CPUTIME_SOFTIRQ];
  95         softirq_cputime = min(softirq_cputime, maxtime);
  96         cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
  97
  98         return softirq_cputime;
  99 }
 100
 101 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 102
 103 #define sched_clock_irqtime     (0)
 104
 105 static cputime_t irqtime_account_hi_update(cputime_t dummy)
 106 {
 107         return 0;
 108 }
 109
 110 static cputime_t irqtime_account_si_update(cputime_t dummy)
 111 {
 112         return 0;
 113 }
 114
 115 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
 116
 117 static inline void task_group_account_field(struct task_struct *p, int index,
 118                                             u64 tmp)
 119 {
 120         /*
 121          * Since all updates are sure to touch the root cgroup, we
 122          * get ourselves ahead and touch it first. If the root cgroup
 123          * is the only cgroup, then nothing else should be necessary.
 124          *
 125          */
 126         __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 127
 128         cpuacct_account_field(p, index, tmp);
 129 }
 130
 131 /*
 132  * Account user cpu time to a process.
 133  * @p: the process that the cpu time gets accounted to
 134  * @cputime: the cpu time spent in user space since the last update
 135  * @cputime_scaled: cputime scaled by cpu frequency
 136  */
 137 void account_user_time(struct task_struct *p, cputime_t cputime,
 138                        cputime_t cputime_scaled)
 139 {
 140         int index;
 141
 142         /* Add user time to process. */
 143         p->utime += cputime;
 144         p->utimescaled += cputime_scaled;
 145         account_group_user_time(p, cputime);
 146
 147         index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 148
 149         /* Add user time to cpustat. */
 150         task_group_account_field(p, index, (__force u64) cputime);
 151
 152         /* Account for user time used */
 153         acct_account_cputime(p);
 154 }
 155
 156 /*
 157  * Account guest cpu time to a process.
 158  * @p: the process that the cpu time gets accounted to
 159  * @cputime: the cpu time spent in virtual machine since the last update
 160  * @cputime_scaled: cputime scaled by cpu frequency
 161  */
 162 static void account_guest_time(struct task_struct *p, cputime_t cputime,
 163                                cputime_t cputime_scaled)
 164 {
 165         u64 *cpustat = kcpustat_this_cpu->cpustat;
 166
 167         /* Add guest time to process. */
 168         p->utime += cputime;
 169         p->utimescaled += cputime_scaled;
 170         account_group_user_time(p, cputime);
 171         p->gtime += cputime;
 172
 173         /* Add guest time to cpustat. */
 174         if (task_nice(p) > 0) {
 175                 cpustat[CPUTIME_NICE] += (__force u64) cputime;
 176                 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
 177         } else {
 178                 cpustat[CPUTIME_USER] += (__force u64) cputime;
 179                 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
 180         }
 181 }
 182
 183 /*
 184  * Account system cpu time to a process and desired cpustat field
 185  * @p: the process that the cpu time gets accounted to
 186  * @cputime: the cpu time spent in kernel space since the last update
 187  * @cputime_scaled: cputime scaled by cpu frequency
 188  * @target_cputime64: pointer to cpustat field that has to be updated
 189  */
 190 static inline
 191 void __account_system_time(struct task_struct *p, cputime_t cputime,
 192                         cputime_t cputime_scaled, int index)
 193 {
 194         /* Add system time to process. */
 195         p->stime += cputime;
 196         p->stimescaled += cputime_scaled;
 197         account_group_system_time(p, cputime);
 198
 199         /* Add system time to cpustat. */
 200         task_group_account_field(p, index, (__force u64) cputime);
 201
 202         /* Account for system time used */
 203         acct_account_cputime(p);
 204 }
 205
 206 /*
 207  * Account system cpu time to a process.
 208  * @p: the process that the cpu time gets accounted to
 209  * @hardirq_offset: the offset to subtract from hardirq_count()
 210  * @cputime: the cpu time spent in kernel space since the last update
 211  * @cputime_scaled: cputime scaled by cpu frequency
 212  */
 213 void account_system_time(struct task_struct *p, int hardirq_offset,
 214                          cputime_t cputime, cputime_t cputime_scaled)
 215 {
 216         int index;
 217
 218         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 219                 account_guest_time(p, cputime, cputime_scaled);
 220                 return;
 221         }
 222
 223         if (hardirq_count() - hardirq_offset)
 224                 index = CPUTIME_IRQ;
 225         else if (in_serving_softirq())
 226                 index = CPUTIME_SOFTIRQ;
 227         else
 228                 index = CPUTIME_SYSTEM;
 229
 230         __account_system_time(p, cputime, cputime_scaled, index);
 231 }
 232
 233 /*
 234  * Account for involuntary wait time.
 235  * @cputime: the cpu time spent in involuntary wait
 236  */
 237 void account_steal_time(cputime_t cputime)
 238 {
 239         u64 *cpustat = kcpustat_this_cpu->cpustat;
 240
 241         cpustat[CPUTIME_STEAL] += (__force u64) cputime;
 242 }
 243
 244 /*
 245  * Account for idle time.
 246  * @cputime: the cpu time spent in idle wait
 247  */
 248 void account_idle_time(cputime_t cputime)
 249 {
 250         u64 *cpustat = kcpustat_this_cpu->cpustat;
 251         struct rq *rq = this_rq();
 252
 253         if (atomic_read(&rq->nr_iowait) > 0)
 254                 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
 255         else
 256                 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 257 }
 258
 259 /*
 260  * When a guest is interrupted for a longer amount of time, missed clock
 261  * ticks are not redelivered later. Due to that, this function may on
 262  * occasion account more time than the calling functions think elapsed.
 263  */
 264 static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
 265 {
 266 #ifdef CONFIG_PARAVIRT
 267         if (static_key_false(&paravirt_steal_enabled)) {
 268                 cputime_t steal_cputime;
 269                 u64 steal;
 270
 271                 steal = paravirt_steal_clock(smp_processor_id());
 272                 steal -= this_rq()->prev_steal_time;
 273
 274                 steal_cputime = min(nsecs_to_cputime(steal), maxtime);
 275                 account_steal_time(steal_cputime);
 276                 this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
 277
 278                 return steal_cputime;
 279         }
 280 #endif
 281         return 0;
 282 }
 283
 284 /*
 285  * Account how much elapsed time was spent in steal, irq, or softirq time.
 286  */
 287 static inline cputime_t account_other_time(cputime_t max)
 288 {
 289         cputime_t accounted;
 290
 291         /* Shall be converted to a lockdep-enabled lightweight check */
 292         WARN_ON_ONCE(!irqs_disabled());
 293
 294         accounted = steal_account_process_time(max);
 295
 296         if (accounted < max)
 297                 accounted += irqtime_account_hi_update(max - accounted);
 298
 299         if (accounted < max)
 300                 accounted += irqtime_account_si_update(max - accounted);
 301
 302         return accounted;
 303 }
 304
 305 #ifdef CONFIG_64BIT
 306 static inline u64 read_sum_exec_runtime(struct task_struct *t)
 307 {
 308         return t->se.sum_exec_runtime;
 309 }
 310 #else
 311 static u64 read_sum_exec_runtime(struct task_struct *t)
 312 {
 313         u64 ns;
 314         struct rq_flags rf;
 315         struct rq *rq;
 316
 317         rq = task_rq_lock(t, &rf);
 318         ns = t->se.sum_exec_runtime;
 319         task_rq_unlock(rq, t, &rf);
 320
 321         return ns;
 322 }
 323 #endif
 324
 325 /*
 326  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 327  * tasks (sum on group iteration) belonging to @tsk's group.
 328  */
 329 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 330 {
 331         struct signal_struct *sig = tsk->signal;
 332         cputime_t utime, stime;
 333         struct task_struct *t;
 334         unsigned int seq, nextseq;
 335         unsigned long flags;
 336
 337         /*
 338          * Update current task runtime to account pending time since last
 339          * scheduler action or thread_group_cputime() call. This thread group
 340          * might have other running tasks on different CPUs, but updating
 341          * their runtime can affect syscall performance, so we skip account
 342          * those pending times and rely only on values updated on tick or
 343          * other scheduler action.
 344          */
 345         if (same_thread_group(current, tsk))
 346                 (void) task_sched_runtime(current);
 347
 348         rcu_read_lock();
 349         /* Attempt a lockless read on the first round. */
 350         nextseq = 0;
 351         do {
 352                 seq = nextseq;
 353                 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
 354                 times->utime = sig->utime;
 355                 times->stime = sig->stime;
 356                 times->sum_exec_runtime = sig->sum_sched_runtime;
 357
 358                 for_each_thread(tsk, t) {
 359                         task_cputime(t, &utime, &stime);
 360                         times->utime += utime;
 361                         times->stime += stime;
 362                         times->sum_exec_runtime += read_sum_exec_runtime(t);
 363                 }
 364                 /* If lockless access failed, take the lock. */
 365                 nextseq = 1;
 366         } while (need_seqretry(&sig->stats_lock, seq));
 367         done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
 368         rcu_read_unlock();
 369 }
 370
 371 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 372 /*
 373  * Account a tick to a process and cpustat
 374  * @p: the process that the cpu time gets accounted to
 375  * @user_tick: is the tick from userspace
 376  * @rq: the pointer to rq
 377  *
 378  * Tick demultiplexing follows the order
 379  * - pending hardirq update
 380  * - pending softirq update
 381  * - user_time
 382  * - idle_time
 383  * - system time
 384  *   - check for guest_time
 385  *   - else account as system_time
 386  *
 387  * Check for hardirq is done both for system and user time as there is
 388  * no timer going off while we are on hardirq and hence we may never get an
 389  * opportunity to update it solely in system time.
 390  * p->stime and friends are only updated on system time and not on irq
 391  * softirq as those do not count in task exec_runtime any more.
 392  */
 393 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 394                                          struct rq *rq, int ticks)
 395 {
 396         u64 cputime = (__force u64) cputime_one_jiffy * ticks;
 397         cputime_t scaled, other;
 398
 399         /*
 400          * When returning from idle, many ticks can get accounted at
 401          * once, including some ticks of steal, irq, and softirq time.
 402          * Subtract those ticks from the amount of time accounted to
 403          * idle, or potentially user or system time. Due to rounding,
 404          * other time can exceed ticks occasionally.
 405          */
 406         other = account_other_time(ULONG_MAX);
 407         if (other >= cputime)
 408                 return;
 409         cputime -= other;
 410         scaled = cputime_to_scaled(cputime);
 411
 412         if (this_cpu_ksoftirqd() == p) {
 413                 /*
 414                  * ksoftirqd time do not get accounted in cpu_softirq_time.
 415                  * So, we have to handle it separately here.
 416                  * Also, p->stime needs to be updated for ksoftirqd.
 417                  */
 418                 __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
 419         } else if (user_tick) {
 420                 account_user_time(p, cputime, scaled);
 421         } else if (p == rq->idle) {
 422                 account_idle_time(cputime);
 423         } else if (p->flags & PF_VCPU) { /* System time or guest time */
 424                 account_guest_time(p, cputime, scaled);
 425         } else {
 426                 __account_system_time(p, cputime, scaled,       CPUTIME_SYSTEM);
 427         }
 428 }
 429
 430 static void irqtime_account_idle_ticks(int ticks)
 431 {
 432         struct rq *rq = this_rq();
 433
 434         irqtime_account_process_tick(current, 0, rq, ticks);
 435 }
 436 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 437 static inline void irqtime_account_idle_ticks(int ticks) {}
 438 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 439                                                 struct rq *rq, int nr_ticks) {}
 440 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 441
 442 /*
 443  * Use precise platform statistics if available:
 444  */
 445 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 446
 447 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 448 void vtime_common_task_switch(struct task_struct *prev)
 449 {
 450         if (is_idle_task(prev))
 451                 vtime_account_idle(prev);
 452         else
 453                 vtime_account_system(prev);
 454
 455 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 456         vtime_account_user(prev);
 457 #endif
 458         arch_vtime_task_switch(prev);
 459 }
 460 #endif
 461
 462 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
 463
 464
 465 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 466 /*
 467  * Archs that account the whole time spent in the idle task
 468  * (outside irq) as idle time can rely on this and just implement
 469  * vtime_account_system() and vtime_account_idle(). Archs that
 470  * have other meaning of the idle time (s390 only includes the
 471  * time spent by the CPU when it's in low power mode) must override
 472  * vtime_account().
 473  */
 474 #ifndef __ARCH_HAS_VTIME_ACCOUNT
 475 void vtime_account_irq_enter(struct task_struct *tsk)
 476 {
 477         if (!in_interrupt() && is_idle_task(tsk))
 478                 vtime_account_idle(tsk);
 479         else
 480                 vtime_account_system(tsk);
 481 }
 482 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 483 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 484
 485 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 486 {
 487         *ut = p->utime;
 488         *st = p->stime;
 489 }
 490 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 491
 492 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 493 {
 494         struct task_cputime cputime;
 495
 496         thread_group_cputime(p, &cputime);
 497
 498         *ut = cputime.utime;
 499         *st = cputime.stime;
 500 }
 501 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 502 /*
 503  * Account a single tick of cpu time.
 504  * @p: the process that the cpu time gets accounted to
 505  * @user_tick: indicates if the tick is a user or a system tick
 506  */
 507 void account_process_tick(struct task_struct *p, int user_tick)
 508 {
 509         cputime_t cputime, scaled, steal;
 510         struct rq *rq = this_rq();
 511
 512         if (vtime_accounting_cpu_enabled())
 513                 return;
 514
 515         if (sched_clock_irqtime) {
 516                 irqtime_account_process_tick(p, user_tick, rq, 1);
 517                 return;
 518         }
 519
 520         cputime = cputime_one_jiffy;
 521         steal = steal_account_process_time(ULONG_MAX);
 522
 523         if (steal >= cputime)
 524                 return;
 525
 526         cputime -= steal;
 527         scaled = cputime_to_scaled(cputime);
 528
 529         if (user_tick)
 530                 account_user_time(p, cputime, scaled);
 531         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 532                 account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
 533         else
 534                 account_idle_time(cputime);
 535 }
 536
 537 /*
 538  * Account multiple ticks of idle time.
 539  * @ticks: number of stolen ticks
 540  */
 541 void account_idle_ticks(unsigned long ticks)
 542 {
 543         cputime_t cputime, steal;
 544
 545         if (sched_clock_irqtime) {
 546                 irqtime_account_idle_ticks(ticks);
 547                 return;
 548         }
 549
 550         cputime = jiffies_to_cputime(ticks);
 551         steal = steal_account_process_time(ULONG_MAX);
 552
 553         if (steal >= cputime)
 554                 return;
 555
 556         cputime -= steal;
 557         account_idle_time(cputime);
 558 }
 559
 560 /*
 561  * Perform (stime * rtime) / total, but avoid multiplication overflow by
 562  * loosing precision when the numbers are big.
 563  */
 564 static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
 565 {
 566         u64 scaled;
 567
 568         for (;;) {
 569                 /* Make sure "rtime" is the bigger of stime/rtime */
 570                 if (stime > rtime)
 571                         swap(rtime, stime);
 572
 573                 /* Make sure 'total' fits in 32 bits */
 574                 if (total >> 32)
 575                         goto drop_precision;
 576
 577                 /* Does rtime (and thus stime) fit in 32 bits? */
 578                 if (!(rtime >> 32))
 579                         break;
 580
 581                 /* Can we just balance rtime/stime rather than dropping bits? */
 582                 if (stime >> 31)
 583                         goto drop_precision;
 584
 585                 /* We can grow stime and shrink rtime and try to make them both fit */
 586                 stime <<= 1;
 587                 rtime >>= 1;
 588                 continue;
 589
 590 drop_precision:
 591                 /* We drop from rtime, it has more bits than stime */
 592                 rtime >>= 1;
 593                 total >>= 1;
 594         }
 595
 596         /*
 597          * Make sure gcc understands that this is a 32x32->64 multiply,
 598          * followed by a 64/32->64 divide.
 599          */
 600         scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
 601         return (__force cputime_t) scaled;
 602 }
 603
 604 /*
 605  * Adjust tick based cputime random precision against scheduler runtime
 606  * accounting.
 607  *
 608  * Tick based cputime accounting depend on random scheduling timeslices of a
 609  * task to be interrupted or not by the timer.  Depending on these
 610  * circumstances, the number of these interrupts may be over or
 611  * under-optimistic, matching the real user and system cputime with a variable
 612  * precision.
 613  *
 614  * Fix this by scaling these tick based values against the total runtime
 615  * accounted by the CFS scheduler.
 616  *
 617  * This code provides the following guarantees:
 618  *
 619  *   stime + utime == rtime
 620  *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
 621  *
 622  * Assuming that rtime_i+1 >= rtime_i.
 623  */
 624 static void cputime_adjust(struct task_cputime *curr,
 625                            struct prev_cputime *prev,
 626                            cputime_t *ut, cputime_t *st)
 627 {
 628         cputime_t rtime, stime, utime;
 629         unsigned long flags;
 630
 631         /* Serialize concurrent callers such that we can honour our guarantees */
 632         raw_spin_lock_irqsave(&prev->lock, flags);
 633         rtime = nsecs_to_cputime(curr->sum_exec_runtime);
 634
 635         /*
 636          * This is possible under two circumstances:
 637          *  - rtime isn't monotonic after all (a bug);
 638          *  - we got reordered by the lock.
 639          *
 640          * In both cases this acts as a filter such that the rest of the code
 641          * can assume it is monotonic regardless of anything else.
 642          */
 643         if (prev->stime + prev->utime >= rtime)
 644                 goto out;
 645
 646         stime = curr->stime;
 647         utime = curr->utime;
 648
 649         /*
 650          * If either stime or both stime and utime are 0, assume all runtime is
 651          * userspace. Once a task gets some ticks, the monotonicy code at
 652          * 'update' will ensure things converge to the observed ratio.
 653          */
 654         if (stime == 0) {
 655                 utime = rtime;
 656                 goto update;
 657         }
 658
 659         if (utime == 0) {
 660                 stime = rtime;
 661                 goto update;
 662         }
 663
 664         stime = scale_stime((__force u64)stime, (__force u64)rtime,
 665                             (__force u64)(stime + utime));
 666
 667 update:
 668         /*
 669          * Make sure stime doesn't go backwards; this preserves monotonicity
 670          * for utime because rtime is monotonic.
 671          *
 672          *  utime_i+1 = rtime_i+1 - stime_i
 673          *            = rtime_i+1 - (rtime_i - utime_i)
 674          *            = (rtime_i+1 - rtime_i) + utime_i
 675          *            >= utime_i
 676          */
 677         if (stime < prev->stime)
 678                 stime = prev->stime;
 679         utime = rtime - stime;
 680
 681         /*
 682          * Make sure utime doesn't go backwards; this still preserves
 683          * monotonicity for stime, analogous argument to above.
 684          */
 685         if (utime < prev->utime) {
 686                 utime = prev->utime;
 687                 stime = rtime - utime;
 688         }
 689
 690         prev->stime = stime;
 691         prev->utime = utime;
 692 out:
 693         *ut = prev->utime;
 694         *st = prev->stime;
 695         raw_spin_unlock_irqrestore(&prev->lock, flags);
 696 }
 697
 698 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 699 {
 700         struct task_cputime cputime = {
 701                 .sum_exec_runtime = p->se.sum_exec_runtime,
 702         };
 703
 704         task_cputime(p, &cputime.utime, &cputime.stime);
 705         cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 706 }
 707 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 708
 709 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 710 {
 711         struct task_cputime cputime;
 712
 713         thread_group_cputime(p, &cputime);
 714         cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 715 }
 716 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 717
 718 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 719 static cputime_t vtime_delta(struct task_struct *tsk)
 720 {
 721         unsigned long now = READ_ONCE(jiffies);
 722
 723         if (time_before(now, (unsigned long)tsk->vtime_snap))
 724                 return 0;
 725
 726         return jiffies_to_cputime(now - tsk->vtime_snap);
 727 }
 728
 729 static cputime_t get_vtime_delta(struct task_struct *tsk)
 730 {
 731         unsigned long now = READ_ONCE(jiffies);
 732         cputime_t delta, other;
 733
 734         /*
 735          * Unlike tick based timing, vtime based timing never has lost
 736          * ticks, and no need for steal time accounting to make up for
 737          * lost ticks. Vtime accounts a rounded version of actual
 738          * elapsed time. Limit account_other_time to prevent rounding
 739          * errors from causing elapsed vtime to go negative.
 740          */
 741         delta = jiffies_to_cputime(now - tsk->vtime_snap);
 742         other = account_other_time(delta);
 743         WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
 744         tsk->vtime_snap = now;
 745
 746         return delta - other;
 747 }
 748
 749 static void __vtime_account_system(struct task_struct *tsk)
 750 {
 751         cputime_t delta_cpu = get_vtime_delta(tsk);
 752
 753         account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
 754 }
 755
 756 void vtime_account_system(struct task_struct *tsk)
 757 {
 758         if (!vtime_delta(tsk))
 759                 return;
 760
 761         write_seqcount_begin(&tsk->vtime_seqcount);
 762         __vtime_account_system(tsk);
 763         write_seqcount_end(&tsk->vtime_seqcount);
 764 }
 765
 766 void vtime_account_user(struct task_struct *tsk)
 767 {
 768         cputime_t delta_cpu;
 769
 770         write_seqcount_begin(&tsk->vtime_seqcount);
 771         tsk->vtime_snap_whence = VTIME_SYS;
 772         if (vtime_delta(tsk)) {
 773                 delta_cpu = get_vtime_delta(tsk);
 774                 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
 775         }
 776         write_seqcount_end(&tsk->vtime_seqcount);
 777 }
 778
 779 void vtime_user_enter(struct task_struct *tsk)
 780 {
 781         write_seqcount_begin(&tsk->vtime_seqcount);
 782         if (vtime_delta(tsk))
 783                 __vtime_account_system(tsk);
 784         tsk->vtime_snap_whence = VTIME_USER;
 785         write_seqcount_end(&tsk->vtime_seqcount);
 786 }
 787
 788 void vtime_guest_enter(struct task_struct *tsk)
 789 {
 790         /*
 791          * The flags must be updated under the lock with
 792          * the vtime_snap flush and update.
 793          * That enforces a right ordering and update sequence
 794          * synchronization against the reader (task_gtime())
 795          * that can thus safely catch up with a tickless delta.
 796          */
 797         write_seqcount_begin(&tsk->vtime_seqcount);
 798         if (vtime_delta(tsk))
 799                 __vtime_account_system(tsk);
 800         current->flags |= PF_VCPU;
 801         write_seqcount_end(&tsk->vtime_seqcount);
 802 }
 803 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 804
 805 void vtime_guest_exit(struct task_struct *tsk)
 806 {
 807         write_seqcount_begin(&tsk->vtime_seqcount);
 808         __vtime_account_system(tsk);
 809         current->flags &= ~PF_VCPU;
 810         write_seqcount_end(&tsk->vtime_seqcount);
 811 }
 812 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 813
 814 void vtime_account_idle(struct task_struct *tsk)
 815 {
 816         cputime_t delta_cpu = get_vtime_delta(tsk);
 817
 818         account_idle_time(delta_cpu);
 819 }
 820
 821 void arch_vtime_task_switch(struct task_struct *prev)
 822 {
 823         write_seqcount_begin(&prev->vtime_seqcount);
 824         prev->vtime_snap_whence = VTIME_INACTIVE;
 825         write_seqcount_end(&prev->vtime_seqcount);
 826
 827         write_seqcount_begin(&current->vtime_seqcount);
 828         current->vtime_snap_whence = VTIME_SYS;
 829         current->vtime_snap = jiffies;
 830         write_seqcount_end(&current->vtime_seqcount);
 831 }
 832
 833 void vtime_init_idle(struct task_struct *t, int cpu)
 834 {
 835         unsigned long flags;
 836
 837         local_irq_save(flags);
 838         write_seqcount_begin(&t->vtime_seqcount);
 839         t->vtime_snap_whence = VTIME_SYS;
 840         t->vtime_snap = jiffies;
 841         write_seqcount_end(&t->vtime_seqcount);
 842         local_irq_restore(flags);
 843 }
 844
 845 cputime_t task_gtime(struct task_struct *t)
 846 {
 847         unsigned int seq;
 848         cputime_t gtime;
 849
 850         if (!vtime_accounting_enabled())
 851                 return t->gtime;
 852
 853         do {
 854                 seq = read_seqcount_begin(&t->vtime_seqcount);
 855
 856                 gtime = t->gtime;
 857                 if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
 858                         gtime += vtime_delta(t);
 859
 860         } while (read_seqcount_retry(&t->vtime_seqcount, seq));
 861
 862         return gtime;
 863 }
 864
 865 /*
 866  * Fetch cputime raw values from fields of task_struct and
 867  * add up the pending nohz execution time since the last
 868  * cputime snapshot.
 869  */
 870 static void
 871 fetch_task_cputime(struct task_struct *t,
 872                    cputime_t *u_dst, cputime_t *s_dst,
 873                    cputime_t *u_src, cputime_t *s_src,
 874                    cputime_t *udelta, cputime_t *sdelta)
 875 {
 876         unsigned int seq;
 877         unsigned long long delta;
 878
 879         do {
 880                 *udelta = 0;
 881                 *sdelta = 0;
 882
 883                 seq = read_seqcount_begin(&t->vtime_seqcount);
 884
 885                 if (u_dst)
 886                         *u_dst = *u_src;
 887                 if (s_dst)
 888                         *s_dst = *s_src;
 889
 890                 /* Task is sleeping, nothing to add */
 891                 if (t->vtime_snap_whence == VTIME_INACTIVE ||
 892                     is_idle_task(t))
 893                         continue;
 894
 895                 delta = vtime_delta(t);
 896
 897                 /*
 898                  * Task runs either in user or kernel space, add pending nohz time to
 899                  * the right place.
 900                  */
 901                 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
 902                         *udelta = delta;
 903                 } else {
 904                         if (t->vtime_snap_whence == VTIME_SYS)
 905                                 *sdelta = delta;
 906                 }
 907         } while (read_seqcount_retry(&t->vtime_seqcount, seq));
 908 }
 909
 910
 911 void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
 912 {
 913         cputime_t udelta, sdelta;
 914
 915         if (!vtime_accounting_enabled()) {
 916                 if (utime)
 917                         *utime = t->utime;
 918                 if (stime)
 919                         *stime = t->stime;
 920                 return;
 921         }
 922
 923         fetch_task_cputime(t, utime, stime, &t->utime,
 924                            &t->stime, &udelta, &sdelta);
 925         if (utime)
 926                 *utime += udelta;
 927         if (stime)
 928                 *stime += sdelta;
 929 }
 930
 931 void task_cputime_scaled(struct task_struct *t,
 932                          cputime_t *utimescaled, cputime_t *stimescaled)
 933 {
 934         cputime_t udelta, sdelta;
 935
 936         if (!vtime_accounting_enabled()) {
 937                 if (utimescaled)
 938                         *utimescaled = t->utimescaled;
 939                 if (stimescaled)
 940                         *stimescaled = t->stimescaled;
 941                 return;
 942         }
 943
 944         fetch_task_cputime(t, utimescaled, stimescaled,
 945                            &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
 946         if (utimescaled)
 947                 *utimescaled += cputime_to_scaled(udelta);
 948         if (stimescaled)
 949                 *stimescaled += cputime_to_scaled(sdelta);
 950 }
 951 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */