arch/x86/kernel/tsc_sync.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * check TSC synchronization.
   4  *
   5  * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar
   6  *
   7  * We check whether all boot CPUs have their TSC's synchronized,
   8  * print a warning if not and turn off the TSC clock-source.
   9  *
  10  * The warp-check is point-to-point between two CPUs, the CPU
  11  * initiating the bootup is the 'source CPU', the freshly booting
  12  * CPU is the 'target CPU'.
  13  *
  14  * Only two CPUs may participate - they can enter in any order.
  15  * ( The serial nature of the boot logic and the CPU hotplug lock
  16  *   protects against more than 2 CPUs entering this code. )
  17  */
  18 #include <linux/topology.h>
  19 #include <linux/spinlock.h>
  20 #include <linux/kernel.h>
  21 #include <linux/smp.h>
  22 #include <linux/nmi.h>
  23 #include <asm/tsc.h>
  24
  25 struct tsc_adjust {
  26         s64             bootval;
  27         s64             adjusted;
  28         unsigned long   nextcheck;
  29         bool            warned;
  30 };
  31
  32 static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
  33 static struct timer_list tsc_sync_check_timer;
  34
  35 /*
  36  * TSC's on different sockets may be reset asynchronously.
  37  * This may cause the TSC ADJUST value on socket 0 to be NOT 0.
  38  */
  39 bool __read_mostly tsc_async_resets;
  40
  41 void mark_tsc_async_resets(char *reason)
  42 {
  43         if (tsc_async_resets)
  44                 return;
  45         tsc_async_resets = true;
  46         pr_info("tsc: Marking TSC async resets true due to %s\n", reason);
  47 }
  48
  49 void tsc_verify_tsc_adjust(bool resume)
  50 {
  51         struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
  52         s64 curval;
  53
  54         if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
  55                 return;
  56
  57         /* Skip unnecessary error messages if TSC already unstable */
  58         if (check_tsc_unstable())
  59                 return;
  60
  61         /* Rate limit the MSR check */
  62         if (!resume && time_before(jiffies, adj->nextcheck))
  63                 return;
  64
  65         adj->nextcheck = jiffies + HZ;
  66
  67         rdmsrl(MSR_IA32_TSC_ADJUST, curval);
  68         if (adj->adjusted == curval)
  69                 return;
  70
  71         /* Restore the original value */
  72         wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
  73
  74         if (!adj->warned || resume) {
  75                 pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
  76                         smp_processor_id(), adj->adjusted, curval);
  77                 adj->warned = true;
  78         }
  79 }
  80
  81 /*
  82  * Normally the tsc_sync will be checked every time system enters idle
  83  * state, but there is still caveat that a system won't enter idle,
  84  * either because it's too busy or configured purposely to not enter
  85  * idle.
  86  *
  87  * So setup a periodic timer (every 10 minutes) to make sure the check
  88  * is always on.
  89  */
  90
  91 #define SYNC_CHECK_INTERVAL             (HZ * 600)
  92
  93 static void tsc_sync_check_timer_fn(struct timer_list *unused)
  94 {
  95         int next_cpu;
  96
  97         tsc_verify_tsc_adjust(false);
  98
  99         /* Run the check for all onlined CPUs in turn */
 100         next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
 101         if (next_cpu >= nr_cpu_ids)
 102                 next_cpu = cpumask_first(cpu_online_mask);
 103
 104         tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
 105         add_timer_on(&tsc_sync_check_timer, next_cpu);
 106 }
 107
 108 static int __init start_sync_check_timer(void)
 109 {
 110         if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
 111                 return 0;
 112
 113         timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
 114         tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
 115         add_timer(&tsc_sync_check_timer);
 116
 117         return 0;
 118 }
 119 late_initcall(start_sync_check_timer);
 120
 121 static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
 122                                    unsigned int cpu, bool bootcpu)
 123 {
 124         /*
 125          * First online CPU in a package stores the boot value in the
 126          * adjustment value. This value might change later via the sync
 127          * mechanism. If that fails we still can yell about boot values not
 128          * being consistent.
 129          *
 130          * On the boot cpu we just force set the ADJUST value to 0 if it's
 131          * non zero. We don't do that on non boot cpus because physical
 132          * hotplug should have set the ADJUST register to a value > 0 so
 133          * the TSC is in sync with the already running cpus.
 134          *
 135          * Also don't force the ADJUST value to zero if that is a valid value
 136          * for socket 0 as determined by the system arch.  This is required
 137          * when multiple sockets are reset asynchronously with each other
 138          * and socket 0 may not have an TSC ADJUST value of 0.
 139          */
 140         if (bootcpu && bootval != 0) {
 141                 if (likely(!tsc_async_resets)) {
 142                         pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n",
 143                                 cpu, bootval);
 144                         wrmsrl(MSR_IA32_TSC_ADJUST, 0);
 145                         bootval = 0;
 146                 } else {
 147                         pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n",
 148                                 cpu, bootval);
 149                 }
 150         }
 151         cur->adjusted = bootval;
 152 }
 153
 154 #ifndef CONFIG_SMP
 155 bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
 156 {
 157         struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
 158         s64 bootval;
 159
 160         if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
 161                 return false;
 162
 163         /* Skip unnecessary error messages if TSC already unstable */
 164         if (check_tsc_unstable())
 165                 return false;
 166
 167         rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
 168         cur->bootval = bootval;
 169         cur->nextcheck = jiffies + HZ;
 170         tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
 171         return false;
 172 }
 173
 174 #else /* !CONFIG_SMP */
 175
 176 /*
 177  * Store and check the TSC ADJUST MSR if available
 178  */
 179 bool tsc_store_and_check_tsc_adjust(bool bootcpu)
 180 {
 181         struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
 182         unsigned int refcpu, cpu = smp_processor_id();
 183         struct cpumask *mask;
 184         s64 bootval;
 185
 186         if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
 187                 return false;
 188
 189         rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
 190         cur->bootval = bootval;
 191         cur->nextcheck = jiffies + HZ;
 192         cur->warned = false;
 193
 194         /*
 195          * If a non-zero TSC value for socket 0 may be valid then the default
 196          * adjusted value cannot assumed to be zero either.
 197          */
 198         if (tsc_async_resets)
 199                 cur->adjusted = bootval;
 200
 201         /*
 202          * Check whether this CPU is the first in a package to come up. In
 203          * this case do not check the boot value against another package
 204          * because the new package might have been physically hotplugged,
 205          * where TSC_ADJUST is expected to be different. When called on the
 206          * boot CPU topology_core_cpumask() might not be available yet.
 207          */
 208         mask = topology_core_cpumask(cpu);
 209         refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
 210
 211         if (refcpu >= nr_cpu_ids) {
 212                 tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
 213                                        bootcpu);
 214                 return false;
 215         }
 216
 217         ref = per_cpu_ptr(&tsc_adjust, refcpu);
 218         /*
 219          * Compare the boot value and complain if it differs in the
 220          * package.
 221          */
 222         if (bootval != ref->bootval)
 223                 printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n");
 224
 225         /*
 226          * The TSC_ADJUST values in a package must be the same. If the boot
 227          * value on this newly upcoming CPU differs from the adjustment
 228          * value of the already online CPU in this package, set it to that
 229          * adjusted value.
 230          */
 231         if (bootval != ref->adjusted) {
 232                 cur->adjusted = ref->adjusted;
 233                 wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
 234         }
 235         /*
 236          * We have the TSCs forced to be in sync on this package. Skip sync
 237          * test:
 238          */
 239         return true;
 240 }
 241
 242 /*
 243  * Entry/exit counters that make sure that both CPUs
 244  * run the measurement code at once:
 245  */
 246 static atomic_t start_count;
 247 static atomic_t stop_count;
 248 static atomic_t skip_test;
 249 static atomic_t test_runs;
 250
 251 /*
 252  * We use a raw spinlock in this exceptional case, because
 253  * we want to have the fastest, inlined, non-debug version
 254  * of a critical section, to be able to prove TSC time-warps:
 255  */
 256 static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 257
 258 static cycles_t last_tsc;
 259 static cycles_t max_warp;
 260 static int nr_warps;
 261 static int random_warps;
 262
 263 /*
 264  * TSC-warp measurement loop running on both CPUs.  This is not called
 265  * if there is no TSC.
 266  */
 267 static cycles_t check_tsc_warp(unsigned int timeout)
 268 {
 269         cycles_t start, now, prev, end, cur_max_warp = 0;
 270         int i, cur_warps = 0;
 271
 272         start = rdtsc_ordered();
 273         /*
 274          * The measurement runs for 'timeout' msecs:
 275          */
 276         end = start + (cycles_t) tsc_khz * timeout;
 277
 278         for (i = 0; ; i++) {
 279                 /*
 280                  * We take the global lock, measure TSC, save the
 281                  * previous TSC that was measured (possibly on
 282                  * another CPU) and update the previous TSC timestamp.
 283                  */
 284                 arch_spin_lock(&sync_lock);
 285                 prev = last_tsc;
 286                 now = rdtsc_ordered();
 287                 last_tsc = now;
 288                 arch_spin_unlock(&sync_lock);
 289
 290                 /*
 291                  * Be nice every now and then (and also check whether
 292                  * measurement is done [we also insert a 10 million
 293                  * loops safety exit, so we dont lock up in case the
 294                  * TSC readout is totally broken]):
 295                  */
 296                 if (unlikely(!(i & 7))) {
 297                         if (now > end || i > 10000000)
 298                                 break;
 299                         cpu_relax();
 300                         touch_nmi_watchdog();
 301                 }
 302                 /*
 303                  * Outside the critical section we can now see whether
 304                  * we saw a time-warp of the TSC going backwards:
 305                  */
 306                 if (unlikely(prev > now)) {
 307                         arch_spin_lock(&sync_lock);
 308                         max_warp = max(max_warp, prev - now);
 309                         cur_max_warp = max_warp;
 310                         /*
 311                          * Check whether this bounces back and forth. Only
 312                          * one CPU should observe time going backwards.
 313                          */
 314                         if (cur_warps != nr_warps)
 315                                 random_warps++;
 316                         nr_warps++;
 317                         cur_warps = nr_warps;
 318                         arch_spin_unlock(&sync_lock);
 319                 }
 320         }
 321         WARN(!(now-start),
 322                 "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
 323                         now-start, end-start);
 324         return cur_max_warp;
 325 }
 326
 327 /*
 328  * If the target CPU coming online doesn't have any of its core-siblings
 329  * online, a timeout of 20msec will be used for the TSC-warp measurement
 330  * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
 331  * information about this socket already (and this information grows as we
 332  * have more and more logical-siblings in that socket).
 333  *
 334  * Ideally we should be able to skip the TSC sync check on the other
 335  * core-siblings, if the first logical CPU in a socket passed the sync test.
 336  * But as the TSC is per-logical CPU and can potentially be modified wrongly
 337  * by the bios, TSC sync test for smaller duration should be able
 338  * to catch such errors. Also this will catch the condition where all the
 339  * cores in the socket don't get reset at the same time.
 340  */
 341 static inline unsigned int loop_timeout(int cpu)
 342 {
 343         return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
 344 }
 345
 346 /*
 347  * Source CPU calls into this - it waits for the freshly booted
 348  * target CPU to arrive and then starts the measurement:
 349  */
 350 void check_tsc_sync_source(int cpu)
 351 {
 352         int cpus = 2;
 353
 354         /*
 355          * No need to check if we already know that the TSC is not
 356          * synchronized or if we have no TSC.
 357          */
 358         if (unsynchronized_tsc())
 359                 return;
 360
 361         /*
 362          * Set the maximum number of test runs to
 363          *  1 if the CPU does not provide the TSC_ADJUST MSR
 364          *  3 if the MSR is available, so the target can try to adjust
 365          */
 366         if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
 367                 atomic_set(&test_runs, 1);
 368         else
 369                 atomic_set(&test_runs, 3);
 370 retry:
 371         /*
 372          * Wait for the target to start or to skip the test:
 373          */
 374         while (atomic_read(&start_count) != cpus - 1) {
 375                 if (atomic_read(&skip_test) > 0) {
 376                         atomic_set(&skip_test, 0);
 377                         return;
 378                 }
 379                 cpu_relax();
 380         }
 381
 382         /*
 383          * Trigger the target to continue into the measurement too:
 384          */
 385         atomic_inc(&start_count);
 386
 387         check_tsc_warp(loop_timeout(cpu));
 388
 389         while (atomic_read(&stop_count) != cpus-1)
 390                 cpu_relax();
 391
 392         /*
 393          * If the test was successful set the number of runs to zero and
 394          * stop. If not, decrement the number of runs an check if we can
 395          * retry. In case of random warps no retry is attempted.
 396          */
 397         if (!nr_warps) {
 398                 atomic_set(&test_runs, 0);
 399
 400                 pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
 401                         smp_processor_id(), cpu);
 402
 403         } else if (atomic_dec_and_test(&test_runs) || random_warps) {
 404                 /* Force it to 0 if random warps brought us here */
 405                 atomic_set(&test_runs, 0);
 406
 407                 pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n",
 408                         smp_processor_id(), cpu);
 409                 pr_warn("Measured %Ld cycles TSC warp between CPUs, "
 410                         "turning off TSC clock.\n", max_warp);
 411                 if (random_warps)
 412                         pr_warn("TSC warped randomly between CPUs\n");
 413                 mark_tsc_unstable("check_tsc_sync_source failed");
 414         }
 415
 416         /*
 417          * Reset it - just in case we boot another CPU later:
 418          */
 419         atomic_set(&start_count, 0);
 420         random_warps = 0;
 421         nr_warps = 0;
 422         max_warp = 0;
 423         last_tsc = 0;
 424
 425         /*
 426          * Let the target continue with the bootup:
 427          */
 428         atomic_inc(&stop_count);
 429
 430         /*
 431          * Retry, if there is a chance to do so.
 432          */
 433         if (atomic_read(&test_runs) > 0)
 434                 goto retry;
 435 }
 436
 437 /*
 438  * Freshly booted CPUs call into this:
 439  */
 440 void check_tsc_sync_target(void)
 441 {
 442         struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
 443         unsigned int cpu = smp_processor_id();
 444         cycles_t cur_max_warp, gbl_max_warp;
 445         int cpus = 2;
 446
 447         /* Also aborts if there is no TSC. */
 448         if (unsynchronized_tsc())
 449                 return;
 450
 451         /*
 452          * Store, verify and sanitize the TSC adjust register. If
 453          * successful skip the test.
 454          *
 455          * The test is also skipped when the TSC is marked reliable. This
 456          * is true for SoCs which have no fallback clocksource. On these
 457          * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
 458          * register might have been wreckaged by the BIOS..
 459          */
 460         if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) {
 461                 atomic_inc(&skip_test);
 462                 return;
 463         }
 464
 465 retry:
 466         /*
 467          * Register this CPU's participation and wait for the
 468          * source CPU to start the measurement:
 469          */
 470         atomic_inc(&start_count);
 471         while (atomic_read(&start_count) != cpus)
 472                 cpu_relax();
 473
 474         cur_max_warp = check_tsc_warp(loop_timeout(cpu));
 475
 476         /*
 477          * Store the maximum observed warp value for a potential retry:
 478          */
 479         gbl_max_warp = max_warp;
 480
 481         /*
 482          * Ok, we are done:
 483          */
 484         atomic_inc(&stop_count);
 485
 486         /*
 487          * Wait for the source CPU to print stuff:
 488          */
 489         while (atomic_read(&stop_count) != cpus)
 490                 cpu_relax();
 491
 492         /*
 493          * Reset it for the next sync test:
 494          */
 495         atomic_set(&stop_count, 0);
 496
 497         /*
 498          * Check the number of remaining test runs. If not zero, the test
 499          * failed and a retry with adjusted TSC is possible. If zero the
 500          * test was either successful or failed terminally.
 501          */
 502         if (!atomic_read(&test_runs))
 503                 return;
 504
 505         /*
 506          * If the warp value of this CPU is 0, then the other CPU
 507          * observed time going backwards so this TSC was ahead and
 508          * needs to move backwards.
 509          */
 510         if (!cur_max_warp)
 511                 cur_max_warp = -gbl_max_warp;
 512
 513         /*
 514          * Add the result to the previous adjustment value.
 515          *
 516          * The adjustment value is slightly off by the overhead of the
 517          * sync mechanism (observed values are ~200 TSC cycles), but this
 518          * really depends on CPU, node distance and frequency. So
 519          * compensating for this is hard to get right. Experiments show
 520          * that the warp is not longer detectable when the observed warp
 521          * value is used. In the worst case the adjustment needs to go
 522          * through a 3rd run for fine tuning.
 523          */
 524         cur->adjusted += cur_max_warp;
 525
 526         pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
 527                 cpu, cur_max_warp, cur->adjusted);
 528
 529         wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
 530         goto retry;
 531
 532 }
 533
 534 #endif /* CONFIG_SMP */