arch/x86/events/intel/cqm.c

   1 /*
   2  * Intel Cache Quality-of-Service Monitoring (CQM) support.
   3  *
   4  * Based very, very heavily on work by Peter Zijlstra.
   5  */
   6
   7 #include <linux/perf_event.h>
   8 #include <linux/slab.h>
   9 #include <asm/cpu_device_id.h>
  10 #include <asm/intel_rdt_common.h>
  11 #include "../perf_event.h"
  12
  13 #define MSR_IA32_QM_CTR         0x0c8e
  14 #define MSR_IA32_QM_EVTSEL      0x0c8d
  15
  16 #define MBM_CNTR_WIDTH          24
  17 /*
  18  * Guaranteed time in ms as per SDM where MBM counters will not overflow.
  19  */
  20 #define MBM_CTR_OVERFLOW_TIME   1000
  21
  22 static u32 cqm_max_rmid = -1;
  23 static unsigned int cqm_l3_scale; /* supposedly cacheline size */
  24 static bool cqm_enabled, mbm_enabled;
  25 unsigned int mbm_socket_max;
  26
  27 /*
  28  * The cached intel_pqr_state is strictly per CPU and can never be
  29  * updated from a remote CPU. Both functions which modify the state
  30  * (intel_cqm_event_start and intel_cqm_event_stop) are called with
  31  * interrupts disabled, which is sufficient for the protection.
  32  */
  33 DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
  34 static struct hrtimer *mbm_timers;
  35 /**
  36  * struct sample - mbm event's (local or total) data
  37  * @total_bytes    #bytes since we began monitoring
  38  * @prev_msr       previous value of MSR
  39  */
  40 struct sample {
  41         u64     total_bytes;
  42         u64     prev_msr;
  43 };
  44
  45 /*
  46  * samples profiled for total memory bandwidth type events
  47  */
  48 static struct sample *mbm_total;
  49 /*
  50  * samples profiled for local memory bandwidth type events
  51  */
  52 static struct sample *mbm_local;
  53
  54 #define pkg_id  topology_physical_package_id(smp_processor_id())
  55 /*
  56  * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
  57  * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
  58  * rmids per socket, an example is given below
  59  * RMID1 of Socket0:  vrmid =  1
  60  * RMID1 of Socket1:  vrmid =  1 * (cqm_max_rmid + 1) + 1
  61  * RMID1 of Socket2:  vrmid =  2 * (cqm_max_rmid + 1) + 1
  62  */
  63 #define rmid_2_index(rmid)  ((pkg_id * (cqm_max_rmid + 1)) + rmid)
  64 /*
  65  * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
  66  * Also protects event->hw.cqm_rmid
  67  *
  68  * Hold either for stability, both for modification of ->hw.cqm_rmid.
  69  */
  70 static DEFINE_MUTEX(cache_mutex);
  71 static DEFINE_RAW_SPINLOCK(cache_lock);
  72
  73 /*
  74  * Groups of events that have the same target(s), one RMID per group.
  75  */
  76 static LIST_HEAD(cache_groups);
  77
  78 /*
  79  * Mask of CPUs for reading CQM values. We only need one per-socket.
  80  */
  81 static cpumask_t cqm_cpumask;
  82
  83 #define RMID_VAL_ERROR          (1ULL << 63)
  84 #define RMID_VAL_UNAVAIL        (1ULL << 62)
  85
  86 /*
  87  * Event IDs are used to program IA32_QM_EVTSEL before reading event
  88  * counter from IA32_QM_CTR
  89  */
  90 #define QOS_L3_OCCUP_EVENT_ID   0x01
  91 #define QOS_MBM_TOTAL_EVENT_ID  0x02
  92 #define QOS_MBM_LOCAL_EVENT_ID  0x03
  93
  94 /*
  95  * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
  96  *
  97  * This rmid is always free and is guaranteed to have an associated
  98  * near-zero occupancy value, i.e. no cachelines are tagged with this
  99  * RMID, once __intel_cqm_rmid_rotate() returns.
 100  */
 101 static u32 intel_cqm_rotation_rmid;
 102
 103 #define INVALID_RMID            (-1)
 104
 105 /*
 106  * Is @rmid valid for programming the hardware?
 107  *
 108  * rmid 0 is reserved by the hardware for all non-monitored tasks, which
 109  * means that we should never come across an rmid with that value.
 110  * Likewise, an rmid value of -1 is used to indicate "no rmid currently
 111  * assigned" and is used as part of the rotation code.
 112  */
 113 static inline bool __rmid_valid(u32 rmid)
 114 {
 115         if (!rmid || rmid == INVALID_RMID)
 116                 return false;
 117
 118         return true;
 119 }
 120
 121 static u64 __rmid_read(u32 rmid)
 122 {
 123         u64 val;
 124
 125         /*
 126          * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
 127          * it just says that to increase confusion.
 128          */
 129         wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
 130         rdmsrl(MSR_IA32_QM_CTR, val);
 131
 132         /*
 133          * Aside from the ERROR and UNAVAIL bits, assume this thing returns
 134          * the number of cachelines tagged with @rmid.
 135          */
 136         return val;
 137 }
 138
 139 enum rmid_recycle_state {
 140         RMID_YOUNG = 0,
 141         RMID_AVAILABLE,
 142         RMID_DIRTY,
 143 };
 144
 145 struct cqm_rmid_entry {
 146         u32 rmid;
 147         enum rmid_recycle_state state;
 148         struct list_head list;
 149         unsigned long queue_time;
 150 };
 151
 152 /*
 153  * cqm_rmid_free_lru - A least recently used list of RMIDs.
 154  *
 155  * Oldest entry at the head, newest (most recently used) entry at the
 156  * tail. This list is never traversed, it's only used to keep track of
 157  * the lru order. That is, we only pick entries of the head or insert
 158  * them on the tail.
 159  *
 160  * All entries on the list are 'free', and their RMIDs are not currently
 161  * in use. To mark an RMID as in use, remove its entry from the lru
 162  * list.
 163  *
 164  *
 165  * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
 166  *
 167  * This list is contains RMIDs that no one is currently using but that
 168  * may have a non-zero occupancy value associated with them. The
 169  * rotation worker moves RMIDs from the limbo list to the free list once
 170  * the occupancy value drops below __intel_cqm_threshold.
 171  *
 172  * Both lists are protected by cache_mutex.
 173  */
 174 static LIST_HEAD(cqm_rmid_free_lru);
 175 static LIST_HEAD(cqm_rmid_limbo_lru);
 176
 177 /*
 178  * We use a simple array of pointers so that we can lookup a struct
 179  * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
 180  * and __put_rmid() from having to worry about dealing with struct
 181  * cqm_rmid_entry - they just deal with rmids, i.e. integers.
 182  *
 183  * Once this array is initialized it is read-only. No locks are required
 184  * to access it.
 185  *
 186  * All entries for all RMIDs can be looked up in the this array at all
 187  * times.
 188  */
 189 static struct cqm_rmid_entry **cqm_rmid_ptrs;
 190
 191 static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
 192 {
 193         struct cqm_rmid_entry *entry;
 194
 195         entry = cqm_rmid_ptrs[rmid];
 196         WARN_ON(entry->rmid != rmid);
 197
 198         return entry;
 199 }
 200
 201 /*
 202  * Returns < 0 on fail.
 203  *
 204  * We expect to be called with cache_mutex held.
 205  */
 206 static u32 __get_rmid(void)
 207 {
 208         struct cqm_rmid_entry *entry;
 209
 210         lockdep_assert_held(&cache_mutex);
 211
 212         if (list_empty(&cqm_rmid_free_lru))
 213                 return INVALID_RMID;
 214
 215         entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
 216         list_del(&entry->list);
 217
 218         return entry->rmid;
 219 }
 220
 221 static void __put_rmid(u32 rmid)
 222 {
 223         struct cqm_rmid_entry *entry;
 224
 225         lockdep_assert_held(&cache_mutex);
 226
 227         WARN_ON(!__rmid_valid(rmid));
 228         entry = __rmid_entry(rmid);
 229
 230         entry->queue_time = jiffies;
 231         entry->state = RMID_YOUNG;
 232
 233         list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
 234 }
 235
 236 static void cqm_cleanup(void)
 237 {
 238         int i;
 239
 240         if (!cqm_rmid_ptrs)
 241                 return;
 242
 243         for (i = 0; i < cqm_max_rmid; i++)
 244                 kfree(cqm_rmid_ptrs[i]);
 245
 246         kfree(cqm_rmid_ptrs);
 247         cqm_rmid_ptrs = NULL;
 248         cqm_enabled = false;
 249 }
 250
 251 static int intel_cqm_setup_rmid_cache(void)
 252 {
 253         struct cqm_rmid_entry *entry;
 254         unsigned int nr_rmids;
 255         int r = 0;
 256
 257         nr_rmids = cqm_max_rmid + 1;
 258         cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
 259                                 nr_rmids, GFP_KERNEL);
 260         if (!cqm_rmid_ptrs)
 261                 return -ENOMEM;
 262
 263         for (; r <= cqm_max_rmid; r++) {
 264                 struct cqm_rmid_entry *entry;
 265
 266                 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 267                 if (!entry)
 268                         goto fail;
 269
 270                 INIT_LIST_HEAD(&entry->list);
 271                 entry->rmid = r;
 272                 cqm_rmid_ptrs[r] = entry;
 273
 274                 list_add_tail(&entry->list, &cqm_rmid_free_lru);
 275         }
 276
 277         /*
 278          * RMID 0 is special and is always allocated. It's used for all
 279          * tasks that are not monitored.
 280          */
 281         entry = __rmid_entry(0);
 282         list_del(&entry->list);
 283
 284         mutex_lock(&cache_mutex);
 285         intel_cqm_rotation_rmid = __get_rmid();
 286         mutex_unlock(&cache_mutex);
 287
 288         return 0;
 289
 290 fail:
 291         cqm_cleanup();
 292         return -ENOMEM;
 293 }
 294
 295 /*
 296  * Determine if @a and @b measure the same set of tasks.
 297  *
 298  * If @a and @b measure the same set of tasks then we want to share a
 299  * single RMID.
 300  */
 301 static bool __match_event(struct perf_event *a, struct perf_event *b)
 302 {
 303         /* Per-cpu and task events don't mix */
 304         if ((a->attach_state & PERF_ATTACH_TASK) !=
 305             (b->attach_state & PERF_ATTACH_TASK))
 306                 return false;
 307
 308 #ifdef CONFIG_CGROUP_PERF
 309         if (a->cgrp != b->cgrp)
 310                 return false;
 311 #endif
 312
 313         /* If not task event, we're machine wide */
 314         if (!(b->attach_state & PERF_ATTACH_TASK))
 315                 return true;
 316
 317         /*
 318          * Events that target same task are placed into the same cache group.
 319          * Mark it as a multi event group, so that we update ->count
 320          * for every event rather than just the group leader later.
 321          */
 322         if (a->hw.target == b->hw.target) {
 323                 b->hw.is_group_event = true;
 324                 return true;
 325         }
 326
 327         /*
 328          * Are we an inherited event?
 329          */
 330         if (b->parent == a)
 331                 return true;
 332
 333         return false;
 334 }
 335
 336 #ifdef CONFIG_CGROUP_PERF
 337 static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
 338 {
 339         if (event->attach_state & PERF_ATTACH_TASK)
 340                 return perf_cgroup_from_task(event->hw.target, event->ctx);
 341
 342         return event->cgrp;
 343 }
 344 #endif
 345
 346 /*
 347  * Determine if @a's tasks intersect with @b's tasks
 348  *
 349  * There are combinations of events that we explicitly prohibit,
 350  *
 351  *                 PROHIBITS
 352  *     system-wide    ->        cgroup and task
 353  *     cgroup         ->        system-wide
 354  *                    ->        task in cgroup
 355  *     task           ->        system-wide
 356  *                    ->        task in cgroup
 357  *
 358  * Call this function before allocating an RMID.
 359  */
 360 static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 361 {
 362 #ifdef CONFIG_CGROUP_PERF
 363         /*
 364          * We can have any number of cgroups but only one system-wide
 365          * event at a time.
 366          */
 367         if (a->cgrp && b->cgrp) {
 368                 struct perf_cgroup *ac = a->cgrp;
 369                 struct perf_cgroup *bc = b->cgrp;
 370
 371                 /*
 372                  * This condition should have been caught in
 373                  * __match_event() and we should be sharing an RMID.
 374                  */
 375                 WARN_ON_ONCE(ac == bc);
 376
 377                 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
 378                     cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
 379                         return true;
 380
 381                 return false;
 382         }
 383
 384         if (a->cgrp || b->cgrp) {
 385                 struct perf_cgroup *ac, *bc;
 386
 387                 /*
 388                  * cgroup and system-wide events are mutually exclusive
 389                  */
 390                 if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
 391                     (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
 392                         return true;
 393
 394                 /*
 395                  * Ensure neither event is part of the other's cgroup
 396                  */
 397                 ac = event_to_cgroup(a);
 398                 bc = event_to_cgroup(b);
 399                 if (ac == bc)
 400                         return true;
 401
 402                 /*
 403                  * Must have cgroup and non-intersecting task events.
 404                  */
 405                 if (!ac || !bc)
 406                         return false;
 407
 408                 /*
 409                  * We have cgroup and task events, and the task belongs
 410                  * to a cgroup. Check for for overlap.
 411                  */
 412                 if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
 413                     cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
 414                         return true;
 415
 416                 return false;
 417         }
 418 #endif
 419         /*
 420          * If one of them is not a task, same story as above with cgroups.
 421          */
 422         if (!(a->attach_state & PERF_ATTACH_TASK) ||
 423             !(b->attach_state & PERF_ATTACH_TASK))
 424                 return true;
 425
 426         /*
 427          * Must be non-overlapping.
 428          */
 429         return false;
 430 }
 431
 432 struct rmid_read {
 433         u32 rmid;
 434         u32 evt_type;
 435         atomic64_t value;
 436 };
 437
 438 static void __intel_cqm_event_count(void *info);
 439 static void init_mbm_sample(u32 rmid, u32 evt_type);
 440 static void __intel_mbm_event_count(void *info);
 441
 442 static bool is_cqm_event(int e)
 443 {
 444         return (e == QOS_L3_OCCUP_EVENT_ID);
 445 }
 446
 447 static bool is_mbm_event(int e)
 448 {
 449         return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
 450 }
 451
 452 static void cqm_mask_call(struct rmid_read *rr)
 453 {
 454         if (is_mbm_event(rr->evt_type))
 455                 on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
 456         else
 457                 on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
 458 }
 459
 460 /*
 461  * Exchange the RMID of a group of events.
 462  */
 463 static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
 464 {
 465         struct perf_event *event;
 466         struct list_head *head = &group->hw.cqm_group_entry;
 467         u32 old_rmid = group->hw.cqm_rmid;
 468
 469         lockdep_assert_held(&cache_mutex);
 470
 471         /*
 472          * If our RMID is being deallocated, perform a read now.
 473          */
 474         if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
 475                 struct rmid_read rr = {
 476                         .rmid = old_rmid,
 477                         .evt_type = group->attr.config,
 478                         .value = ATOMIC64_INIT(0),
 479                 };
 480
 481                 cqm_mask_call(&rr);
 482                 local64_set(&group->count, atomic64_read(&rr.value));
 483         }
 484
 485         raw_spin_lock_irq(&cache_lock);
 486
 487         group->hw.cqm_rmid = rmid;
 488         list_for_each_entry(event, head, hw.cqm_group_entry)
 489                 event->hw.cqm_rmid = rmid;
 490
 491         raw_spin_unlock_irq(&cache_lock);
 492
 493         /*
 494          * If the allocation is for mbm, init the mbm stats.
 495          * Need to check if each event in the group is mbm event
 496          * because there could be multiple type of events in the same group.
 497          */
 498         if (__rmid_valid(rmid)) {
 499                 event = group;
 500                 if (is_mbm_event(event->attr.config))
 501                         init_mbm_sample(rmid, event->attr.config);
 502
 503                 list_for_each_entry(event, head, hw.cqm_group_entry) {
 504                         if (is_mbm_event(event->attr.config))
 505                                 init_mbm_sample(rmid, event->attr.config);
 506                 }
 507         }
 508
 509         return old_rmid;
 510 }
 511
 512 /*
 513  * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
 514  * cachelines are still tagged with RMIDs in limbo, we progressively
 515  * increment the threshold until we find an RMID in limbo with <=
 516  * __intel_cqm_threshold lines tagged. This is designed to mitigate the
 517  * problem where cachelines tagged with an RMID are not steadily being
 518  * evicted.
 519  *
 520  * On successful rotations we decrease the threshold back towards zero.
 521  *
 522  * __intel_cqm_max_threshold provides an upper bound on the threshold,
 523  * and is measured in bytes because it's exposed to userland.
 524  */
 525 static unsigned int __intel_cqm_threshold;
 526 static unsigned int __intel_cqm_max_threshold;
 527
 528 /*
 529  * Test whether an RMID has a zero occupancy value on this cpu.
 530  */
 531 static void intel_cqm_stable(void *arg)
 532 {
 533         struct cqm_rmid_entry *entry;
 534
 535         list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
 536                 if (entry->state != RMID_AVAILABLE)
 537                         break;
 538
 539                 if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
 540                         entry->state = RMID_DIRTY;
 541         }
 542 }
 543
 544 /*
 545  * If we have group events waiting for an RMID that don't conflict with
 546  * events already running, assign @rmid.
 547  */
 548 static bool intel_cqm_sched_in_event(u32 rmid)
 549 {
 550         struct perf_event *leader, *event;
 551
 552         lockdep_assert_held(&cache_mutex);
 553
 554         leader = list_first_entry(&cache_groups, struct perf_event,
 555                                   hw.cqm_groups_entry);
 556         event = leader;
 557
 558         list_for_each_entry_continue(event, &cache_groups,
 559                                      hw.cqm_groups_entry) {
 560                 if (__rmid_valid(event->hw.cqm_rmid))
 561                         continue;
 562
 563                 if (__conflict_event(event, leader))
 564                         continue;
 565
 566                 intel_cqm_xchg_rmid(event, rmid);
 567                 return true;
 568         }
 569
 570         return false;
 571 }
 572
 573 /*
 574  * Initially use this constant for both the limbo queue time and the
 575  * rotation timer interval, pmu::hrtimer_interval_ms.
 576  *
 577  * They don't need to be the same, but the two are related since if you
 578  * rotate faster than you recycle RMIDs, you may run out of available
 579  * RMIDs.
 580  */
 581 #define RMID_DEFAULT_QUEUE_TIME 250     /* ms */
 582
 583 static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
 584
 585 /*
 586  * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
 587  * @nr_available: number of freeable RMIDs on the limbo list
 588  *
 589  * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
 590  * cachelines are tagged with those RMIDs. After this we can reuse them
 591  * and know that the current set of active RMIDs is stable.
 592  *
 593  * Return %true or %false depending on whether stabilization needs to be
 594  * reattempted.
 595  *
 596  * If we return %true then @nr_available is updated to indicate the
 597  * number of RMIDs on the limbo list that have been queued for the
 598  * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
 599  * are above __intel_cqm_threshold.
 600  */
 601 static bool intel_cqm_rmid_stabilize(unsigned int *available)
 602 {
 603         struct cqm_rmid_entry *entry, *tmp;
 604
 605         lockdep_assert_held(&cache_mutex);
 606
 607         *available = 0;
 608         list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
 609                 unsigned long min_queue_time;
 610                 unsigned long now = jiffies;
 611
 612                 /*
 613                  * We hold RMIDs placed into limbo for a minimum queue
 614                  * time. Before the minimum queue time has elapsed we do
 615                  * not recycle RMIDs.
 616                  *
 617                  * The reasoning is that until a sufficient time has
 618                  * passed since we stopped using an RMID, any RMID
 619                  * placed onto the limbo list will likely still have
 620                  * data tagged in the cache, which means we'll probably
 621                  * fail to recycle it anyway.
 622                  *
 623                  * We can save ourselves an expensive IPI by skipping
 624                  * any RMIDs that have not been queued for the minimum
 625                  * time.
 626                  */
 627                 min_queue_time = entry->queue_time +
 628                         msecs_to_jiffies(__rmid_queue_time_ms);
 629
 630                 if (time_after(min_queue_time, now))
 631                         break;
 632
 633                 entry->state = RMID_AVAILABLE;
 634                 (*available)++;
 635         }
 636
 637         /*
 638          * Fast return if none of the RMIDs on the limbo list have been
 639          * sitting on the queue for the minimum queue time.
 640          */
 641         if (!*available)
 642                 return false;
 643
 644         /*
 645          * Test whether an RMID is free for each package.
 646          */
 647         on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
 648
 649         list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
 650                 /*
 651                  * Exhausted all RMIDs that have waited min queue time.
 652                  */
 653                 if (entry->state == RMID_YOUNG)
 654                         break;
 655
 656                 if (entry->state == RMID_DIRTY)
 657                         continue;
 658
 659                 list_del(&entry->list); /* remove from limbo */
 660
 661                 /*
 662                  * The rotation RMID gets priority if it's
 663                  * currently invalid. In which case, skip adding
 664                  * the RMID to the the free lru.
 665                  */
 666                 if (!__rmid_valid(intel_cqm_rotation_rmid)) {
 667                         intel_cqm_rotation_rmid = entry->rmid;
 668                         continue;
 669                 }
 670
 671                 /*
 672                  * If we have groups waiting for RMIDs, hand
 673                  * them one now provided they don't conflict.
 674                  */
 675                 if (intel_cqm_sched_in_event(entry->rmid))
 676                         continue;
 677
 678                 /*
 679                  * Otherwise place it onto the free list.
 680                  */
 681                 list_add_tail(&entry->list, &cqm_rmid_free_lru);
 682         }
 683
 684
 685         return __rmid_valid(intel_cqm_rotation_rmid);
 686 }
 687
 688 /*
 689  * Pick a victim group and move it to the tail of the group list.
 690  * @next: The first group without an RMID
 691  */
 692 static void __intel_cqm_pick_and_rotate(struct perf_event *next)
 693 {
 694         struct perf_event *rotor;
 695         u32 rmid;
 696
 697         lockdep_assert_held(&cache_mutex);
 698
 699         rotor = list_first_entry(&cache_groups, struct perf_event,
 700                                  hw.cqm_groups_entry);
 701
 702         /*
 703          * The group at the front of the list should always have a valid
 704          * RMID. If it doesn't then no groups have RMIDs assigned and we
 705          * don't need to rotate the list.
 706          */
 707         if (next == rotor)
 708                 return;
 709
 710         rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
 711         __put_rmid(rmid);
 712
 713         list_rotate_left(&cache_groups);
 714 }
 715
 716 /*
 717  * Deallocate the RMIDs from any events that conflict with @event, and
 718  * place them on the back of the group list.
 719  */
 720 static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
 721 {
 722         struct perf_event *group, *g;
 723         u32 rmid;
 724
 725         lockdep_assert_held(&cache_mutex);
 726
 727         list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
 728                 if (group == event)
 729                         continue;
 730
 731                 rmid = group->hw.cqm_rmid;
 732
 733                 /*
 734                  * Skip events that don't have a valid RMID.
 735                  */
 736                 if (!__rmid_valid(rmid))
 737                         continue;
 738
 739                 /*
 740                  * No conflict? No problem! Leave the event alone.
 741                  */
 742                 if (!__conflict_event(group, event))
 743                         continue;
 744
 745                 intel_cqm_xchg_rmid(group, INVALID_RMID);
 746                 __put_rmid(rmid);
 747         }
 748 }
 749
 750 /*
 751  * Attempt to rotate the groups and assign new RMIDs.
 752  *
 753  * We rotate for two reasons,
 754  *   1. To handle the scheduling of conflicting events
 755  *   2. To recycle RMIDs
 756  *
 757  * Rotating RMIDs is complicated because the hardware doesn't give us
 758  * any clues.
 759  *
 760  * There's problems with the hardware interface; when you change the
 761  * task:RMID map cachelines retain their 'old' tags, giving a skewed
 762  * picture. In order to work around this, we must always keep one free
 763  * RMID - intel_cqm_rotation_rmid.
 764  *
 765  * Rotation works by taking away an RMID from a group (the old RMID),
 766  * and assigning the free RMID to another group (the new RMID). We must
 767  * then wait for the old RMID to not be used (no cachelines tagged).
 768  * This ensure that all cachelines are tagged with 'active' RMIDs. At
 769  * this point we can start reading values for the new RMID and treat the
 770  * old RMID as the free RMID for the next rotation.
 771  *
 772  * Return %true or %false depending on whether we did any rotating.
 773  */
 774 static bool __intel_cqm_rmid_rotate(void)
 775 {
 776         struct perf_event *group, *start = NULL;
 777         unsigned int threshold_limit;
 778         unsigned int nr_needed = 0;
 779         unsigned int nr_available;
 780         bool rotated = false;
 781
 782         mutex_lock(&cache_mutex);
 783
 784 again:
 785         /*
 786          * Fast path through this function if there are no groups and no
 787          * RMIDs that need cleaning.
 788          */
 789         if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
 790                 goto out;
 791
 792         list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
 793                 if (!__rmid_valid(group->hw.cqm_rmid)) {
 794                         if (!start)
 795                                 start = group;
 796                         nr_needed++;
 797                 }
 798         }
 799
 800         /*
 801          * We have some event groups, but they all have RMIDs assigned
 802          * and no RMIDs need cleaning.
 803          */
 804         if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
 805                 goto out;
 806
 807         if (!nr_needed)
 808                 goto stabilize;
 809
 810         /*
 811          * We have more event groups without RMIDs than available RMIDs,
 812          * or we have event groups that conflict with the ones currently
 813          * scheduled.
 814          *
 815          * We force deallocate the rmid of the group at the head of
 816          * cache_groups. The first event group without an RMID then gets
 817          * assigned intel_cqm_rotation_rmid. This ensures we always make
 818          * forward progress.
 819          *
 820          * Rotate the cache_groups list so the previous head is now the
 821          * tail.
 822          */
 823         __intel_cqm_pick_and_rotate(start);
 824
 825         /*
 826          * If the rotation is going to succeed, reduce the threshold so
 827          * that we don't needlessly reuse dirty RMIDs.
 828          */
 829         if (__rmid_valid(intel_cqm_rotation_rmid)) {
 830                 intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
 831                 intel_cqm_rotation_rmid = __get_rmid();
 832
 833                 intel_cqm_sched_out_conflicting_events(start);
 834
 835                 if (__intel_cqm_threshold)
 836                         __intel_cqm_threshold--;
 837         }
 838
 839         rotated = true;
 840
 841 stabilize:
 842         /*
 843          * We now need to stablize the RMID we freed above (if any) to
 844          * ensure that the next time we rotate we have an RMID with zero
 845          * occupancy value.
 846          *
 847          * Alternatively, if we didn't need to perform any rotation,
 848          * we'll have a bunch of RMIDs in limbo that need stabilizing.
 849          */
 850         threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
 851
 852         while (intel_cqm_rmid_stabilize(&nr_available) &&
 853                __intel_cqm_threshold < threshold_limit) {
 854                 unsigned int steal_limit;
 855
 856                 /*
 857                  * Don't spin if nobody is actively waiting for an RMID,
 858                  * the rotation worker will be kicked as soon as an
 859                  * event needs an RMID anyway.
 860                  */
 861                 if (!nr_needed)
 862                         break;
 863
 864                 /* Allow max 25% of RMIDs to be in limbo. */
 865                 steal_limit = (cqm_max_rmid + 1) / 4;
 866
 867                 /*
 868                  * We failed to stabilize any RMIDs so our rotation
 869                  * logic is now stuck. In order to make forward progress
 870                  * we have a few options:
 871                  *
 872                  *   1. rotate ("steal") another RMID
 873                  *   2. increase the threshold
 874                  *   3. do nothing
 875                  *
 876                  * We do both of 1. and 2. until we hit the steal limit.
 877                  *
 878                  * The steal limit prevents all RMIDs ending up on the
 879                  * limbo list. This can happen if every RMID has a
 880                  * non-zero occupancy above threshold_limit, and the
 881                  * occupancy values aren't dropping fast enough.
 882                  *
 883                  * Note that there is prioritisation at work here - we'd
 884                  * rather increase the number of RMIDs on the limbo list
 885                  * than increase the threshold, because increasing the
 886                  * threshold skews the event data (because we reuse
 887                  * dirty RMIDs) - threshold bumps are a last resort.
 888                  */
 889                 if (nr_available < steal_limit)
 890                         goto again;
 891
 892                 __intel_cqm_threshold++;
 893         }
 894
 895 out:
 896         mutex_unlock(&cache_mutex);
 897         return rotated;
 898 }
 899
 900 static void intel_cqm_rmid_rotate(struct work_struct *work);
 901
 902 static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
 903
 904 static struct pmu intel_cqm_pmu;
 905
 906 static void intel_cqm_rmid_rotate(struct work_struct *work)
 907 {
 908         unsigned long delay;
 909
 910         __intel_cqm_rmid_rotate();
 911
 912         delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
 913         schedule_delayed_work(&intel_cqm_rmid_work, delay);
 914 }
 915
 916 static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
 917 {
 918         struct sample *mbm_current;
 919         u32 vrmid = rmid_2_index(rmid);
 920         u64 val, bytes, shift;
 921         u32 eventid;
 922
 923         if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
 924                 mbm_current = &mbm_local[vrmid];
 925                 eventid     = QOS_MBM_LOCAL_EVENT_ID;
 926         } else {
 927                 mbm_current = &mbm_total[vrmid];
 928                 eventid     = QOS_MBM_TOTAL_EVENT_ID;
 929         }
 930
 931         wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
 932         rdmsrl(MSR_IA32_QM_CTR, val);
 933         if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
 934                 return mbm_current->total_bytes;
 935
 936         if (first) {
 937                 mbm_current->prev_msr = val;
 938                 mbm_current->total_bytes = 0;
 939                 return mbm_current->total_bytes;
 940         }
 941
 942         /*
 943          * The h/w guarantees that counters will not overflow
 944          * so long as we poll them at least once per second.
 945          */
 946         shift = 64 - MBM_CNTR_WIDTH;
 947         bytes = (val << shift) - (mbm_current->prev_msr << shift);
 948         bytes >>= shift;
 949
 950         bytes *= cqm_l3_scale;
 951
 952         mbm_current->total_bytes += bytes;
 953         mbm_current->prev_msr = val;
 954
 955         return mbm_current->total_bytes;
 956 }
 957
 958 static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
 959 {
 960         return update_sample(rmid, evt_type, 0);
 961 }
 962
 963 static void __intel_mbm_event_init(void *info)
 964 {
 965         struct rmid_read *rr = info;
 966
 967         update_sample(rr->rmid, rr->evt_type, 1);
 968 }
 969
 970 static void init_mbm_sample(u32 rmid, u32 evt_type)
 971 {
 972         struct rmid_read rr = {
 973                 .rmid = rmid,
 974                 .evt_type = evt_type,
 975                 .value = ATOMIC64_INIT(0),
 976         };
 977
 978         /* on each socket, init sample */
 979         on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
 980 }
 981
 982 /*
 983  * Find a group and setup RMID.
 984  *
 985  * If we're part of a group, we use the group's RMID.
 986  */
 987 static void intel_cqm_setup_event(struct perf_event *event,
 988                                   struct perf_event **group)
 989 {
 990         struct perf_event *iter;
 991         bool conflict = false;
 992         u32 rmid;
 993
 994         event->hw.is_group_event = false;
 995         list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
 996                 rmid = iter->hw.cqm_rmid;
 997
 998                 if (__match_event(iter, event)) {
 999                         /* All tasks in a group share an RMID */
1000                         event->hw.cqm_rmid = rmid;
1001                         *group = iter;
1002                         if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
1003                                 init_mbm_sample(rmid, event->attr.config);
1004                         return;
1005                 }
1006
1007                 /*
1008                  * We only care about conflicts for events that are
1009                  * actually scheduled in (and hence have a valid RMID).
1010                  */
1011                 if (__conflict_event(iter, event) && __rmid_valid(rmid))
1012                         conflict = true;
1013         }
1014
1015         if (conflict)
1016                 rmid = INVALID_RMID;
1017         else
1018                 rmid = __get_rmid();
1019
1020         if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
1021                 init_mbm_sample(rmid, event->attr.config);
1022
1023         event->hw.cqm_rmid = rmid;
1024 }
1025
1026 static void intel_cqm_event_read(struct perf_event *event)
1027 {
1028         unsigned long flags;
1029         u32 rmid;
1030         u64 val;
1031
1032         /*
1033          * Task events are handled by intel_cqm_event_count().
1034          */
1035         if (event->cpu == -1)
1036                 return;
1037
1038         raw_spin_lock_irqsave(&cache_lock, flags);
1039         rmid = event->hw.cqm_rmid;
1040
1041         if (!__rmid_valid(rmid))
1042                 goto out;
1043
1044         if (is_mbm_event(event->attr.config))
1045                 val = rmid_read_mbm(rmid, event->attr.config);
1046         else
1047                 val = __rmid_read(rmid);
1048
1049         /*
1050          * Ignore this reading on error states and do not update the value.
1051          */
1052         if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
1053                 goto out;
1054
1055         local64_set(&event->count, val);
1056 out:
1057         raw_spin_unlock_irqrestore(&cache_lock, flags);
1058 }
1059
1060 static void __intel_cqm_event_count(void *info)
1061 {
1062         struct rmid_read *rr = info;
1063         u64 val;
1064
1065         val = __rmid_read(rr->rmid);
1066
1067         if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
1068                 return;
1069
1070         atomic64_add(val, &rr->value);
1071 }
1072
1073 static inline bool cqm_group_leader(struct perf_event *event)
1074 {
1075         return !list_empty(&event->hw.cqm_groups_entry);
1076 }
1077
1078 static void __intel_mbm_event_count(void *info)
1079 {
1080         struct rmid_read *rr = info;
1081         u64 val;
1082
1083         val = rmid_read_mbm(rr->rmid, rr->evt_type);
1084         if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
1085                 return;
1086         atomic64_add(val, &rr->value);
1087 }
1088
1089 static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
1090 {
1091         struct perf_event *iter, *iter1;
1092         int ret = HRTIMER_RESTART;
1093         struct list_head *head;
1094         unsigned long flags;
1095         u32 grp_rmid;
1096
1097         /*
1098          * Need to cache_lock as the timer Event Select MSR reads
1099          * can race with the mbm/cqm count() and mbm_init() reads.
1100          */
1101         raw_spin_lock_irqsave(&cache_lock, flags);
1102
1103         if (list_empty(&cache_groups)) {
1104                 ret = HRTIMER_NORESTART;
1105                 goto out;
1106         }
1107
1108         list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
1109                 grp_rmid = iter->hw.cqm_rmid;
1110                 if (!__rmid_valid(grp_rmid))
1111                         continue;
1112                 if (is_mbm_event(iter->attr.config))
1113                         update_sample(grp_rmid, iter->attr.config, 0);
1114
1115                 head = &iter->hw.cqm_group_entry;
1116                 if (list_empty(head))
1117                         continue;
1118                 list_for_each_entry(iter1, head, hw.cqm_group_entry) {
1119                         if (!iter1->hw.is_group_event)
1120                                 break;
1121                         if (is_mbm_event(iter1->attr.config))
1122                                 update_sample(iter1->hw.cqm_rmid,
1123                                               iter1->attr.config, 0);
1124                 }
1125         }
1126
1127         hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
1128 out:
1129         raw_spin_unlock_irqrestore(&cache_lock, flags);
1130
1131         return ret;
1132 }
1133
1134 static void __mbm_start_timer(void *info)
1135 {
1136         hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
1137                              HRTIMER_MODE_REL_PINNED);
1138 }
1139
1140 static void __mbm_stop_timer(void *info)
1141 {
1142         hrtimer_cancel(&mbm_timers[pkg_id]);
1143 }
1144
1145 static void mbm_start_timers(void)
1146 {
1147         on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
1148 }
1149
1150 static void mbm_stop_timers(void)
1151 {
1152         on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
1153 }
1154
1155 static void mbm_hrtimer_init(void)
1156 {
1157         struct hrtimer *hr;
1158         int i;
1159
1160         for (i = 0; i < mbm_socket_max; i++) {
1161                 hr = &mbm_timers[i];
1162                 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1163                 hr->function = mbm_hrtimer_handle;
1164         }
1165 }
1166
1167 static u64 intel_cqm_event_count(struct perf_event *event)
1168 {
1169         unsigned long flags;
1170         struct rmid_read rr = {
1171                 .evt_type = event->attr.config,
1172                 .value = ATOMIC64_INIT(0),
1173         };
1174
1175         /*
1176          * We only need to worry about task events. System-wide events
1177          * are handled like usual, i.e. entirely with
1178          * intel_cqm_event_read().
1179          */
1180         if (event->cpu != -1)
1181                 return __perf_event_count(event);
1182
1183         /*
1184          * Only the group leader gets to report values except in case of
1185          * multiple events in the same group, we still need to read the
1186          * other events.This stops us
1187          * reporting duplicate values to userspace, and gives us a clear
1188          * rule for which task gets to report the values.
1189          *
1190          * Note that it is impossible to attribute these values to
1191          * specific packages - we forfeit that ability when we create
1192          * task events.
1193          */
1194         if (!cqm_group_leader(event) && !event->hw.is_group_event)
1195                 return 0;
1196
1197         /*
1198          * Getting up-to-date values requires an SMP IPI which is not
1199          * possible if we're being called in interrupt context. Return
1200          * the cached values instead.
1201          */
1202         if (unlikely(in_interrupt()))
1203                 goto out;
1204
1205         /*
1206          * Notice that we don't perform the reading of an RMID
1207          * atomically, because we can't hold a spin lock across the
1208          * IPIs.
1209          *
1210          * Speculatively perform the read, since @event might be
1211          * assigned a different (possibly invalid) RMID while we're
1212          * busying performing the IPI calls. It's therefore necessary to
1213          * check @event's RMID afterwards, and if it has changed,
1214          * discard the result of the read.
1215          */
1216         rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
1217
1218         if (!__rmid_valid(rr.rmid))
1219                 goto out;
1220
1221         cqm_mask_call(&rr);
1222
1223         raw_spin_lock_irqsave(&cache_lock, flags);
1224         if (event->hw.cqm_rmid == rr.rmid)
1225                 local64_set(&event->count, atomic64_read(&rr.value));
1226         raw_spin_unlock_irqrestore(&cache_lock, flags);
1227 out:
1228         return __perf_event_count(event);
1229 }
1230
1231 static void intel_cqm_event_start(struct perf_event *event, int mode)
1232 {
1233         struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
1234         u32 rmid = event->hw.cqm_rmid;
1235
1236         if (!(event->hw.cqm_state & PERF_HES_STOPPED))
1237                 return;
1238
1239         event->hw.cqm_state &= ~PERF_HES_STOPPED;
1240
1241         if (state->rmid_usecnt++) {
1242                 if (!WARN_ON_ONCE(state->rmid != rmid))
1243                         return;
1244         } else {
1245                 WARN_ON_ONCE(state->rmid);
1246         }
1247
1248         state->rmid = rmid;
1249         wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
1250 }
1251
1252 static void intel_cqm_event_stop(struct perf_event *event, int mode)
1253 {
1254         struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
1255
1256         if (event->hw.cqm_state & PERF_HES_STOPPED)
1257                 return;
1258
1259         event->hw.cqm_state |= PERF_HES_STOPPED;
1260
1261         intel_cqm_event_read(event);
1262
1263         if (!--state->rmid_usecnt) {
1264                 state->rmid = 0;
1265                 wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
1266         } else {
1267                 WARN_ON_ONCE(!state->rmid);
1268         }
1269 }
1270
1271 static int intel_cqm_event_add(struct perf_event *event, int mode)
1272 {
1273         unsigned long flags;
1274         u32 rmid;
1275
1276         raw_spin_lock_irqsave(&cache_lock, flags);
1277
1278         event->hw.cqm_state = PERF_HES_STOPPED;
1279         rmid = event->hw.cqm_rmid;
1280
1281         if (__rmid_valid(rmid) && (mode & PERF_EF_START))
1282                 intel_cqm_event_start(event, mode);
1283
1284         raw_spin_unlock_irqrestore(&cache_lock, flags);
1285
1286         return 0;
1287 }
1288
1289 static void intel_cqm_event_destroy(struct perf_event *event)
1290 {
1291         struct perf_event *group_other = NULL;
1292         unsigned long flags;
1293
1294         mutex_lock(&cache_mutex);
1295         /*
1296         * Hold the cache_lock as mbm timer handlers could be
1297         * scanning the list of events.
1298         */
1299         raw_spin_lock_irqsave(&cache_lock, flags);
1300
1301         /*
1302          * If there's another event in this group...
1303          */
1304         if (!list_empty(&event->hw.cqm_group_entry)) {
1305                 group_other = list_first_entry(&event->hw.cqm_group_entry,
1306                                                struct perf_event,
1307                                                hw.cqm_group_entry);
1308                 list_del(&event->hw.cqm_group_entry);
1309         }
1310
1311         /*
1312          * And we're the group leader..
1313          */
1314         if (cqm_group_leader(event)) {
1315                 /*
1316                  * If there was a group_other, make that leader, otherwise
1317                  * destroy the group and return the RMID.
1318                  */
1319                 if (group_other) {
1320                         list_replace(&event->hw.cqm_groups_entry,
1321                                      &group_other->hw.cqm_groups_entry);
1322                 } else {
1323                         u32 rmid = event->hw.cqm_rmid;
1324
1325                         if (__rmid_valid(rmid))
1326                                 __put_rmid(rmid);
1327                         list_del(&event->hw.cqm_groups_entry);
1328                 }
1329         }
1330
1331         raw_spin_unlock_irqrestore(&cache_lock, flags);
1332
1333         /*
1334          * Stop the mbm overflow timers when the last event is destroyed.
1335         */
1336         if (mbm_enabled && list_empty(&cache_groups))
1337                 mbm_stop_timers();
1338
1339         mutex_unlock(&cache_mutex);
1340 }
1341
1342 static int intel_cqm_event_init(struct perf_event *event)
1343 {
1344         struct perf_event *group = NULL;
1345         bool rotate = false;
1346         unsigned long flags;
1347
1348         if (event->attr.type != intel_cqm_pmu.type)
1349                 return -ENOENT;
1350
1351         if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
1352              (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
1353                 return -EINVAL;
1354
1355         if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
1356             (is_mbm_event(event->attr.config) && !mbm_enabled))
1357                 return -EINVAL;
1358
1359         /* unsupported modes and filters */
1360         if (event->attr.exclude_user   ||
1361             event->attr.exclude_kernel ||
1362             event->attr.exclude_hv     ||
1363             event->attr.exclude_idle   ||
1364             event->attr.exclude_host   ||
1365             event->attr.exclude_guest  ||
1366             event->attr.sample_period) /* no sampling */
1367                 return -EINVAL;
1368
1369         INIT_LIST_HEAD(&event->hw.cqm_group_entry);
1370         INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
1371
1372         event->destroy = intel_cqm_event_destroy;
1373
1374         mutex_lock(&cache_mutex);
1375
1376         /*
1377          * Start the mbm overflow timers when the first event is created.
1378         */
1379         if (mbm_enabled && list_empty(&cache_groups))
1380                 mbm_start_timers();
1381
1382         /* Will also set rmid */
1383         intel_cqm_setup_event(event, &group);
1384
1385         /*
1386         * Hold the cache_lock as mbm timer handlers be
1387         * scanning the list of events.
1388         */
1389         raw_spin_lock_irqsave(&cache_lock, flags);
1390
1391         if (group) {
1392                 list_add_tail(&event->hw.cqm_group_entry,
1393                               &group->hw.cqm_group_entry);
1394         } else {
1395                 list_add_tail(&event->hw.cqm_groups_entry,
1396                               &cache_groups);
1397
1398                 /*
1399                  * All RMIDs are either in use or have recently been
1400                  * used. Kick the rotation worker to clean/free some.
1401                  *
1402                  * We only do this for the group leader, rather than for
1403                  * every event in a group to save on needless work.
1404                  */
1405                 if (!__rmid_valid(event->hw.cqm_rmid))
1406                         rotate = true;
1407         }
1408
1409         raw_spin_unlock_irqrestore(&cache_lock, flags);
1410         mutex_unlock(&cache_mutex);
1411
1412         if (rotate)
1413                 schedule_delayed_work(&intel_cqm_rmid_work, 0);
1414
1415         return 0;
1416 }
1417
1418 EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
1419 EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
1420 EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
1421 EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
1422 EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
1423
1424 EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
1425 EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
1426 EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
1427 EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
1428
1429 EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
1430 EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
1431 EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
1432 EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
1433
1434 static struct attribute *intel_cqm_events_attr[] = {
1435         EVENT_PTR(intel_cqm_llc),
1436         EVENT_PTR(intel_cqm_llc_pkg),
1437         EVENT_PTR(intel_cqm_llc_unit),
1438         EVENT_PTR(intel_cqm_llc_scale),
1439         EVENT_PTR(intel_cqm_llc_snapshot),
1440         NULL,
1441 };
1442
1443 static struct attribute *intel_mbm_events_attr[] = {
1444         EVENT_PTR(intel_cqm_total_bytes),
1445         EVENT_PTR(intel_cqm_local_bytes),
1446         EVENT_PTR(intel_cqm_total_bytes_pkg),
1447         EVENT_PTR(intel_cqm_local_bytes_pkg),
1448         EVENT_PTR(intel_cqm_total_bytes_unit),
1449         EVENT_PTR(intel_cqm_local_bytes_unit),
1450         EVENT_PTR(intel_cqm_total_bytes_scale),
1451         EVENT_PTR(intel_cqm_local_bytes_scale),
1452         NULL,
1453 };
1454
1455 static struct attribute *intel_cmt_mbm_events_attr[] = {
1456         EVENT_PTR(intel_cqm_llc),
1457         EVENT_PTR(intel_cqm_total_bytes),
1458         EVENT_PTR(intel_cqm_local_bytes),
1459         EVENT_PTR(intel_cqm_llc_pkg),
1460         EVENT_PTR(intel_cqm_total_bytes_pkg),
1461         EVENT_PTR(intel_cqm_local_bytes_pkg),
1462         EVENT_PTR(intel_cqm_llc_unit),
1463         EVENT_PTR(intel_cqm_total_bytes_unit),
1464         EVENT_PTR(intel_cqm_local_bytes_unit),
1465         EVENT_PTR(intel_cqm_llc_scale),
1466         EVENT_PTR(intel_cqm_total_bytes_scale),
1467         EVENT_PTR(intel_cqm_local_bytes_scale),
1468         EVENT_PTR(intel_cqm_llc_snapshot),
1469         NULL,
1470 };
1471
1472 static struct attribute_group intel_cqm_events_group = {
1473         .name = "events",
1474         .attrs = NULL,
1475 };
1476
1477 PMU_FORMAT_ATTR(event, "config:0-7");
1478 static struct attribute *intel_cqm_formats_attr[] = {
1479         &format_attr_event.attr,
1480         NULL,
1481 };
1482
1483 static struct attribute_group intel_cqm_format_group = {
1484         .name = "format",
1485         .attrs = intel_cqm_formats_attr,
1486 };
1487
1488 static ssize_t
1489 max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
1490                            char *page)
1491 {
1492         ssize_t rv;
1493
1494         mutex_lock(&cache_mutex);
1495         rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
1496         mutex_unlock(&cache_mutex);
1497
1498         return rv;
1499 }
1500
1501 static ssize_t
1502 max_recycle_threshold_store(struct device *dev,
1503                             struct device_attribute *attr,
1504                             const char *buf, size_t count)
1505 {
1506         unsigned int bytes, cachelines;
1507         int ret;
1508
1509         ret = kstrtouint(buf, 0, &bytes);
1510         if (ret)
1511                 return ret;
1512
1513         mutex_lock(&cache_mutex);
1514
1515         __intel_cqm_max_threshold = bytes;
1516         cachelines = bytes / cqm_l3_scale;
1517
1518         /*
1519          * The new maximum takes effect immediately.
1520          */
1521         if (__intel_cqm_threshold > cachelines)
1522                 __intel_cqm_threshold = cachelines;
1523
1524         mutex_unlock(&cache_mutex);
1525
1526         return count;
1527 }
1528
1529 static DEVICE_ATTR_RW(max_recycle_threshold);
1530
1531 static struct attribute *intel_cqm_attrs[] = {
1532         &dev_attr_max_recycle_threshold.attr,
1533         NULL,
1534 };
1535
1536 static const struct attribute_group intel_cqm_group = {
1537         .attrs = intel_cqm_attrs,
1538 };
1539
1540 static const struct attribute_group *intel_cqm_attr_groups[] = {
1541         &intel_cqm_events_group,
1542         &intel_cqm_format_group,
1543         &intel_cqm_group,
1544         NULL,
1545 };
1546
1547 static struct pmu intel_cqm_pmu = {
1548         .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
1549         .attr_groups         = intel_cqm_attr_groups,
1550         .task_ctx_nr         = perf_sw_context,
1551         .event_init          = intel_cqm_event_init,
1552         .add                 = intel_cqm_event_add,
1553         .del                 = intel_cqm_event_stop,
1554         .start               = intel_cqm_event_start,
1555         .stop                = intel_cqm_event_stop,
1556         .read                = intel_cqm_event_read,
1557         .count               = intel_cqm_event_count,
1558 };
1559
1560 static inline void cqm_pick_event_reader(int cpu)
1561 {
1562         int reader;
1563
1564         /* First online cpu in package becomes the reader */
1565         reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
1566         if (reader >= nr_cpu_ids)
1567                 cpumask_set_cpu(cpu, &cqm_cpumask);
1568 }
1569
1570 static int intel_cqm_cpu_starting(unsigned int cpu)
1571 {
1572         struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
1573         struct cpuinfo_x86 *c = &cpu_data(cpu);
1574
1575         state->rmid = 0;
1576         state->closid = 0;
1577         state->rmid_usecnt = 0;
1578
1579         WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
1580         WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
1581
1582         cqm_pick_event_reader(cpu);
1583         return 0;
1584 }
1585
1586 static int intel_cqm_cpu_exit(unsigned int cpu)
1587 {
1588         int target;
1589
1590         /* Is @cpu the current cqm reader for this package ? */
1591         if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
1592                 return 0;
1593
1594         /* Find another online reader in this package */
1595         target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
1596
1597         if (target < nr_cpu_ids)
1598                 cpumask_set_cpu(target, &cqm_cpumask);
1599
1600         return 0;
1601 }
1602
1603 static const struct x86_cpu_id intel_cqm_match[] = {
1604         { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
1605         {}
1606 };
1607
1608 static void mbm_cleanup(void)
1609 {
1610         if (!mbm_enabled)
1611                 return;
1612
1613         kfree(mbm_local);
1614         kfree(mbm_total);
1615         mbm_enabled = false;
1616 }
1617
1618 static const struct x86_cpu_id intel_mbm_local_match[] = {
1619         { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
1620         {}
1621 };
1622
1623 static const struct x86_cpu_id intel_mbm_total_match[] = {
1624         { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
1625         {}
1626 };
1627
1628 static int intel_mbm_init(void)
1629 {
1630         int ret = 0, array_size, maxid = cqm_max_rmid + 1;
1631
1632         mbm_socket_max = topology_max_packages();
1633         array_size = sizeof(struct sample) * maxid * mbm_socket_max;
1634         mbm_local = kmalloc(array_size, GFP_KERNEL);
1635         if (!mbm_local)
1636                 return -ENOMEM;
1637
1638         mbm_total = kmalloc(array_size, GFP_KERNEL);
1639         if (!mbm_total) {
1640                 ret = -ENOMEM;
1641                 goto out;
1642         }
1643
1644         array_size = sizeof(struct hrtimer) * mbm_socket_max;
1645         mbm_timers = kmalloc(array_size, GFP_KERNEL);
1646         if (!mbm_timers) {
1647                 ret = -ENOMEM;
1648                 goto out;
1649         }
1650         mbm_hrtimer_init();
1651
1652 out:
1653         if (ret)
1654                 mbm_cleanup();
1655
1656         return ret;
1657 }
1658
1659 static int __init intel_cqm_init(void)
1660 {
1661         char *str = NULL, scale[20];
1662         int cpu, ret;
1663
1664         if (x86_match_cpu(intel_cqm_match))
1665                 cqm_enabled = true;
1666
1667         if (x86_match_cpu(intel_mbm_local_match) &&
1668              x86_match_cpu(intel_mbm_total_match))
1669                 mbm_enabled = true;
1670
1671         if (!cqm_enabled && !mbm_enabled)
1672                 return -ENODEV;
1673
1674         cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
1675
1676         /*
1677          * It's possible that not all resources support the same number
1678          * of RMIDs. Instead of making scheduling much more complicated
1679          * (where we have to match a task's RMID to a cpu that supports
1680          * that many RMIDs) just find the minimum RMIDs supported across
1681          * all cpus.
1682          *
1683          * Also, check that the scales match on all cpus.
1684          */
1685         get_online_cpus();
1686         for_each_online_cpu(cpu) {
1687                 struct cpuinfo_x86 *c = &cpu_data(cpu);
1688
1689                 if (c->x86_cache_max_rmid < cqm_max_rmid)
1690                         cqm_max_rmid = c->x86_cache_max_rmid;
1691
1692                 if (c->x86_cache_occ_scale != cqm_l3_scale) {
1693                         pr_err("Multiple LLC scale values, disabling\n");
1694                         ret = -EINVAL;
1695                         goto out;
1696                 }
1697         }
1698
1699         /*
1700          * A reasonable upper limit on the max threshold is the number
1701          * of lines tagged per RMID if all RMIDs have the same number of
1702          * lines tagged in the LLC.
1703          *
1704          * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
1705          */
1706         __intel_cqm_max_threshold =
1707                 boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
1708
1709         snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
1710         str = kstrdup(scale, GFP_KERNEL);
1711         if (!str) {
1712                 ret = -ENOMEM;
1713                 goto out;
1714         }
1715
1716         event_attr_intel_cqm_llc_scale.event_str = str;
1717
1718         ret = intel_cqm_setup_rmid_cache();
1719         if (ret)
1720                 goto out;
1721
1722         if (mbm_enabled)
1723                 ret = intel_mbm_init();
1724         if (ret && !cqm_enabled)
1725                 goto out;
1726
1727         if (cqm_enabled && mbm_enabled)
1728                 intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
1729         else if (!cqm_enabled && mbm_enabled)
1730                 intel_cqm_events_group.attrs = intel_mbm_events_attr;
1731         else if (cqm_enabled && !mbm_enabled)
1732                 intel_cqm_events_group.attrs = intel_cqm_events_attr;
1733
1734         ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
1735         if (ret) {
1736                 pr_err("Intel CQM perf registration failed: %d\n", ret);
1737                 goto out;
1738         }
1739
1740         if (cqm_enabled)
1741                 pr_info("Intel CQM monitoring enabled\n");
1742         if (mbm_enabled)
1743                 pr_info("Intel MBM enabled\n");
1744
1745         /*
1746          * Setup the hot cpu notifier once we are sure cqm
1747          * is enabled to avoid notifier leak.
1748          */
1749         cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING,
1750                           "perf/x86/cqm:starting",
1751                           intel_cqm_cpu_starting, NULL);
1752         cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "perf/x86/cqm:online",
1753                           NULL, intel_cqm_cpu_exit);
1754
1755 out:
1756         put_online_cpus();
1757
1758         if (ret) {
1759                 kfree(str);
1760                 cqm_cleanup();
1761                 mbm_cleanup();
1762         }
1763
1764         return ret;
1765 }
1766 device_initcall(intel_cqm_init);