]>
Commit | Line | Data |
---|---|---|
83d4a977 VS |
1 | /* |
2 | * Resource Director Technology(RDT) | |
3 | * - Monitoring code | |
4 | * | |
5 | * Copyright (C) 2017 Intel Corporation | |
6 | * | |
7 | * Author: | |
8 | * Vikas Shivappa <vikas.shivappa@intel.com> | |
9 | * | |
10 | * This replaces the cqm.c based on perf but we reuse a lot of | |
11 | * code and datastructures originally from Peter Zijlstra and Matt Fleming. | |
12 | * | |
13 | * This program is free software; you can redistribute it and/or modify it | |
14 | * under the terms and conditions of the GNU General Public License, | |
15 | * version 2, as published by the Free Software Foundation. | |
16 | * | |
17 | * This program is distributed in the hope it will be useful, but WITHOUT | |
18 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
19 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | |
20 | * more details. | |
21 | * | |
22 | * More information about RDT be found in the Intel (R) x86 Architecture | |
23 | * Software Developer Manual June 2016, volume 3, section 17.17. | |
24 | */ | |
25 | ||
26 | #include <linux/module.h> | |
27 | #include <linux/slab.h> | |
28 | #include <asm/cpu_device_id.h> | |
29 | #include "intel_rdt.h" | |
30 | ||
c6353f9a VS |
31 | #define MSR_IA32_QM_CTR 0x0c8e |
32 | #define MSR_IA32_QM_EVTSEL 0x0c8d | |
33 | ||
83d4a977 VS |
34 | struct rmid_entry { |
35 | u32 rmid; | |
ac2fc5ad | 36 | int busy; |
83d4a977 VS |
37 | struct list_head list; |
38 | }; | |
39 | ||
40 | /** | |
41 | * @rmid_free_lru A least recently used list of free RMIDs | |
42 | * These RMIDs are guaranteed to have an occupancy less than the | |
43 | * threshold occupancy | |
44 | */ | |
45 | static LIST_HEAD(rmid_free_lru); | |
46 | ||
47 | /** | |
ac2fc5ad | 48 | * @rmid_limbo_count count of currently unused but (potentially) |
83d4a977 | 49 | * dirty RMIDs. |
ac2fc5ad | 50 | * This counts RMIDs that no one is currently using but that |
83d4a977 VS |
51 | * may have a occupancy value > intel_cqm_threshold. User can change |
52 | * the threshold occupancy value. | |
53 | */ | |
ac2fc5ad | 54 | unsigned int rmid_limbo_count; |
83d4a977 VS |
55 | |
56 | /** | |
57 | * @rmid_entry - The entry in the limbo and free lists. | |
58 | */ | |
59 | static struct rmid_entry *rmid_ptrs; | |
60 | ||
61 | /* | |
62 | * Global boolean for rdt_monitor which is true if any | |
63 | * resource monitoring is enabled. | |
64 | */ | |
65 | bool rdt_mon_capable; | |
66 | ||
67 | /* | |
68 | * Global to indicate which monitoring events are enabled. | |
69 | */ | |
70 | unsigned int rdt_mon_features; | |
71 | ||
72 | /* | |
73 | * This is the threshold cache occupancy at which we will consider an | |
74 | * RMID available for re-allocation. | |
75 | */ | |
76 | unsigned int intel_cqm_threshold; | |
77 | ||
78 | static inline struct rmid_entry *__rmid_entry(u32 rmid) | |
79 | { | |
80 | struct rmid_entry *entry; | |
81 | ||
82 | entry = &rmid_ptrs[rmid]; | |
83 | WARN_ON(entry->rmid != rmid); | |
84 | ||
85 | return entry; | |
86 | } | |
87 | ||
c6353f9a VS |
88 | static u64 __rmid_read(u32 rmid, u32 eventid) |
89 | { | |
90 | u64 val; | |
91 | ||
92 | /* | |
93 | * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured | |
94 | * with a valid event code for supported resource type and the bits | |
95 | * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, | |
96 | * IA32_QM_CTR.data (bits 61:0) reports the monitored data. | |
97 | * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) | |
98 | * are error bits. | |
99 | */ | |
100 | wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); | |
101 | rdmsrl(MSR_IA32_QM_CTR, val); | |
102 | ||
103 | return val; | |
104 | } | |
105 | ||
ac2fc5ad | 106 | static bool rmid_dirty(struct rmid_entry *entry) |
c6353f9a | 107 | { |
ac2fc5ad | 108 | u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); |
c6353f9a | 109 | |
ac2fc5ad | 110 | return val >= intel_cqm_threshold; |
c6353f9a VS |
111 | } |
112 | ||
113 | /* | |
ac2fc5ad VS |
114 | * Check the RMIDs that are marked as busy for this domain. If the |
115 | * reported LLC occupancy is below the threshold clear the busy bit and | |
116 | * decrement the count. If the busy count gets to zero on an RMID, we | |
117 | * free the RMID | |
c6353f9a | 118 | */ |
ac2fc5ad | 119 | void __check_limbo(struct rdt_domain *d, bool force_free) |
c6353f9a | 120 | { |
ac2fc5ad | 121 | struct rmid_entry *entry; |
c6353f9a | 122 | struct rdt_resource *r; |
ac2fc5ad | 123 | u32 crmid = 1, nrmid; |
c6353f9a VS |
124 | |
125 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | |
126 | ||
c6353f9a | 127 | /* |
ac2fc5ad VS |
128 | * Skip RMID 0 and start from RMID 1 and check all the RMIDs that |
129 | * are marked as busy for occupancy < threshold. If the occupancy | |
130 | * is less than the threshold decrement the busy counter of the | |
131 | * RMID and move it to the free list when the counter reaches 0. | |
c6353f9a | 132 | */ |
ac2fc5ad VS |
133 | for (;;) { |
134 | nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); | |
135 | if (nrmid >= r->num_rmid) | |
136 | break; | |
137 | ||
138 | entry = __rmid_entry(nrmid); | |
139 | if (force_free || !rmid_dirty(entry)) { | |
140 | clear_bit(entry->rmid, d->rmid_busy_llc); | |
141 | if (!--entry->busy) { | |
142 | rmid_limbo_count--; | |
c6353f9a | 143 | list_add_tail(&entry->list, &rmid_free_lru); |
c6353f9a VS |
144 | } |
145 | } | |
ac2fc5ad | 146 | crmid = nrmid + 1; |
c6353f9a | 147 | } |
ac2fc5ad | 148 | } |
c6353f9a | 149 | |
ac2fc5ad VS |
150 | bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) |
151 | { | |
152 | return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; | |
c6353f9a VS |
153 | } |
154 | ||
155 | /* | |
156 | * As of now the RMIDs allocation is global. | |
157 | * However we keep track of which packages the RMIDs | |
158 | * are used to optimize the limbo list management. | |
159 | */ | |
160 | int alloc_rmid(void) | |
161 | { | |
162 | struct rmid_entry *entry; | |
c6353f9a VS |
163 | |
164 | lockdep_assert_held(&rdtgroup_mutex); | |
165 | ||
ac2fc5ad VS |
166 | if (list_empty(&rmid_free_lru)) |
167 | return rmid_limbo_count ? -EBUSY : -ENOSPC; | |
c6353f9a VS |
168 | |
169 | entry = list_first_entry(&rmid_free_lru, | |
170 | struct rmid_entry, list); | |
171 | list_del(&entry->list); | |
172 | ||
173 | return entry->rmid; | |
174 | } | |
175 | ||
176 | static void add_rmid_to_limbo(struct rmid_entry *entry) | |
177 | { | |
178 | struct rdt_resource *r; | |
179 | struct rdt_domain *d; | |
ac2fc5ad | 180 | int cpu; |
c6353f9a VS |
181 | u64 val; |
182 | ||
183 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | |
184 | ||
ac2fc5ad | 185 | entry->busy = 0; |
c6353f9a VS |
186 | cpu = get_cpu(); |
187 | list_for_each_entry(d, &r->domains, list) { | |
188 | if (cpumask_test_cpu(cpu, &d->cpu_mask)) { | |
189 | val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); | |
190 | if (val <= intel_cqm_threshold) | |
191 | continue; | |
192 | } | |
ac2fc5ad VS |
193 | |
194 | /* | |
195 | * For the first limbo RMID in the domain, | |
196 | * setup up the limbo worker. | |
197 | */ | |
198 | if (!has_busy_rmid(r, d)) | |
199 | cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); | |
c6353f9a | 200 | set_bit(entry->rmid, d->rmid_busy_llc); |
ac2fc5ad | 201 | entry->busy++; |
c6353f9a VS |
202 | } |
203 | put_cpu(); | |
204 | ||
ac2fc5ad VS |
205 | if (entry->busy) |
206 | rmid_limbo_count++; | |
207 | else | |
c6353f9a | 208 | list_add_tail(&entry->list, &rmid_free_lru); |
c6353f9a VS |
209 | } |
210 | ||
211 | void free_rmid(u32 rmid) | |
212 | { | |
213 | struct rmid_entry *entry; | |
214 | ||
215 | if (!rmid) | |
216 | return; | |
217 | ||
218 | lockdep_assert_held(&rdtgroup_mutex); | |
219 | ||
220 | entry = __rmid_entry(rmid); | |
221 | ||
222 | if (is_llc_occupancy_enabled()) | |
223 | add_rmid_to_limbo(entry); | |
224 | else | |
225 | list_add_tail(&entry->list, &rmid_free_lru); | |
226 | } | |
227 | ||
8efeea83 VS |
228 | static int __mon_event_count(u32 rmid, struct rmid_read *rr) |
229 | { | |
7ed47cce TL |
230 | u64 chunks, shift, tval; |
231 | struct mbm_state *m; | |
8efeea83 VS |
232 | |
233 | tval = __rmid_read(rmid, rr->evtid); | |
234 | if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { | |
235 | rr->val = tval; | |
236 | return -EINVAL; | |
237 | } | |
238 | switch (rr->evtid) { | |
239 | case QOS_L3_OCCUP_EVENT_ID: | |
240 | rr->val += tval; | |
241 | return 0; | |
7ed47cce TL |
242 | case QOS_L3_MBM_TOTAL_EVENT_ID: |
243 | m = &rr->d->mbm_total[rmid]; | |
244 | break; | |
245 | case QOS_L3_MBM_LOCAL_EVENT_ID: | |
246 | m = &rr->d->mbm_local[rmid]; | |
247 | break; | |
8efeea83 VS |
248 | default: |
249 | /* | |
250 | * Code would never reach here because | |
251 | * an invalid event id would fail the __rmid_read. | |
252 | */ | |
253 | return -EINVAL; | |
254 | } | |
00d7dbe2 VS |
255 | |
256 | if (rr->first) { | |
257 | m->prev_msr = tval; | |
258 | m->chunks = 0; | |
259 | return 0; | |
260 | } | |
261 | ||
7ed47cce TL |
262 | shift = 64 - MBM_CNTR_WIDTH; |
263 | chunks = (tval << shift) - (m->prev_msr << shift); | |
264 | chunks >>= shift; | |
265 | m->chunks += chunks; | |
266 | m->prev_msr = tval; | |
267 | ||
268 | rr->val += m->chunks; | |
269 | return 0; | |
8efeea83 VS |
270 | } |
271 | ||
272 | /* | |
273 | * This is called via IPI to read the CQM/MBM counters | |
274 | * on a domain. | |
275 | */ | |
276 | void mon_event_count(void *info) | |
277 | { | |
278 | struct rdtgroup *rdtgrp, *entry; | |
279 | struct rmid_read *rr = info; | |
280 | struct list_head *head; | |
281 | ||
282 | rdtgrp = rr->rgrp; | |
283 | ||
284 | if (__mon_event_count(rdtgrp->mon.rmid, rr)) | |
285 | return; | |
286 | ||
287 | /* | |
288 | * For Ctrl groups read data from child monitor groups. | |
289 | */ | |
290 | head = &rdtgrp->mon.crdtgrp_list; | |
291 | ||
292 | if (rdtgrp->type == RDTCTRL_GROUP) { | |
293 | list_for_each_entry(entry, head, mon.crdtgrp_list) { | |
294 | if (__mon_event_count(entry->mon.rmid, rr)) | |
295 | return; | |
296 | } | |
297 | } | |
298 | } | |
646779c7 | 299 | |
8c27c393 VS |
300 | static void mbm_update(struct rdt_domain *d, int rmid) |
301 | { | |
302 | struct rmid_read rr; | |
303 | ||
304 | rr.first = false; | |
305 | rr.d = d; | |
306 | ||
307 | /* | |
308 | * This is protected from concurrent reads from user | |
309 | * as both the user and we hold the global mutex. | |
310 | */ | |
311 | if (is_mbm_total_enabled()) { | |
312 | rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; | |
313 | __mon_event_count(rmid, &rr); | |
314 | } | |
315 | if (is_mbm_local_enabled()) { | |
316 | rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; | |
317 | __mon_event_count(rmid, &rr); | |
318 | } | |
319 | } | |
320 | ||
ac2fc5ad VS |
321 | /* |
322 | * Handler to scan the limbo list and move the RMIDs | |
323 | * to free list whose occupancy < threshold_occupancy. | |
324 | */ | |
325 | void cqm_handle_limbo(struct work_struct *work) | |
326 | { | |
327 | unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); | |
328 | int cpu = smp_processor_id(); | |
329 | struct rdt_resource *r; | |
330 | struct rdt_domain *d; | |
331 | ||
332 | mutex_lock(&rdtgroup_mutex); | |
333 | ||
334 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | |
335 | d = get_domain_from_cpu(cpu, r); | |
336 | ||
337 | if (!d) { | |
338 | pr_warn_once("Failure to get domain for limbo worker\n"); | |
339 | goto out_unlock; | |
340 | } | |
341 | ||
342 | __check_limbo(d, false); | |
343 | ||
344 | if (has_busy_rmid(r, d)) | |
345 | schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); | |
346 | ||
347 | out_unlock: | |
348 | mutex_unlock(&rdtgroup_mutex); | |
349 | } | |
350 | ||
351 | void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) | |
352 | { | |
353 | unsigned long delay = msecs_to_jiffies(delay_ms); | |
354 | struct rdt_resource *r; | |
355 | int cpu; | |
356 | ||
357 | r = &rdt_resources_all[RDT_RESOURCE_L3]; | |
358 | ||
359 | cpu = cpumask_any(&dom->cpu_mask); | |
360 | dom->cqm_work_cpu = cpu; | |
361 | ||
362 | schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); | |
363 | } | |
364 | ||
8c27c393 VS |
365 | void mbm_handle_overflow(struct work_struct *work) |
366 | { | |
367 | unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); | |
368 | struct rdtgroup *prgrp, *crgrp; | |
369 | int cpu = smp_processor_id(); | |
370 | struct list_head *head; | |
371 | struct rdt_domain *d; | |
372 | ||
373 | mutex_lock(&rdtgroup_mutex); | |
374 | ||
375 | if (!static_branch_likely(&rdt_enable_key)) | |
376 | goto out_unlock; | |
377 | ||
378 | d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); | |
379 | if (!d) | |
380 | goto out_unlock; | |
381 | ||
382 | list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { | |
383 | mbm_update(d, prgrp->mon.rmid); | |
384 | ||
385 | head = &prgrp->mon.crdtgrp_list; | |
386 | list_for_each_entry(crgrp, head, mon.crdtgrp_list) | |
387 | mbm_update(d, crgrp->mon.rmid); | |
388 | } | |
389 | ||
390 | schedule_delayed_work_on(cpu, &d->mbm_over, delay); | |
ac2fc5ad | 391 | |
8c27c393 VS |
392 | out_unlock: |
393 | mutex_unlock(&rdtgroup_mutex); | |
394 | } | |
395 | ||
aa0998e2 | 396 | void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) |
8c27c393 | 397 | { |
aa0998e2 | 398 | unsigned long delay = msecs_to_jiffies(delay_ms); |
8c27c393 VS |
399 | int cpu; |
400 | ||
401 | if (!static_branch_likely(&rdt_enable_key)) | |
402 | return; | |
403 | cpu = cpumask_any(&dom->cpu_mask); | |
404 | dom->mbm_work_cpu = cpu; | |
405 | schedule_delayed_work_on(cpu, &dom->mbm_over, delay); | |
406 | } | |
407 | ||
83d4a977 VS |
408 | static int dom_data_init(struct rdt_resource *r) |
409 | { | |
410 | struct rmid_entry *entry = NULL; | |
411 | int i, nr_rmids; | |
412 | ||
413 | nr_rmids = r->num_rmid; | |
414 | rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); | |
415 | if (!rmid_ptrs) | |
416 | return -ENOMEM; | |
417 | ||
418 | for (i = 0; i < nr_rmids; i++) { | |
419 | entry = &rmid_ptrs[i]; | |
420 | INIT_LIST_HEAD(&entry->list); | |
421 | ||
422 | entry->rmid = i; | |
423 | list_add_tail(&entry->list, &rmid_free_lru); | |
424 | } | |
425 | ||
426 | /* | |
427 | * RMID 0 is special and is always allocated. It's used for all | |
428 | * tasks that are not monitored. | |
429 | */ | |
430 | entry = __rmid_entry(0); | |
431 | list_del(&entry->list); | |
432 | ||
433 | return 0; | |
434 | } | |
435 | ||
436 | static struct mon_evt llc_occupancy_event = { | |
437 | .name = "llc_occupancy", | |
438 | .evtid = QOS_L3_OCCUP_EVENT_ID, | |
439 | }; | |
440 | ||
7ed47cce TL |
441 | static struct mon_evt mbm_total_event = { |
442 | .name = "mbm_total_bytes", | |
443 | .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, | |
444 | }; | |
445 | ||
446 | static struct mon_evt mbm_local_event = { | |
447 | .name = "mbm_local_bytes", | |
448 | .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, | |
449 | }; | |
450 | ||
83d4a977 VS |
451 | /* |
452 | * Initialize the event list for the resource. | |
453 | * | |
454 | * Note that MBM events are also part of RDT_RESOURCE_L3 resource | |
455 | * because as per the SDM the total and local memory bandwidth | |
456 | * are enumerated as part of L3 monitoring. | |
457 | */ | |
458 | static void l3_mon_evt_init(struct rdt_resource *r) | |
459 | { | |
460 | INIT_LIST_HEAD(&r->evt_list); | |
461 | ||
462 | if (is_llc_occupancy_enabled()) | |
463 | list_add_tail(&llc_occupancy_event.list, &r->evt_list); | |
7ed47cce TL |
464 | if (is_mbm_total_enabled()) |
465 | list_add_tail(&mbm_total_event.list, &r->evt_list); | |
466 | if (is_mbm_local_enabled()) | |
467 | list_add_tail(&mbm_local_event.list, &r->evt_list); | |
83d4a977 VS |
468 | } |
469 | ||
470 | int rdt_get_mon_l3_config(struct rdt_resource *r) | |
471 | { | |
472 | int ret; | |
473 | ||
474 | r->mon_scale = boot_cpu_data.x86_cache_occ_scale; | |
475 | r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; | |
476 | ||
477 | /* | |
478 | * A reasonable upper limit on the max threshold is the number | |
479 | * of lines tagged per RMID if all RMIDs have the same number of | |
480 | * lines tagged in the LLC. | |
481 | * | |
482 | * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. | |
483 | */ | |
484 | intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid; | |
485 | ||
486 | /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ | |
487 | intel_cqm_threshold /= r->mon_scale; | |
488 | ||
489 | ret = dom_data_init(r); | |
490 | if (ret) | |
491 | return ret; | |
492 | ||
493 | l3_mon_evt_init(r); | |
494 | ||
495 | r->mon_capable = true; | |
496 | r->mon_enabled = true; | |
497 | ||
498 | return 0; | |
499 | } |