]>
Commit | Line | Data |
---|---|---|
957558c9 | 1 | /* |
bb7dde87 | 2 | * Copyright(c) 2015 - 2017 Intel Corporation. |
957558c9 MH |
3 | * |
4 | * This file is provided under a dual BSD/GPLv2 license. When using or | |
5 | * redistributing this file, you may do so under either license. | |
6 | * | |
7 | * GPL LICENSE SUMMARY | |
8 | * | |
957558c9 MH |
9 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of version 2 of the GNU General Public License as | |
11 | * published by the Free Software Foundation. | |
12 | * | |
13 | * This program is distributed in the hope that it will be useful, but | |
14 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | * General Public License for more details. | |
17 | * | |
18 | * BSD LICENSE | |
19 | * | |
957558c9 MH |
20 | * Redistribution and use in source and binary forms, with or without |
21 | * modification, are permitted provided that the following conditions | |
22 | * are met: | |
23 | * | |
24 | * - Redistributions of source code must retain the above copyright | |
25 | * notice, this list of conditions and the following disclaimer. | |
26 | * - Redistributions in binary form must reproduce the above copyright | |
27 | * notice, this list of conditions and the following disclaimer in | |
28 | * the documentation and/or other materials provided with the | |
29 | * distribution. | |
30 | * - Neither the name of Intel Corporation nor the names of its | |
31 | * contributors may be used to endorse or promote products derived | |
32 | * from this software without specific prior written permission. | |
33 | * | |
34 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
35 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
36 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
37 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
38 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
39 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
40 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
41 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
42 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
43 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
44 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
45 | * | |
46 | */ | |
47 | #include <linux/topology.h> | |
48 | #include <linux/cpumask.h> | |
49 | #include <linux/module.h> | |
2d01c37d | 50 | #include <linux/interrupt.h> |
957558c9 MH |
51 | |
52 | #include "hfi.h" | |
53 | #include "affinity.h" | |
54 | #include "sdma.h" | |
55 | #include "trace.h" | |
56 | ||
4197344b DD |
57 | struct hfi1_affinity_node_list node_affinity = { |
58 | .list = LIST_HEAD_INIT(node_affinity.list), | |
584d9577 | 59 | .lock = __MUTEX_INITIALIZER(node_affinity.lock) |
4197344b DD |
60 | }; |
61 | ||
957558c9 MH |
62 | /* Name of IRQ types, indexed by enum irq_type */ |
63 | static const char * const irq_type_names[] = { | |
64 | "SDMA", | |
65 | "RCVCTXT", | |
66 | "GENERAL", | |
67 | "OTHER", | |
68 | }; | |
69 | ||
d6373019 SS |
70 | /* Per NUMA node count of HFI devices */ |
71 | static unsigned int *hfi1_per_node_cntr; | |
72 | ||
957558c9 MH |
73 | static inline void init_cpu_mask_set(struct cpu_mask_set *set) |
74 | { | |
75 | cpumask_clear(&set->mask); | |
76 | cpumask_clear(&set->used); | |
77 | set->gen = 0; | |
78 | } | |
79 | ||
0852d241 | 80 | /* Initialize non-HT cpu cores mask */ |
4197344b | 81 | void init_real_cpu_mask(void) |
0852d241 | 82 | { |
0852d241 JJ |
83 | int possible, curr_cpu, i, ht; |
84 | ||
4197344b | 85 | cpumask_clear(&node_affinity.real_cpu_mask); |
0852d241 JJ |
86 | |
87 | /* Start with cpu online mask as the real cpu mask */ | |
4197344b | 88 | cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask); |
0852d241 JJ |
89 | |
90 | /* | |
91 | * Remove HT cores from the real cpu mask. Do this in two steps below. | |
92 | */ | |
4197344b | 93 | possible = cpumask_weight(&node_affinity.real_cpu_mask); |
0852d241 | 94 | ht = cpumask_weight(topology_sibling_cpumask( |
4197344b | 95 | cpumask_first(&node_affinity.real_cpu_mask))); |
0852d241 JJ |
96 | /* |
97 | * Step 1. Skip over the first N HT siblings and use them as the | |
98 | * "real" cores. Assumes that HT cores are not enumerated in | |
99 | * succession (except in the single core case). | |
100 | */ | |
4197344b | 101 | curr_cpu = cpumask_first(&node_affinity.real_cpu_mask); |
0852d241 | 102 | for (i = 0; i < possible / ht; i++) |
4197344b | 103 | curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); |
0852d241 JJ |
104 | /* |
105 | * Step 2. Remove the remaining HT siblings. Use cpumask_next() to | |
106 | * skip any gaps. | |
107 | */ | |
108 | for (; i < possible; i++) { | |
4197344b DD |
109 | cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask); |
110 | curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask); | |
0852d241 | 111 | } |
4197344b | 112 | } |
0852d241 | 113 | |
d6373019 | 114 | int node_affinity_init(void) |
4197344b | 115 | { |
d6373019 SS |
116 | int node; |
117 | struct pci_dev *dev = NULL; | |
118 | const struct pci_device_id *ids = hfi1_pci_tbl; | |
119 | ||
b094a36f | 120 | cpumask_clear(&node_affinity.proc.used); |
4197344b | 121 | cpumask_copy(&node_affinity.proc.mask, cpu_online_mask); |
b094a36f SS |
122 | |
123 | node_affinity.proc.gen = 0; | |
124 | node_affinity.num_core_siblings = | |
125 | cpumask_weight(topology_sibling_cpumask( | |
126 | cpumask_first(&node_affinity.proc.mask) | |
127 | )); | |
9d8145a6 | 128 | node_affinity.num_possible_nodes = num_possible_nodes(); |
b094a36f SS |
129 | node_affinity.num_online_nodes = num_online_nodes(); |
130 | node_affinity.num_online_cpus = num_online_cpus(); | |
131 | ||
4197344b DD |
132 | /* |
133 | * The real cpu mask is part of the affinity struct but it has to be | |
134 | * initialized early. It is needed to calculate the number of user | |
135 | * contexts in set_up_context_variables(). | |
136 | */ | |
137 | init_real_cpu_mask(); | |
d6373019 | 138 | |
9d8145a6 | 139 | hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes, |
d6373019 SS |
140 | sizeof(*hfi1_per_node_cntr), GFP_KERNEL); |
141 | if (!hfi1_per_node_cntr) | |
142 | return -ENOMEM; | |
143 | ||
144 | while (ids->vendor) { | |
145 | dev = NULL; | |
146 | while ((dev = pci_get_device(ids->vendor, ids->device, dev))) { | |
147 | node = pcibus_to_node(dev->bus); | |
148 | if (node < 0) | |
149 | node = numa_node_id(); | |
150 | ||
151 | hfi1_per_node_cntr[node]++; | |
152 | } | |
153 | ids++; | |
154 | } | |
155 | ||
156 | return 0; | |
4197344b DD |
157 | } |
158 | ||
159 | void node_affinity_destroy(void) | |
160 | { | |
161 | struct list_head *pos, *q; | |
162 | struct hfi1_affinity_node *entry; | |
163 | ||
584d9577 | 164 | mutex_lock(&node_affinity.lock); |
4197344b DD |
165 | list_for_each_safe(pos, q, &node_affinity.list) { |
166 | entry = list_entry(pos, struct hfi1_affinity_node, | |
167 | list); | |
168 | list_del(pos); | |
169 | kfree(entry); | |
170 | } | |
584d9577 | 171 | mutex_unlock(&node_affinity.lock); |
d6373019 | 172 | kfree(hfi1_per_node_cntr); |
4197344b DD |
173 | } |
174 | ||
175 | static struct hfi1_affinity_node *node_affinity_allocate(int node) | |
176 | { | |
177 | struct hfi1_affinity_node *entry; | |
178 | ||
179 | entry = kzalloc(sizeof(*entry), GFP_KERNEL); | |
180 | if (!entry) | |
181 | return NULL; | |
182 | entry->node = node; | |
183 | INIT_LIST_HEAD(&entry->list); | |
184 | ||
185 | return entry; | |
186 | } | |
187 | ||
188 | /* | |
189 | * It appends an entry to the list. | |
190 | * It *must* be called with node_affinity.lock held. | |
191 | */ | |
192 | static void node_affinity_add_tail(struct hfi1_affinity_node *entry) | |
193 | { | |
194 | list_add_tail(&entry->list, &node_affinity.list); | |
195 | } | |
196 | ||
197 | /* It must be called with node_affinity.lock held */ | |
198 | static struct hfi1_affinity_node *node_affinity_lookup(int node) | |
199 | { | |
200 | struct list_head *pos; | |
201 | struct hfi1_affinity_node *entry; | |
202 | ||
203 | list_for_each(pos, &node_affinity.list) { | |
204 | entry = list_entry(pos, struct hfi1_affinity_node, list); | |
205 | if (entry->node == node) | |
206 | return entry; | |
207 | } | |
208 | ||
209 | return NULL; | |
0852d241 JJ |
210 | } |
211 | ||
957558c9 MH |
212 | /* |
213 | * Interrupt affinity. | |
214 | * | |
215 | * non-rcv avail gets a default mask that | |
216 | * starts as possible cpus with threads reset | |
217 | * and each rcv avail reset. | |
218 | * | |
219 | * rcv avail gets node relative 1 wrapping back | |
220 | * to the node relative 1 as necessary. | |
221 | * | |
222 | */ | |
4197344b | 223 | int hfi1_dev_affinity_init(struct hfi1_devdata *dd) |
957558c9 MH |
224 | { |
225 | int node = pcibus_to_node(dd->pcidev->bus); | |
4197344b | 226 | struct hfi1_affinity_node *entry; |
957558c9 | 227 | const struct cpumask *local_mask; |
0852d241 | 228 | int curr_cpu, possible, i; |
957558c9 MH |
229 | |
230 | if (node < 0) | |
231 | node = numa_node_id(); | |
232 | dd->node = node; | |
233 | ||
957558c9 MH |
234 | local_mask = cpumask_of_node(dd->node); |
235 | if (cpumask_first(local_mask) >= nr_cpu_ids) | |
236 | local_mask = topology_core_cpumask(0); | |
4197344b | 237 | |
584d9577 | 238 | mutex_lock(&node_affinity.lock); |
4197344b | 239 | entry = node_affinity_lookup(dd->node); |
4197344b DD |
240 | |
241 | /* | |
242 | * If this is the first time this NUMA node's affinity is used, | |
243 | * create an entry in the global affinity structure and initialize it. | |
244 | */ | |
245 | if (!entry) { | |
246 | entry = node_affinity_allocate(node); | |
247 | if (!entry) { | |
248 | dd_dev_err(dd, | |
249 | "Unable to allocate global affinity node\n"); | |
584d9577 | 250 | mutex_unlock(&node_affinity.lock); |
4197344b | 251 | return -ENOMEM; |
957558c9 | 252 | } |
4197344b DD |
253 | init_cpu_mask_set(&entry->def_intr); |
254 | init_cpu_mask_set(&entry->rcv_intr); | |
d6373019 | 255 | cpumask_clear(&entry->general_intr_mask); |
4197344b DD |
256 | /* Use the "real" cpu mask of this node as the default */ |
257 | cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, | |
258 | local_mask); | |
259 | ||
260 | /* fill in the receive list */ | |
261 | possible = cpumask_weight(&entry->def_intr.mask); | |
262 | curr_cpu = cpumask_first(&entry->def_intr.mask); | |
263 | ||
264 | if (possible == 1) { | |
265 | /* only one CPU, everyone will use it */ | |
266 | cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask); | |
d6373019 | 267 | cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); |
4197344b DD |
268 | } else { |
269 | /* | |
d6373019 SS |
270 | * The general/control context will be the first CPU in |
271 | * the default list, so it is removed from the default | |
272 | * list and added to the general interrupt list. | |
4197344b | 273 | */ |
d6373019 SS |
274 | cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask); |
275 | cpumask_set_cpu(curr_cpu, &entry->general_intr_mask); | |
4197344b DD |
276 | curr_cpu = cpumask_next(curr_cpu, |
277 | &entry->def_intr.mask); | |
957558c9 | 278 | |
4197344b DD |
279 | /* |
280 | * Remove the remaining kernel receive queues from | |
281 | * the default list and add them to the receive list. | |
282 | */ | |
d6373019 SS |
283 | for (i = 0; |
284 | i < (dd->n_krcv_queues - 1) * | |
285 | hfi1_per_node_cntr[dd->node]; | |
286 | i++) { | |
4197344b DD |
287 | cpumask_clear_cpu(curr_cpu, |
288 | &entry->def_intr.mask); | |
289 | cpumask_set_cpu(curr_cpu, | |
290 | &entry->rcv_intr.mask); | |
291 | curr_cpu = cpumask_next(curr_cpu, | |
292 | &entry->def_intr.mask); | |
293 | if (curr_cpu >= nr_cpu_ids) | |
294 | break; | |
295 | } | |
d6373019 SS |
296 | |
297 | /* | |
298 | * If there ends up being 0 CPU cores leftover for SDMA | |
299 | * engines, use the same CPU cores as general/control | |
300 | * context. | |
301 | */ | |
302 | if (cpumask_weight(&entry->def_intr.mask) == 0) | |
303 | cpumask_copy(&entry->def_intr.mask, | |
304 | &entry->general_intr_mask); | |
4197344b | 305 | } |
957558c9 | 306 | |
4197344b | 307 | node_affinity_add_tail(entry); |
4197344b | 308 | } |
584d9577 | 309 | mutex_unlock(&node_affinity.lock); |
4197344b | 310 | return 0; |
957558c9 MH |
311 | } |
312 | ||
2d01c37d TS |
313 | /* |
314 | * Function updates the irq affinity hint for msix after it has been changed | |
315 | * by the user using the /proc/irq interface. This function only accepts | |
316 | * one cpu in the mask. | |
317 | */ | |
318 | static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) | |
319 | { | |
320 | struct sdma_engine *sde = msix->arg; | |
321 | struct hfi1_devdata *dd = sde->dd; | |
322 | struct hfi1_affinity_node *entry; | |
323 | struct cpu_mask_set *set; | |
324 | int i, old_cpu; | |
325 | ||
326 | if (cpu > num_online_cpus() || cpu == sde->cpu) | |
327 | return; | |
328 | ||
329 | mutex_lock(&node_affinity.lock); | |
330 | entry = node_affinity_lookup(dd->node); | |
331 | if (!entry) | |
332 | goto unlock; | |
333 | ||
334 | old_cpu = sde->cpu; | |
335 | sde->cpu = cpu; | |
336 | cpumask_clear(&msix->mask); | |
337 | cpumask_set_cpu(cpu, &msix->mask); | |
bb7dde87 MR |
338 | dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n", |
339 | msix->irq, irq_type_names[msix->type], | |
2d01c37d | 340 | sde->this_idx, cpu); |
bb7dde87 | 341 | irq_set_affinity_hint(msix->irq, &msix->mask); |
2d01c37d TS |
342 | |
343 | /* | |
344 | * Set the new cpu in the hfi1_affinity_node and clean | |
345 | * the old cpu if it is not used by any other IRQ | |
346 | */ | |
347 | set = &entry->def_intr; | |
348 | cpumask_set_cpu(cpu, &set->mask); | |
349 | cpumask_set_cpu(cpu, &set->used); | |
350 | for (i = 0; i < dd->num_msix_entries; i++) { | |
351 | struct hfi1_msix_entry *other_msix; | |
352 | ||
353 | other_msix = &dd->msix_entries[i]; | |
354 | if (other_msix->type != IRQ_SDMA || other_msix == msix) | |
355 | continue; | |
356 | ||
357 | if (cpumask_test_cpu(old_cpu, &other_msix->mask)) | |
358 | goto unlock; | |
359 | } | |
360 | cpumask_clear_cpu(old_cpu, &set->mask); | |
361 | cpumask_clear_cpu(old_cpu, &set->used); | |
362 | unlock: | |
363 | mutex_unlock(&node_affinity.lock); | |
364 | } | |
365 | ||
366 | static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify, | |
367 | const cpumask_t *mask) | |
368 | { | |
369 | int cpu = cpumask_first(mask); | |
370 | struct hfi1_msix_entry *msix = container_of(notify, | |
371 | struct hfi1_msix_entry, | |
372 | notify); | |
373 | ||
374 | /* Only one CPU configuration supported currently */ | |
375 | hfi1_update_sdma_affinity(msix, cpu); | |
376 | } | |
377 | ||
378 | static void hfi1_irq_notifier_release(struct kref *ref) | |
379 | { | |
380 | /* | |
381 | * This is required by affinity notifier. We don't have anything to | |
382 | * free here. | |
383 | */ | |
384 | } | |
385 | ||
386 | static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix) | |
387 | { | |
388 | struct irq_affinity_notify *notify = &msix->notify; | |
389 | ||
bb7dde87 | 390 | notify->irq = msix->irq; |
2d01c37d TS |
391 | notify->notify = hfi1_irq_notifier_notify; |
392 | notify->release = hfi1_irq_notifier_release; | |
393 | ||
394 | if (irq_set_affinity_notifier(notify->irq, notify)) | |
395 | pr_err("Failed to register sdma irq affinity notifier for irq %d\n", | |
396 | notify->irq); | |
397 | } | |
398 | ||
399 | static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix) | |
400 | { | |
401 | struct irq_affinity_notify *notify = &msix->notify; | |
402 | ||
403 | if (irq_set_affinity_notifier(notify->irq, NULL)) | |
404 | pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n", | |
405 | notify->irq); | |
406 | } | |
407 | ||
584d9577 TS |
408 | /* |
409 | * Function sets the irq affinity for msix. | |
410 | * It *must* be called with node_affinity.lock held. | |
411 | */ | |
412 | static int get_irq_affinity(struct hfi1_devdata *dd, | |
413 | struct hfi1_msix_entry *msix) | |
957558c9 | 414 | { |
957558c9 | 415 | cpumask_var_t diff; |
4197344b | 416 | struct hfi1_affinity_node *entry; |
d6373019 | 417 | struct cpu_mask_set *set = NULL; |
957558c9 MH |
418 | struct sdma_engine *sde = NULL; |
419 | struct hfi1_ctxtdata *rcd = NULL; | |
420 | char extra[64]; | |
421 | int cpu = -1; | |
422 | ||
423 | extra[0] = '\0'; | |
424 | cpumask_clear(&msix->mask); | |
425 | ||
4197344b | 426 | entry = node_affinity_lookup(dd->node); |
4197344b | 427 | |
957558c9 MH |
428 | switch (msix->type) { |
429 | case IRQ_SDMA: | |
430 | sde = (struct sdma_engine *)msix->arg; | |
431 | scnprintf(extra, 64, "engine %u", sde->this_idx); | |
4197344b | 432 | set = &entry->def_intr; |
957558c9 | 433 | break; |
d6373019 SS |
434 | case IRQ_GENERAL: |
435 | cpu = cpumask_first(&entry->general_intr_mask); | |
436 | break; | |
957558c9 MH |
437 | case IRQ_RCVCTXT: |
438 | rcd = (struct hfi1_ctxtdata *)msix->arg; | |
d6373019 SS |
439 | if (rcd->ctxt == HFI1_CTRL_CTXT) |
440 | cpu = cpumask_first(&entry->general_intr_mask); | |
441 | else | |
4197344b | 442 | set = &entry->rcv_intr; |
957558c9 MH |
443 | scnprintf(extra, 64, "ctxt %u", rcd->ctxt); |
444 | break; | |
445 | default: | |
446 | dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); | |
447 | return -EINVAL; | |
448 | } | |
449 | ||
450 | /* | |
d6373019 SS |
451 | * The general and control contexts are placed on a particular |
452 | * CPU, which is set above. Skip accounting for it. Everything else | |
453 | * finds its CPU here. | |
957558c9 | 454 | */ |
4197344b | 455 | if (cpu == -1 && set) { |
102b2c2f SS |
456 | if (!zalloc_cpumask_var(&diff, GFP_KERNEL)) |
457 | return -ENOMEM; | |
458 | ||
957558c9 MH |
459 | if (cpumask_equal(&set->mask, &set->used)) { |
460 | /* | |
461 | * We've used up all the CPUs, bump up the generation | |
462 | * and reset the 'used' map | |
463 | */ | |
464 | set->gen++; | |
465 | cpumask_clear(&set->used); | |
466 | } | |
467 | cpumask_andnot(diff, &set->mask, &set->used); | |
468 | cpu = cpumask_first(diff); | |
469 | cpumask_set_cpu(cpu, &set->used); | |
102b2c2f SS |
470 | |
471 | free_cpumask_var(diff); | |
957558c9 MH |
472 | } |
473 | ||
957558c9 | 474 | cpumask_set_cpu(cpu, &msix->mask); |
bb7dde87 MR |
475 | dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n", |
476 | msix->irq, irq_type_names[msix->type], | |
957558c9 | 477 | extra, cpu); |
bb7dde87 | 478 | irq_set_affinity_hint(msix->irq, &msix->mask); |
957558c9 | 479 | |
2d01c37d TS |
480 | if (msix->type == IRQ_SDMA) { |
481 | sde->cpu = cpu; | |
482 | hfi1_setup_sdma_notifier(msix); | |
483 | } | |
484 | ||
957558c9 MH |
485 | return 0; |
486 | } | |
487 | ||
584d9577 TS |
488 | int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) |
489 | { | |
490 | int ret; | |
491 | ||
492 | mutex_lock(&node_affinity.lock); | |
493 | ret = get_irq_affinity(dd, msix); | |
494 | mutex_unlock(&node_affinity.lock); | |
495 | return ret; | |
496 | } | |
497 | ||
957558c9 MH |
498 | void hfi1_put_irq_affinity(struct hfi1_devdata *dd, |
499 | struct hfi1_msix_entry *msix) | |
500 | { | |
501 | struct cpu_mask_set *set = NULL; | |
502 | struct hfi1_ctxtdata *rcd; | |
4197344b DD |
503 | struct hfi1_affinity_node *entry; |
504 | ||
584d9577 | 505 | mutex_lock(&node_affinity.lock); |
4197344b | 506 | entry = node_affinity_lookup(dd->node); |
957558c9 MH |
507 | |
508 | switch (msix->type) { | |
509 | case IRQ_SDMA: | |
4197344b | 510 | set = &entry->def_intr; |
2d01c37d | 511 | hfi1_cleanup_sdma_notifier(msix); |
957558c9 | 512 | break; |
d6373019 | 513 | case IRQ_GENERAL: |
b094a36f | 514 | /* Don't do accounting for general contexts */ |
d6373019 | 515 | break; |
957558c9 MH |
516 | case IRQ_RCVCTXT: |
517 | rcd = (struct hfi1_ctxtdata *)msix->arg; | |
d6373019 | 518 | /* Don't do accounting for control contexts */ |
957558c9 | 519 | if (rcd->ctxt != HFI1_CTRL_CTXT) |
4197344b | 520 | set = &entry->rcv_intr; |
957558c9 MH |
521 | break; |
522 | default: | |
584d9577 | 523 | mutex_unlock(&node_affinity.lock); |
957558c9 MH |
524 | return; |
525 | } | |
526 | ||
527 | if (set) { | |
957558c9 MH |
528 | cpumask_andnot(&set->used, &set->used, &msix->mask); |
529 | if (cpumask_empty(&set->used) && set->gen) { | |
530 | set->gen--; | |
531 | cpumask_copy(&set->used, &set->mask); | |
532 | } | |
957558c9 MH |
533 | } |
534 | ||
bb7dde87 | 535 | irq_set_affinity_hint(msix->irq, NULL); |
957558c9 | 536 | cpumask_clear(&msix->mask); |
584d9577 | 537 | mutex_unlock(&node_affinity.lock); |
957558c9 MH |
538 | } |
539 | ||
b094a36f SS |
540 | /* This should be called with node_affinity.lock held */ |
541 | static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask, | |
542 | struct hfi1_affinity_node_list *affinity) | |
543 | { | |
544 | int possible, curr_cpu, i; | |
545 | uint num_cores_per_socket = node_affinity.num_online_cpus / | |
546 | affinity->num_core_siblings / | |
547 | node_affinity.num_online_nodes; | |
548 | ||
549 | cpumask_copy(hw_thread_mask, &affinity->proc.mask); | |
550 | if (affinity->num_core_siblings > 0) { | |
551 | /* Removing other siblings not needed for now */ | |
552 | possible = cpumask_weight(hw_thread_mask); | |
553 | curr_cpu = cpumask_first(hw_thread_mask); | |
554 | for (i = 0; | |
555 | i < num_cores_per_socket * node_affinity.num_online_nodes; | |
556 | i++) | |
557 | curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); | |
558 | ||
559 | for (; i < possible; i++) { | |
560 | cpumask_clear_cpu(curr_cpu, hw_thread_mask); | |
561 | curr_cpu = cpumask_next(curr_cpu, hw_thread_mask); | |
562 | } | |
563 | ||
564 | /* Identifying correct HW threads within physical cores */ | |
565 | cpumask_shift_left(hw_thread_mask, hw_thread_mask, | |
566 | num_cores_per_socket * | |
567 | node_affinity.num_online_nodes * | |
568 | hw_thread_no); | |
569 | } | |
570 | } | |
571 | ||
572 | int hfi1_get_proc_affinity(int node) | |
957558c9 | 573 | { |
b094a36f | 574 | int cpu = -1, ret, i; |
4197344b | 575 | struct hfi1_affinity_node *entry; |
b094a36f | 576 | cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; |
957558c9 | 577 | const struct cpumask *node_mask, |
0c98d344 | 578 | *proc_mask = ¤t->cpus_allowed; |
b094a36f SS |
579 | struct hfi1_affinity_node_list *affinity = &node_affinity; |
580 | struct cpu_mask_set *set = &affinity->proc; | |
957558c9 MH |
581 | |
582 | /* | |
583 | * check whether process/context affinity has already | |
584 | * been set | |
585 | */ | |
586 | if (cpumask_weight(proc_mask) == 1) { | |
f242d93a LR |
587 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", |
588 | current->pid, current->comm, | |
589 | cpumask_pr_args(proc_mask)); | |
957558c9 MH |
590 | /* |
591 | * Mark the pre-set CPU as used. This is atomic so we don't | |
592 | * need the lock | |
593 | */ | |
594 | cpu = cpumask_first(proc_mask); | |
595 | cpumask_set_cpu(cpu, &set->used); | |
596 | goto done; | |
597 | } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { | |
f242d93a LR |
598 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", |
599 | current->pid, current->comm, | |
600 | cpumask_pr_args(proc_mask)); | |
957558c9 MH |
601 | goto done; |
602 | } | |
603 | ||
604 | /* | |
605 | * The process does not have a preset CPU affinity so find one to | |
b094a36f SS |
606 | * recommend using the following algorithm: |
607 | * | |
608 | * For each user process that is opening a context on HFI Y: | |
609 | * a) If all cores are filled, reinitialize the bitmask | |
610 | * b) Fill real cores first, then HT cores (First set of HT | |
611 | * cores on all physical cores, then second set of HT core, | |
612 | * and, so on) in the following order: | |
613 | * | |
614 | * 1. Same NUMA node as HFI Y and not running an IRQ | |
615 | * handler | |
616 | * 2. Same NUMA node as HFI Y and running an IRQ handler | |
617 | * 3. Different NUMA node to HFI Y and not running an IRQ | |
618 | * handler | |
619 | * 4. Different NUMA node to HFI Y and running an IRQ | |
620 | * handler | |
621 | * c) Mark core as filled in the bitmask. As user processes are | |
622 | * done, clear cores from the bitmask. | |
957558c9 MH |
623 | */ |
624 | ||
625 | ret = zalloc_cpumask_var(&diff, GFP_KERNEL); | |
626 | if (!ret) | |
627 | goto done; | |
b094a36f | 628 | ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL); |
957558c9 MH |
629 | if (!ret) |
630 | goto free_diff; | |
b094a36f | 631 | ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL); |
957558c9 | 632 | if (!ret) |
b094a36f SS |
633 | goto free_hw_thread_mask; |
634 | ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL); | |
635 | if (!ret) | |
636 | goto free_available_mask; | |
957558c9 | 637 | |
584d9577 | 638 | mutex_lock(&affinity->lock); |
957558c9 | 639 | /* |
b094a36f | 640 | * If we've used all available HW threads, clear the mask and start |
957558c9 MH |
641 | * overloading. |
642 | */ | |
643 | if (cpumask_equal(&set->mask, &set->used)) { | |
644 | set->gen++; | |
645 | cpumask_clear(&set->used); | |
646 | } | |
647 | ||
d6373019 SS |
648 | /* |
649 | * If NUMA node has CPUs used by interrupt handlers, include them in the | |
650 | * interrupt handler mask. | |
651 | */ | |
652 | entry = node_affinity_lookup(node); | |
653 | if (entry) { | |
b094a36f SS |
654 | cpumask_copy(intrs_mask, (entry->def_intr.gen ? |
655 | &entry->def_intr.mask : | |
656 | &entry->def_intr.used)); | |
657 | cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ? | |
658 | &entry->rcv_intr.mask : | |
659 | &entry->rcv_intr.used)); | |
660 | cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask); | |
d6373019 | 661 | } |
f242d93a | 662 | hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", |
b094a36f SS |
663 | cpumask_pr_args(intrs_mask)); |
664 | ||
665 | cpumask_copy(hw_thread_mask, &set->mask); | |
957558c9 MH |
666 | |
667 | /* | |
b094a36f SS |
668 | * If HT cores are enabled, identify which HW threads within the |
669 | * physical cores should be used. | |
957558c9 | 670 | */ |
b094a36f SS |
671 | if (affinity->num_core_siblings > 0) { |
672 | for (i = 0; i < affinity->num_core_siblings; i++) { | |
673 | find_hw_thread_mask(i, hw_thread_mask, affinity); | |
674 | ||
675 | /* | |
676 | * If there's at least one available core for this HW | |
677 | * thread number, stop looking for a core. | |
678 | * | |
679 | * diff will always be not empty at least once in this | |
680 | * loop as the used mask gets reset when | |
681 | * (set->mask == set->used) before this loop. | |
682 | */ | |
683 | cpumask_andnot(diff, hw_thread_mask, &set->used); | |
684 | if (!cpumask_empty(diff)) | |
685 | break; | |
686 | } | |
687 | } | |
688 | hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl", | |
689 | cpumask_pr_args(hw_thread_mask)); | |
690 | ||
957558c9 | 691 | node_mask = cpumask_of_node(node); |
b094a36f | 692 | hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node, |
f242d93a | 693 | cpumask_pr_args(node_mask)); |
957558c9 | 694 | |
b094a36f SS |
695 | /* Get cpumask of available CPUs on preferred NUMA */ |
696 | cpumask_and(available_mask, hw_thread_mask, node_mask); | |
697 | cpumask_andnot(available_mask, available_mask, &set->used); | |
698 | hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node, | |
699 | cpumask_pr_args(available_mask)); | |
957558c9 MH |
700 | |
701 | /* | |
702 | * At first, we don't want to place processes on the same | |
b094a36f SS |
703 | * CPUs as interrupt handlers. Then, CPUs running interrupt |
704 | * handlers are used. | |
705 | * | |
706 | * 1) If diff is not empty, then there are CPUs not running | |
707 | * non-interrupt handlers available, so diff gets copied | |
708 | * over to available_mask. | |
709 | * 2) If diff is empty, then all CPUs not running interrupt | |
710 | * handlers are taken, so available_mask contains all | |
711 | * available CPUs running interrupt handlers. | |
712 | * 3) If available_mask is empty, then all CPUs on the | |
713 | * preferred NUMA node are taken, so other NUMA nodes are | |
714 | * used for process assignments using the same method as | |
715 | * the preferred NUMA node. | |
957558c9 | 716 | */ |
b094a36f | 717 | cpumask_andnot(diff, available_mask, intrs_mask); |
957558c9 | 718 | if (!cpumask_empty(diff)) |
b094a36f | 719 | cpumask_copy(available_mask, diff); |
957558c9 | 720 | |
b094a36f SS |
721 | /* If we don't have CPUs on the preferred node, use other NUMA nodes */ |
722 | if (cpumask_empty(available_mask)) { | |
723 | cpumask_andnot(available_mask, hw_thread_mask, &set->used); | |
724 | /* Excluding preferred NUMA cores */ | |
725 | cpumask_andnot(available_mask, available_mask, node_mask); | |
726 | hfi1_cdbg(PROC, | |
727 | "Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl", | |
728 | cpumask_pr_args(available_mask)); | |
729 | ||
730 | /* | |
731 | * At first, we don't want to place processes on the same | |
732 | * CPUs as interrupt handlers. | |
733 | */ | |
734 | cpumask_andnot(diff, available_mask, intrs_mask); | |
735 | if (!cpumask_empty(diff)) | |
736 | cpumask_copy(available_mask, diff); | |
957558c9 | 737 | } |
b094a36f SS |
738 | hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl", |
739 | cpumask_pr_args(available_mask)); | |
957558c9 | 740 | |
b094a36f | 741 | cpu = cpumask_first(available_mask); |
957558c9 MH |
742 | if (cpu >= nr_cpu_ids) /* empty */ |
743 | cpu = -1; | |
744 | else | |
745 | cpumask_set_cpu(cpu, &set->used); | |
584d9577 TS |
746 | |
747 | mutex_unlock(&affinity->lock); | |
b094a36f SS |
748 | hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu); |
749 | ||
750 | free_cpumask_var(intrs_mask); | |
751 | free_available_mask: | |
752 | free_cpumask_var(available_mask); | |
753 | free_hw_thread_mask: | |
754 | free_cpumask_var(hw_thread_mask); | |
957558c9 MH |
755 | free_diff: |
756 | free_cpumask_var(diff); | |
757 | done: | |
758 | return cpu; | |
759 | } | |
760 | ||
b094a36f | 761 | void hfi1_put_proc_affinity(int cpu) |
957558c9 | 762 | { |
b094a36f SS |
763 | struct hfi1_affinity_node_list *affinity = &node_affinity; |
764 | struct cpu_mask_set *set = &affinity->proc; | |
957558c9 MH |
765 | |
766 | if (cpu < 0) | |
767 | return; | |
584d9577 TS |
768 | |
769 | mutex_lock(&affinity->lock); | |
957558c9 | 770 | cpumask_clear_cpu(cpu, &set->used); |
b094a36f | 771 | hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); |
957558c9 MH |
772 | if (cpumask_empty(&set->used) && set->gen) { |
773 | set->gen--; | |
774 | cpumask_copy(&set->used, &set->mask); | |
775 | } | |
584d9577 | 776 | mutex_unlock(&affinity->lock); |
957558c9 | 777 | } |