2 * Copyright(c) 2015 - 2018 Intel Corporation.
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
24 * - Redistributions of source code must retain the above copyright
25 * notice, this list of conditions and the following disclaimer.
26 * - Redistributions in binary form must reproduce the above copyright
27 * notice, this list of conditions and the following disclaimer in
28 * the documentation and/or other materials provided with the
30 * - Neither the name of Intel Corporation nor the names of its
31 * contributors may be used to endorse or promote products derived
32 * from this software without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 #include <linux/pci.h>
49 #include <linux/netdevice.h>
50 #include <linux/vmalloc.h>
51 #include <linux/delay.h>
52 #include <linux/idr.h>
53 #include <linux/module.h>
54 #include <linux/printk.h>
55 #include <linux/hrtimer.h>
56 #include <linux/bitmap.h>
57 #include <linux/numa.h>
58 #include <rdma/rdma_vt.h>
74 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
76 #define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
78 * min buffers we want to have per context, after driver
80 #define HFI1_MIN_USER_CTXT_BUFCNT 7
82 #define HFI1_MIN_HDRQ_EGRBUF_CNT 2
83 #define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
84 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
85 #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
87 #define NUM_IB_PORTS 1
90 * Number of user receive contexts we are configured to use (to allow for more
91 * pio buffers per ctxt, etc.) Zero means use one user context per CPU.
93 int num_user_contexts
= -1;
94 module_param_named(num_user_contexts
, num_user_contexts
, int, 0444);
96 num_user_contexts
, "Set max number of user contexts to use (default: -1 will use the real (non-HT) CPU count)");
98 uint krcvqs
[RXE_NUM_DATA_VL
];
100 module_param_array(krcvqs
, uint
, &krcvqsset
, S_IRUGO
);
101 MODULE_PARM_DESC(krcvqs
, "Array of the number of non-control kernel receive queues by VL");
103 /* computed based on above array */
104 unsigned long n_krcvqs
;
106 static unsigned hfi1_rcvarr_split
= 25;
107 module_param_named(rcvarr_split
, hfi1_rcvarr_split
, uint
, S_IRUGO
);
108 MODULE_PARM_DESC(rcvarr_split
, "Percent of context's RcvArray entries used for Eager buffers");
110 static uint eager_buffer_size
= (8 << 20); /* 8MB */
111 module_param(eager_buffer_size
, uint
, S_IRUGO
);
112 MODULE_PARM_DESC(eager_buffer_size
, "Size of the eager buffers, default: 8MB");
114 static uint rcvhdrcnt
= 2048; /* 2x the max eager buffer count */
115 module_param_named(rcvhdrcnt
, rcvhdrcnt
, uint
, S_IRUGO
);
116 MODULE_PARM_DESC(rcvhdrcnt
, "Receive header queue count (default 2048)");
118 static uint hfi1_hdrq_entsize
= 32;
119 module_param_named(hdrq_entsize
, hfi1_hdrq_entsize
, uint
, 0444);
120 MODULE_PARM_DESC(hdrq_entsize
, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)");
122 unsigned int user_credit_return_threshold
= 33; /* default is 33% */
123 module_param(user_credit_return_threshold
, uint
, S_IRUGO
);
124 MODULE_PARM_DESC(user_credit_return_threshold
, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
126 static inline u64
encode_rcv_header_entry_size(u16 size
);
128 static struct idr hfi1_unit_table
;
130 static int hfi1_create_kctxt(struct hfi1_devdata
*dd
,
131 struct hfi1_pportdata
*ppd
)
133 struct hfi1_ctxtdata
*rcd
;
136 /* Control context has to be always 0 */
137 BUILD_BUG_ON(HFI1_CTRL_CTXT
!= 0);
139 ret
= hfi1_create_ctxtdata(ppd
, dd
->node
, &rcd
);
141 dd_dev_err(dd
, "Kernel receive context allocation failed\n");
146 * Set up the kernel context flags here and now because they use
147 * default values for all receive side memories. User contexts will
148 * be handled as they are created.
150 rcd
->flags
= HFI1_CAP_KGET(MULTI_PKT_EGR
) |
151 HFI1_CAP_KGET(NODROP_RHQ_FULL
) |
152 HFI1_CAP_KGET(NODROP_EGR_FULL
) |
153 HFI1_CAP_KGET(DMA_RTAIL
);
155 /* Control context must use DMA_RTAIL */
156 if (rcd
->ctxt
== HFI1_CTRL_CTXT
)
157 rcd
->flags
|= HFI1_CAP_DMA_RTAIL
;
160 rcd
->sc
= sc_alloc(dd
, SC_ACK
, rcd
->rcvhdrqentsize
, dd
->node
);
162 dd_dev_err(dd
, "Kernel send context allocation failed\n");
165 hfi1_init_ctxt(rcd
->sc
);
171 * Create the receive context array and one or more kernel contexts
173 int hfi1_create_kctxts(struct hfi1_devdata
*dd
)
178 dd
->rcd
= kcalloc_node(dd
->num_rcv_contexts
, sizeof(*dd
->rcd
),
179 GFP_KERNEL
, dd
->node
);
183 for (i
= 0; i
< dd
->first_dyn_alloc_ctxt
; ++i
) {
184 ret
= hfi1_create_kctxt(dd
, dd
->pport
);
191 for (i
= 0; dd
->rcd
&& i
< dd
->first_dyn_alloc_ctxt
; ++i
)
192 hfi1_free_ctxt(dd
->rcd
[i
]);
194 /* All the contexts should be freed, free the array */
201 * Helper routines for the receive context reference count (rcd and uctxt).
203 static void hfi1_rcd_init(struct hfi1_ctxtdata
*rcd
)
205 kref_init(&rcd
->kref
);
209 * hfi1_rcd_free - When reference is zero clean up.
210 * @kref: pointer to an initialized rcd data structure
213 static void hfi1_rcd_free(struct kref
*kref
)
216 struct hfi1_ctxtdata
*rcd
=
217 container_of(kref
, struct hfi1_ctxtdata
, kref
);
219 hfi1_free_ctxtdata(rcd
->dd
, rcd
);
221 spin_lock_irqsave(&rcd
->dd
->uctxt_lock
, flags
);
222 rcd
->dd
->rcd
[rcd
->ctxt
] = NULL
;
223 spin_unlock_irqrestore(&rcd
->dd
->uctxt_lock
, flags
);
229 * hfi1_rcd_put - decrement reference for rcd
230 * @rcd: pointer to an initialized rcd data structure
232 * Use this to put a reference after the init.
234 int hfi1_rcd_put(struct hfi1_ctxtdata
*rcd
)
237 return kref_put(&rcd
->kref
, hfi1_rcd_free
);
243 * hfi1_rcd_get - increment reference for rcd
244 * @rcd: pointer to an initialized rcd data structure
246 * Use this to get a reference after the init.
248 void hfi1_rcd_get(struct hfi1_ctxtdata
*rcd
)
250 kref_get(&rcd
->kref
);
254 * allocate_rcd_index - allocate an rcd index from the rcd array
255 * @dd: pointer to a valid devdata structure
256 * @rcd: rcd data structure to assign
257 * @index: pointer to index that is allocated
259 * Find an empty index in the rcd array, and assign the given rcd to it.
260 * If the array is full, we are EBUSY.
263 static int allocate_rcd_index(struct hfi1_devdata
*dd
,
264 struct hfi1_ctxtdata
*rcd
, u16
*index
)
269 spin_lock_irqsave(&dd
->uctxt_lock
, flags
);
270 for (ctxt
= 0; ctxt
< dd
->num_rcv_contexts
; ctxt
++)
274 if (ctxt
< dd
->num_rcv_contexts
) {
279 spin_unlock_irqrestore(&dd
->uctxt_lock
, flags
);
281 if (ctxt
>= dd
->num_rcv_contexts
)
290 * hfi1_rcd_get_by_index_safe - validate the ctxt index before accessing the
292 * @dd: pointer to a valid devdata structure
293 * @ctxt: the index of an possilbe rcd
295 * This is a wrapper for hfi1_rcd_get_by_index() to validate that the given
296 * ctxt index is valid.
298 * The caller is responsible for making the _put().
301 struct hfi1_ctxtdata
*hfi1_rcd_get_by_index_safe(struct hfi1_devdata
*dd
,
304 if (ctxt
< dd
->num_rcv_contexts
)
305 return hfi1_rcd_get_by_index(dd
, ctxt
);
311 * hfi1_rcd_get_by_index
312 * @dd: pointer to a valid devdata structure
313 * @ctxt: the index of an possilbe rcd
315 * We need to protect access to the rcd array. If access is needed to
316 * one or more index, get the protecting spinlock and then increment the
319 * The caller is responsible for making the _put().
322 struct hfi1_ctxtdata
*hfi1_rcd_get_by_index(struct hfi1_devdata
*dd
, u16 ctxt
)
325 struct hfi1_ctxtdata
*rcd
= NULL
;
327 spin_lock_irqsave(&dd
->uctxt_lock
, flags
);
332 spin_unlock_irqrestore(&dd
->uctxt_lock
, flags
);
338 * Common code for user and kernel context create and setup.
339 * NOTE: the initial kref is done here (hf1_rcd_init()).
341 int hfi1_create_ctxtdata(struct hfi1_pportdata
*ppd
, int numa
,
342 struct hfi1_ctxtdata
**context
)
344 struct hfi1_devdata
*dd
= ppd
->dd
;
345 struct hfi1_ctxtdata
*rcd
;
346 unsigned kctxt_ngroups
= 0;
349 if (dd
->rcv_entries
.nctxt_extra
>
350 dd
->num_rcv_contexts
- dd
->first_dyn_alloc_ctxt
)
351 kctxt_ngroups
= (dd
->rcv_entries
.nctxt_extra
-
352 (dd
->num_rcv_contexts
- dd
->first_dyn_alloc_ctxt
));
353 rcd
= kzalloc_node(sizeof(*rcd
), GFP_KERNEL
, numa
);
355 u32 rcvtids
, max_entries
;
359 ret
= allocate_rcd_index(dd
, rcd
, &ctxt
);
366 INIT_LIST_HEAD(&rcd
->qp_wait_list
);
367 hfi1_exp_tid_group_init(rcd
);
371 rcd
->rcv_array_groups
= dd
->rcv_entries
.ngroups
;
372 rcd
->rhf_rcv_function_map
= normal_rhf_rcv_functions
;
374 mutex_init(&rcd
->exp_mutex
);
376 hfi1_cdbg(PROC
, "setting up context %u\n", rcd
->ctxt
);
379 * Calculate the context's RcvArray entry starting point.
380 * We do this here because we have to take into account all
381 * the RcvArray entries that previous context would have
382 * taken and we have to account for any extra groups assigned
383 * to the static (kernel) or dynamic (vnic/user) contexts.
385 if (ctxt
< dd
->first_dyn_alloc_ctxt
) {
386 if (ctxt
< kctxt_ngroups
) {
387 base
= ctxt
* (dd
->rcv_entries
.ngroups
+ 1);
388 rcd
->rcv_array_groups
++;
390 base
= kctxt_ngroups
+
391 (ctxt
* dd
->rcv_entries
.ngroups
);
394 u16 ct
= ctxt
- dd
->first_dyn_alloc_ctxt
;
396 base
= ((dd
->n_krcv_queues
* dd
->rcv_entries
.ngroups
) +
398 if (ct
< dd
->rcv_entries
.nctxt_extra
) {
399 base
+= ct
* (dd
->rcv_entries
.ngroups
+ 1);
400 rcd
->rcv_array_groups
++;
402 base
+= dd
->rcv_entries
.nctxt_extra
+
403 (ct
* dd
->rcv_entries
.ngroups
);
406 rcd
->eager_base
= base
* dd
->rcv_entries
.group_size
;
408 rcd
->rcvhdrq_cnt
= rcvhdrcnt
;
409 rcd
->rcvhdrqentsize
= hfi1_hdrq_entsize
;
411 rcd
->rcvhdrqentsize
- sizeof(u64
) / sizeof(u32
);
413 * Simple Eager buffer allocation: we have already pre-allocated
414 * the number of RcvArray entry groups. Each ctxtdata structure
415 * holds the number of groups for that context.
417 * To follow CSR requirements and maintain cacheline alignment,
418 * make sure all sizes and bases are multiples of group_size.
420 * The expected entry count is what is left after assigning
423 max_entries
= rcd
->rcv_array_groups
*
424 dd
->rcv_entries
.group_size
;
425 rcvtids
= ((max_entries
* hfi1_rcvarr_split
) / 100);
426 rcd
->egrbufs
.count
= round_down(rcvtids
,
427 dd
->rcv_entries
.group_size
);
428 if (rcd
->egrbufs
.count
> MAX_EAGER_ENTRIES
) {
429 dd_dev_err(dd
, "ctxt%u: requested too many RcvArray entries.\n",
431 rcd
->egrbufs
.count
= MAX_EAGER_ENTRIES
;
434 "ctxt%u: max Eager buffer RcvArray entries: %u\n",
435 rcd
->ctxt
, rcd
->egrbufs
.count
);
438 * Allocate array that will hold the eager buffer accounting
440 * This will allocate the maximum possible buffer count based
441 * on the value of the RcvArray split parameter.
442 * The resulting value will be rounded down to the closest
443 * multiple of dd->rcv_entries.group_size.
445 rcd
->egrbufs
.buffers
=
446 kcalloc_node(rcd
->egrbufs
.count
,
447 sizeof(*rcd
->egrbufs
.buffers
),
449 if (!rcd
->egrbufs
.buffers
)
451 rcd
->egrbufs
.rcvtids
=
452 kcalloc_node(rcd
->egrbufs
.count
,
453 sizeof(*rcd
->egrbufs
.rcvtids
),
455 if (!rcd
->egrbufs
.rcvtids
)
457 rcd
->egrbufs
.size
= eager_buffer_size
;
459 * The size of the buffers programmed into the RcvArray
460 * entries needs to be big enough to handle the highest
463 if (rcd
->egrbufs
.size
< hfi1_max_mtu
) {
464 rcd
->egrbufs
.size
= __roundup_pow_of_two(hfi1_max_mtu
);
466 "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
467 rcd
->ctxt
, rcd
->egrbufs
.size
);
469 rcd
->egrbufs
.rcvtid_size
= HFI1_MAX_EAGER_BUFFER_SIZE
;
471 /* Applicable only for statically created kernel contexts */
472 if (ctxt
< dd
->first_dyn_alloc_ctxt
) {
473 rcd
->opstats
= kzalloc_node(sizeof(*rcd
->opstats
),
491 * @rcd: pointer to an initialized rcd data structure
493 * This wrapper is the free function that matches hfi1_create_ctxtdata().
494 * When a context is done being used (kernel or user), this function is called
495 * for the "final" put to match the kref init from hf1i_create_ctxtdata().
496 * Other users of the context do a get/put sequence to make sure that the
497 * structure isn't removed while in use.
499 void hfi1_free_ctxt(struct hfi1_ctxtdata
*rcd
)
505 * Convert a receive header entry size that to the encoding used in the CSR.
507 * Return a zero if the given size is invalid.
509 static inline u64
encode_rcv_header_entry_size(u16 size
)
511 /* there are only 3 valid receive header entry sizes */
518 return 0; /* invalid */
522 * Select the largest ccti value over all SLs to determine the intra-
523 * packet gap for the link.
525 * called with cca_timer_lock held (to protect access to cca_timer
526 * array), and rcu_read_lock() (to protect access to cc_state).
528 void set_link_ipg(struct hfi1_pportdata
*ppd
)
530 struct hfi1_devdata
*dd
= ppd
->dd
;
531 struct cc_state
*cc_state
;
533 u16 cce
, ccti_limit
, max_ccti
= 0;
536 u32 current_egress_rate
; /* Mbits /sec */
539 * max_pkt_time is the maximum packet egress time in units
540 * of the fabric clock period 1/(805 MHz).
543 cc_state
= get_cc_state(ppd
);
547 * This should _never_ happen - rcu_read_lock() is held,
548 * and set_link_ipg() should not be called if cc_state
553 for (i
= 0; i
< OPA_MAX_SLS
; i
++) {
554 u16 ccti
= ppd
->cca_timer
[i
].ccti
;
560 ccti_limit
= cc_state
->cct
.ccti_limit
;
561 if (max_ccti
> ccti_limit
)
562 max_ccti
= ccti_limit
;
564 cce
= cc_state
->cct
.entries
[max_ccti
].entry
;
565 shift
= (cce
& 0xc000) >> 14;
566 mult
= (cce
& 0x3fff);
568 current_egress_rate
= active_egress_rate(ppd
);
570 max_pkt_time
= egress_cycles(ppd
->ibmaxlen
, current_egress_rate
);
572 src
= (max_pkt_time
>> shift
) * mult
;
574 src
&= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK
;
575 src
<<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT
;
577 write_csr(dd
, SEND_STATIC_RATE_CONTROL
, src
);
580 static enum hrtimer_restart
cca_timer_fn(struct hrtimer
*t
)
582 struct cca_timer
*cca_timer
;
583 struct hfi1_pportdata
*ppd
;
585 u16 ccti_timer
, ccti_min
;
586 struct cc_state
*cc_state
;
588 enum hrtimer_restart ret
= HRTIMER_NORESTART
;
590 cca_timer
= container_of(t
, struct cca_timer
, hrtimer
);
591 ppd
= cca_timer
->ppd
;
596 cc_state
= get_cc_state(ppd
);
600 return HRTIMER_NORESTART
;
604 * 1) decrement ccti for SL
605 * 2) calculate IPG for link (set_link_ipg())
606 * 3) restart timer, unless ccti is at min value
609 ccti_min
= cc_state
->cong_setting
.entries
[sl
].ccti_min
;
610 ccti_timer
= cc_state
->cong_setting
.entries
[sl
].ccti_timer
;
612 spin_lock_irqsave(&ppd
->cca_timer_lock
, flags
);
614 if (cca_timer
->ccti
> ccti_min
) {
619 if (cca_timer
->ccti
> ccti_min
) {
620 unsigned long nsec
= 1024 * ccti_timer
;
621 /* ccti_timer is in units of 1.024 usec */
622 hrtimer_forward_now(t
, ns_to_ktime(nsec
));
623 ret
= HRTIMER_RESTART
;
626 spin_unlock_irqrestore(&ppd
->cca_timer_lock
, flags
);
632 * Common code for initializing the physical port structure.
634 void hfi1_init_pportdata(struct pci_dev
*pdev
, struct hfi1_pportdata
*ppd
,
635 struct hfi1_devdata
*dd
, u8 hw_pidx
, u8 port
)
638 uint default_pkey_idx
;
639 struct cc_state
*cc_state
;
642 ppd
->hw_pidx
= hw_pidx
;
643 ppd
->port
= port
; /* IB port number, not index */
644 ppd
->prev_link_width
= LINK_WIDTH_DEFAULT
;
646 * There are C_VL_COUNT number of PortVLXmitWait counters.
647 * Adding 1 to C_VL_COUNT to include the PortXmitWait counter.
649 for (i
= 0; i
< C_VL_COUNT
+ 1; i
++) {
650 ppd
->port_vl_xmit_wait_last
[i
] = 0;
651 ppd
->vl_xmit_flit_cnt
[i
] = 0;
654 default_pkey_idx
= 1;
656 ppd
->pkeys
[default_pkey_idx
] = DEFAULT_P_KEY
;
657 ppd
->part_enforce
|= HFI1_PART_ENFORCE_IN
;
660 dd_dev_err(dd
, "Faking data partition 0x8001 in idx %u\n",
662 ppd
->pkeys
[!default_pkey_idx
] = 0x8001;
665 INIT_WORK(&ppd
->link_vc_work
, handle_verify_cap
);
666 INIT_WORK(&ppd
->link_up_work
, handle_link_up
);
667 INIT_WORK(&ppd
->link_down_work
, handle_link_down
);
668 INIT_WORK(&ppd
->freeze_work
, handle_freeze
);
669 INIT_WORK(&ppd
->link_downgrade_work
, handle_link_downgrade
);
670 INIT_WORK(&ppd
->sma_message_work
, handle_sma_message
);
671 INIT_WORK(&ppd
->link_bounce_work
, handle_link_bounce
);
672 INIT_DELAYED_WORK(&ppd
->start_link_work
, handle_start_link
);
673 INIT_WORK(&ppd
->linkstate_active_work
, receive_interrupt_work
);
674 INIT_WORK(&ppd
->qsfp_info
.qsfp_work
, qsfp_event
);
676 mutex_init(&ppd
->hls_lock
);
677 spin_lock_init(&ppd
->qsfp_info
.qsfp_lock
);
679 ppd
->qsfp_info
.ppd
= ppd
;
680 ppd
->sm_trap_qp
= 0x0;
685 spin_lock_init(&ppd
->cca_timer_lock
);
687 for (i
= 0; i
< OPA_MAX_SLS
; i
++) {
688 hrtimer_init(&ppd
->cca_timer
[i
].hrtimer
, CLOCK_MONOTONIC
,
690 ppd
->cca_timer
[i
].ppd
= ppd
;
691 ppd
->cca_timer
[i
].sl
= i
;
692 ppd
->cca_timer
[i
].ccti
= 0;
693 ppd
->cca_timer
[i
].hrtimer
.function
= cca_timer_fn
;
696 ppd
->cc_max_table_entries
= IB_CC_TABLE_CAP_DEFAULT
;
698 spin_lock_init(&ppd
->cc_state_lock
);
699 spin_lock_init(&ppd
->cc_log_lock
);
700 cc_state
= kzalloc(sizeof(*cc_state
), GFP_KERNEL
);
701 RCU_INIT_POINTER(ppd
->cc_state
, cc_state
);
707 dd_dev_err(dd
, "Congestion Control Agent disabled for port %d\n", port
);
711 * Do initialization for device that is only needed on
712 * first detect, not on resets.
714 static int loadtime_init(struct hfi1_devdata
*dd
)
720 * init_after_reset - re-initialize after a reset
721 * @dd: the hfi1_ib device
723 * sanity check at least some of the values after reset, and
724 * ensure no receive or transmit (explicitly, in case reset
727 static int init_after_reset(struct hfi1_devdata
*dd
)
730 struct hfi1_ctxtdata
*rcd
;
732 * Ensure chip does no sends or receives, tail updates, or
733 * pioavail updates while we re-initialize. This is mostly
734 * for the driver data structures, not chip registers.
736 for (i
= 0; i
< dd
->num_rcv_contexts
; i
++) {
737 rcd
= hfi1_rcd_get_by_index(dd
, i
);
738 hfi1_rcvctrl(dd
, HFI1_RCVCTRL_CTXT_DIS
|
739 HFI1_RCVCTRL_INTRAVAIL_DIS
|
740 HFI1_RCVCTRL_TAILUPD_DIS
, rcd
);
743 pio_send_control(dd
, PSC_GLOBAL_DISABLE
);
744 for (i
= 0; i
< dd
->num_send_contexts
; i
++)
745 sc_disable(dd
->send_contexts
[i
].sc
);
750 static void enable_chip(struct hfi1_devdata
*dd
)
752 struct hfi1_ctxtdata
*rcd
;
756 /* enable PIO send */
757 pio_send_control(dd
, PSC_GLOBAL_ENABLE
);
760 * Enable kernel ctxts' receive and receive interrupt.
761 * Other ctxts done as user opens and initializes them.
763 for (i
= 0; i
< dd
->first_dyn_alloc_ctxt
; ++i
) {
764 rcd
= hfi1_rcd_get_by_index(dd
, i
);
767 rcvmask
= HFI1_RCVCTRL_CTXT_ENB
| HFI1_RCVCTRL_INTRAVAIL_ENB
;
768 rcvmask
|= HFI1_CAP_KGET_MASK(rcd
->flags
, DMA_RTAIL
) ?
769 HFI1_RCVCTRL_TAILUPD_ENB
: HFI1_RCVCTRL_TAILUPD_DIS
;
770 if (!HFI1_CAP_KGET_MASK(rcd
->flags
, MULTI_PKT_EGR
))
771 rcvmask
|= HFI1_RCVCTRL_ONE_PKT_EGR_ENB
;
772 if (HFI1_CAP_KGET_MASK(rcd
->flags
, NODROP_RHQ_FULL
))
773 rcvmask
|= HFI1_RCVCTRL_NO_RHQ_DROP_ENB
;
774 if (HFI1_CAP_KGET_MASK(rcd
->flags
, NODROP_EGR_FULL
))
775 rcvmask
|= HFI1_RCVCTRL_NO_EGR_DROP_ENB
;
776 hfi1_rcvctrl(dd
, rcvmask
, rcd
);
783 * create_workqueues - create per port workqueues
784 * @dd: the hfi1_ib device
786 static int create_workqueues(struct hfi1_devdata
*dd
)
789 struct hfi1_pportdata
*ppd
;
791 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
792 ppd
= dd
->pport
+ pidx
;
797 WQ_SYSFS
| WQ_HIGHPRI
| WQ_CPU_INTENSIVE
,
798 HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES
,
805 * Make the link workqueue single-threaded to enforce
811 WQ_SYSFS
| WQ_MEM_RECLAIM
| WQ_UNBOUND
,
820 pr_err("alloc_workqueue failed for port %d\n", pidx
+ 1);
821 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
822 ppd
= dd
->pport
+ pidx
;
824 destroy_workqueue(ppd
->hfi1_wq
);
828 destroy_workqueue(ppd
->link_wq
);
836 * enable_general_intr() - Enable the IRQs that will be handled by the
837 * general interrupt handler.
841 static void enable_general_intr(struct hfi1_devdata
*dd
)
843 set_intr_bits(dd
, CCE_ERR_INT
, MISC_ERR_INT
, true);
844 set_intr_bits(dd
, PIO_ERR_INT
, TXE_ERR_INT
, true);
845 set_intr_bits(dd
, IS_SENDCTXT_ERR_START
, IS_SENDCTXT_ERR_END
, true);
846 set_intr_bits(dd
, PBC_INT
, GPIO_ASSERT_INT
, true);
847 set_intr_bits(dd
, TCRIT_INT
, TCRIT_INT
, true);
848 set_intr_bits(dd
, IS_DC_START
, IS_DC_END
, true);
849 set_intr_bits(dd
, IS_SENDCREDIT_START
, IS_SENDCREDIT_END
, true);
853 * hfi1_init - do the actual initialization sequence on the chip
854 * @dd: the hfi1_ib device
855 * @reinit: re-initializing, so don't allocate new memory
857 * Do the actual initialization sequence on the chip. This is done
858 * both from the init routine called from the PCI infrastructure, and
859 * when we reset the chip, or detect that it was reset internally,
860 * or it's administratively re-enabled.
862 * Memory allocation here and in called routines is only done in
863 * the first case (reinit == 0). We have to be careful, because even
864 * without memory allocation, we need to re-write all the chip registers
865 * TIDs, etc. after the reset or enable has completed.
867 int hfi1_init(struct hfi1_devdata
*dd
, int reinit
)
869 int ret
= 0, pidx
, lastfail
= 0;
872 struct hfi1_ctxtdata
*rcd
;
873 struct hfi1_pportdata
*ppd
;
875 /* Set up send low level handlers */
876 dd
->process_pio_send
= hfi1_verbs_send_pio
;
877 dd
->process_dma_send
= hfi1_verbs_send_dma
;
878 dd
->pio_inline_send
= pio_copy
;
879 dd
->process_vnic_dma_send
= hfi1_vnic_send_dma
;
882 atomic_set(&dd
->drop_packet
, DROP_PACKET_ON
);
885 atomic_set(&dd
->drop_packet
, DROP_PACKET_OFF
);
889 /* make sure the link is not "up" */
890 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
891 ppd
= dd
->pport
+ pidx
;
896 ret
= init_after_reset(dd
);
898 ret
= loadtime_init(dd
);
902 /* allocate dummy tail memory for all receive contexts */
903 dd
->rcvhdrtail_dummy_kvaddr
= dma_alloc_coherent(&dd
->pcidev
->dev
,
905 &dd
->rcvhdrtail_dummy_dma
,
908 if (!dd
->rcvhdrtail_dummy_kvaddr
) {
909 dd_dev_err(dd
, "cannot allocate dummy tail memory\n");
914 /* dd->rcd can be NULL if early initialization failed */
915 for (i
= 0; dd
->rcd
&& i
< dd
->first_dyn_alloc_ctxt
; ++i
) {
917 * Set up the (kernel) rcvhdr queue and egr TIDs. If doing
918 * re-init, the simplest way to handle this is to free
919 * existing, and re-allocate.
920 * Need to re-create rest of ctxt 0 ctxtdata as well.
922 rcd
= hfi1_rcd_get_by_index(dd
, i
);
926 rcd
->do_interrupt
= &handle_receive_interrupt
;
928 lastfail
= hfi1_create_rcvhdrq(dd
, rcd
);
930 lastfail
= hfi1_setup_eagerbufs(rcd
);
933 "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
940 /* Allocate enough memory for user event notification. */
941 len
= PAGE_ALIGN(chip_rcv_contexts(dd
) * HFI1_MAX_SHARED_CTXTS
*
942 sizeof(*dd
->events
));
943 dd
->events
= vmalloc_user(len
);
945 dd_dev_err(dd
, "Failed to allocate user events page\n");
947 * Allocate a page for device and port status.
948 * Page will be shared amongst all user processes.
950 dd
->status
= vmalloc_user(PAGE_SIZE
);
952 dd_dev_err(dd
, "Failed to allocate dev status page\n");
953 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
954 ppd
= dd
->pport
+ pidx
;
956 /* Currently, we only have one port */
957 ppd
->statusp
= &dd
->status
->port
;
962 /* enable chip even if we have an error, so we can debug cause */
967 * Set status even if port serdes is not initialized
968 * so that diags will work.
971 dd
->status
->dev
|= HFI1_STATUS_CHIP_PRESENT
|
974 /* enable all interrupts from the chip */
975 enable_general_intr(dd
);
978 /* chip is OK for user apps; mark it as initialized */
979 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
980 ppd
= dd
->pport
+ pidx
;
983 * start the serdes - must be after interrupts are
984 * enabled so we are notified when the link goes up
986 lastfail
= bringup_serdes(ppd
);
989 "Failed to bring up port %u\n",
993 * Set status even if port serdes is not initialized
994 * so that diags will work.
997 *ppd
->statusp
|= HFI1_STATUS_CHIP_PRESENT
|
999 if (!ppd
->link_speed_enabled
)
1004 /* if ret is non-zero, we probably should do some cleanup here... */
1008 static inline struct hfi1_devdata
*__hfi1_lookup(int unit
)
1010 return idr_find(&hfi1_unit_table
, unit
);
1013 struct hfi1_devdata
*hfi1_lookup(int unit
)
1015 struct hfi1_devdata
*dd
;
1016 unsigned long flags
;
1018 spin_lock_irqsave(&hfi1_devs_lock
, flags
);
1019 dd
= __hfi1_lookup(unit
);
1020 spin_unlock_irqrestore(&hfi1_devs_lock
, flags
);
1026 * Stop the timers during unit shutdown, or after an error late
1027 * in initialization.
1029 static void stop_timers(struct hfi1_devdata
*dd
)
1031 struct hfi1_pportdata
*ppd
;
1034 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
1035 ppd
= dd
->pport
+ pidx
;
1036 if (ppd
->led_override_timer
.function
) {
1037 del_timer_sync(&ppd
->led_override_timer
);
1038 atomic_set(&ppd
->led_override_timer_active
, 0);
1044 * shutdown_device - shut down a device
1045 * @dd: the hfi1_ib device
1047 * This is called to make the device quiet when we are about to
1048 * unload the driver, and also when the device is administratively
1049 * disabled. It does not free any data structures.
1050 * Everything it does has to be setup again by hfi1_init(dd, 1)
1052 static void shutdown_device(struct hfi1_devdata
*dd
)
1054 struct hfi1_pportdata
*ppd
;
1055 struct hfi1_ctxtdata
*rcd
;
1059 if (dd
->flags
& HFI1_SHUTDOWN
)
1061 dd
->flags
|= HFI1_SHUTDOWN
;
1063 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
1064 ppd
= dd
->pport
+ pidx
;
1068 *ppd
->statusp
&= ~(HFI1_STATUS_IB_CONF
|
1069 HFI1_STATUS_IB_READY
);
1071 dd
->flags
&= ~HFI1_INITTED
;
1073 /* mask and clean up interrupts */
1074 set_intr_bits(dd
, IS_FIRST_SOURCE
, IS_LAST_SOURCE
, false);
1075 msix_clean_up_interrupts(dd
);
1077 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
1078 ppd
= dd
->pport
+ pidx
;
1079 for (i
= 0; i
< dd
->num_rcv_contexts
; i
++) {
1080 rcd
= hfi1_rcd_get_by_index(dd
, i
);
1081 hfi1_rcvctrl(dd
, HFI1_RCVCTRL_TAILUPD_DIS
|
1082 HFI1_RCVCTRL_CTXT_DIS
|
1083 HFI1_RCVCTRL_INTRAVAIL_DIS
|
1084 HFI1_RCVCTRL_PKEY_DIS
|
1085 HFI1_RCVCTRL_ONE_PKT_EGR_DIS
, rcd
);
1089 * Gracefully stop all sends allowing any in progress to
1090 * trickle out first.
1092 for (i
= 0; i
< dd
->num_send_contexts
; i
++)
1093 sc_flush(dd
->send_contexts
[i
].sc
);
1097 * Enough for anything that's going to trickle out to have actually
1102 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
1103 ppd
= dd
->pport
+ pidx
;
1105 /* disable all contexts */
1106 for (i
= 0; i
< dd
->num_send_contexts
; i
++)
1107 sc_disable(dd
->send_contexts
[i
].sc
);
1108 /* disable the send device */
1109 pio_send_control(dd
, PSC_GLOBAL_DISABLE
);
1111 shutdown_led_override(ppd
);
1114 * Clear SerdesEnable.
1115 * We can't count on interrupts since we are stopping.
1117 hfi1_quiet_serdes(ppd
);
1120 destroy_workqueue(ppd
->hfi1_wq
);
1121 ppd
->hfi1_wq
= NULL
;
1124 destroy_workqueue(ppd
->link_wq
);
1125 ppd
->link_wq
= NULL
;
1132 * hfi1_free_ctxtdata - free a context's allocated data
1133 * @dd: the hfi1_ib device
1134 * @rcd: the ctxtdata structure
1136 * free up any allocated data for a context
1137 * It should never change any chip state, or global driver state.
1139 void hfi1_free_ctxtdata(struct hfi1_devdata
*dd
, struct hfi1_ctxtdata
*rcd
)
1147 dma_free_coherent(&dd
->pcidev
->dev
, rcvhdrq_size(rcd
),
1148 rcd
->rcvhdrq
, rcd
->rcvhdrq_dma
);
1149 rcd
->rcvhdrq
= NULL
;
1150 if (rcd
->rcvhdrtail_kvaddr
) {
1151 dma_free_coherent(&dd
->pcidev
->dev
, PAGE_SIZE
,
1152 (void *)rcd
->rcvhdrtail_kvaddr
,
1153 rcd
->rcvhdrqtailaddr_dma
);
1154 rcd
->rcvhdrtail_kvaddr
= NULL
;
1158 /* all the RcvArray entries should have been cleared by now */
1159 kfree(rcd
->egrbufs
.rcvtids
);
1160 rcd
->egrbufs
.rcvtids
= NULL
;
1162 for (e
= 0; e
< rcd
->egrbufs
.alloced
; e
++) {
1163 if (rcd
->egrbufs
.buffers
[e
].dma
)
1164 dma_free_coherent(&dd
->pcidev
->dev
,
1165 rcd
->egrbufs
.buffers
[e
].len
,
1166 rcd
->egrbufs
.buffers
[e
].addr
,
1167 rcd
->egrbufs
.buffers
[e
].dma
);
1169 kfree(rcd
->egrbufs
.buffers
);
1170 rcd
->egrbufs
.alloced
= 0;
1171 rcd
->egrbufs
.buffers
= NULL
;
1176 vfree(rcd
->subctxt_uregbase
);
1177 vfree(rcd
->subctxt_rcvegrbuf
);
1178 vfree(rcd
->subctxt_rcvhdr_base
);
1179 kfree(rcd
->opstats
);
1181 rcd
->subctxt_uregbase
= NULL
;
1182 rcd
->subctxt_rcvegrbuf
= NULL
;
1183 rcd
->subctxt_rcvhdr_base
= NULL
;
1184 rcd
->opstats
= NULL
;
1188 * Release our hold on the shared asic data. If we are the last one,
1189 * return the structure to be finalized outside the lock. Must be
1190 * holding hfi1_devs_lock.
1192 static struct hfi1_asic_data
*release_asic_data(struct hfi1_devdata
*dd
)
1194 struct hfi1_asic_data
*ad
;
1199 dd
->asic_data
->dds
[dd
->hfi1_id
] = NULL
;
1200 other
= dd
->hfi1_id
? 0 : 1;
1202 dd
->asic_data
= NULL
;
1203 /* return NULL if the other dd still has a link */
1204 return ad
->dds
[other
] ? NULL
: ad
;
1207 static void finalize_asic_data(struct hfi1_devdata
*dd
,
1208 struct hfi1_asic_data
*ad
)
1210 clean_up_i2c(dd
, ad
);
1215 * hfi1_clean_devdata - cleans up per-unit data structure
1216 * @dd: pointer to a valid devdata structure
1218 * It cleans up all data structures set up by
1219 * by hfi1_alloc_devdata().
1221 static void hfi1_clean_devdata(struct hfi1_devdata
*dd
)
1223 struct hfi1_asic_data
*ad
;
1224 unsigned long flags
;
1226 spin_lock_irqsave(&hfi1_devs_lock
, flags
);
1227 if (!list_empty(&dd
->list
)) {
1228 idr_remove(&hfi1_unit_table
, dd
->unit
);
1229 list_del_init(&dd
->list
);
1231 ad
= release_asic_data(dd
);
1232 spin_unlock_irqrestore(&hfi1_devs_lock
, flags
);
1234 finalize_asic_data(dd
, ad
);
1235 free_platform_config(dd
);
1236 rcu_barrier(); /* wait for rcu callbacks to complete */
1237 free_percpu(dd
->int_counter
);
1238 free_percpu(dd
->rcv_limit
);
1239 free_percpu(dd
->send_schedule
);
1240 free_percpu(dd
->tx_opstats
);
1241 dd
->int_counter
= NULL
;
1242 dd
->rcv_limit
= NULL
;
1243 dd
->send_schedule
= NULL
;
1244 dd
->tx_opstats
= NULL
;
1245 kfree(dd
->comp_vect
);
1246 dd
->comp_vect
= NULL
;
1247 sdma_clean(dd
, dd
->num_sdma
);
1248 rvt_dealloc_device(&dd
->verbs_dev
.rdi
);
1251 static void __hfi1_free_devdata(struct kobject
*kobj
)
1253 struct hfi1_devdata
*dd
=
1254 container_of(kobj
, struct hfi1_devdata
, kobj
);
1256 hfi1_clean_devdata(dd
);
1259 static struct kobj_type hfi1_devdata_type
= {
1260 .release
= __hfi1_free_devdata
,
1263 void hfi1_free_devdata(struct hfi1_devdata
*dd
)
1265 kobject_put(&dd
->kobj
);
1269 * hfi1_alloc_devdata - Allocate our primary per-unit data structure.
1270 * @pdev: Valid PCI device
1271 * @extra: How many bytes to alloc past the default
1273 * Must be done via verbs allocator, because the verbs cleanup process
1274 * both does cleanup and free of the data structure.
1275 * "extra" is for chip-specific data.
1277 * Use the idr mechanism to get a unit number for this unit.
1279 static struct hfi1_devdata
*hfi1_alloc_devdata(struct pci_dev
*pdev
,
1282 unsigned long flags
;
1283 struct hfi1_devdata
*dd
;
1286 /* extra is * number of ports */
1287 nports
= extra
/ sizeof(struct hfi1_pportdata
);
1289 dd
= (struct hfi1_devdata
*)rvt_alloc_device(sizeof(*dd
) + extra
,
1292 return ERR_PTR(-ENOMEM
);
1293 dd
->num_pports
= nports
;
1294 dd
->pport
= (struct hfi1_pportdata
*)(dd
+ 1);
1296 pci_set_drvdata(pdev
, dd
);
1298 INIT_LIST_HEAD(&dd
->list
);
1299 idr_preload(GFP_KERNEL
);
1300 spin_lock_irqsave(&hfi1_devs_lock
, flags
);
1302 ret
= idr_alloc(&hfi1_unit_table
, dd
, 0, 0, GFP_NOWAIT
);
1305 list_add(&dd
->list
, &hfi1_dev_list
);
1307 dd
->node
= NUMA_NO_NODE
;
1309 spin_unlock_irqrestore(&hfi1_devs_lock
, flags
);
1314 "Could not allocate unit ID: error %d\n", -ret
);
1317 rvt_set_ibdev_name(&dd
->verbs_dev
.rdi
, "%s_%d", class_name(), dd
->unit
);
1320 * Initialize all locks for the device. This needs to be as early as
1321 * possible so locks are usable.
1323 spin_lock_init(&dd
->sc_lock
);
1324 spin_lock_init(&dd
->sendctrl_lock
);
1325 spin_lock_init(&dd
->rcvctrl_lock
);
1326 spin_lock_init(&dd
->uctxt_lock
);
1327 spin_lock_init(&dd
->hfi1_diag_trans_lock
);
1328 spin_lock_init(&dd
->sc_init_lock
);
1329 spin_lock_init(&dd
->dc8051_memlock
);
1330 seqlock_init(&dd
->sc2vl_lock
);
1331 spin_lock_init(&dd
->sde_map_lock
);
1332 spin_lock_init(&dd
->pio_map_lock
);
1333 mutex_init(&dd
->dc8051_lock
);
1334 init_waitqueue_head(&dd
->event_queue
);
1335 spin_lock_init(&dd
->irq_src_lock
);
1337 dd
->int_counter
= alloc_percpu(u64
);
1338 if (!dd
->int_counter
) {
1343 dd
->rcv_limit
= alloc_percpu(u64
);
1344 if (!dd
->rcv_limit
) {
1349 dd
->send_schedule
= alloc_percpu(u64
);
1350 if (!dd
->send_schedule
) {
1355 dd
->tx_opstats
= alloc_percpu(struct hfi1_opcode_stats_perctx
);
1356 if (!dd
->tx_opstats
) {
1361 dd
->comp_vect
= kzalloc(sizeof(*dd
->comp_vect
), GFP_KERNEL
);
1362 if (!dd
->comp_vect
) {
1367 kobject_init(&dd
->kobj
, &hfi1_devdata_type
);
1371 hfi1_clean_devdata(dd
);
1372 return ERR_PTR(ret
);
1376 * Called from freeze mode handlers, and from PCI error
1377 * reporting code. Should be paranoid about state of
1378 * system and data structures.
1380 void hfi1_disable_after_error(struct hfi1_devdata
*dd
)
1382 if (dd
->flags
& HFI1_INITTED
) {
1385 dd
->flags
&= ~HFI1_INITTED
;
1387 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
1388 struct hfi1_pportdata
*ppd
;
1390 ppd
= dd
->pport
+ pidx
;
1391 if (dd
->flags
& HFI1_PRESENT
)
1392 set_link_state(ppd
, HLS_DN_DISABLE
);
1395 *ppd
->statusp
&= ~HFI1_STATUS_IB_READY
;
1400 * Mark as having had an error for driver, and also
1401 * for /sys and status word mapped to user programs.
1402 * This marks unit as not usable, until reset.
1405 dd
->status
->dev
|= HFI1_STATUS_HWERROR
;
1408 static void remove_one(struct pci_dev
*);
1409 static int init_one(struct pci_dev
*, const struct pci_device_id
*);
1410 static void shutdown_one(struct pci_dev
*);
1412 #define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: "
1413 #define PFX DRIVER_NAME ": "
1415 const struct pci_device_id hfi1_pci_tbl
[] = {
1416 { PCI_DEVICE(PCI_VENDOR_ID_INTEL
, PCI_DEVICE_ID_INTEL0
) },
1417 { PCI_DEVICE(PCI_VENDOR_ID_INTEL
, PCI_DEVICE_ID_INTEL1
) },
1421 MODULE_DEVICE_TABLE(pci
, hfi1_pci_tbl
);
1423 static struct pci_driver hfi1_pci_driver
= {
1424 .name
= DRIVER_NAME
,
1426 .remove
= remove_one
,
1427 .shutdown
= shutdown_one
,
1428 .id_table
= hfi1_pci_tbl
,
1429 .err_handler
= &hfi1_pci_err_handler
,
1432 static void __init
compute_krcvqs(void)
1436 for (i
= 0; i
< krcvqsset
; i
++)
1437 n_krcvqs
+= krcvqs
[i
];
1441 * Do all the generic driver unit- and chip-independent memory
1442 * allocation and initialization.
1444 static int __init
hfi1_mod_init(void)
1452 ret
= node_affinity_init();
1456 /* validate max MTU before any devices start */
1457 if (!valid_opa_max_mtu(hfi1_max_mtu
)) {
1458 pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n",
1459 hfi1_max_mtu
, HFI1_DEFAULT_MAX_MTU
);
1460 hfi1_max_mtu
= HFI1_DEFAULT_MAX_MTU
;
1462 /* valid CUs run from 1-128 in powers of 2 */
1463 if (hfi1_cu
> 128 || !is_power_of_2(hfi1_cu
))
1465 /* valid credit return threshold is 0-100, variable is unsigned */
1466 if (user_credit_return_threshold
> 100)
1467 user_credit_return_threshold
= 100;
1471 * sanitize receive interrupt count, time must wait until after
1472 * the hardware type is known
1474 if (rcv_intr_count
> RCV_HDR_HEAD_COUNTER_MASK
)
1475 rcv_intr_count
= RCV_HDR_HEAD_COUNTER_MASK
;
1476 /* reject invalid combinations */
1477 if (rcv_intr_count
== 0 && rcv_intr_timeout
== 0) {
1478 pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n");
1481 if (rcv_intr_count
> 1 && rcv_intr_timeout
== 0) {
1483 * Avoid indefinite packet delivery by requiring a timeout
1486 pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n");
1487 rcv_intr_timeout
= 1;
1489 if (rcv_intr_dynamic
&& !(rcv_intr_count
> 1 && rcv_intr_timeout
> 0)) {
1491 * The dynamic algorithm expects a non-zero timeout
1494 pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n");
1495 rcv_intr_dynamic
= 0;
1498 /* sanitize link CRC options */
1499 link_crc_mask
&= SUPPORTED_CRCS
;
1502 * These must be called before the driver is registered with
1503 * the PCI subsystem.
1505 idr_init(&hfi1_unit_table
);
1508 ret
= pci_register_driver(&hfi1_pci_driver
);
1510 pr_err("Unable to register driver: error %d\n", -ret
);
1513 goto bail
; /* all OK */
1517 idr_destroy(&hfi1_unit_table
);
1523 module_init(hfi1_mod_init
);
1526 * Do the non-unit driver cleanup, memory free, etc. at unload.
1528 static void __exit
hfi1_mod_cleanup(void)
1530 pci_unregister_driver(&hfi1_pci_driver
);
1531 node_affinity_destroy_all();
1534 idr_destroy(&hfi1_unit_table
);
1535 dispose_firmware(); /* asymmetric with obtain_firmware() */
1539 module_exit(hfi1_mod_cleanup
);
1541 /* this can only be called after a successful initialization */
1542 static void cleanup_device_data(struct hfi1_devdata
*dd
)
1547 /* users can't do anything more with chip */
1548 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
1549 struct hfi1_pportdata
*ppd
= &dd
->pport
[pidx
];
1550 struct cc_state
*cc_state
;
1554 *ppd
->statusp
&= ~HFI1_STATUS_CHIP_PRESENT
;
1556 for (i
= 0; i
< OPA_MAX_SLS
; i
++)
1557 hrtimer_cancel(&ppd
->cca_timer
[i
].hrtimer
);
1559 spin_lock(&ppd
->cc_state_lock
);
1560 cc_state
= get_cc_state_protected(ppd
);
1561 RCU_INIT_POINTER(ppd
->cc_state
, NULL
);
1562 spin_unlock(&ppd
->cc_state_lock
);
1565 kfree_rcu(cc_state
, rcu
);
1568 free_credit_return(dd
);
1570 if (dd
->rcvhdrtail_dummy_kvaddr
) {
1571 dma_free_coherent(&dd
->pcidev
->dev
, sizeof(u64
),
1572 (void *)dd
->rcvhdrtail_dummy_kvaddr
,
1573 dd
->rcvhdrtail_dummy_dma
);
1574 dd
->rcvhdrtail_dummy_kvaddr
= NULL
;
1578 * Free any resources still in use (usually just kernel contexts)
1579 * at unload; we do for ctxtcnt, because that's what we allocate.
1581 for (ctxt
= 0; dd
->rcd
&& ctxt
< dd
->num_rcv_contexts
; ctxt
++) {
1582 struct hfi1_ctxtdata
*rcd
= dd
->rcd
[ctxt
];
1585 hfi1_clear_tids(rcd
);
1586 hfi1_free_ctxt(rcd
);
1594 /* must follow rcv context free - need to remove rcv's hooks */
1595 for (ctxt
= 0; ctxt
< dd
->num_send_contexts
; ctxt
++)
1596 sc_free(dd
->send_contexts
[ctxt
].sc
);
1597 dd
->num_send_contexts
= 0;
1598 kfree(dd
->send_contexts
);
1599 dd
->send_contexts
= NULL
;
1600 kfree(dd
->hw_to_sw
);
1601 dd
->hw_to_sw
= NULL
;
1602 kfree(dd
->boardname
);
1608 * Clean up on unit shutdown, or error during unit load after
1609 * successful initialization.
1611 static void postinit_cleanup(struct hfi1_devdata
*dd
)
1613 hfi1_start_cleanup(dd
);
1614 hfi1_comp_vectors_clean_up(dd
);
1615 hfi1_dev_affinity_clean_up(dd
);
1617 hfi1_pcie_ddcleanup(dd
);
1618 hfi1_pcie_cleanup(dd
->pcidev
);
1620 cleanup_device_data(dd
);
1622 hfi1_free_devdata(dd
);
1625 static int init_validate_rcvhdrcnt(struct hfi1_devdata
*dd
, uint thecnt
)
1627 if (thecnt
<= HFI1_MIN_HDRQ_EGRBUF_CNT
) {
1628 dd_dev_err(dd
, "Receive header queue count too small\n");
1632 if (thecnt
> HFI1_MAX_HDRQ_EGRBUF_CNT
) {
1634 "Receive header queue count cannot be greater than %u\n",
1635 HFI1_MAX_HDRQ_EGRBUF_CNT
);
1639 if (thecnt
% HDRQ_INCREMENT
) {
1640 dd_dev_err(dd
, "Receive header queue count %d must be divisible by %lu\n",
1641 thecnt
, HDRQ_INCREMENT
);
1648 static int init_one(struct pci_dev
*pdev
, const struct pci_device_id
*ent
)
1650 int ret
= 0, j
, pidx
, initfail
;
1651 struct hfi1_devdata
*dd
;
1652 struct hfi1_pportdata
*ppd
;
1654 /* First, lock the non-writable module parameters */
1657 /* Validate dev ids */
1658 if (!(ent
->device
== PCI_DEVICE_ID_INTEL0
||
1659 ent
->device
== PCI_DEVICE_ID_INTEL1
)) {
1660 dev_err(&pdev
->dev
, "Failing on unknown Intel deviceid 0x%x\n",
1666 /* Allocate the dd so we can get to work */
1667 dd
= hfi1_alloc_devdata(pdev
, NUM_IB_PORTS
*
1668 sizeof(struct hfi1_pportdata
));
1674 /* Validate some global module parameters */
1675 ret
= init_validate_rcvhdrcnt(dd
, rcvhdrcnt
);
1679 /* use the encoding function as a sanitization check */
1680 if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize
)) {
1681 dd_dev_err(dd
, "Invalid HdrQ Entry size %u\n",
1687 /* The receive eager buffer size must be set before the receive
1688 * contexts are created.
1690 * Set the eager buffer size. Validate that it falls in a range
1691 * allowed by the hardware - all powers of 2 between the min and
1692 * max. The maximum valid MTU is within the eager buffer range
1693 * so we do not need to cap the max_mtu by an eager buffer size
1696 if (eager_buffer_size
) {
1697 if (!is_power_of_2(eager_buffer_size
))
1699 roundup_pow_of_two(eager_buffer_size
);
1701 clamp_val(eager_buffer_size
,
1702 MIN_EAGER_BUFFER
* 8,
1703 MAX_EAGER_BUFFER_TOTAL
);
1704 dd_dev_info(dd
, "Eager buffer size %u\n",
1707 dd_dev_err(dd
, "Invalid Eager buffer size of 0\n");
1712 /* restrict value of hfi1_rcvarr_split */
1713 hfi1_rcvarr_split
= clamp_val(hfi1_rcvarr_split
, 0, 100);
1715 ret
= hfi1_pcie_init(dd
);
1720 * Do device-specific initialization, function table setup, dd
1723 ret
= hfi1_init_dd(dd
);
1725 goto clean_bail
; /* error already printed */
1727 ret
= create_workqueues(dd
);
1731 /* do the generic initialization */
1732 initfail
= hfi1_init(dd
, 0);
1735 hfi1_vnic_setup(dd
);
1737 ret
= hfi1_register_ib_device(dd
);
1740 * Now ready for use. this should be cleared whenever we
1741 * detect a reset, or initiate one. If earlier failure,
1742 * we still create devices, so diags, etc. can be used
1743 * to determine cause of problem.
1745 if (!initfail
&& !ret
) {
1746 dd
->flags
|= HFI1_INITTED
;
1747 /* create debufs files after init and ib register */
1748 hfi1_dbg_ibdev_init(&dd
->verbs_dev
);
1751 j
= hfi1_device_create(dd
);
1753 dd_dev_err(dd
, "Failed to create /dev devices: %d\n", -j
);
1755 if (initfail
|| ret
) {
1756 msix_clean_up_interrupts(dd
);
1758 flush_workqueue(ib_wq
);
1759 for (pidx
= 0; pidx
< dd
->num_pports
; ++pidx
) {
1760 hfi1_quiet_serdes(dd
->pport
+ pidx
);
1761 ppd
= dd
->pport
+ pidx
;
1763 destroy_workqueue(ppd
->hfi1_wq
);
1764 ppd
->hfi1_wq
= NULL
;
1767 destroy_workqueue(ppd
->link_wq
);
1768 ppd
->link_wq
= NULL
;
1772 hfi1_device_remove(dd
);
1774 hfi1_unregister_ib_device(dd
);
1775 hfi1_vnic_cleanup(dd
);
1776 postinit_cleanup(dd
);
1779 goto bail
; /* everything already cleaned */
1787 hfi1_pcie_cleanup(pdev
);
1792 static void wait_for_clients(struct hfi1_devdata
*dd
)
1795 * Remove the device init value and complete the device if there is
1796 * no clients or wait for active clients to finish.
1798 if (atomic_dec_and_test(&dd
->user_refcount
))
1799 complete(&dd
->user_comp
);
1801 wait_for_completion(&dd
->user_comp
);
1804 static void remove_one(struct pci_dev
*pdev
)
1806 struct hfi1_devdata
*dd
= pci_get_drvdata(pdev
);
1808 /* close debugfs files before ib unregister */
1809 hfi1_dbg_ibdev_exit(&dd
->verbs_dev
);
1811 /* remove the /dev hfi1 interface */
1812 hfi1_device_remove(dd
);
1814 /* wait for existing user space clients to finish */
1815 wait_for_clients(dd
);
1817 /* unregister from IB core */
1818 hfi1_unregister_ib_device(dd
);
1821 hfi1_vnic_cleanup(dd
);
1824 * Disable the IB link, disable interrupts on the device,
1825 * clear dma engines, etc.
1827 shutdown_device(dd
);
1831 /* wait until all of our (qsfp) queue_work() calls complete */
1832 flush_workqueue(ib_wq
);
1834 postinit_cleanup(dd
);
1837 static void shutdown_one(struct pci_dev
*pdev
)
1839 struct hfi1_devdata
*dd
= pci_get_drvdata(pdev
);
1841 shutdown_device(dd
);
1845 * hfi1_create_rcvhdrq - create a receive header queue
1846 * @dd: the hfi1_ib device
1847 * @rcd: the context data
1849 * This must be contiguous memory (from an i/o perspective), and must be
1850 * DMA'able (which means for some systems, it will go through an IOMMU,
1851 * or be forced into a low address range).
1853 int hfi1_create_rcvhdrq(struct hfi1_devdata
*dd
, struct hfi1_ctxtdata
*rcd
)
1858 if (!rcd
->rcvhdrq
) {
1861 amt
= rcvhdrq_size(rcd
);
1863 if (rcd
->ctxt
< dd
->first_dyn_alloc_ctxt
|| rcd
->is_vnic
)
1864 gfp_flags
= GFP_KERNEL
;
1866 gfp_flags
= GFP_USER
;
1867 rcd
->rcvhdrq
= dma_alloc_coherent(&dd
->pcidev
->dev
, amt
,
1869 gfp_flags
| __GFP_COMP
);
1871 if (!rcd
->rcvhdrq
) {
1873 "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
1878 if (HFI1_CAP_KGET_MASK(rcd
->flags
, DMA_RTAIL
) ||
1879 HFI1_CAP_UGET_MASK(rcd
->flags
, DMA_RTAIL
)) {
1880 rcd
->rcvhdrtail_kvaddr
= dma_alloc_coherent(&dd
->pcidev
->dev
,
1882 &rcd
->rcvhdrqtailaddr_dma
,
1884 if (!rcd
->rcvhdrtail_kvaddr
)
1889 * These values are per-context:
1894 reg
= ((u64
)(rcd
->rcvhdrq_cnt
>> HDRQ_SIZE_SHIFT
)
1895 & RCV_HDR_CNT_CNT_MASK
)
1896 << RCV_HDR_CNT_CNT_SHIFT
;
1897 write_kctxt_csr(dd
, rcd
->ctxt
, RCV_HDR_CNT
, reg
);
1898 reg
= (encode_rcv_header_entry_size(rcd
->rcvhdrqentsize
)
1899 & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK
)
1900 << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT
;
1901 write_kctxt_csr(dd
, rcd
->ctxt
, RCV_HDR_ENT_SIZE
, reg
);
1902 reg
= ((u64
)DEFAULT_RCVHDRSIZE
& RCV_HDR_SIZE_HDR_SIZE_MASK
)
1903 << RCV_HDR_SIZE_HDR_SIZE_SHIFT
;
1904 write_kctxt_csr(dd
, rcd
->ctxt
, RCV_HDR_SIZE
, reg
);
1907 * Program dummy tail address for every receive context
1908 * before enabling any receive context
1910 write_kctxt_csr(dd
, rcd
->ctxt
, RCV_HDR_TAIL_ADDR
,
1911 dd
->rcvhdrtail_dummy_dma
);
1917 "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
1919 dma_free_coherent(&dd
->pcidev
->dev
, amt
, rcd
->rcvhdrq
,
1921 rcd
->rcvhdrq
= NULL
;
1927 * allocate eager buffers, both kernel and user contexts.
1928 * @rcd: the context we are setting up.
1930 * Allocate the eager TID buffers and program them into hip.
1931 * They are no longer completely contiguous, we do multiple allocation
1932 * calls. Otherwise we get the OOM code involved, by asking for too
1933 * much per call, with disastrous results on some kernels.
1935 int hfi1_setup_eagerbufs(struct hfi1_ctxtdata
*rcd
)
1937 struct hfi1_devdata
*dd
= rcd
->dd
;
1938 u32 max_entries
, egrtop
, alloced_bytes
= 0;
1942 u16 round_mtu
= roundup_pow_of_two(hfi1_max_mtu
);
1945 * GFP_USER, but without GFP_FS, so buffer cache can be
1946 * coalesced (we hope); otherwise, even at order 4,
1947 * heavy filesystem activity makes these fail, and we can
1948 * use compound pages.
1950 gfp_flags
= __GFP_RECLAIM
| __GFP_IO
| __GFP_COMP
;
1953 * The minimum size of the eager buffers is a groups of MTU-sized
1955 * The global eager_buffer_size parameter is checked against the
1956 * theoretical lower limit of the value. Here, we check against the
1959 if (rcd
->egrbufs
.size
< (round_mtu
* dd
->rcv_entries
.group_size
))
1960 rcd
->egrbufs
.size
= round_mtu
* dd
->rcv_entries
.group_size
;
1962 * If using one-pkt-per-egr-buffer, lower the eager buffer
1963 * size to the max MTU (page-aligned).
1965 if (!HFI1_CAP_KGET_MASK(rcd
->flags
, MULTI_PKT_EGR
))
1966 rcd
->egrbufs
.rcvtid_size
= round_mtu
;
1969 * Eager buffers sizes of 1MB or less require smaller TID sizes
1970 * to satisfy the "multiple of 8 RcvArray entries" requirement.
1972 if (rcd
->egrbufs
.size
<= (1 << 20))
1973 rcd
->egrbufs
.rcvtid_size
= max((unsigned long)round_mtu
,
1974 rounddown_pow_of_two(rcd
->egrbufs
.size
/ 8));
1976 while (alloced_bytes
< rcd
->egrbufs
.size
&&
1977 rcd
->egrbufs
.alloced
< rcd
->egrbufs
.count
) {
1978 rcd
->egrbufs
.buffers
[idx
].addr
=
1979 dma_alloc_coherent(&dd
->pcidev
->dev
,
1980 rcd
->egrbufs
.rcvtid_size
,
1981 &rcd
->egrbufs
.buffers
[idx
].dma
,
1983 if (rcd
->egrbufs
.buffers
[idx
].addr
) {
1984 rcd
->egrbufs
.buffers
[idx
].len
=
1985 rcd
->egrbufs
.rcvtid_size
;
1986 rcd
->egrbufs
.rcvtids
[rcd
->egrbufs
.alloced
].addr
=
1987 rcd
->egrbufs
.buffers
[idx
].addr
;
1988 rcd
->egrbufs
.rcvtids
[rcd
->egrbufs
.alloced
].dma
=
1989 rcd
->egrbufs
.buffers
[idx
].dma
;
1990 rcd
->egrbufs
.alloced
++;
1991 alloced_bytes
+= rcd
->egrbufs
.rcvtid_size
;
1998 * Fail the eager buffer allocation if:
1999 * - we are already using the lowest acceptable size
2000 * - we are using one-pkt-per-egr-buffer (this implies
2001 * that we are accepting only one size)
2003 if (rcd
->egrbufs
.rcvtid_size
== round_mtu
||
2004 !HFI1_CAP_KGET_MASK(rcd
->flags
, MULTI_PKT_EGR
)) {
2005 dd_dev_err(dd
, "ctxt%u: Failed to allocate eager buffers\n",
2008 goto bail_rcvegrbuf_phys
;
2011 new_size
= rcd
->egrbufs
.rcvtid_size
/ 2;
2014 * If the first attempt to allocate memory failed, don't
2015 * fail everything but continue with the next lower
2019 rcd
->egrbufs
.rcvtid_size
= new_size
;
2024 * Re-partition already allocated buffers to a smaller
2027 rcd
->egrbufs
.alloced
= 0;
2028 for (i
= 0, j
= 0, offset
= 0; j
< idx
; i
++) {
2029 if (i
>= rcd
->egrbufs
.count
)
2031 rcd
->egrbufs
.rcvtids
[i
].dma
=
2032 rcd
->egrbufs
.buffers
[j
].dma
+ offset
;
2033 rcd
->egrbufs
.rcvtids
[i
].addr
=
2034 rcd
->egrbufs
.buffers
[j
].addr
+ offset
;
2035 rcd
->egrbufs
.alloced
++;
2036 if ((rcd
->egrbufs
.buffers
[j
].dma
+ offset
+
2038 (rcd
->egrbufs
.buffers
[j
].dma
+
2039 rcd
->egrbufs
.buffers
[j
].len
)) {
2046 rcd
->egrbufs
.rcvtid_size
= new_size
;
2049 rcd
->egrbufs
.numbufs
= idx
;
2050 rcd
->egrbufs
.size
= alloced_bytes
;
2053 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
2054 rcd
->ctxt
, rcd
->egrbufs
.alloced
,
2055 rcd
->egrbufs
.rcvtid_size
/ 1024, rcd
->egrbufs
.size
/ 1024);
2058 * Set the contexts rcv array head update threshold to the closest
2059 * power of 2 (so we can use a mask instead of modulo) below half
2060 * the allocated entries.
2062 rcd
->egrbufs
.threshold
=
2063 rounddown_pow_of_two(rcd
->egrbufs
.alloced
/ 2);
2065 * Compute the expected RcvArray entry base. This is done after
2066 * allocating the eager buffers in order to maximize the
2067 * expected RcvArray entries for the context.
2069 max_entries
= rcd
->rcv_array_groups
* dd
->rcv_entries
.group_size
;
2070 egrtop
= roundup(rcd
->egrbufs
.alloced
, dd
->rcv_entries
.group_size
);
2071 rcd
->expected_count
= max_entries
- egrtop
;
2072 if (rcd
->expected_count
> MAX_TID_PAIR_ENTRIES
* 2)
2073 rcd
->expected_count
= MAX_TID_PAIR_ENTRIES
* 2;
2075 rcd
->expected_base
= rcd
->eager_base
+ egrtop
;
2076 hfi1_cdbg(PROC
, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n",
2077 rcd
->ctxt
, rcd
->egrbufs
.alloced
, rcd
->expected_count
,
2078 rcd
->eager_base
, rcd
->expected_base
);
2080 if (!hfi1_rcvbuf_validate(rcd
->egrbufs
.rcvtid_size
, PT_EAGER
, &order
)) {
2082 "ctxt%u: current Eager buffer size is invalid %u\n",
2083 rcd
->ctxt
, rcd
->egrbufs
.rcvtid_size
);
2085 goto bail_rcvegrbuf_phys
;
2088 for (idx
= 0; idx
< rcd
->egrbufs
.alloced
; idx
++) {
2089 hfi1_put_tid(dd
, rcd
->eager_base
+ idx
, PT_EAGER
,
2090 rcd
->egrbufs
.rcvtids
[idx
].dma
, order
);
2096 bail_rcvegrbuf_phys
:
2097 for (idx
= 0; idx
< rcd
->egrbufs
.alloced
&&
2098 rcd
->egrbufs
.buffers
[idx
].addr
;
2100 dma_free_coherent(&dd
->pcidev
->dev
,
2101 rcd
->egrbufs
.buffers
[idx
].len
,
2102 rcd
->egrbufs
.buffers
[idx
].addr
,
2103 rcd
->egrbufs
.buffers
[idx
].dma
);
2104 rcd
->egrbufs
.buffers
[idx
].addr
= NULL
;
2105 rcd
->egrbufs
.buffers
[idx
].dma
= 0;
2106 rcd
->egrbufs
.buffers
[idx
].len
= 0;