]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blobdiff - drivers/staging/rdma/hfi1/chip.c
IB/hfi1: Do not free hfi1 cdev parent structure early
[mirror_ubuntu-artful-kernel.git] / drivers / staging / rdma / hfi1 / chip.c
index 16eb653903e0b873909f3cf3175cf597fae88ea8..d8cc329901d060f96887d97fb9e7ef79a10a5530 100644 (file)
@@ -123,6 +123,8 @@ struct flag_table {
 
 #define MIN_KERNEL_KCTXTS         2
 #define FIRST_KERNEL_KCTXT        1
+/* sizes for both the QP and RSM map tables */
+#define NUM_MAP_ENTRIES                256
 #define NUM_MAP_REGS             32
 
 /* Bit offset into the GUID which carries HFI id information */
@@ -1029,9 +1031,12 @@ static int thermal_init(struct hfi1_devdata *dd);
 static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
                                  int msecs);
 static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
 static void handle_temp_err(struct hfi1_devdata *);
 static void dc_shutdown(struct hfi1_devdata *);
 static void dc_start(struct hfi1_devdata *);
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+                          unsigned int *np);
 
 /*
  * Error interrupt table entry.  This is used as input to the interrupt
@@ -5661,7 +5666,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index)
        sci = &dd->send_contexts[sw_index];
 
        /* there is no information for user (PSM) and ack contexts */
-       if (sci->type != SC_KERNEL)
+       if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15))
                return -1;
 
        sc = sci->sc;
@@ -6100,7 +6105,7 @@ int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
        }
 
        /* this access is valid only when the link is up */
-       if ((ppd->host_link_state & HLS_UP) == 0) {
+       if (ppd->host_link_state & HLS_DOWN) {
                dd_dev_info(dd, "%s: link state %s not up\n",
                            __func__, link_state_name(ppd->host_link_state));
                ret = -EBUSY;
@@ -6199,18 +6204,13 @@ static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
 
 /*
  * Handle host requests from the 8051.
- *
- * This is a work-queue function outside of the interrupt.
  */
-void handle_8051_request(struct work_struct *work)
+static void handle_8051_request(struct hfi1_pportdata *ppd)
 {
-       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                       dc_host_req_work);
        struct hfi1_devdata *dd = ppd->dd;
        u64 reg;
        u16 data = 0;
-       u8 type, i, lanes, *cache = ppd->qsfp_info.cache;
-       u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS];
+       u8 type;
 
        reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
        if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
@@ -6231,46 +6231,11 @@ void handle_8051_request(struct work_struct *work)
        case HREQ_READ_CONFIG:
        case HREQ_SET_TX_EQ_ABS:
        case HREQ_SET_TX_EQ_REL:
+       case HREQ_ENABLE:
                dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
                            type);
                hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
                break;
-
-       case HREQ_ENABLE:
-               lanes = data & 0xF;
-               for (i = 0; lanes; lanes >>= 1, i++) {
-                       if (!(lanes & 1))
-                               continue;
-                       if (data & 0x200) {
-                               /* enable TX CDR */
-                               if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
-                                   cache[QSFP_CDR_INFO_OFFS] & 0x80)
-                                       cdr_ctrl_byte |= (1 << (i + 4));
-                       } else {
-                               /* disable TX CDR */
-                               if (cache[QSFP_MOD_PWR_OFFS] & 0x8 &&
-                                   cache[QSFP_CDR_INFO_OFFS] & 0x80)
-                                       cdr_ctrl_byte &= ~(1 << (i + 4));
-                       }
-
-                       if (data & 0x800) {
-                               /* enable RX CDR */
-                               if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
-                                   cache[QSFP_CDR_INFO_OFFS] & 0x40)
-                                       cdr_ctrl_byte |= (1 << i);
-                       } else {
-                               /* disable RX CDR */
-                               if (cache[QSFP_MOD_PWR_OFFS] & 0x4 &&
-                                   cache[QSFP_CDR_INFO_OFFS] & 0x40)
-                                       cdr_ctrl_byte &= ~(1 << i);
-                       }
-               }
-               one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS,
-                              &cdr_ctrl_byte, 1);
-               hreq_response(dd, HREQ_SUCCESS, data);
-               refresh_qsfp_cache(ppd, &ppd->qsfp_info);
-               break;
-
        case HREQ_CONFIG_DONE:
                hreq_response(dd, HREQ_SUCCESS, 0);
                break;
@@ -6278,7 +6243,6 @@ void handle_8051_request(struct work_struct *work)
        case HREQ_INTERFACE_TEST:
                hreq_response(dd, HREQ_SUCCESS, data);
                break;
-
        default:
                dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
                hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
@@ -6849,6 +6813,75 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
        ppd->neighbor_fm_security = 0;
 }
 
+static const char * const link_down_reason_strs[] = {
+       [OPA_LINKDOWN_REASON_NONE] = "None",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0",
+       [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length",
+       [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long",
+       [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short",
+       [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID",
+       [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID",
+       [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2",
+       [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8",
+       [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10",
+       [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error",
+       [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15",
+       [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15",
+       [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance",
+       [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance",
+       [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance",
+       [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack",
+       [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker",
+       [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt",
+       [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit",
+       [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29",
+       [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30",
+       [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] =
+                                       "Excessive buffer overrun",
+       [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown",
+       [OPA_LINKDOWN_REASON_REBOOT] = "Reboot",
+       [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown",
+       [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce",
+       [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy",
+       [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy",
+       [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected",
+       [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] =
+                                       "Local media not installed",
+       [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed",
+       [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config",
+       [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] =
+                                       "End to end not installed",
+       [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy",
+       [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy",
+       [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy",
+       [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management",
+       [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled",
+       [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient"
+};
+
+/* return the neighbor link down reason string */
+static const char *link_down_reason_str(u8 reason)
+{
+       const char *str = NULL;
+
+       if (reason < ARRAY_SIZE(link_down_reason_strs))
+               str = link_down_reason_strs[reason];
+       if (!str)
+               str = "(invalid)";
+
+       return str;
+}
+
 /*
  * Handle a link down interrupt from the 8051.
  *
@@ -6857,8 +6890,11 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd)
 void handle_link_down(struct work_struct *work)
 {
        u8 lcl_reason, neigh_reason = 0;
+       u8 link_down_reason;
        struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
-                                                               link_down_work);
+                                                 link_down_work);
+       int was_up;
+       static const char ldr_str[] = "Link down reason: ";
 
        if ((ppd->host_link_state &
             (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) &&
@@ -6867,20 +6903,63 @@ void handle_link_down(struct work_struct *work)
                        HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED);
 
        /* Go offline first, then deal with reading/writing through 8051 */
+       was_up = !!(ppd->host_link_state & HLS_UP);
        set_link_state(ppd, HLS_DN_OFFLINE);
 
-       lcl_reason = 0;
-       read_planned_down_reason_code(ppd->dd, &neigh_reason);
+       if (was_up) {
+               lcl_reason = 0;
+               /* link down reason is only valid if the link was up */
+               read_link_down_reason(ppd->dd, &link_down_reason);
+               switch (link_down_reason) {
+               case LDR_LINK_TRANSFER_ACTIVE_LOW:
+                       /* the link went down, no idle message reason */
+                       dd_dev_info(ppd->dd, "%sUnexpected link down\n",
+                                   ldr_str);
+                       break;
+               case LDR_RECEIVED_LINKDOWN_IDLE_MSG:
+                       /*
+                        * The neighbor reason is only valid if an idle message
+                        * was received for it.
+                        */
+                       read_planned_down_reason_code(ppd->dd, &neigh_reason);
+                       dd_dev_info(ppd->dd,
+                                   "%sNeighbor link down message %d, %s\n",
+                                   ldr_str, neigh_reason,
+                                   link_down_reason_str(neigh_reason));
+                       break;
+               case LDR_RECEIVED_HOST_OFFLINE_REQ:
+                       dd_dev_info(ppd->dd,
+                                   "%sHost requested link to go offline\n",
+                                   ldr_str);
+                       break;
+               default:
+                       dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n",
+                                   ldr_str, link_down_reason);
+                       break;
+               }
 
-       /*
-        * If no reason, assume peer-initiated but missed
-        * LinkGoingDown idle flits.
-        */
-       if (neigh_reason == 0)
-               lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+               /*
+                * If no reason, assume peer-initiated but missed
+                * LinkGoingDown idle flits.
+                */
+               if (neigh_reason == 0)
+                       lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+       } else {
+               /* went down while polling or going up */
+               lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT;
+       }
 
        set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
 
+       /* inform the SMA when the link transitions from up to down */
+       if (was_up && ppd->local_link_down_reason.sma == 0 &&
+           ppd->neigh_link_down_reason.sma == 0) {
+               ppd->local_link_down_reason.sma =
+                                       ppd->local_link_down_reason.latest;
+               ppd->neigh_link_down_reason.sma =
+                                       ppd->neigh_link_down_reason.latest;
+       }
+
        reset_neighbor_info(ppd);
 
        /* disable the port */
@@ -6890,7 +6969,7 @@ void handle_link_down(struct work_struct *work)
         * If there is no cable attached, turn the DC off. Otherwise,
         * start the link bring up.
         */
-       if (!qsfp_mod_present(ppd)) {
+       if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) {
                dc_shutdown(ppd->dd);
        } else {
                tune_serdes(ppd);
@@ -7350,7 +7429,7 @@ void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
 retry:
        mutex_lock(&ppd->hls_lock);
        /* only apply if the link is up */
-       if (!(ppd->host_link_state & HLS_UP)) {
+       if (ppd->host_link_state & HLS_DOWN) {
                /* still going up..wait and retry */
                if (ppd->host_link_state & HLS_GOING_UP) {
                        if (++tries < 1000) {
@@ -7373,7 +7452,11 @@ retry:
                ppd->link_width_downgrade_rx_active = rx;
        }
 
-       if (lwde == 0) {
+       if (ppd->link_width_downgrade_tx_active == 0 ||
+           ppd->link_width_downgrade_rx_active == 0) {
+               /* the 8051 reported a dead link as a downgrade */
+               dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n");
+       } else if (lwde == 0) {
                /* downgrade is disabled */
 
                /* bounce if not at starting active width */
@@ -7534,7 +7617,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
                        host_msg &= ~(u64)LINKUP_ACHIEVED;
                }
                if (host_msg & EXT_DEVICE_CFG_REQ) {
-                       queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work);
+                       handle_8051_request(ppd);
                        host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
                }
                if (host_msg & VERIFY_CAP_FRAME) {
@@ -8660,6 +8743,14 @@ static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
        *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
 }
 
+static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr)
+{
+       u32 frame;
+
+       read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame);
+       *ldr = (frame & 0xff);
+}
+
 static int read_tx_settings(struct hfi1_devdata *dd,
                            u8 *enable_lane_tx,
                            u8 *tx_polarity_inversion,
@@ -9049,9 +9140,9 @@ set_local_link_attributes_fail:
 }
 
 /*
- * Call this to start the link.  Schedule a retry if the cable is not
- * present or if unable to start polling.  Do not do anything if the
- * link is disabled.  Returns 0 if link is disabled or moved to polling
+ * Call this to start the link.
+ * Do not do anything if the link is disabled.
+ * Returns 0 if link is disabled, moved to polling, or the driver is not ready.
  */
 int start_link(struct hfi1_pportdata *ppd)
 {
@@ -9068,15 +9159,7 @@ int start_link(struct hfi1_pportdata *ppd)
                return 0;
        }
 
-       if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
-           loopback == LOOPBACK_LCB ||
-           ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
-               return set_link_state(ppd, HLS_DN_POLL);
-
-       dd_dev_info(ppd->dd,
-                   "%s: stopping link start because no cable is present\n",
-                   __func__);
-       return -EAGAIN;
+       return set_link_state(ppd, HLS_DN_POLL);
 }
 
 static void wait_for_qsfp_init(struct hfi1_pportdata *ppd)
@@ -9129,9 +9212,6 @@ void reset_qsfp(struct hfi1_pportdata *ppd)
 
        /* Reset the QSFP */
        mask = (u64)QSFP_HFI0_RESET_N;
-       qsfp_mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
-       qsfp_mask |= mask;
-       write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE, qsfp_mask);
 
        qsfp_mask = read_csr(dd,
                             dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
@@ -9169,6 +9249,12 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
                dd_dev_info(dd, "%s: QSFP cable temperature too low\n",
                            __func__);
 
+       /*
+        * The remaining alarms/warnings don't matter if the link is down.
+        */
+       if (ppd->host_link_state & HLS_DOWN)
+               return 0;
+
        if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
            (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
                dd_dev_info(dd, "%s: QSFP supply voltage too high\n",
@@ -9247,7 +9333,7 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
        return 0;
 }
 
-/* This routine will only be scheduled if the QSFP module is present */
+/* This routine will only be scheduled if the QSFP module present is asserted */
 void qsfp_event(struct work_struct *work)
 {
        struct qsfp_data *qd;
@@ -9263,9 +9349,8 @@ void qsfp_event(struct work_struct *work)
                return;
 
        /*
-        * Turn DC back on after cables has been
-        * re-inserted. Up until now, the DC has been in
-        * reset to save power.
+        * Turn DC back on after cable has been re-inserted. Up until
+        * now, the DC has been in reset to save power.
         */
        dc_start(dd);
 
@@ -9397,7 +9482,15 @@ int bringup_serdes(struct hfi1_pportdata *ppd)
                        return ret;
        }
 
-       /* tune the SERDES to a ballpark setting for
+       get_port_type(ppd);
+       if (ppd->port_type == PORT_TYPE_QSFP) {
+               set_qsfp_int_n(ppd, 0);
+               wait_for_qsfp_init(ppd);
+               set_qsfp_int_n(ppd, 1);
+       }
+
+       /*
+        * Tune the SerDes to a ballpark setting for
         * optimal signal and bit error rate
         * Needs to be done before starting the link
         */
@@ -9676,6 +9769,7 @@ static void set_send_length(struct hfi1_pportdata *ppd)
                              & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
                SEND_LEN_CHECK1_LEN_VL15_SHIFT;
        int i;
+       u32 thres;
 
        for (i = 0; i < ppd->vls_supported; i++) {
                if (dd->vld[i].mtu > maxvlmtu)
@@ -9694,16 +9788,17 @@ static void set_send_length(struct hfi1_pportdata *ppd)
        /* adjust kernel credit return thresholds based on new MTUs */
        /* all kernel receive contexts have the same hdrqentsize */
        for (i = 0; i < ppd->vls_supported; i++) {
-               sc_set_cr_threshold(dd->vld[i].sc,
-                                   sc_mtu_to_threshold(dd->vld[i].sc,
-                                                       dd->vld[i].mtu,
-                                                       dd->rcd[0]->
-                                                       rcvhdrqentsize));
-       }
-       sc_set_cr_threshold(dd->vld[15].sc,
-                           sc_mtu_to_threshold(dd->vld[15].sc,
-                                               dd->vld[15].mtu,
+               thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
+                           sc_mtu_to_threshold(dd->vld[i].sc,
+                                               dd->vld[i].mtu,
                                                dd->rcd[0]->rcvhdrqentsize));
+               sc_set_cr_threshold(dd->vld[i].sc, thres);
+       }
+       thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50),
+                   sc_mtu_to_threshold(dd->vld[15].sc,
+                                       dd->vld[15].mtu,
+                                       dd->rcd[0]->rcvhdrqentsize));
+       sc_set_cr_threshold(dd->vld[15].sc, thres);
 
        /* Adjust maximum MTU for the port in DC */
        dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
@@ -9989,7 +10084,7 @@ u32 driver_physical_state(struct hfi1_pportdata *ppd)
  */
 u32 driver_logical_state(struct hfi1_pportdata *ppd)
 {
-       if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP))
+       if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN))
                return IB_PORT_DOWN;
 
        switch (ppd->host_link_state & HLS_UP) {
@@ -10030,7 +10125,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
        struct hfi1_devdata *dd = ppd->dd;
        struct ib_event event = {.device = NULL};
        int ret1, ret = 0;
-       int was_up, is_down;
        int orig_new_state, poll_bounce;
 
        mutex_lock(&ppd->hls_lock);
@@ -10049,8 +10143,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
                    poll_bounce ? "(bounce) " : "",
                    link_state_reason_name(ppd, state));
 
-       was_up = !!(ppd->host_link_state & HLS_UP);
-
        /*
         * If we're going to a (HLS_*) link state that implies the logical
         * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
@@ -10261,17 +10353,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state)
                break;
        }
 
-       is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
-                       HLS_DN_DISABLE | HLS_DN_OFFLINE));
-
-       if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
-           ppd->neigh_link_down_reason.sma == 0) {
-               ppd->local_link_down_reason.sma =
-                 ppd->local_link_down_reason.latest;
-               ppd->neigh_link_down_reason.sma =
-                 ppd->neigh_link_down_reason.latest;
-       }
-
        goto done;
 
 unexpected:
@@ -12673,22 +12754,24 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
        int total_contexts;
        int ret;
        unsigned ngroups;
+       int qos_rmt_count;
+       int user_rmt_reduced;
 
        /*
-        * Kernel contexts: (to be fixed later):
-        * - min or 2 or 1 context/numa
+        * Kernel receive contexts:
+        * - min of 2 or 1 context/numa (excluding control context)
         * - Context 0 - control context (VL15/multicast/error)
-        * - Context 1 - default context
+        * - Context 1 - first kernel context
+        * - Context 2 - second kernel context
+        * ...
         */
        if (n_krcvqs)
                /*
-                * Don't count context 0 in n_krcvqs since
-                * is isn't used for normal verbs traffic.
-                *
-                * krcvqs will reflect number of kernel
-                * receive contexts above 0.
+                * n_krcvqs is the sum of module parameter kernel receive
+                * contexts, krcvqs[].  It does not include the control
+                * context, so add that.
                 */
-               num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1;
+               num_kernel_contexts = n_krcvqs + 1;
        else
                num_kernel_contexts = num_online_nodes() + 1;
        num_kernel_contexts =
@@ -12705,12 +12788,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
                num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
        }
        /*
-        * User contexts: (to be fixed later)
-        *      - default to 1 user context per CPU if num_user_contexts is
-        *        negative
+        * User contexts:
+        *      - default to 1 user context per real (non-HT) CPU core if
+        *        num_user_contexts is negative
         */
        if (num_user_contexts < 0)
-               num_user_contexts = num_online_cpus();
+               num_user_contexts =
+                       cpumask_weight(&dd->affinity->real_cpu_mask);
 
        total_contexts = num_kernel_contexts + num_user_contexts;
 
@@ -12727,6 +12811,19 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
                total_contexts = num_kernel_contexts + num_user_contexts;
        }
 
+       /* each user context requires an entry in the RMT */
+       qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
+       if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) {
+               user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+               dd_dev_err(dd,
+                          "RMT size is reducing the number of user receive contexts from %d to %d\n",
+                          (int)num_user_contexts,
+                          user_rmt_reduced);
+               /* recalculate */
+               num_user_contexts = user_rmt_reduced;
+               total_contexts = num_kernel_contexts + num_user_contexts;
+       }
+
        /* the first N are kernel contexts, the rest are user contexts */
        dd->num_rcv_contexts = total_contexts;
        dd->n_krcv_queues = num_kernel_contexts;
@@ -12776,12 +12873,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
                dd->num_send_contexts = ret;
                dd_dev_info(
                        dd,
-                       "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
+                       "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n",
                        dd->chip_send_contexts,
                        dd->num_send_contexts,
                        dd->sc_sizes[SC_KERNEL].count,
                        dd->sc_sizes[SC_ACK].count,
-                       dd->sc_sizes[SC_USER].count);
+                       dd->sc_sizes[SC_USER].count,
+                       dd->sc_sizes[SC_VL15].count);
                ret = 0;        /* success */
        }
 
@@ -13451,122 +13549,224 @@ static void init_qpmap_table(struct hfi1_devdata *dd,
        int i;
        u64 ctxt = first_ctxt;
 
-       for (i = 0; i < 256;) {
+       for (i = 0; i < 256; i++) {
                reg |= ctxt << (8 * (i % 8));
-               i++;
                ctxt++;
                if (ctxt > last_ctxt)
                        ctxt = first_ctxt;
-               if (i % 8 == 0) {
+               if (i % 8 == 7) {
                        write_csr(dd, regno, reg);
                        reg = 0;
                        regno += 8;
                }
        }
-       if (i % 8)
-               write_csr(dd, regno, reg);
 
        add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
                        | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
 }
 
-/**
- * init_qos - init RX qos
- * @dd - device data
- * @first_context
- *
- * This routine initializes Rule 0 and the
- * RSM map table to implement qos.
- *
- * If all of the limit tests succeed,
- * qos is applied based on the array
- * interpretation of krcvqs where
- * entry 0 is VL0.
- *
- * The number of vl bits (n) and the number of qpn
- * bits (m) are computed to feed both the RSM map table
- * and the single rule.
- *
+struct rsm_map_table {
+       u64 map[NUM_MAP_REGS];
+       unsigned int used;
+};
+
+struct rsm_rule_data {
+       u8 offset;
+       u8 pkt_type;
+       u32 field1_off;
+       u32 field2_off;
+       u32 index1_off;
+       u32 index1_width;
+       u32 index2_off;
+       u32 index2_width;
+       u32 mask1;
+       u32 value1;
+       u32 mask2;
+       u32 value2;
+};
+
+/*
+ * Return an initialized RMT map table for users to fill in.  OK if it
+ * returns NULL, indicating no table.
+ */
+static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd)
+{
+       struct rsm_map_table *rmt;
+       u8 rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+
+       rmt = kmalloc(sizeof(*rmt), GFP_KERNEL);
+       if (rmt) {
+               memset(rmt->map, rxcontext, sizeof(rmt->map));
+               rmt->used = 0;
+       }
+
+       return rmt;
+}
+
+/*
+ * Write the final RMT map table to the chip and free the table.  OK if
+ * table is NULL.
+ */
+static void complete_rsm_map_table(struct hfi1_devdata *dd,
+                                  struct rsm_map_table *rmt)
+{
+       int i;
+
+       if (rmt) {
+               /* write table to chip */
+               for (i = 0; i < NUM_MAP_REGS; i++)
+                       write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]);
+
+               /* enable RSM */
+               add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+       }
+}
+
+/*
+ * Add a receive side mapping rule.
  */
-static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
+static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index,
+                        struct rsm_rule_data *rrd)
+{
+       write_csr(dd, RCV_RSM_CFG + (8 * rule_index),
+                 (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT |
+                 1ull << rule_index | /* enable bit */
+                 (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+       write_csr(dd, RCV_RSM_SELECT + (8 * rule_index),
+                 (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+                 (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+                 (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+                 (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+                 (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+                 (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+       write_csr(dd, RCV_RSM_MATCH + (8 * rule_index),
+                 (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT |
+                 (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT |
+                 (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT |
+                 (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT);
+}
+
+/* return the number of RSM map table entries that will be used for QOS */
+static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp,
+                          unsigned int *np)
 {
+       int i;
+       unsigned int m, n;
        u8 max_by_vl = 0;
-       unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
-       u64 *rsmmap;
-       u64 reg;
-       u8  rxcontext = is_ax(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
 
-       /* validate */
+       /* is QOS active at all? */
        if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
            num_vls == 1 ||
            krcvqsset <= 1)
-               goto bail;
-       for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
+               goto no_qos;
+
+       /* determine bits for qpn */
+       for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++)
                if (krcvqs[i] > max_by_vl)
                        max_by_vl = krcvqs[i];
        if (max_by_vl > 32)
-               goto bail;
-       qpns_per_vl = __roundup_pow_of_two(max_by_vl);
-       /* determine bits vl */
-       n = ilog2(num_vls);
-       /* determine bits for qpn */
-       m = ilog2(qpns_per_vl);
+               goto no_qos;
+       m = ilog2(__roundup_pow_of_two(max_by_vl));
+
+       /* determine bits for vl */
+       n = ilog2(__roundup_pow_of_two(num_vls));
+
+       /* reject if too much is used */
        if ((m + n) > 7)
+               goto no_qos;
+
+       if (mp)
+               *mp = m;
+       if (np)
+               *np = n;
+
+       return 1 << (m + n);
+
+no_qos:
+       if (mp)
+               *mp = 0;
+       if (np)
+               *np = 0;
+       return 0;
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @rmt - RSM map table
+ *
+ * This routine initializes Rule 0 and the RSM map table to implement
+ * quality of service (qos).
+ *
+ * If all of the limit tests succeed, qos is applied based on the array
+ * interpretation of krcvqs where entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn bits (m) are computed to
+ * feed both the RSM map table and the single rule.
+ */
+static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt)
+{
+       struct rsm_rule_data rrd;
+       unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+       unsigned int rmt_entries;
+       u64 reg;
+
+       if (!rmt)
                goto bail;
-       if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
+       rmt_entries = qos_rmt_entries(dd, &m, &n);
+       if (rmt_entries == 0)
                goto bail;
-       rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
-       if (!rsmmap)
+       qpns_per_vl = 1 << m;
+
+       /* enough room in the map table? */
+       rmt_entries = 1 << (m + n);
+       if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES)
                goto bail;
-       memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
-       /* init the local copy of the table */
-       for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
+
+       /* add qos entries to the the RSM map table */
+       for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) {
                unsigned tctxt;
 
                for (qpn = 0, tctxt = ctxt;
                     krcvqs[i] && qpn < qpns_per_vl; qpn++) {
                        unsigned idx, regoff, regidx;
 
-                       /* generate index <= 128 */
-                       idx = (qpn << n) ^ i;
+                       /* generate the index the hardware will produce */
+                       idx = rmt->used + ((qpn << n) ^ i);
                        regoff = (idx % 8) * 8;
                        regidx = idx / 8;
-                       reg = rsmmap[regidx];
-                       /* replace 0xff with context number */
+                       /* replace default with context number */
+                       reg = rmt->map[regidx];
                        reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
                                << regoff);
                        reg |= (u64)(tctxt++) << regoff;
-                       rsmmap[regidx] = reg;
+                       rmt->map[regidx] = reg;
                        if (tctxt == ctxt + krcvqs[i])
                                tctxt = ctxt;
                }
                ctxt += krcvqs[i];
        }
-       /* flush cached copies to chip */
-       for (i = 0; i < NUM_MAP_REGS; i++)
-               write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
-       /* add rule0 */
-       write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
-                 RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK <<
-                 RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
-                 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
-       write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
-                 LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
-                 LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
-                 LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
-                 ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
-                 QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
-                 ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
-       write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
-                 LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
-                 LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
-                 LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
-                 LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
-       /* Enable RSM */
-       add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
-       kfree(rsmmap);
-       /* map everything else to first context */
-       init_qpmap_table(dd, FIRST_KERNEL_KCTXT, MIN_KERNEL_KCTXTS - 1);
+
+       rrd.offset = rmt->used;
+       rrd.pkt_type = 2;
+       rrd.field1_off = LRH_BTH_MATCH_OFFSET;
+       rrd.field2_off = LRH_SC_MATCH_OFFSET;
+       rrd.index1_off = LRH_SC_SELECT_OFFSET;
+       rrd.index1_width = n;
+       rrd.index2_off = QPN_SELECT_OFFSET;
+       rrd.index2_width = m + n;
+       rrd.mask1 = LRH_BTH_MASK;
+       rrd.value1 = LRH_BTH_VALUE;
+       rrd.mask2 = LRH_SC_MASK;
+       rrd.value2 = LRH_SC_VALUE;
+
+       /* add rule 0 */
+       add_rsm_rule(dd, 0, &rrd);
+
+       /* mark RSM map entries as used */
+       rmt->used += rmt_entries;
+       /* map everything else to the mcast/err/vl15 context */
+       init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT);
        dd->qos_shift = n + 1;
        return;
 bail:
@@ -13574,13 +13774,86 @@ bail:
        init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
 }
 
+static void init_user_fecn_handling(struct hfi1_devdata *dd,
+                                   struct rsm_map_table *rmt)
+{
+       struct rsm_rule_data rrd;
+       u64 reg;
+       int i, idx, regoff, regidx;
+       u8 offset;
+
+       /* there needs to be enough room in the map table */
+       if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
+               dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+               return;
+       }
+
+       /*
+        * RSM will extract the destination context as an index into the
+        * map table.  The destination contexts are a sequential block
+        * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive).
+        * Map entries are accessed as offset + extracted value.  Adjust
+        * the added offset so this sequence can be placed anywhere in
+        * the table - as long as the entries themselves do not wrap.
+        * There are only enough bits in offset for the table size, so
+        * start with that to allow for a "negative" offset.
+        */
+       offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
+                                               (int)dd->first_user_ctxt);
+
+       for (i = dd->first_user_ctxt, idx = rmt->used;
+                               i < dd->num_rcv_contexts; i++, idx++) {
+               /* replace with identity mapping */
+               regoff = (idx % 8) * 8;
+               regidx = idx / 8;
+               reg = rmt->map[regidx];
+               reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff);
+               reg |= (u64)i << regoff;
+               rmt->map[regidx] = reg;
+       }
+
+       /*
+        * For RSM intercept of Expected FECN packets:
+        * o packet type 0 - expected
+        * o match on F (bit 95), using select/match 1, and
+        * o match on SH (bit 133), using select/match 2.
+        *
+        * Use index 1 to extract the 8-bit receive context from DestQP
+        * (start at bit 64).  Use that as the RSM map table index.
+        */
+       rrd.offset = offset;
+       rrd.pkt_type = 0;
+       rrd.field1_off = 95;
+       rrd.field2_off = 133;
+       rrd.index1_off = 64;
+       rrd.index1_width = 8;
+       rrd.index2_off = 0;
+       rrd.index2_width = 0;
+       rrd.mask1 = 1;
+       rrd.value1 = 1;
+       rrd.mask2 = 1;
+       rrd.value2 = 1;
+
+       /* add rule 1 */
+       add_rsm_rule(dd, 1, &rrd);
+
+       rmt->used += dd->num_user_contexts;
+}
+
 static void init_rxe(struct hfi1_devdata *dd)
 {
+       struct rsm_map_table *rmt;
+
        /* enable all receive errors */
        write_csr(dd, RCV_ERR_MASK, ~0ull);
-       /* setup QPN map table - start where VL15 context leaves off */
-       init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ?
-                MIN_KERNEL_KCTXTS : 0);
+
+       rmt = alloc_rsm_map_table(dd);
+       /* set up QOS, including the QPN map table */
+       init_qos(dd, rmt);
+       init_user_fecn_handling(dd, rmt);
+       complete_rsm_map_table(dd, rmt);
+       kfree(rmt);
+
        /*
         * make sure RcvCtrl.RcvWcb <= PCIe Device Control
         * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
@@ -13762,6 +14035,7 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
        write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
        reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
        reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+       reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK;
        write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
 done:
        return ret;
@@ -14148,6 +14422,19 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
                 (dd->revision >> CCE_REVISION_SW_SHIFT)
                    & CCE_REVISION_SW_MASK);
 
+       /*
+        * The real cpu mask is part of the affinity struct but has to be
+        * initialized earlier than the rest of the affinity struct because it
+        * is needed to calculate the number of user contexts in
+        * set_up_context_variables(). However, hfi1_dev_affinity_init(),
+        * which initializes the rest of the affinity struct members,
+        * depends on set_up_context_variables() for the number of kernel
+        * contexts, so it cannot be called before set_up_context_variables().
+        */
+       ret = init_real_cpu_mask(dd);
+       if (ret)
+               goto bail_cleanup;
+
        ret = set_up_context_variables(dd);
        if (ret)
                goto bail_cleanup;
@@ -14161,9 +14448,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
        /* set up KDETH QP prefix in both RX and TX CSRs */
        init_kdeth_qp(dd);
 
-       ret = hfi1_dev_affinity_init(dd);
-       if (ret)
-               goto bail_cleanup;
+       hfi1_dev_affinity_init(dd);
 
        /* send contexts must be set up before receive contexts */
        ret = init_send_contexts(dd);
@@ -14303,7 +14588,7 @@ u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
                   (reason), (ret))
 
 /*
- * Initialize the Avago Thermal sensor.
+ * Initialize the thermal sensor.
  *
  * After initialization, enable polling of thermal sensor through
  * SBus interface. In order for this to work, the SBus Master