}
}
+static int mlx4_internal_err_ret_value(struct mlx4_dev *dev, u16 op,
+ u8 op_modifier)
+{
+ switch (op) {
+ case MLX4_CMD_UNMAP_ICM:
+ case MLX4_CMD_UNMAP_ICM_AUX:
+ case MLX4_CMD_UNMAP_FA:
+ case MLX4_CMD_2RST_QP:
+ case MLX4_CMD_HW2SW_EQ:
+ case MLX4_CMD_HW2SW_CQ:
+ case MLX4_CMD_HW2SW_SRQ:
+ case MLX4_CMD_HW2SW_MPT:
+ case MLX4_CMD_CLOSE_HCA:
+ case MLX4_QP_FLOW_STEERING_DETACH:
+ case MLX4_CMD_FREE_RES:
+ case MLX4_CMD_CLOSE_PORT:
+ return CMD_STAT_OK;
+
+ case MLX4_CMD_QP_ATTACH:
+ /* On Detach case return success */
+ if (op_modifier == 0)
+ return CMD_STAT_OK;
+ return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+
+ default:
+ return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+ }
+}
+
+static int mlx4_closing_cmd_fatal_error(u16 op, u8 fw_status)
+{
+ /* Any error during the closing commands below is considered fatal */
+ if (op == MLX4_CMD_CLOSE_HCA ||
+ op == MLX4_CMD_HW2SW_EQ ||
+ op == MLX4_CMD_HW2SW_CQ ||
+ op == MLX4_CMD_2RST_QP ||
+ op == MLX4_CMD_HW2SW_SRQ ||
+ op == MLX4_CMD_SYNC_TPT ||
+ op == MLX4_CMD_UNMAP_ICM ||
+ op == MLX4_CMD_UNMAP_ICM_AUX ||
+ op == MLX4_CMD_UNMAP_FA)
+ return 1;
+ /* Error on MLX4_CMD_HW2SW_MPT is fatal except when fw status equals
+ * CMD_STAT_REG_BOUND.
+ * This status indicates that memory region has memory windows bound to it
+ * which may result from invalid user space usage and is not fatal.
+ */
+ if (op == MLX4_CMD_HW2SW_MPT && fw_status != CMD_STAT_REG_BOUND)
+ return 1;
+ return 0;
+}
+
+static int mlx4_cmd_reset_flow(struct mlx4_dev *dev, u16 op, u8 op_modifier,
+ int err)
+{
+ /* Only if reset flow is really active return code is based on
+ * command, otherwise current error code is returned.
+ */
+ if (mlx4_internal_err_reset) {
+ mlx4_enter_error_state(dev->persist);
+ err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+ }
+
+ return err;
+}
+
static int comm_pending(struct mlx4_dev *dev)
{
struct mlx4_priv *priv = mlx4_priv(dev);
cmd->free_head = context->next;
spin_unlock(&cmd->context_lock);
- init_completion(&context->done);
+ reinit_completion(&context->done);
mlx4_comm_cmd_post(dev, op, param);
{
struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
u32 __iomem *hcr = cmd->hcr;
- int ret = -EAGAIN;
+ int ret = -EIO;
unsigned long end;
- mutex_lock(&cmd->hcr_mutex);
-
- if (pci_channel_offline(dev->persist->pdev)) {
+ mutex_lock(&dev->persist->device_state_mutex);
+ /* To avoid writing to unknown addresses after the device state was
+ * changed to internal error and the chip was reset,
+ * check the INTERNAL_ERROR flag which is updated under
+ * device_state_mutex lock.
+ */
+ if (pci_channel_offline(dev->persist->pdev) ||
+ (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
/*
* Device is going through error recovery
* and cannot accept commands.
*/
- ret = -EIO;
goto out;
}
* Device is going through error recovery
* and cannot accept commands.
*/
- ret = -EIO;
goto out;
}
ret = 0;
out:
- mutex_unlock(&cmd->hcr_mutex);
+ if (ret)
+ mlx4_warn(dev, "Could not post command 0x%x: ret=%d, in_param=0x%llx, in_mod=0x%x, op_mod=0x%x\n",
+ op, ret, in_param, in_modifier, op_modifier);
+ mutex_unlock(&dev->persist->device_state_mutex);
+
return ret;
}
down(&priv->cmd.poll_sem);
- if (pci_channel_offline(dev->persist->pdev)) {
+ if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
/*
* Device is going through error recovery
* and cannot accept commands.
*/
- err = -EIO;
+ err = mlx4_internal_err_ret_value(dev, op, op_modifier);
goto out;
}
err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
if (err)
- goto out;
+ goto out_reset;
end = msecs_to_jiffies(timeout) + jiffies;
while (cmd_pending(dev) && time_before(jiffies, end)) {
* and cannot accept commands.
*/
err = -EIO;
+ goto out_reset;
+ }
+
+ if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+ err = mlx4_internal_err_ret_value(dev, op, op_modifier);
goto out;
}
if (cmd_pending(dev)) {
mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
op);
- err = -ETIMEDOUT;
- goto out;
+ err = -EIO;
+ goto out_reset;
}
if (out_is_imm)
stat = be32_to_cpu((__force __be32)
__raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
err = mlx4_status_to_errno(stat);
- if (err)
+ if (err) {
mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
op, stat);
+ if (mlx4_closing_cmd_fatal_error(op, stat))
+ goto out_reset;
+ goto out;
+ }
+out_reset:
+ if (err)
+ err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
out:
up(&priv->cmd.poll_sem);
return err;
goto out;
}
- init_completion(&context->done);
+ reinit_completion(&context->done);
- mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
- in_modifier, op_modifier, op, context->token, 1);
+ err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+ in_modifier, op_modifier, op, context->token, 1);
+ if (err)
+ goto out_reset;
if (!wait_for_completion_timeout(&context->done,
msecs_to_jiffies(timeout))) {
mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
op);
- err = -EBUSY;
- goto out;
+ err = -EIO;
+ goto out_reset;
}
err = context->result;
else
mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
op, context->fw_status);
+ if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+ err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+ else if (mlx4_closing_cmd_fatal_error(op, context->fw_status))
+ goto out_reset;
+
goto out;
}
if (out_is_imm)
*out_param = context->out_param;
+out_reset:
+ if (err)
+ err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
out:
spin_lock(&cmd->context_lock);
context->next = cmd->free_head;
u16 op, unsigned long timeout, int native)
{
if (pci_channel_offline(dev->persist->pdev))
- return -EIO;
+ return mlx4_cmd_reset_flow(dev, op, op_modifier, -EIO);
if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
+ if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+ return mlx4_internal_err_ret_value(dev, op,
+ op_modifier);
if (mlx4_priv(dev)->cmd.use_events)
return mlx4_cmd_wait(dev, in_param, out_param,
out_is_imm, in_modifier,
int flags = 0;
if (!priv->cmd.initialized) {
- mutex_init(&priv->cmd.hcr_mutex);
mutex_init(&priv->cmd.slave_cmd_mutex);
sema_init(&priv->cmd.poll_sem, 1);
priv->cmd.use_events = 0;
for (i = 0; i < priv->cmd.max_cmds; ++i) {
priv->cmd.context[i].token = i;
priv->cmd.context[i].next = i + 1;
+ /* To support fatal error flow, initialize all
+ * cmd contexts to allow simulating completions
+ * with complete() at any time.
+ */
+ init_completion(&priv->cmd.context[i].done);
}
priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
return slave - 1;
}
+void mlx4_cmd_wake_completions(struct mlx4_dev *dev)
+{
+ struct mlx4_priv *priv = mlx4_priv(dev);
+ struct mlx4_cmd_context *context;
+ int i;
+
+ spin_lock(&priv->cmd.context_lock);
+ if (priv->cmd.context) {
+ for (i = 0; i < priv->cmd.max_cmds; ++i) {
+ context = &priv->cmd.context[i];
+ context->fw_status = CMD_STAT_INTERNAL_ERR;
+ context->result =
+ mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+ complete(&context->done);
+ }
+ }
+ spin_unlock(&priv->cmd.context_lock);
+}
+
struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave)
{
struct mlx4_active_ports actv_ports;