Print this page
don't block in nvme_bd_cmd
8629 nvme: rework command abortion
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

@@ -54,15 +54,12 @@
  * hold up to 4096 admin commands.
  *
  * From the hardware perspective both queues of a queue pair are independent,
  * but they share some driver state: the command array (holding pointers to
  * commands currently being processed by the hardware) and the active command
- * counter. Access to the submission side of a queue pair and the shared state
- * is protected by nq_mutex. The completion side of a queue pair does not need
- * that protection apart from its access to the shared state; it is called only
- * in the interrupt handler which does not run concurrently for the same
- * interrupt vector.
+ * counter. Access to a queue pair and the shared state is protected by
+ * nq_mutex.
  *
  * When a command is submitted to a queue pair the active command counter is
  * incremented and a pointer to the command is stored in the command array. The
  * array index is used as command identifier (CID) in the submission queue
  * entry. Some commands may take a very long time to complete, and if the queue

@@ -147,19 +144,48 @@
  * errors indicating a driver bug the driver panics. Almost all other error
  * status values just cause EIO to be returned.
  *
  * Command timeouts are currently detected for all admin commands except
  * asynchronous event requests. If a command times out and the hardware appears
- * to be healthy the driver attempts to abort the command. If this fails the
+ * to be healthy the driver attempts to abort the command. The original command
+ * timeout is also applied to the abort command. If the abort times out too the
  * driver assumes the device to be dead, fences it off, and calls FMA to retire
- * it. In general admin commands are issued at attach time only. No timeout
- * handling of normal I/O commands is presently done.
+ * it. In all other cases the aborted command should return immediately with a
+ * status indicating it was aborted, and the driver will wait indefinitely for
+ * that to happen. No timeout handling of normal I/O commands is presently done.
  *
- * In some cases it may be possible that the ABORT command times out, too. In
- * that case the device is also declared dead and fenced off.
+ * Any command that times out due to the controller dropping dead will be put on
+ * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
+ * memory being reused by the system and later be written to by a "dead" NVMe
+ * controller.
  *
  *
+ * Locking:
+ *
+ * Each queue pair has its own nq_mutex, which must be held when accessing the
+ * associated queue registers or the shared state of the queue pair. Callers of
+ * nvme_unqueue_cmd() must make sure that nq_mutex is held, while
+ * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of this
+ * themselves.
+ *
+ * Each command also has its own nc_mutex, which is associated with the
+ * condition variable nc_cv. It is only used on admin commands which are run
+ * synchronously. In that case it must be held across calls to
+ * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
+ * nvme_admin_cmd(). It must also be held whenever the completion state of the
+ * command is changed or while a admin command timeout is handled.
+ *
+ * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
+ * More than one nc_mutex may only be held when aborting commands. In this case,
+ * the nc_mutex of the command to be aborted must be held across the call to
+ * nvme_abort_cmd() to prevent the command from completing while the abort is in
+ * progress.
+ *
+ * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
+ * and exclusive-open flag nm_oexcl.
+ *
+ *
  * Quiesce / Fast Reboot:
  *
  * The driver currently does not support fast reboot. A quiesce(9E) entry point
  * is still provided which is used to send a shutdown notification to the
  * device.

@@ -219,10 +245,11 @@
 #include <sys/atomic.h>
 #include <sys/archsystm.h>
 #include <sys/sata/sata_hba.h>
 #include <sys/stat.h>
 #include <sys/policy.h>
+#include <sys/list.h>
 
 #include <sys/nvme.h>
 
 #ifdef __x86
 #include <sys/x86_archext.h>

@@ -255,16 +282,17 @@
 static int nvme_init(nvme_t *);
 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 static void nvme_free_cmd(nvme_cmd_t *);
 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
     bd_xfer_t *);
-static int nvme_admin_cmd(nvme_cmd_t *, int);
+static void nvme_admin_cmd(nvme_cmd_t *, int);
 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
+static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int);
 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
-static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
+static void nvme_wait_cmd(nvme_cmd_t *, uint_t);
 static void nvme_wakeup_cmd(void *);
 static void nvme_async_event_task(void *);
 
 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);

@@ -271,22 +299,22 @@
 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
 static inline int nvme_check_cmd_status(nvme_cmd_t *);
 
-static void nvme_abort_cmd(nvme_cmd_t *);
+static int nvme_abort_cmd(nvme_cmd_t *, uint_t);
 static void nvme_async_event(nvme_t *);
 static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t,
     boolean_t, uint8_t);
 static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...);
-static void *nvme_identify(nvme_t *, uint32_t);
-static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,
+static int nvme_identify(nvme_t *, uint32_t, void **);
+static int nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,
     uint32_t *);
-static boolean_t nvme_get_features(nvme_t *, uint32_t, uint8_t, uint32_t *,
+static int nvme_get_features(nvme_t *, uint32_t, uint8_t, uint32_t *,
     void **, size_t *);
-static boolean_t nvme_write_cache_set(nvme_t *, boolean_t);
-static int nvme_set_nqueues(nvme_t *, uint16_t);
+static int nvme_write_cache_set(nvme_t *, boolean_t);
+static int nvme_set_nqueues(nvme_t *, uint16_t *);
 
 static void nvme_free_dma(nvme_dma_t *);
 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
     nvme_dma_t **);
 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,

@@ -459,10 +487,19 @@
         .o_sync_cache   = nvme_bd_sync,
         .o_read         = nvme_bd_read,
         .o_write        = nvme_bd_write,
 };
 
+/*
+ * This list will hold commands that have timed out and couldn't be aborted.
+ * As we don't know what the hardware may still do with the DMA memory we can't
+ * free them, so we'll keep them forever on this list where we can easily look
+ * at them with mdb.
+ */
+static struct list nvme_lost_cmds;
+static kmutex_t nvme_lc_mutex;
+
 int
 _init(void)
 {
         int error;
 

@@ -471,15 +508,21 @@
                 return (error);
 
         nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
             sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
 
+        mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL);
+        list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t),
+            offsetof(nvme_cmd_t, nc_list));
+
         bd_mod_init(&nvme_dev_ops);
 
         error = mod_install(&nvme_modlinkage);
         if (error != DDI_SUCCESS) {
                 ddi_soft_state_fini(&nvme_state);
+                mutex_destroy(&nvme_lc_mutex);
+                list_destroy(&nvme_lost_cmds);
                 bd_mod_fini(&nvme_dev_ops);
         }
 
         return (error);
 }

@@ -487,14 +530,19 @@
 int
 _fini(void)
 {
         int error;
 
+        if (!list_is_empty(&nvme_lost_cmds))
+                return (DDI_FAILURE);
+
         error = mod_remove(&nvme_modlinkage);
         if (error == DDI_SUCCESS) {
                 ddi_soft_state_fini(&nvme_state);
                 kmem_cache_destroy(nvme_cmd_cache);
+                mutex_destroy(&nvme_lc_mutex);
+                list_destroy(&nvme_lost_cmds);
                 bd_mod_fini(&nvme_dev_ops);
         }
 
         return (error);
 }

@@ -800,10 +848,14 @@
 }
 
 static void
 nvme_free_cmd(nvme_cmd_t *cmd)
 {
+        /* Don't free commands on the lost commands list. */
+        if (list_link_active(&cmd->nc_list))
+                return;
+
         if (cmd->nc_dma) {
                 if (cmd->nc_dma->nd_cached)
                         kmem_cache_free(cmd->nc_nvme->n_prp_cache,
                             cmd->nc_dma);
                 else

@@ -866,10 +918,31 @@
 
         mutex_exit(&qp->nq_mutex);
 }
 
 static nvme_cmd_t *
+nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid)
+{
+        nvme_cmd_t *cmd;
+
+        ASSERT(mutex_owned(&qp->nq_mutex));
+        ASSERT3S(cid, <, qp->nq_nentry);
+
+        cmd = qp->nq_cmd[cid];
+        qp->nq_cmd[cid] = NULL;
+        ASSERT3U(qp->nq_active_cmds, >, 0);
+        qp->nq_active_cmds--;
+        sema_v(&qp->nq_sema);
+
+        ASSERT3P(cmd, !=, NULL);
+        ASSERT3P(cmd->nc_nvme, ==, nvme);
+        ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid);
+
+        return (cmd);
+}
+
+static nvme_cmd_t *
 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
 {
         nvme_reg_cqhdbl_t head = { 0 };
 
         nvme_cqe_t *cqe;

@@ -886,20 +959,14 @@
                 mutex_exit(&qp->nq_mutex);
                 return (NULL);
         }
 
         ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
-        ASSERT(cqe->cqe_cid < qp->nq_nentry);
 
-        cmd = qp->nq_cmd[cqe->cqe_cid];
-        qp->nq_cmd[cqe->cqe_cid] = NULL;
-        qp->nq_active_cmds--;
+        cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid);
 
-        ASSERT(cmd != NULL);
-        ASSERT(cmd->nc_nvme == nvme);
         ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
-        ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
         bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
 
         qp->nq_sqhead = cqe->cqe_sqhd;
 
         head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;

@@ -908,11 +975,10 @@
         if (qp->nq_cqhead == 0)
                 qp->nq_phase = qp->nq_phase ? 0 : 1;
 
         nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
         mutex_exit(&qp->nq_mutex);
-        sema_v(&qp->nq_sema);
 
         return (cmd);
 }
 
 static int

@@ -1193,11 +1259,17 @@
 static inline int
 nvme_check_cmd_status(nvme_cmd_t *cmd)
 {
         nvme_cqe_t *cqe = &cmd->nc_cqe;
 
-        /* take a shortcut if everything is alright */
+        /*
+         * Take a shortcut if the controller is dead, or if
+         * command status indicates no error.
+         */
+        if (cmd->nc_nvme->n_dead)
+                return (EIO);
+
         if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
             cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
                 return (0);
 
         if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)

@@ -1210,163 +1282,133 @@
                 return (nvme_check_vendor_cmd_status(cmd));
 
         return (nvme_check_unknown_cmd_status(cmd));
 }
 
-/*
- * nvme_abort_cmd_cb -- replaces nc_callback of aborted commands
- *
- * This functions takes care of cleaning up aborted commands. The command
- * status is checked to catch any fatal errors.
- */
-static void
-nvme_abort_cmd_cb(void *arg)
+static int
+nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec)
 {
-        nvme_cmd_t *cmd = arg;
-
-        /*
-         * Grab the command mutex. Once we have it we hold the last reference
-         * to the command and can safely free it.
-         */
-        mutex_enter(&cmd->nc_mutex);
-        (void) nvme_check_cmd_status(cmd);
-        mutex_exit(&cmd->nc_mutex);
-
-        nvme_free_cmd(cmd);
-}
-
-static void
-nvme_abort_cmd(nvme_cmd_t *abort_cmd)
-{
         nvme_t *nvme = abort_cmd->nc_nvme;
         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
         nvme_abort_cmd_t ac = { 0 };
+        int ret = 0;
 
         sema_p(&nvme->n_abort_sema);
 
         ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid;
         ac.b.ac_sqid = abort_cmd->nc_sqid;
 
-        /*
-         * Drop the mutex of the aborted command. From this point on
-         * we must assume that the abort callback has freed the command.
-         */
-        mutex_exit(&abort_cmd->nc_mutex);
-
         cmd->nc_sqid = 0;
         cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
         cmd->nc_callback = nvme_wakeup_cmd;
         cmd->nc_sqe.sqe_cdw10 = ac.r;
 
         /*
          * Send the ABORT to the hardware. The ABORT command will return _after_
-         * the aborted command has completed (aborted or otherwise).
+         * the aborted command has completed (aborted or otherwise), but since
+         * we still hold the aborted command's mutex its callback hasn't been
+         * processed yet.
          */
-        if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
+        nvme_admin_cmd(cmd, sec);
                 sema_v(&nvme->n_abort_sema);
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!nvme_admin_cmd failed for ABORT");
-                atomic_inc_32(&nvme->n_abort_failed);
-                return;
-        }
-        sema_v(&nvme->n_abort_sema);
 
-        if (nvme_check_cmd_status(cmd)) {
+        if ((ret = nvme_check_cmd_status(cmd)) != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!ABORT failed with sct = %x, sc = %x",
                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
                 atomic_inc_32(&nvme->n_abort_failed);
         } else {
+                dev_err(nvme->n_dip, CE_WARN,
+                    "!ABORT of command %d/%d %ssuccessful",
+                    abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid,
+                    cmd->nc_cqe.cqe_dw0 & 1 ? "un" : "");
+                if ((cmd->nc_cqe.cqe_dw0 & 1) == 0)
                 atomic_inc_32(&nvme->n_cmd_aborted);
         }
 
         nvme_free_cmd(cmd);
+        return (ret);
 }
 
 /*
  * nvme_wait_cmd -- wait for command completion or timeout
  *
- * Returns B_TRUE if the command completed normally.
- *
- * Returns B_FALSE if the command timed out and an abort was attempted. The
- * command mutex will be dropped and the command must be considered freed. The
- * freeing of the command is normally done by the abort command callback.
- *
  * In case of a serious error or a timeout of the abort command the hardware
  * will be declared dead and FMA will be notified.
  */
-static boolean_t
+static void
 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec)
 {
         clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC);
         nvme_t *nvme = cmd->nc_nvme;
         nvme_reg_csts_t csts;
+        nvme_qpair_t *qp;
 
         ASSERT(mutex_owned(&cmd->nc_mutex));
 
         while (!cmd->nc_completed) {
                 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1)
                         break;
         }
 
         if (cmd->nc_completed)
-                return (B_TRUE);
+                return;
 
         /*
-         * The command timed out. Change the callback to the cleanup function.
-         */
-        cmd->nc_callback = nvme_abort_cmd_cb;
-
-        /*
+         * The command timed out.
+         *
          * Check controller for fatal status, any errors associated with the
          * register or DMA handle, or for a double timeout (abort command timed
          * out). If necessary log a warning and call FMA.
          */
         csts.r = nvme_get32(nvme, NVME_REG_CSTS);
-        dev_err(nvme->n_dip, CE_WARN, "!command timeout, "
-            "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
+        dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, "
+            "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid,
+            cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
         atomic_inc_32(&nvme->n_cmd_timeout);
 
         if (csts.b.csts_cfs ||
             nvme_check_regs_hdl(nvme) ||
             nvme_check_dma_hdl(cmd->nc_dma) ||
             cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
                 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
                 nvme->n_dead = B_TRUE;
-                mutex_exit(&cmd->nc_mutex);
-        } else {
+        } else if (nvme_abort_cmd(cmd, sec) == 0) {
                 /*
-                 * Try to abort the command. The command mutex is released by
-                 * nvme_abort_cmd().
-                 * If the abort succeeds it will have freed the aborted command.
-                 * If the abort fails for other reasons we must assume that the
-                 * command may complete at any time, and the callback will free
-                 * it for us.
+                 * If the abort succeeded the command should complete
+                 * immediately with an appropriate status.
                  */
-                nvme_abort_cmd(cmd);
+                while (!cmd->nc_completed)
+                        cv_wait(&cmd->nc_cv, &cmd->nc_mutex);
+
+                return;
         }
 
-        return (B_FALSE);
+        qp = nvme->n_ioq[cmd->nc_sqid];
+
+        mutex_enter(&qp->nq_mutex);
+        (void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
+        mutex_exit(&qp->nq_mutex);
+
+        /*
+         * As we don't know what the presumed dead hardware might still do with
+         * the DMA memory, we'll put the command on the lost commands list if it
+         * has any DMA memory.
+         */
+        if (cmd->nc_dma != NULL) {
+                mutex_enter(&nvme_lc_mutex);
+                list_insert_head(&nvme_lost_cmds, cmd);
+                mutex_exit(&nvme_lc_mutex);
+        }
 }
 
 static void
 nvme_wakeup_cmd(void *arg)
 {
         nvme_cmd_t *cmd = arg;
 
         mutex_enter(&cmd->nc_mutex);
-        /*
-         * There is a slight chance that this command completed shortly after
-         * the timeout was hit in nvme_wait_cmd() but before the callback was
-         * changed. Catch that case here and clean up accordingly.
-         */
-        if (cmd->nc_callback == nvme_abort_cmd_cb) {
-                mutex_exit(&cmd->nc_mutex);
-                nvme_abort_cmd_cb(cmd);
-                return;
-        }
-
         cmd->nc_completed = B_TRUE;
         cv_signal(&cmd->nc_cv);
         mutex_exit(&cmd->nc_mutex);
 }
 

@@ -1388,11 +1430,11 @@
          *
          * Other possible errors are various scenarios where the async request
          * was aborted, or internal errors in the device. Internal errors are
          * reported to FMA, the command aborts need no special handling here.
          */
-        if (nvme_check_cmd_status(cmd)) {
+        if (nvme_check_cmd_status(cmd) != 0) {
                 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
                     "!async event request returned failure, sct = %x, "
                     "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
                     cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
                     cmd->nc_cqe.cqe_sf.sf_m);

@@ -1520,26 +1562,17 @@
 
         if (health_log)
                 kmem_free(health_log, logsize);
 }
 
-static int
+static void
 nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
 {
         mutex_enter(&cmd->nc_mutex);
         nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
-
-        if (nvme_wait_cmd(cmd, sec) == B_FALSE) {
-                /*
-                 * The command timed out. An abort command was posted that
-                 * will take care of the cleanup.
-                 */
-                return (DDI_FAILURE);
-        }
+        nvme_wait_cmd(cmd, sec);
         mutex_exit(&cmd->nc_mutex);
-
-        return (DDI_SUCCESS);
 }
 
 static void
 nvme_async_event(nvme_t *nvme)
 {

@@ -1577,16 +1610,11 @@
          * namespaces in one command. Handle that gracefully.
          */
         if (nsid == (uint32_t)-1)
                 cmd->nc_dontpanic = B_TRUE;
 
-        if ((ret = nvme_admin_cmd(cmd, nvme_format_cmd_timeout))
-            != DDI_SUCCESS) {
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!nvme_admin_cmd failed for FORMAT NVM");
-                return (EIO);
-        }
+        nvme_admin_cmd(cmd, nvme_format_cmd_timeout);
 
         if ((ret = nvme_check_cmd_status(cmd)) != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!FORMAT failed with sct = %x, sc = %x",
                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);

@@ -1601,11 +1629,11 @@
     ...)
 {
         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
         nvme_getlogpage_t getlogpage = { 0 };
         va_list ap;
-        int ret = DDI_FAILURE;
+        int ret;
 
         va_start(ap, logpage);
 
         cmd->nc_sqid = 0;
         cmd->nc_callback = nvme_wakeup_cmd;

@@ -1636,10 +1664,11 @@
 
         default:
                 dev_err(nvme->n_dip, CE_WARN, "!unknown log page requested: %d",
                     logpage);
                 atomic_inc_32(&nvme->n_unknown_logpage);
+                ret = EINVAL;
                 goto fail;
         }
 
         va_end(ap);
 

@@ -1649,17 +1678,19 @@
 
         if (nvme_zalloc_dma(nvme, getlogpage.b.lp_numd * sizeof (uint32_t),
             DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!nvme_zalloc_dma failed for GET LOG PAGE");
+                ret = ENOMEM;
                 goto fail;
         }
 
         if (cmd->nc_dma->nd_ncookie > 2) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!too many DMA cookies for GET LOG PAGE");
                 atomic_inc_32(&nvme->n_too_many_cookies);
+                ret = ENOMEM;
                 goto fail;
         }
 
         cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
         if (cmd->nc_dma->nd_ncookie > 1) {

@@ -1667,40 +1698,37 @@
                     &cmd->nc_dma->nd_cookie);
                 cmd->nc_sqe.sqe_dptr.d_prp[1] =
                     cmd->nc_dma->nd_cookie.dmac_laddress;
         }
 
-        if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!nvme_admin_cmd failed for GET LOG PAGE");
-                return (ret);
-        }
+        nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
 
-        if (nvme_check_cmd_status(cmd)) {
+        if ((ret = nvme_check_cmd_status(cmd)) != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!GET LOG PAGE failed with sct = %x, sc = %x",
                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
                 goto fail;
         }
 
         *buf = kmem_alloc(*bufsize, KM_SLEEP);
         bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
 
-        ret = DDI_SUCCESS;
-
 fail:
         nvme_free_cmd(cmd);
 
         return (ret);
 }
 
-static void *
-nvme_identify(nvme_t *nvme, uint32_t nsid)
+static int
+nvme_identify(nvme_t *nvme, uint32_t nsid, void **buf)
 {
         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
-        void *buf = NULL;
+        int ret;
 
+        if (buf == NULL)
+                return (EINVAL);
+
         cmd->nc_sqid = 0;
         cmd->nc_callback = nvme_wakeup_cmd;
         cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
         cmd->nc_sqe.sqe_nsid = nsid;
         cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL;

@@ -1707,17 +1735,19 @@
 
         if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
             &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!nvme_zalloc_dma failed for IDENTIFY");
+                ret = ENOMEM;
                 goto fail;
         }
 
         if (cmd->nc_dma->nd_ncookie > 2) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!too many DMA cookies for IDENTIFY");
                 atomic_inc_32(&nvme->n_too_many_cookies);
+                ret = ENOMEM;
                 goto fail;
         }
 
         cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
         if (cmd->nc_dma->nd_ncookie > 1) {

@@ -1725,39 +1755,35 @@
                     &cmd->nc_dma->nd_cookie);
                 cmd->nc_sqe.sqe_dptr.d_prp[1] =
                     cmd->nc_dma->nd_cookie.dmac_laddress;
         }
 
-        if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!nvme_admin_cmd failed for IDENTIFY");
-                return (NULL);
-        }
+        nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
 
-        if (nvme_check_cmd_status(cmd)) {
+        if ((ret = nvme_check_cmd_status(cmd)) != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!IDENTIFY failed with sct = %x, sc = %x",
                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
                 goto fail;
         }
 
-        buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
-        bcopy(cmd->nc_dma->nd_memp, buf, NVME_IDENTIFY_BUFSIZE);
+        *buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
+        bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE);
 
 fail:
         nvme_free_cmd(cmd);
 
-        return (buf);
+        return (ret);
 }
 
-static boolean_t
+static int
 nvme_set_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t val,
     uint32_t *res)
 {
         _NOTE(ARGUNUSED(nsid));
         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
-        boolean_t ret = B_FALSE;
+        int ret = EINVAL;
 
         ASSERT(res != NULL);
 
         cmd->nc_sqid = 0;
         cmd->nc_callback = nvme_wakeup_cmd;

@@ -1776,38 +1802,33 @@
 
         default:
                 goto fail;
         }
 
-        if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!nvme_admin_cmd failed for SET FEATURES");
-                return (ret);
-        }
+        nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
 
-        if (nvme_check_cmd_status(cmd)) {
+        if ((ret = nvme_check_cmd_status(cmd)) != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!SET FEATURES %d failed with sct = %x, sc = %x",
                     feature, cmd->nc_cqe.cqe_sf.sf_sct,
                     cmd->nc_cqe.cqe_sf.sf_sc);
                 goto fail;
         }
 
         *res = cmd->nc_cqe.cqe_dw0;
-        ret = B_TRUE;
 
 fail:
         nvme_free_cmd(cmd);
         return (ret);
 }
 
-static boolean_t
+static int
 nvme_get_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t *res,
     void **buf, size_t *bufsize)
 {
         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
-        boolean_t ret = B_FALSE;
+        int ret = EINVAL;
 
         ASSERT(res != NULL);
 
         if (bufsize != NULL)
                 *bufsize = 0;

@@ -1869,17 +1890,19 @@
         if (bufsize != NULL && *bufsize != 0) {
                 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ,
                     &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
                         dev_err(nvme->n_dip, CE_WARN,
                             "!nvme_zalloc_dma failed for GET FEATURES");
+                        ret = ENOMEM;
                         goto fail;
                 }
 
                 if (cmd->nc_dma->nd_ncookie > 2) {
                         dev_err(nvme->n_dip, CE_WARN,
                             "!too many DMA cookies for GET FEATURES");
                         atomic_inc_32(&nvme->n_too_many_cookies);
+                        ret = ENOMEM;
                         goto fail;
                 }
 
                 cmd->nc_sqe.sqe_dptr.d_prp[0] =
                     cmd->nc_dma->nd_cookie.dmac_laddress;

@@ -1889,17 +1912,13 @@
                         cmd->nc_sqe.sqe_dptr.d_prp[1] =
                             cmd->nc_dma->nd_cookie.dmac_laddress;
                 }
         }
 
-        if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!nvme_admin_cmd failed for GET FEATURES");
-                return (ret);
-        }
+        nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
 
-        if (nvme_check_cmd_status(cmd)) {
+        if ((ret = nvme_check_cmd_status(cmd)) != 0) {
                 if (feature == NVME_FEAT_LBA_RANGE &&
                     cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
                     cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD)
                         nvme->n_lba_range_supported = B_FALSE;
                 else

@@ -1915,56 +1934,58 @@
                 *buf = kmem_alloc(*bufsize, KM_SLEEP);
                 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
         }
 
         *res = cmd->nc_cqe.cqe_dw0;
-        ret = B_TRUE;
 
 fail:
         nvme_free_cmd(cmd);
         return (ret);
 }
 
-static boolean_t
+static int
 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
 {
         nvme_write_cache_t nwc = { 0 };
 
         if (enable)
                 nwc.b.wc_wce = 1;
 
-        if (!nvme_set_features(nvme, 0, NVME_FEAT_WRITE_CACHE, nwc.r, &nwc.r))
-                return (B_FALSE);
-
-        return (B_TRUE);
+        return (nvme_set_features(nvme, 0, NVME_FEAT_WRITE_CACHE, nwc.r,
+            &nwc.r));
 }
 
 static int
-nvme_set_nqueues(nvme_t *nvme, uint16_t nqueues)
+nvme_set_nqueues(nvme_t *nvme, uint16_t *nqueues)
 {
         nvme_nqueues_t nq = { 0 };
+        int ret;
 
-        nq.b.nq_nsq = nq.b.nq_ncq = nqueues - 1;
+        nq.b.nq_nsq = nq.b.nq_ncq = *nqueues - 1;
 
-        if (!nvme_set_features(nvme, 0, NVME_FEAT_NQUEUES, nq.r, &nq.r)) {
-                return (0);
-        }
+        ret = nvme_set_features(nvme, 0, NVME_FEAT_NQUEUES, nq.r, &nq.r);
 
+        if (ret == 0) {
         /*
-         * Always use the same number of submission and completion queues, and
-         * never use more than the requested number of queues.
+                 * Always use the same number of submission and completion
+                 * queues, and never use more than the requested number of
+                 * queues.
          */
-        return (MIN(nqueues, MIN(nq.b.nq_nsq, nq.b.nq_ncq) + 1));
+                *nqueues = MIN(*nqueues, MIN(nq.b.nq_nsq, nq.b.nq_ncq) + 1);
+        }
+
+        return (ret);
 }
 
 static int
 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
 {
         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
         nvme_create_queue_dw10_t dw10 = { 0 };
         nvme_create_cq_dw11_t c_dw11 = { 0 };
         nvme_create_sq_dw11_t s_dw11 = { 0 };
+        int ret;
 
         dw10.b.q_qid = idx;
         dw10.b.q_qsize = qp->nq_nentry - 1;
 
         c_dw11.b.cq_pc = 1;

@@ -1976,22 +1997,17 @@
         cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
         cmd->nc_sqe.sqe_cdw10 = dw10.r;
         cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
         cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_cqdma->nd_cookie.dmac_laddress;
 
-        if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!nvme_admin_cmd failed for CREATE CQUEUE");
-                return (DDI_FAILURE);
-        }
+        nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
 
-        if (nvme_check_cmd_status(cmd)) {
+        if ((ret = nvme_check_cmd_status(cmd)) != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!CREATE CQUEUE failed with sct = %x, sc = %x",
                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
-                nvme_free_cmd(cmd);
-                return (DDI_FAILURE);
+                goto fail;
         }
 
         nvme_free_cmd(cmd);
 
         s_dw11.b.sq_pc = 1;

@@ -2003,27 +2019,23 @@
         cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
         cmd->nc_sqe.sqe_cdw10 = dw10.r;
         cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
         cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
 
-        if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!nvme_admin_cmd failed for CREATE SQUEUE");
-                return (DDI_FAILURE);
-        }
+        nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
 
-        if (nvme_check_cmd_status(cmd)) {
+        if ((ret = nvme_check_cmd_status(cmd)) != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!CREATE SQUEUE failed with sct = %x, sc = %x",
                     cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
-                nvme_free_cmd(cmd);
-                return (DDI_FAILURE);
+                goto fail;
         }
 
+fail:
         nvme_free_cmd(cmd);
 
-        return (DDI_SUCCESS);
+        return (ret);
 }
 
 static boolean_t
 nvme_reset(nvme_t *nvme, boolean_t quiesce)
 {

@@ -2112,13 +2124,12 @@
         nvme_namespace_t *ns = &nvme->n_ns[nsid - 1];
         nvme_identify_nsid_t *idns;
         int last_rp;
 
         ns->ns_nvme = nvme;
-        idns = nvme_identify(nvme, nsid);
 
-        if (idns == NULL) {
+        if (nvme_identify(nvme, nsid, (void **)&idns) != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!failed to identify namespace %d", nsid);
                 return (DDI_FAILURE);
         }
 

@@ -2204,11 +2215,11 @@
         nvme_reg_acq_t acq = { 0 };
         nvme_reg_cap_t cap;
         nvme_reg_vs_t vs;
         nvme_reg_csts_t csts;
         int i = 0;
-        int nqueues;
+        uint16_t nqueues;
         char model[sizeof (nvme->n_idctl->id_model) + 1];
         char *vendor, *product;
 
         /* Check controller version */
         vs.r = nvme_get32(nvme, NVME_REG_VS);

@@ -2369,12 +2380,11 @@
         nvme_async_event(nvme);
 
         /*
          * Identify Controller
          */
-        nvme->n_idctl = nvme_identify(nvme, 0);
-        if (nvme->n_idctl == NULL) {
+        if (nvme_identify(nvme, 0, (void **)&nvme->n_idctl) != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!failed to identify controller");
                 goto fail;
         }
 

@@ -2459,11 +2469,12 @@
             "volatile-write-cache-present",
             nvme->n_write_cache_present ? 1 : 0);
 
         if (!nvme->n_write_cache_present) {
                 nvme->n_write_cache_enabled = B_FALSE;
-        } else if (!nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)) {
+        } else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)
+            != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
                     "!failed to %sable volatile write cache",
                     nvme->n_write_cache_enabled ? "en" : "dis");
                 /*
                  * Assume the cache is (still) enabled.

@@ -2531,30 +2542,33 @@
         nqueues = nvme->n_intr_cnt;
 
         /*
          * Create I/O queue pairs.
          */
-        nvme->n_ioq_count = nvme_set_nqueues(nvme, nqueues);
-        if (nvme->n_ioq_count == 0) {
+
+        if (nvme_set_nqueues(nvme, &nqueues) != 0) {
                 dev_err(nvme->n_dip, CE_WARN,
-                    "!failed to set number of I/O queues to %d", nqueues);
+                    "!failed to set number of I/O queues to %d",
+                    nvme->n_intr_cnt);
                 goto fail;
         }
 
         /*
          * Reallocate I/O queue array
          */
         kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
         nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
-            (nvme->n_ioq_count + 1), KM_SLEEP);
+            (nqueues + 1), KM_SLEEP);
         nvme->n_ioq[0] = nvme->n_adminq;
 
+        nvme->n_ioq_count = nqueues;
+
         /*
          * If we got less queues than we asked for we might as well give
          * some of the interrupt vectors back to the system.
          */
-        if (nvme->n_ioq_count < nqueues) {
+        if (nvme->n_ioq_count < nvme->n_intr_cnt) {
                 nvme_release_interrupts(nvme);
 
                 if (nvme_setup_interrupts(nvme, nvme->n_intr_type,
                     nvme->n_ioq_count) != DDI_SUCCESS) {
                         dev_err(nvme->n_dip, CE_WARN,

@@ -2577,12 +2591,11 @@
                         dev_err(nvme->n_dip, CE_WARN,
                             "!unable to allocate I/O qpair %d", i);
                         goto fail;
                 }
 
-                if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i)
-                    != DDI_SUCCESS) {
+                if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) {
                         dev_err(nvme->n_dip, CE_WARN,
                             "!unable to create I/O qpair %d", i);
                         goto fail;
                 }
         }

@@ -2612,10 +2625,14 @@
         nvme_cmd_t *cmd;
 
         if (inum >= nvme->n_intr_cnt)
                 return (DDI_INTR_UNCLAIMED);
 
+        if (nvme->n_dead)
+                return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ?
+                    DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED);
+
         /*
          * The interrupt vector a queue uses is calculated as queue_idx %
          * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
          * in steps of n_intr_cnt to process all queues using this vector.
          */

@@ -3373,10 +3390,13 @@
                 return (ENXIO);
 
         if (nsid > nvme->n_namespace_count)
                 return (ENXIO);
 
+        if (nvme->n_dead)
+                return (EIO);
+
         nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor;
 
         mutex_enter(&nm->nm_mutex);
         if (nm->nm_oexcl) {
                 rv = EBUSY;

@@ -3445,13 +3465,12 @@
                 return (EPERM);
 
         if (nioc->n_len < NVME_IDENTIFY_BUFSIZE)
                 return (EINVAL);
 
-        idctl = nvme_identify(nvme, nsid);
-        if (idctl == NULL)
-                return (EIO);
+        if ((rv = nvme_identify(nvme, nsid, (void **)&idctl)) != 0)
+                return (rv);
 
         if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode)
             != 0)
                 rv = EFAULT;
 

@@ -3614,13 +3633,13 @@
 
         default:
                 return (EINVAL);
         }
 
-        if (nvme_get_features(nvme, nsid, feature, &res, &buf, &bufsize) ==
-            B_FALSE)
-                return (EIO);
+        rv = nvme_get_features(nvme, nsid, feature, &res, &buf, &bufsize);
+        if (rv != 0)
+                return (rv);
 
         if (nioc->n_len < bufsize) {
                 kmem_free(buf, bufsize);
                 return (EINVAL);
         }

@@ -3833,10 +3852,14 @@
 #ifdef _MULTI_DATAMODEL
                 break;
         }
 #endif
 
+        if (nvme->n_dead && cmd != NVME_IOC_DETACH)
+                return (EIO);
+
+
         if (cmd == NVME_IOC_IDENTIFY_CTRL) {
                 /*
                  * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and
                  * attachment point nodes.
                  */