Print this page
don't block in nvme_bd_cmd
8628 nvme: use a semaphore to guard submission queue
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

@@ -42,11 +42,11 @@
  * vector and will post them to a taskq for completion processing.
  *
  *
  * Command Processing:
  *
- * NVMe devices can have up to 65536 I/O queue pairs, with each queue holding up
+ * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
  * to 65536 I/O commands. The driver will configure one I/O queue pair per
  * available interrupt vector, with the queue length usually much smaller than
  * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
  * interrupt vectors will be used.
  *

@@ -67,11 +67,12 @@
  * array index is used as command identifier (CID) in the submission queue
  * entry. Some commands may take a very long time to complete, and if the queue
  * wraps around in that time a submission may find the next array slot to still
  * be used by a long-running command. In this case the array is sequentially
  * searched for the next free slot. The length of the command array is the same
- * as the configured queue length.
+ * as the configured queue length. Queue overrun is prevented by the semaphore,
+ * so a command submission may block if the queue is full.
  *
  *
  * Polled I/O Support:
  *
  * For kernel core dump support the driver can do polled I/O. As interrupts are

@@ -255,11 +256,13 @@
 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 static void nvme_free_cmd(nvme_cmd_t *);
 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
     bd_xfer_t *);
 static int nvme_admin_cmd(nvme_cmd_t *, int);
-static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
+static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
+static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
+static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
 static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
 static void nvme_wakeup_cmd(void *);
 static void nvme_async_event_task(void *);
 

@@ -269,11 +272,11 @@
 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
 static inline int nvme_check_cmd_status(nvme_cmd_t *);
 
 static void nvme_abort_cmd(nvme_cmd_t *);
-static int nvme_async_event(nvme_t *);
+static void nvme_async_event(nvme_t *);
 static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t,
     boolean_t, uint8_t);
 static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...);
 static void *nvme_identify(nvme_t *, uint32_t);
 static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,

@@ -719,10 +722,11 @@
 nvme_free_qpair(nvme_qpair_t *qp)
 {
         int i;
 
         mutex_destroy(&qp->nq_mutex);
+        sema_destroy(&qp->nq_sema);
 
         if (qp->nq_sqdma != NULL)
                 nvme_free_dma(qp->nq_sqdma);
         if (qp->nq_cqdma != NULL)
                 nvme_free_dma(qp->nq_cqdma);

@@ -744,10 +748,11 @@
 {
         nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
 
         mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
             DDI_INTR_PRI(nvme->n_intr_pri));
+        sema_init(&qp->nq_sema, nentry, NULL, SEMA_DRIVER, NULL);
 
         if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
             DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
                 goto fail;
 

@@ -810,22 +815,33 @@
         mutex_destroy(&cmd->nc_mutex);
 
         kmem_cache_free(nvme_cmd_cache, cmd);
 }
 
+static void
+nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
+{
+        sema_p(&qp->nq_sema);
+        nvme_submit_cmd_common(qp, cmd);
+}
+
 static int
-nvme_submit_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
+nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
 {
-        nvme_reg_sqtdbl_t tail = { 0 };
+        if (sema_tryp(&qp->nq_sema) == 0)
+                return (EAGAIN);
 
-        mutex_enter(&qp->nq_mutex);
+        nvme_submit_cmd_common(qp, cmd);
+        return (0);
+}
 
-        if (qp->nq_active_cmds == qp->nq_nentry) {
-                mutex_exit(&qp->nq_mutex);
-                return (DDI_FAILURE);
-        }
+static void
+nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd)
+{
+        nvme_reg_sqtdbl_t tail = { 0 };
 
+        mutex_enter(&qp->nq_mutex);
         cmd->nc_completed = B_FALSE;
 
         /*
          * Try to insert the cmd into the active cmd array at the nq_next_cmd
          * slot. If the slot is already occupied advance to the next slot and

@@ -847,11 +863,10 @@
 
         tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
         nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
 
         mutex_exit(&qp->nq_mutex);
-        return (DDI_SUCCESS);
 }
 
 static nvme_cmd_t *
 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
 {

@@ -893,10 +908,11 @@
         if (qp->nq_cqhead == 0)
                 qp->nq_phase = qp->nq_phase ? 0 : 1;
 
         nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
         mutex_exit(&qp->nq_mutex);
+        sema_v(&qp->nq_sema);
 
         return (cmd);
 }
 
 static int

@@ -1361,11 +1377,10 @@
         nvme_t *nvme = cmd->nc_nvme;
         nvme_error_log_entry_t *error_log = NULL;
         nvme_health_log_t *health_log = NULL;
         size_t logsize = 0;
         nvme_async_event_t event;
-        int ret;
 
         /*
          * Check for errors associated with the async request itself. The only
          * command-specific error is "async event limit exceeded", which
          * indicates a programming error in the driver and causes a panic in

@@ -1395,19 +1410,12 @@
 
         event.r = cmd->nc_cqe.cqe_dw0;
 
         /* Clear CQE and re-submit the async request. */
         bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
-        ret = nvme_submit_cmd(nvme->n_adminq, cmd);
+        nvme_submit_admin_cmd(nvme->n_adminq, cmd);
 
-        if (ret != DDI_SUCCESS) {
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!failed to resubmit async event request");
-                atomic_inc_32(&nvme->n_async_resubmit_failed);
-                nvme_free_cmd(cmd);
-        }
-
         switch (event.b.ae_type) {
         case NVME_ASYNC_TYPE_ERROR:
                 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
                         (void) nvme_get_logpage(nvme, (void **)&error_log,
                             &logsize, event.b.ae_logpage);

@@ -1515,24 +1523,13 @@
 }
 
 static int
 nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
 {
-        int ret;
-
         mutex_enter(&cmd->nc_mutex);
-        ret = nvme_submit_cmd(cmd->nc_nvme->n_adminq, cmd);
+        nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
 
-        if (ret != DDI_SUCCESS) {
-                mutex_exit(&cmd->nc_mutex);
-                dev_err(cmd->nc_nvme->n_dip, CE_WARN,
-                    "!nvme_submit_cmd failed");
-                atomic_inc_32(&cmd->nc_nvme->n_admin_queue_full);
-                nvme_free_cmd(cmd);
-                return (DDI_FAILURE);
-        }
-
         if (nvme_wait_cmd(cmd, sec) == B_FALSE) {
                 /*
                  * The command timed out. An abort command was posted that
                  * will take care of the cleanup.
                  */

@@ -1541,30 +1538,20 @@
         mutex_exit(&cmd->nc_mutex);
 
         return (DDI_SUCCESS);
 }
 
-static int
+static void
 nvme_async_event(nvme_t *nvme)
 {
         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
-        int ret;
 
         cmd->nc_sqid = 0;
         cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
         cmd->nc_callback = nvme_async_event_task;
 
-        ret = nvme_submit_cmd(nvme->n_adminq, cmd);
-
-        if (ret != DDI_SUCCESS) {
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!nvme_submit_cmd failed for ASYNCHRONOUS EVENT");
-                nvme_free_cmd(cmd);
-                return (DDI_FAILURE);
-        }
-
-        return (DDI_SUCCESS);
+        nvme_submit_admin_cmd(nvme->n_adminq, cmd);
 }
 
 static int
 nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms,
     uint8_t pi, boolean_t pil, uint8_t ses)

@@ -2377,15 +2364,11 @@
         }
 
         /*
          * Post an asynchronous event command to catch errors.
          */
-        if (nvme_async_event(nvme) != DDI_SUCCESS) {
-                dev_err(nvme->n_dip, CE_WARN,
-                    "!failed to post async event");
-                goto fail;
-        }
+        nvme_async_event(nvme);
 
         /*
          * Identify Controller
          */
         nvme->n_idctl = nvme_identify(nvme, 0);

@@ -2606,17 +2589,12 @@
 
         /*
          * Post more asynchronous events commands to reduce event reporting
          * latency as suggested by the spec.
          */
-        for (i = 1; i != nvme->n_async_event_limit; i++) {
-                if (nvme_async_event(nvme) != DDI_SUCCESS) {
-                        dev_err(nvme->n_dip, CE_WARN,
-                            "!failed to post async event %d", i);
-                        goto fail;
-                }
-        }
+        for (i = 1; i != nvme->n_async_event_limit; i++)
+                nvme_async_event(nvme);
 
         return (DDI_SUCCESS);
 
 fail:
         (void) nvme_reset(nvme, B_FALSE);

@@ -3276,13 +3254,14 @@
 
 static int
 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
 {
         nvme_t *nvme = ns->ns_nvme;
-        nvme_cmd_t *cmd, *ret;
+        nvme_cmd_t *cmd;
         nvme_qpair_t *ioq;
         boolean_t poll;
+        int ret;
 
         if (nvme->n_dead)
                 return (EIO);
 
         cmd = nvme_create_nvm_cmd(ns, opc, xfer);

@@ -3298,20 +3277,22 @@
          * complete immediately after it was submitted, which means we must
          * treat both cmd and xfer as if they have been freed already.
          */
         poll = (xfer->x_flags & BD_XFER_POLL) != 0;
 
-        if (nvme_submit_cmd(ioq, cmd) != DDI_SUCCESS)
-                return (EAGAIN);
+        ret = nvme_submit_io_cmd(ioq, cmd);
 
+        if (ret != 0)
+                return (ret);
+
         if (!poll)
                 return (0);
 
         do {
-                ret = nvme_retrieve_cmd(nvme, ioq);
-                if (ret != NULL)
-                        nvme_bd_xfer_done(ret);
+                cmd = nvme_retrieve_cmd(nvme, ioq);
+                if (cmd != NULL)
+                        nvme_bd_xfer_done(cmd);
                 else
                         drv_usecwait(10);
         } while (ioq->nq_active_cmds != 0);
 
         return (0);