illumos-8628 Cdiff usr/src/uts/common/io/nvme/nvme.c

Print this page

don't block in nvme_bd_cmd
8628 nvme: use a semaphore to guard submission queue
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>


*** 42,52 ****
   * vector and will post them to a taskq for completion processing.
   *
   *
   * Command Processing:
   *
!  * NVMe devices can have up to 65536 I/O queue pairs, with each queue holding up
   * to 65536 I/O commands. The driver will configure one I/O queue pair per
   * available interrupt vector, with the queue length usually much smaller than
   * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
   * interrupt vectors will be used.
   *
--- 42,52 ----
   * vector and will post them to a taskq for completion processing.
   *
   *
   * Command Processing:
   *
!  * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
   * to 65536 I/O commands. The driver will configure one I/O queue pair per
   * available interrupt vector, with the queue length usually much smaller than
   * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
   * interrupt vectors will be used.
   *
*** 67,77 ****
   * array index is used as command identifier (CID) in the submission queue
   * entry. Some commands may take a very long time to complete, and if the queue
   * wraps around in that time a submission may find the next array slot to still
   * be used by a long-running command. In this case the array is sequentially
   * searched for the next free slot. The length of the command array is the same
!  * as the configured queue length.
   *
   *
   * Polled I/O Support:
   *
   * For kernel core dump support the driver can do polled I/O. As interrupts are
--- 67,78 ----
   * array index is used as command identifier (CID) in the submission queue
   * entry. Some commands may take a very long time to complete, and if the queue
   * wraps around in that time a submission may find the next array slot to still
   * be used by a long-running command. In this case the array is sequentially
   * searched for the next free slot. The length of the command array is the same
!  * as the configured queue length. Queue overrun is prevented by the semaphore,
!  * so a command submission may block if the queue is full.
   *
   *
   * Polled I/O Support:
   *
   * For kernel core dump support the driver can do polled I/O. As interrupts are
*** 255,265 ****
  static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
  static void nvme_free_cmd(nvme_cmd_t *);
  static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
      bd_xfer_t *);
  static int nvme_admin_cmd(nvme_cmd_t *, int);
! static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
  static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
  static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
  static void nvme_wakeup_cmd(void *);
  static void nvme_async_event_task(void *);
  
--- 256,268 ----
  static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
  static void nvme_free_cmd(nvme_cmd_t *);
  static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
      bd_xfer_t *);
  static int nvme_admin_cmd(nvme_cmd_t *, int);
! static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
! static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
! static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
  static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
  static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
  static void nvme_wakeup_cmd(void *);
  static void nvme_async_event_task(void *);
  
*** 269,279 ****
  static int nvme_check_specific_cmd_status(nvme_cmd_t *);
  static int nvme_check_generic_cmd_status(nvme_cmd_t *);
  static inline int nvme_check_cmd_status(nvme_cmd_t *);
  
  static void nvme_abort_cmd(nvme_cmd_t *);
! static int nvme_async_event(nvme_t *);
  static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t,
      boolean_t, uint8_t);
  static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...);
  static void *nvme_identify(nvme_t *, uint32_t);
  static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,
--- 272,282 ----
  static int nvme_check_specific_cmd_status(nvme_cmd_t *);
  static int nvme_check_generic_cmd_status(nvme_cmd_t *);
  static inline int nvme_check_cmd_status(nvme_cmd_t *);
  
  static void nvme_abort_cmd(nvme_cmd_t *);
! static void nvme_async_event(nvme_t *);
  static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t,
      boolean_t, uint8_t);
  static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...);
  static void *nvme_identify(nvme_t *, uint32_t);
  static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,
*** 719,728 ****
--- 722,732 ----
  nvme_free_qpair(nvme_qpair_t *qp)
  {
          int i;
  
          mutex_destroy(&qp->nq_mutex);
+         sema_destroy(&qp->nq_sema);
  
          if (qp->nq_sqdma != NULL)
                  nvme_free_dma(qp->nq_sqdma);
          if (qp->nq_cqdma != NULL)
                  nvme_free_dma(qp->nq_cqdma);
*** 744,753 ****
--- 748,758 ----
  {
          nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
  
          mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
              DDI_INTR_PRI(nvme->n_intr_pri));
+         sema_init(&qp->nq_sema, nentry, NULL, SEMA_DRIVER, NULL);
  
          if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
              DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
                  goto fail;
  
*** 810,831 ****
          mutex_destroy(&cmd->nc_mutex);
  
          kmem_cache_free(nvme_cmd_cache, cmd);
  }
  
  static int
! nvme_submit_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
  {
!         nvme_reg_sqtdbl_t tail = { 0 };
  
!         mutex_enter(&qp->nq_mutex);
  
!         if (qp->nq_active_cmds == qp->nq_nentry) {
!                 mutex_exit(&qp->nq_mutex);
!                 return (DDI_FAILURE);
!         }
  
          cmd->nc_completed = B_FALSE;
  
          /*
           * Try to insert the cmd into the active cmd array at the nq_next_cmd
           * slot. If the slot is already occupied advance to the next slot and
--- 815,847 ----
          mutex_destroy(&cmd->nc_mutex);
  
          kmem_cache_free(nvme_cmd_cache, cmd);
  }
  
+ static void
+ nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
+ {
+         sema_p(&qp->nq_sema);
+         nvme_submit_cmd_common(qp, cmd);
+ }
+ 
  static int
! nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
  {
!         if (sema_tryp(&qp->nq_sema) == 0)
!                 return (EAGAIN);
  
!         nvme_submit_cmd_common(qp, cmd);
!         return (0);
! }
  
! static void
! nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd)
! {
!         nvme_reg_sqtdbl_t tail = { 0 };
  
+         mutex_enter(&qp->nq_mutex);
          cmd->nc_completed = B_FALSE;
  
          /*
           * Try to insert the cmd into the active cmd array at the nq_next_cmd
           * slot. If the slot is already occupied advance to the next slot and
*** 847,857 ****
  
          tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
          nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
  
          mutex_exit(&qp->nq_mutex);
-         return (DDI_SUCCESS);
  }
  
  static nvme_cmd_t *
  nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
  {
--- 863,872 ----
*** 893,902 ****
--- 908,918 ----
          if (qp->nq_cqhead == 0)
                  qp->nq_phase = qp->nq_phase ? 0 : 1;
  
          nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
          mutex_exit(&qp->nq_mutex);
+         sema_v(&qp->nq_sema);
  
          return (cmd);
  }
  
  static int
*** 1361,1371 ****
          nvme_t *nvme = cmd->nc_nvme;
          nvme_error_log_entry_t *error_log = NULL;
          nvme_health_log_t *health_log = NULL;
          size_t logsize = 0;
          nvme_async_event_t event;
-         int ret;
  
          /*
           * Check for errors associated with the async request itself. The only
           * command-specific error is "async event limit exceeded", which
           * indicates a programming error in the driver and causes a panic in
--- 1377,1386 ----
*** 1395,1413 ****
  
          event.r = cmd->nc_cqe.cqe_dw0;
  
          /* Clear CQE and re-submit the async request. */
          bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
!         ret = nvme_submit_cmd(nvme->n_adminq, cmd);
  
-         if (ret != DDI_SUCCESS) {
-                 dev_err(nvme->n_dip, CE_WARN,
-                     "!failed to resubmit async event request");
-                 atomic_inc_32(&nvme->n_async_resubmit_failed);
-                 nvme_free_cmd(cmd);
-         }
- 
          switch (event.b.ae_type) {
          case NVME_ASYNC_TYPE_ERROR:
                  if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
                          (void) nvme_get_logpage(nvme, (void **)&error_log,
                              &logsize, event.b.ae_logpage);
--- 1410,1421 ----
  
          event.r = cmd->nc_cqe.cqe_dw0;
  
          /* Clear CQE and re-submit the async request. */
          bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
!         nvme_submit_admin_cmd(nvme->n_adminq, cmd);
  
          switch (event.b.ae_type) {
          case NVME_ASYNC_TYPE_ERROR:
                  if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
                          (void) nvme_get_logpage(nvme, (void **)&error_log,
                              &logsize, event.b.ae_logpage);
*** 1515,1538 ****
  }
  
  static int
  nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
  {
-         int ret;
- 
          mutex_enter(&cmd->nc_mutex);
!         ret = nvme_submit_cmd(cmd->nc_nvme->n_adminq, cmd);
  
-         if (ret != DDI_SUCCESS) {
-                 mutex_exit(&cmd->nc_mutex);
-                 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
-                     "!nvme_submit_cmd failed");
-                 atomic_inc_32(&cmd->nc_nvme->n_admin_queue_full);
-                 nvme_free_cmd(cmd);
-                 return (DDI_FAILURE);
-         }
- 
          if (nvme_wait_cmd(cmd, sec) == B_FALSE) {
                  /*
                   * The command timed out. An abort command was posted that
                   * will take care of the cleanup.
                   */
--- 1523,1535 ----
  }
  
  static int
  nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
  {
          mutex_enter(&cmd->nc_mutex);
!         nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
  
          if (nvme_wait_cmd(cmd, sec) == B_FALSE) {
                  /*
                   * The command timed out. An abort command was posted that
                   * will take care of the cleanup.
                   */
*** 1541,1570 ****
          mutex_exit(&cmd->nc_mutex);
  
          return (DDI_SUCCESS);
  }
  
! static int
  nvme_async_event(nvme_t *nvme)
  {
          nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
-         int ret;
  
          cmd->nc_sqid = 0;
          cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
          cmd->nc_callback = nvme_async_event_task;
  
!         ret = nvme_submit_cmd(nvme->n_adminq, cmd);
! 
!         if (ret != DDI_SUCCESS) {
!                 dev_err(nvme->n_dip, CE_WARN,
!                     "!nvme_submit_cmd failed for ASYNCHRONOUS EVENT");
!                 nvme_free_cmd(cmd);
!                 return (DDI_FAILURE);
!         }
! 
!         return (DDI_SUCCESS);
  }
  
  static int
  nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms,
      uint8_t pi, boolean_t pil, uint8_t ses)
--- 1538,1557 ----
          mutex_exit(&cmd->nc_mutex);
  
          return (DDI_SUCCESS);
  }
  
! static void
  nvme_async_event(nvme_t *nvme)
  {
          nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
  
          cmd->nc_sqid = 0;
          cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
          cmd->nc_callback = nvme_async_event_task;
  
!         nvme_submit_admin_cmd(nvme->n_adminq, cmd);
  }
  
  static int
  nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms,
      uint8_t pi, boolean_t pil, uint8_t ses)
*** 2377,2391 ****
          }
  
          /*
           * Post an asynchronous event command to catch errors.
           */
!         if (nvme_async_event(nvme) != DDI_SUCCESS) {
!                 dev_err(nvme->n_dip, CE_WARN,
!                     "!failed to post async event");
!                 goto fail;
!         }
  
          /*
           * Identify Controller
           */
          nvme->n_idctl = nvme_identify(nvme, 0);
--- 2364,2374 ----
          }
  
          /*
           * Post an asynchronous event command to catch errors.
           */
!         nvme_async_event(nvme);
  
          /*
           * Identify Controller
           */
          nvme->n_idctl = nvme_identify(nvme, 0);
*** 2606,2622 ****
  
          /*
           * Post more asynchronous events commands to reduce event reporting
           * latency as suggested by the spec.
           */
!         for (i = 1; i != nvme->n_async_event_limit; i++) {
!                 if (nvme_async_event(nvme) != DDI_SUCCESS) {
!                         dev_err(nvme->n_dip, CE_WARN,
!                             "!failed to post async event %d", i);
!                         goto fail;
!                 }
!         }
  
          return (DDI_SUCCESS);
  
  fail:
          (void) nvme_reset(nvme, B_FALSE);
--- 2589,2600 ----
  
          /*
           * Post more asynchronous events commands to reduce event reporting
           * latency as suggested by the spec.
           */
!         for (i = 1; i != nvme->n_async_event_limit; i++)
!                 nvme_async_event(nvme);
  
          return (DDI_SUCCESS);
  
  fail:
          (void) nvme_reset(nvme, B_FALSE);
*** 3276,3288 ****
  
  static int
  nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
  {
          nvme_t *nvme = ns->ns_nvme;
!         nvme_cmd_t *cmd, *ret;
          nvme_qpair_t *ioq;
          boolean_t poll;
  
          if (nvme->n_dead)
                  return (EIO);
  
          cmd = nvme_create_nvm_cmd(ns, opc, xfer);
--- 3254,3267 ----
  
  static int
  nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
  {
          nvme_t *nvme = ns->ns_nvme;
!         nvme_cmd_t *cmd;
          nvme_qpair_t *ioq;
          boolean_t poll;
+         int ret;
  
          if (nvme->n_dead)
                  return (EIO);
  
          cmd = nvme_create_nvm_cmd(ns, opc, xfer);
*** 3298,3317 ****
           * complete immediately after it was submitted, which means we must
           * treat both cmd and xfer as if they have been freed already.
           */
          poll = (xfer->x_flags & BD_XFER_POLL) != 0;
  
!         if (nvme_submit_cmd(ioq, cmd) != DDI_SUCCESS)
!                 return (EAGAIN);
  
          if (!poll)
                  return (0);
  
          do {
!                 ret = nvme_retrieve_cmd(nvme, ioq);
!                 if (ret != NULL)
!                         nvme_bd_xfer_done(ret);
                  else
                          drv_usecwait(10);
          } while (ioq->nq_active_cmds != 0);
  
          return (0);
--- 3277,3298 ----
           * complete immediately after it was submitted, which means we must
           * treat both cmd and xfer as if they have been freed already.
           */
          poll = (xfer->x_flags & BD_XFER_POLL) != 0;
  
!         ret = nvme_submit_io_cmd(ioq, cmd);
  
+         if (ret != 0)
+                 return (ret);
+ 
          if (!poll)
                  return (0);
  
          do {
!                 cmd = nvme_retrieve_cmd(nvme, ioq);
!                 if (cmd != NULL)
!                         nvme_bd_xfer_done(cmd);
                  else
                          drv_usecwait(10);
          } while (ioq->nq_active_cmds != 0);
  
          return (0);