Print this page
don't block in nvme_bd_cmd
8628 nvme: use a semaphore to guard submission queue
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
@@ -42,11 +42,11 @@
* vector and will post them to a taskq for completion processing.
*
*
* Command Processing:
*
- * NVMe devices can have up to 65536 I/O queue pairs, with each queue holding up
+ * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
* to 65536 I/O commands. The driver will configure one I/O queue pair per
* available interrupt vector, with the queue length usually much smaller than
* the maximum of 65536. If the hardware doesn't provide enough queues, fewer
* interrupt vectors will be used.
*
@@ -67,11 +67,12 @@
* array index is used as command identifier (CID) in the submission queue
* entry. Some commands may take a very long time to complete, and if the queue
* wraps around in that time a submission may find the next array slot to still
* be used by a long-running command. In this case the array is sequentially
* searched for the next free slot. The length of the command array is the same
- * as the configured queue length.
+ * as the configured queue length. Queue overrun is prevented by the semaphore,
+ * so a command submission may block if the queue is full.
*
*
* Polled I/O Support:
*
* For kernel core dump support the driver can do polled I/O. As interrupts are
@@ -255,11 +256,13 @@
static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
static void nvme_free_cmd(nvme_cmd_t *);
static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
bd_xfer_t *);
static int nvme_admin_cmd(nvme_cmd_t *, int);
-static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
+static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
+static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
+static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
static void nvme_wakeup_cmd(void *);
static void nvme_async_event_task(void *);
@@ -269,11 +272,11 @@
static int nvme_check_specific_cmd_status(nvme_cmd_t *);
static int nvme_check_generic_cmd_status(nvme_cmd_t *);
static inline int nvme_check_cmd_status(nvme_cmd_t *);
static void nvme_abort_cmd(nvme_cmd_t *);
-static int nvme_async_event(nvme_t *);
+static void nvme_async_event(nvme_t *);
static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t,
boolean_t, uint8_t);
static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...);
static void *nvme_identify(nvme_t *, uint32_t);
static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,
@@ -719,10 +722,11 @@
nvme_free_qpair(nvme_qpair_t *qp)
{
int i;
mutex_destroy(&qp->nq_mutex);
+ sema_destroy(&qp->nq_sema);
if (qp->nq_sqdma != NULL)
nvme_free_dma(qp->nq_sqdma);
if (qp->nq_cqdma != NULL)
nvme_free_dma(qp->nq_cqdma);
@@ -744,10 +748,11 @@
{
nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
DDI_INTR_PRI(nvme->n_intr_pri));
+ sema_init(&qp->nq_sema, nentry, NULL, SEMA_DRIVER, NULL);
if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
goto fail;
@@ -810,22 +815,33 @@
mutex_destroy(&cmd->nc_mutex);
kmem_cache_free(nvme_cmd_cache, cmd);
}
+static void
+nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
+{
+ sema_p(&qp->nq_sema);
+ nvme_submit_cmd_common(qp, cmd);
+}
+
static int
-nvme_submit_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
+nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
{
- nvme_reg_sqtdbl_t tail = { 0 };
+ if (sema_tryp(&qp->nq_sema) == 0)
+ return (EAGAIN);
- mutex_enter(&qp->nq_mutex);
+ nvme_submit_cmd_common(qp, cmd);
+ return (0);
+}
- if (qp->nq_active_cmds == qp->nq_nentry) {
- mutex_exit(&qp->nq_mutex);
- return (DDI_FAILURE);
- }
+static void
+nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd)
+{
+ nvme_reg_sqtdbl_t tail = { 0 };
+ mutex_enter(&qp->nq_mutex);
cmd->nc_completed = B_FALSE;
/*
* Try to insert the cmd into the active cmd array at the nq_next_cmd
* slot. If the slot is already occupied advance to the next slot and
@@ -847,11 +863,10 @@
tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
mutex_exit(&qp->nq_mutex);
- return (DDI_SUCCESS);
}
static nvme_cmd_t *
nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
{
@@ -893,10 +908,11 @@
if (qp->nq_cqhead == 0)
qp->nq_phase = qp->nq_phase ? 0 : 1;
nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
mutex_exit(&qp->nq_mutex);
+ sema_v(&qp->nq_sema);
return (cmd);
}
static int
@@ -1361,11 +1377,10 @@
nvme_t *nvme = cmd->nc_nvme;
nvme_error_log_entry_t *error_log = NULL;
nvme_health_log_t *health_log = NULL;
size_t logsize = 0;
nvme_async_event_t event;
- int ret;
/*
* Check for errors associated with the async request itself. The only
* command-specific error is "async event limit exceeded", which
* indicates a programming error in the driver and causes a panic in
@@ -1395,19 +1410,12 @@
event.r = cmd->nc_cqe.cqe_dw0;
/* Clear CQE and re-submit the async request. */
bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
- ret = nvme_submit_cmd(nvme->n_adminq, cmd);
+ nvme_submit_admin_cmd(nvme->n_adminq, cmd);
- if (ret != DDI_SUCCESS) {
- dev_err(nvme->n_dip, CE_WARN,
- "!failed to resubmit async event request");
- atomic_inc_32(&nvme->n_async_resubmit_failed);
- nvme_free_cmd(cmd);
- }
-
switch (event.b.ae_type) {
case NVME_ASYNC_TYPE_ERROR:
if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
(void) nvme_get_logpage(nvme, (void **)&error_log,
&logsize, event.b.ae_logpage);
@@ -1515,24 +1523,13 @@
}
static int
nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
{
- int ret;
-
mutex_enter(&cmd->nc_mutex);
- ret = nvme_submit_cmd(cmd->nc_nvme->n_adminq, cmd);
+ nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
- if (ret != DDI_SUCCESS) {
- mutex_exit(&cmd->nc_mutex);
- dev_err(cmd->nc_nvme->n_dip, CE_WARN,
- "!nvme_submit_cmd failed");
- atomic_inc_32(&cmd->nc_nvme->n_admin_queue_full);
- nvme_free_cmd(cmd);
- return (DDI_FAILURE);
- }
-
if (nvme_wait_cmd(cmd, sec) == B_FALSE) {
/*
* The command timed out. An abort command was posted that
* will take care of the cleanup.
*/
@@ -1541,30 +1538,20 @@
mutex_exit(&cmd->nc_mutex);
return (DDI_SUCCESS);
}
-static int
+static void
nvme_async_event(nvme_t *nvme)
{
nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
- int ret;
cmd->nc_sqid = 0;
cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
cmd->nc_callback = nvme_async_event_task;
- ret = nvme_submit_cmd(nvme->n_adminq, cmd);
-
- if (ret != DDI_SUCCESS) {
- dev_err(nvme->n_dip, CE_WARN,
- "!nvme_submit_cmd failed for ASYNCHRONOUS EVENT");
- nvme_free_cmd(cmd);
- return (DDI_FAILURE);
- }
-
- return (DDI_SUCCESS);
+ nvme_submit_admin_cmd(nvme->n_adminq, cmd);
}
static int
nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms,
uint8_t pi, boolean_t pil, uint8_t ses)
@@ -2377,15 +2364,11 @@
}
/*
* Post an asynchronous event command to catch errors.
*/
- if (nvme_async_event(nvme) != DDI_SUCCESS) {
- dev_err(nvme->n_dip, CE_WARN,
- "!failed to post async event");
- goto fail;
- }
+ nvme_async_event(nvme);
/*
* Identify Controller
*/
nvme->n_idctl = nvme_identify(nvme, 0);
@@ -2606,17 +2589,12 @@
/*
* Post more asynchronous events commands to reduce event reporting
* latency as suggested by the spec.
*/
- for (i = 1; i != nvme->n_async_event_limit; i++) {
- if (nvme_async_event(nvme) != DDI_SUCCESS) {
- dev_err(nvme->n_dip, CE_WARN,
- "!failed to post async event %d", i);
- goto fail;
- }
- }
+ for (i = 1; i != nvme->n_async_event_limit; i++)
+ nvme_async_event(nvme);
return (DDI_SUCCESS);
fail:
(void) nvme_reset(nvme, B_FALSE);
@@ -3276,13 +3254,14 @@
static int
nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
{
nvme_t *nvme = ns->ns_nvme;
- nvme_cmd_t *cmd, *ret;
+ nvme_cmd_t *cmd;
nvme_qpair_t *ioq;
boolean_t poll;
+ int ret;
if (nvme->n_dead)
return (EIO);
cmd = nvme_create_nvm_cmd(ns, opc, xfer);
@@ -3298,20 +3277,22 @@
* complete immediately after it was submitted, which means we must
* treat both cmd and xfer as if they have been freed already.
*/
poll = (xfer->x_flags & BD_XFER_POLL) != 0;
- if (nvme_submit_cmd(ioq, cmd) != DDI_SUCCESS)
- return (EAGAIN);
+ ret = nvme_submit_io_cmd(ioq, cmd);
+ if (ret != 0)
+ return (ret);
+
if (!poll)
return (0);
do {
- ret = nvme_retrieve_cmd(nvme, ioq);
- if (ret != NULL)
- nvme_bd_xfer_done(ret);
+ cmd = nvme_retrieve_cmd(nvme, ioq);
+ if (cmd != NULL)
+ nvme_bd_xfer_done(cmd);
else
drv_usecwait(10);
} while (ioq->nq_active_cmds != 0);
return (0);