Print this page
don't block in nvme_bd_cmd
8628 nvme: use a semaphore to guard submission queue
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>


  27  * endian systems without changes to the code accessing registers and data
  28  * structures used by the hardware.
  29  *
  30  *
  31  * Interrupt Usage:
  32  *
  33  * The driver will use a single interrupt while configuring the device as the
  34  * specification requires, but contrary to the specification it will try to use
  35  * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
  36  * will switch to multiple-message MSI(-X) if supported. The driver wants to
  37  * have one interrupt vector per CPU, but it will work correctly if less are
  38  * available. Interrupts can be shared by queues, the interrupt handler will
  39  * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
  40  * the admin queue will share an interrupt with one I/O queue. The interrupt
  41  * handler will retrieve completed commands from all queues sharing an interrupt
  42  * vector and will post them to a taskq for completion processing.
  43  *
  44  *
  45  * Command Processing:
  46  *
  47  * NVMe devices can have up to 65536 I/O queue pairs, with each queue holding up
  48  * to 65536 I/O commands. The driver will configure one I/O queue pair per
  49  * available interrupt vector, with the queue length usually much smaller than
  50  * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
  51  * interrupt vectors will be used.
  52  *
  53  * Additionally the hardware provides a single special admin queue pair that can
  54  * hold up to 4096 admin commands.
  55  *
  56  * From the hardware perspective both queues of a queue pair are independent,
  57  * but they share some driver state: the command array (holding pointers to
  58  * commands currently being processed by the hardware) and the active command
  59  * counter. Access to the submission side of a queue pair and the shared state
  60  * is protected by nq_mutex. The completion side of a queue pair does not need
  61  * that protection apart from its access to the shared state; it is called only
  62  * in the interrupt handler which does not run concurrently for the same
  63  * interrupt vector.
  64  *
  65  * When a command is submitted to a queue pair the active command counter is
  66  * incremented and a pointer to the command is stored in the command array. The
  67  * array index is used as command identifier (CID) in the submission queue
  68  * entry. Some commands may take a very long time to complete, and if the queue
  69  * wraps around in that time a submission may find the next array slot to still
  70  * be used by a long-running command. In this case the array is sequentially
  71  * searched for the next free slot. The length of the command array is the same
  72  * as the configured queue length.

  73  *
  74  *
  75  * Polled I/O Support:
  76  *
  77  * For kernel core dump support the driver can do polled I/O. As interrupts are
  78  * turned off while dumping the driver will just submit a command in the regular
  79  * way, and then repeatedly attempt a command retrieval until it gets the
  80  * command back.
  81  *
  82  *
  83  * Namespace Support:
  84  *
  85  * NVMe devices can have multiple namespaces, each being a independent data
  86  * store. The driver supports multiple namespaces and creates a blkdev interface
  87  * for each namespace found. Namespaces can have various attributes to support
  88  * thin provisioning and protection information. This driver does not support
  89  * any of this and ignores namespaces that have these attributes.
  90  *
  91  * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
  92  * (EUI64). This driver uses the EUI64 if present to generate the devid and


 240 
 241 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
 242 int nvme_format_cmd_timeout = 600;
 243 
 244 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
 245 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
 246 static int nvme_quiesce(dev_info_t *);
 247 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
 248 static int nvme_setup_interrupts(nvme_t *, int, int);
 249 static void nvme_release_interrupts(nvme_t *);
 250 static uint_t nvme_intr(caddr_t, caddr_t);
 251 
 252 static void nvme_shutdown(nvme_t *, int, boolean_t);
 253 static boolean_t nvme_reset(nvme_t *, boolean_t);
 254 static int nvme_init(nvme_t *);
 255 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 256 static void nvme_free_cmd(nvme_cmd_t *);
 257 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
 258     bd_xfer_t *);
 259 static int nvme_admin_cmd(nvme_cmd_t *, int);
 260 static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);


 261 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
 262 static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
 263 static void nvme_wakeup_cmd(void *);
 264 static void nvme_async_event_task(void *);
 265 
 266 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
 267 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
 268 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
 269 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
 270 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
 271 static inline int nvme_check_cmd_status(nvme_cmd_t *);
 272 
 273 static void nvme_abort_cmd(nvme_cmd_t *);
 274 static int nvme_async_event(nvme_t *);
 275 static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t,
 276     boolean_t, uint8_t);
 277 static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...);
 278 static void *nvme_identify(nvme_t *, uint32_t);
 279 static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,
 280     uint32_t *);
 281 static boolean_t nvme_get_features(nvme_t *, uint32_t, uint8_t, uint32_t *,
 282     void **, size_t *);
 283 static boolean_t nvme_write_cache_set(nvme_t *, boolean_t);
 284 static int nvme_set_nqueues(nvme_t *, uint16_t);
 285 
 286 static void nvme_free_dma(nvme_dma_t *);
 287 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
 288     nvme_dma_t **);
 289 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
 290     nvme_dma_t **);
 291 static void nvme_free_qpair(nvme_qpair_t *);
 292 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, int);
 293 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
 294 


 704                 goto fail;
 705         }
 706 
 707         return (DDI_SUCCESS);
 708 
 709 fail:
 710         if (*dma) {
 711                 nvme_free_dma(*dma);
 712                 *dma = NULL;
 713         }
 714 
 715         return (DDI_FAILURE);
 716 }
 717 
 718 static void
 719 nvme_free_qpair(nvme_qpair_t *qp)
 720 {
 721         int i;
 722 
 723         mutex_destroy(&qp->nq_mutex);

 724 
 725         if (qp->nq_sqdma != NULL)
 726                 nvme_free_dma(qp->nq_sqdma);
 727         if (qp->nq_cqdma != NULL)
 728                 nvme_free_dma(qp->nq_cqdma);
 729 
 730         if (qp->nq_active_cmds > 0)
 731                 for (i = 0; i != qp->nq_nentry; i++)
 732                         if (qp->nq_cmd[i] != NULL)
 733                                 nvme_free_cmd(qp->nq_cmd[i]);
 734 
 735         if (qp->nq_cmd != NULL)
 736                 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
 737 
 738         kmem_free(qp, sizeof (nvme_qpair_t));
 739 }
 740 
 741 static int
 742 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
 743     int idx)
 744 {
 745         nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
 746 
 747         mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
 748             DDI_INTR_PRI(nvme->n_intr_pri));

 749 
 750         if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
 751             DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
 752                 goto fail;
 753 
 754         if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
 755             DDI_DMA_READ, &qp->nq_cqdma) != DDI_SUCCESS)
 756                 goto fail;
 757 
 758         qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
 759         qp->nq_cq = (nvme_cqe_t *)qp->nq_cqdma->nd_memp;
 760         qp->nq_nentry = nentry;
 761 
 762         qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
 763         qp->nq_cqhdbl = NVME_REG_CQHDBL(nvme, idx);
 764 
 765         qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
 766         qp->nq_next_cmd = 0;
 767 
 768         *nqp = qp;


 795 }
 796 
 797 static void
 798 nvme_free_cmd(nvme_cmd_t *cmd)
 799 {
 800         if (cmd->nc_dma) {
 801                 if (cmd->nc_dma->nd_cached)
 802                         kmem_cache_free(cmd->nc_nvme->n_prp_cache,
 803                             cmd->nc_dma);
 804                 else
 805                         nvme_free_dma(cmd->nc_dma);
 806                 cmd->nc_dma = NULL;
 807         }
 808 
 809         cv_destroy(&cmd->nc_cv);
 810         mutex_destroy(&cmd->nc_mutex);
 811 
 812         kmem_cache_free(nvme_cmd_cache, cmd);
 813 }
 814 







 815 static int
 816 nvme_submit_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
 817 {
 818         nvme_reg_sqtdbl_t tail = { 0 };

 819 
 820         mutex_enter(&qp->nq_mutex);


 821 
 822         if (qp->nq_active_cmds == qp->nq_nentry) {
 823                 mutex_exit(&qp->nq_mutex);
 824                 return (DDI_FAILURE);
 825         }
 826 

 827         cmd->nc_completed = B_FALSE;
 828 
 829         /*
 830          * Try to insert the cmd into the active cmd array at the nq_next_cmd
 831          * slot. If the slot is already occupied advance to the next slot and
 832          * try again. This can happen for long running commands like async event
 833          * requests.
 834          */
 835         while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
 836                 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 837         qp->nq_cmd[qp->nq_next_cmd] = cmd;
 838 
 839         qp->nq_active_cmds++;
 840 
 841         cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
 842         bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
 843         (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
 844             sizeof (nvme_sqe_t) * qp->nq_sqtail,
 845             sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
 846         qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 847 
 848         tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
 849         nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
 850 
 851         mutex_exit(&qp->nq_mutex);
 852         return (DDI_SUCCESS);
 853 }
 854 
 855 static nvme_cmd_t *
 856 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
 857 {
 858         nvme_reg_cqhdbl_t head = { 0 };
 859 
 860         nvme_cqe_t *cqe;
 861         nvme_cmd_t *cmd;
 862 
 863         (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
 864             sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
 865 
 866         mutex_enter(&qp->nq_mutex);
 867         cqe = &qp->nq_cq[qp->nq_cqhead];
 868 
 869         /* Check phase tag of CQE. Hardware inverts it for new entries. */
 870         if (cqe->cqe_sf.sf_p == qp->nq_phase) {
 871                 mutex_exit(&qp->nq_mutex);
 872                 return (NULL);


 878         cmd = qp->nq_cmd[cqe->cqe_cid];
 879         qp->nq_cmd[cqe->cqe_cid] = NULL;
 880         qp->nq_active_cmds--;
 881 
 882         ASSERT(cmd != NULL);
 883         ASSERT(cmd->nc_nvme == nvme);
 884         ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
 885         ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
 886         bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
 887 
 888         qp->nq_sqhead = cqe->cqe_sqhd;
 889 
 890         head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
 891 
 892         /* Toggle phase on wrap-around. */
 893         if (qp->nq_cqhead == 0)
 894                 qp->nq_phase = qp->nq_phase ? 0 : 1;
 895 
 896         nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
 897         mutex_exit(&qp->nq_mutex);

 898 
 899         return (cmd);
 900 }
 901 
 902 static int
 903 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
 904 {
 905         nvme_cqe_t *cqe = &cmd->nc_cqe;
 906 
 907         dev_err(cmd->nc_nvme->n_dip, CE_WARN,
 908             "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
 909             "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
 910             cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
 911             cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
 912 
 913         if (cmd->nc_xfer != NULL)
 914                 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
 915 
 916         if (cmd->nc_nvme->n_strict_version) {
 917                 cmd->nc_nvme->n_dead = B_TRUE;


1346         if (cmd->nc_callback == nvme_abort_cmd_cb) {
1347                 mutex_exit(&cmd->nc_mutex);
1348                 nvme_abort_cmd_cb(cmd);
1349                 return;
1350         }
1351 
1352         cmd->nc_completed = B_TRUE;
1353         cv_signal(&cmd->nc_cv);
1354         mutex_exit(&cmd->nc_mutex);
1355 }
1356 
1357 static void
1358 nvme_async_event_task(void *arg)
1359 {
1360         nvme_cmd_t *cmd = arg;
1361         nvme_t *nvme = cmd->nc_nvme;
1362         nvme_error_log_entry_t *error_log = NULL;
1363         nvme_health_log_t *health_log = NULL;
1364         size_t logsize = 0;
1365         nvme_async_event_t event;
1366         int ret;
1367 
1368         /*
1369          * Check for errors associated with the async request itself. The only
1370          * command-specific error is "async event limit exceeded", which
1371          * indicates a programming error in the driver and causes a panic in
1372          * nvme_check_cmd_status().
1373          *
1374          * Other possible errors are various scenarios where the async request
1375          * was aborted, or internal errors in the device. Internal errors are
1376          * reported to FMA, the command aborts need no special handling here.
1377          */
1378         if (nvme_check_cmd_status(cmd)) {
1379                 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1380                     "!async event request returned failure, sct = %x, "
1381                     "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
1382                     cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
1383                     cmd->nc_cqe.cqe_sf.sf_m);
1384 
1385                 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1386                     cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
1387                         cmd->nc_nvme->n_dead = B_TRUE;
1388                         ddi_fm_service_impact(cmd->nc_nvme->n_dip,
1389                             DDI_SERVICE_LOST);
1390                 }
1391                 nvme_free_cmd(cmd);
1392                 return;
1393         }
1394 
1395 
1396         event.r = cmd->nc_cqe.cqe_dw0;
1397 
1398         /* Clear CQE and re-submit the async request. */
1399         bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
1400         ret = nvme_submit_cmd(nvme->n_adminq, cmd);
1401 
1402         if (ret != DDI_SUCCESS) {
1403                 dev_err(nvme->n_dip, CE_WARN,
1404                     "!failed to resubmit async event request");
1405                 atomic_inc_32(&nvme->n_async_resubmit_failed);
1406                 nvme_free_cmd(cmd);
1407         }
1408 
1409         switch (event.b.ae_type) {
1410         case NVME_ASYNC_TYPE_ERROR:
1411                 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
1412                         (void) nvme_get_logpage(nvme, (void **)&error_log,
1413                             &logsize, event.b.ae_logpage);
1414                 } else {
1415                         dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
1416                             "async event reply: %d", event.b.ae_logpage);
1417                         atomic_inc_32(&nvme->n_wrong_logpage);
1418                 }
1419 
1420                 switch (event.b.ae_info) {
1421                 case NVME_ASYNC_ERROR_INV_SQ:
1422                         dev_err(nvme->n_dip, CE_PANIC, "programming error: "
1423                             "invalid submission queue");
1424                         return;
1425 
1426                 case NVME_ASYNC_ERROR_INV_DBL:
1427                         dev_err(nvme->n_dip, CE_PANIC, "programming error: "
1428                             "invalid doorbell write value");


1500                 break;
1501 
1502         default:
1503                 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
1504                     "type = %x, info = %x, logpage = %x", event.b.ae_type,
1505                     event.b.ae_info, event.b.ae_logpage);
1506                 atomic_inc_32(&nvme->n_unknown_event);
1507                 break;
1508         }
1509 
1510         if (error_log)
1511                 kmem_free(error_log, logsize);
1512 
1513         if (health_log)
1514                 kmem_free(health_log, logsize);
1515 }
1516 
1517 static int
1518 nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
1519 {
1520         int ret;
1521 
1522         mutex_enter(&cmd->nc_mutex);
1523         ret = nvme_submit_cmd(cmd->nc_nvme->n_adminq, cmd);
1524 
1525         if (ret != DDI_SUCCESS) {
1526                 mutex_exit(&cmd->nc_mutex);
1527                 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1528                     "!nvme_submit_cmd failed");
1529                 atomic_inc_32(&cmd->nc_nvme->n_admin_queue_full);
1530                 nvme_free_cmd(cmd);
1531                 return (DDI_FAILURE);
1532         }
1533 
1534         if (nvme_wait_cmd(cmd, sec) == B_FALSE) {
1535                 /*
1536                  * The command timed out. An abort command was posted that
1537                  * will take care of the cleanup.
1538                  */
1539                 return (DDI_FAILURE);
1540         }
1541         mutex_exit(&cmd->nc_mutex);
1542 
1543         return (DDI_SUCCESS);
1544 }
1545 
1546 static int
1547 nvme_async_event(nvme_t *nvme)
1548 {
1549         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1550         int ret;
1551 
1552         cmd->nc_sqid = 0;
1553         cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
1554         cmd->nc_callback = nvme_async_event_task;
1555 
1556         ret = nvme_submit_cmd(nvme->n_adminq, cmd);
1557 
1558         if (ret != DDI_SUCCESS) {
1559                 dev_err(nvme->n_dip, CE_WARN,
1560                     "!nvme_submit_cmd failed for ASYNCHRONOUS EVENT");
1561                 nvme_free_cmd(cmd);
1562                 return (DDI_FAILURE);
1563         }
1564 
1565         return (DDI_SUCCESS);
1566 }
1567 
1568 static int
1569 nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms,
1570     uint8_t pi, boolean_t pil, uint8_t ses)
1571 {
1572         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1573         nvme_format_nvm_t format_nvm = { 0 };
1574         int ret;
1575 
1576         format_nvm.b.fm_lbaf = lbaf & 0xf;
1577         format_nvm.b.fm_ms = ms ? 1 : 0;
1578         format_nvm.b.fm_pi = pi & 0x7;
1579         format_nvm.b.fm_pil = pil ? 1 : 0;
1580         format_nvm.b.fm_ses = ses & 0x7;
1581 
1582         cmd->nc_sqid = 0;
1583         cmd->nc_callback = nvme_wakeup_cmd;
1584         cmd->nc_sqe.sqe_nsid = nsid;
1585         cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT;


2362          */
2363         sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL);
2364 
2365         /*
2366          * Setup initial interrupt for admin queue.
2367          */
2368         if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1)
2369             != DDI_SUCCESS) &&
2370             (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1)
2371             != DDI_SUCCESS) &&
2372             (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
2373             != DDI_SUCCESS)) {
2374                 dev_err(nvme->n_dip, CE_WARN,
2375                     "!failed to setup initial interrupt");
2376                 goto fail;
2377         }
2378 
2379         /*
2380          * Post an asynchronous event command to catch errors.
2381          */
2382         if (nvme_async_event(nvme) != DDI_SUCCESS) {
2383                 dev_err(nvme->n_dip, CE_WARN,
2384                     "!failed to post async event");
2385                 goto fail;
2386         }
2387 
2388         /*
2389          * Identify Controller
2390          */
2391         nvme->n_idctl = nvme_identify(nvme, 0);
2392         if (nvme->n_idctl == NULL) {
2393                 dev_err(nvme->n_dip, CE_WARN,
2394                     "!failed to identify controller");
2395                 goto fail;
2396         }
2397 
2398         /*
2399          * Get Vendor & Product ID
2400          */
2401         bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
2402         model[sizeof (nvme->n_idctl->id_model)] = '\0';
2403         sata_split_model(model, &vendor, &product);
2404 
2405         if (vendor == NULL)
2406                 nvme->n_vendor = strdup("NVMe");


2591         for (i = 1; i != nvme->n_ioq_count + 1; i++) {
2592                 if (nvme_alloc_qpair(nvme, nvme->n_io_queue_len,
2593                     &nvme->n_ioq[i], i) != DDI_SUCCESS) {
2594                         dev_err(nvme->n_dip, CE_WARN,
2595                             "!unable to allocate I/O qpair %d", i);
2596                         goto fail;
2597                 }
2598 
2599                 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i)
2600                     != DDI_SUCCESS) {
2601                         dev_err(nvme->n_dip, CE_WARN,
2602                             "!unable to create I/O qpair %d", i);
2603                         goto fail;
2604                 }
2605         }
2606 
2607         /*
2608          * Post more asynchronous events commands to reduce event reporting
2609          * latency as suggested by the spec.
2610          */
2611         for (i = 1; i != nvme->n_async_event_limit; i++) {
2612                 if (nvme_async_event(nvme) != DDI_SUCCESS) {
2613                         dev_err(nvme->n_dip, CE_WARN,
2614                             "!failed to post async event %d", i);
2615                         goto fail;
2616                 }
2617         }
2618 
2619         return (DDI_SUCCESS);
2620 
2621 fail:
2622         (void) nvme_reset(nvme, B_FALSE);
2623         return (DDI_FAILURE);
2624 }
2625 
2626 static uint_t
2627 nvme_intr(caddr_t arg1, caddr_t arg2)
2628 {
2629         /*LINTED: E_PTR_BAD_CAST_ALIGN*/
2630         nvme_t *nvme = (nvme_t *)arg1;
2631         int inum = (int)(uintptr_t)arg2;
2632         int ccnt = 0;
2633         int qnum;
2634         nvme_cmd_t *cmd;
2635 
2636         if (inum >= nvme->n_intr_cnt)
2637                 return (DDI_INTR_UNCLAIMED);


3261 
3262 static int
3263 nvme_bd_mediainfo(void *arg, bd_media_t *media)
3264 {
3265         nvme_namespace_t *ns = arg;
3266 
3267         media->m_nblks = ns->ns_block_count;
3268         media->m_blksize = ns->ns_block_size;
3269         media->m_readonly = B_FALSE;
3270         media->m_solidstate = B_TRUE;
3271 
3272         media->m_pblksize = ns->ns_best_block_size;
3273 
3274         return (0);
3275 }
3276 
3277 static int
3278 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
3279 {
3280         nvme_t *nvme = ns->ns_nvme;
3281         nvme_cmd_t *cmd, *ret;
3282         nvme_qpair_t *ioq;
3283         boolean_t poll;

3284 
3285         if (nvme->n_dead)
3286                 return (EIO);
3287 
3288         cmd = nvme_create_nvm_cmd(ns, opc, xfer);
3289         if (cmd == NULL)
3290                 return (ENOMEM);
3291 
3292         cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1;
3293         ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
3294         ioq = nvme->n_ioq[cmd->nc_sqid];
3295 
3296         /*
3297          * Get the polling flag before submitting the command. The command may
3298          * complete immediately after it was submitted, which means we must
3299          * treat both cmd and xfer as if they have been freed already.
3300          */
3301         poll = (xfer->x_flags & BD_XFER_POLL) != 0;
3302 
3303         if (nvme_submit_cmd(ioq, cmd) != DDI_SUCCESS)
3304                 return (EAGAIN);
3305 



3306         if (!poll)
3307                 return (0);
3308 
3309         do {
3310                 ret = nvme_retrieve_cmd(nvme, ioq);
3311                 if (ret != NULL)
3312                         nvme_bd_xfer_done(ret);
3313                 else
3314                         drv_usecwait(10);
3315         } while (ioq->nq_active_cmds != 0);
3316 
3317         return (0);
3318 }
3319 
3320 static int
3321 nvme_bd_read(void *arg, bd_xfer_t *xfer)
3322 {
3323         nvme_namespace_t *ns = arg;
3324 
3325         return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ));
3326 }
3327 
3328 static int
3329 nvme_bd_write(void *arg, bd_xfer_t *xfer)
3330 {
3331         nvme_namespace_t *ns = arg;
3332 




  27  * endian systems without changes to the code accessing registers and data
  28  * structures used by the hardware.
  29  *
  30  *
  31  * Interrupt Usage:
  32  *
  33  * The driver will use a single interrupt while configuring the device as the
  34  * specification requires, but contrary to the specification it will try to use
  35  * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
  36  * will switch to multiple-message MSI(-X) if supported. The driver wants to
  37  * have one interrupt vector per CPU, but it will work correctly if less are
  38  * available. Interrupts can be shared by queues, the interrupt handler will
  39  * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
  40  * the admin queue will share an interrupt with one I/O queue. The interrupt
  41  * handler will retrieve completed commands from all queues sharing an interrupt
  42  * vector and will post them to a taskq for completion processing.
  43  *
  44  *
  45  * Command Processing:
  46  *
  47  * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
  48  * to 65536 I/O commands. The driver will configure one I/O queue pair per
  49  * available interrupt vector, with the queue length usually much smaller than
  50  * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
  51  * interrupt vectors will be used.
  52  *
  53  * Additionally the hardware provides a single special admin queue pair that can
  54  * hold up to 4096 admin commands.
  55  *
  56  * From the hardware perspective both queues of a queue pair are independent,
  57  * but they share some driver state: the command array (holding pointers to
  58  * commands currently being processed by the hardware) and the active command
  59  * counter. Access to the submission side of a queue pair and the shared state
  60  * is protected by nq_mutex. The completion side of a queue pair does not need
  61  * that protection apart from its access to the shared state; it is called only
  62  * in the interrupt handler which does not run concurrently for the same
  63  * interrupt vector.
  64  *
  65  * When a command is submitted to a queue pair the active command counter is
  66  * incremented and a pointer to the command is stored in the command array. The
  67  * array index is used as command identifier (CID) in the submission queue
  68  * entry. Some commands may take a very long time to complete, and if the queue
  69  * wraps around in that time a submission may find the next array slot to still
  70  * be used by a long-running command. In this case the array is sequentially
  71  * searched for the next free slot. The length of the command array is the same
  72  * as the configured queue length. Queue overrun is prevented by the semaphore,
  73  * so a command submission may block if the queue is full.
  74  *
  75  *
  76  * Polled I/O Support:
  77  *
  78  * For kernel core dump support the driver can do polled I/O. As interrupts are
  79  * turned off while dumping the driver will just submit a command in the regular
  80  * way, and then repeatedly attempt a command retrieval until it gets the
  81  * command back.
  82  *
  83  *
  84  * Namespace Support:
  85  *
  86  * NVMe devices can have multiple namespaces, each being a independent data
  87  * store. The driver supports multiple namespaces and creates a blkdev interface
  88  * for each namespace found. Namespaces can have various attributes to support
  89  * thin provisioning and protection information. This driver does not support
  90  * any of this and ignores namespaces that have these attributes.
  91  *
  92  * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
  93  * (EUI64). This driver uses the EUI64 if present to generate the devid and


 241 
 242 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
 243 int nvme_format_cmd_timeout = 600;
 244 
 245 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
 246 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
 247 static int nvme_quiesce(dev_info_t *);
 248 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
 249 static int nvme_setup_interrupts(nvme_t *, int, int);
 250 static void nvme_release_interrupts(nvme_t *);
 251 static uint_t nvme_intr(caddr_t, caddr_t);
 252 
 253 static void nvme_shutdown(nvme_t *, int, boolean_t);
 254 static boolean_t nvme_reset(nvme_t *, boolean_t);
 255 static int nvme_init(nvme_t *);
 256 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 257 static void nvme_free_cmd(nvme_cmd_t *);
 258 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
 259     bd_xfer_t *);
 260 static int nvme_admin_cmd(nvme_cmd_t *, int);
 261 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
 262 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
 263 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
 264 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
 265 static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
 266 static void nvme_wakeup_cmd(void *);
 267 static void nvme_async_event_task(void *);
 268 
 269 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
 270 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
 271 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
 272 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
 273 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
 274 static inline int nvme_check_cmd_status(nvme_cmd_t *);
 275 
 276 static void nvme_abort_cmd(nvme_cmd_t *);
 277 static void nvme_async_event(nvme_t *);
 278 static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t,
 279     boolean_t, uint8_t);
 280 static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...);
 281 static void *nvme_identify(nvme_t *, uint32_t);
 282 static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,
 283     uint32_t *);
 284 static boolean_t nvme_get_features(nvme_t *, uint32_t, uint8_t, uint32_t *,
 285     void **, size_t *);
 286 static boolean_t nvme_write_cache_set(nvme_t *, boolean_t);
 287 static int nvme_set_nqueues(nvme_t *, uint16_t);
 288 
 289 static void nvme_free_dma(nvme_dma_t *);
 290 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
 291     nvme_dma_t **);
 292 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
 293     nvme_dma_t **);
 294 static void nvme_free_qpair(nvme_qpair_t *);
 295 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, int);
 296 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
 297 


 707                 goto fail;
 708         }
 709 
 710         return (DDI_SUCCESS);
 711 
 712 fail:
 713         if (*dma) {
 714                 nvme_free_dma(*dma);
 715                 *dma = NULL;
 716         }
 717 
 718         return (DDI_FAILURE);
 719 }
 720 
 721 static void
 722 nvme_free_qpair(nvme_qpair_t *qp)
 723 {
 724         int i;
 725 
 726         mutex_destroy(&qp->nq_mutex);
 727         sema_destroy(&qp->nq_sema);
 728 
 729         if (qp->nq_sqdma != NULL)
 730                 nvme_free_dma(qp->nq_sqdma);
 731         if (qp->nq_cqdma != NULL)
 732                 nvme_free_dma(qp->nq_cqdma);
 733 
 734         if (qp->nq_active_cmds > 0)
 735                 for (i = 0; i != qp->nq_nentry; i++)
 736                         if (qp->nq_cmd[i] != NULL)
 737                                 nvme_free_cmd(qp->nq_cmd[i]);
 738 
 739         if (qp->nq_cmd != NULL)
 740                 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
 741 
 742         kmem_free(qp, sizeof (nvme_qpair_t));
 743 }
 744 
 745 static int
 746 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
 747     int idx)
 748 {
 749         nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
 750 
 751         mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
 752             DDI_INTR_PRI(nvme->n_intr_pri));
 753         sema_init(&qp->nq_sema, nentry, NULL, SEMA_DRIVER, NULL);
 754 
 755         if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
 756             DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
 757                 goto fail;
 758 
 759         if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
 760             DDI_DMA_READ, &qp->nq_cqdma) != DDI_SUCCESS)
 761                 goto fail;
 762 
 763         qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
 764         qp->nq_cq = (nvme_cqe_t *)qp->nq_cqdma->nd_memp;
 765         qp->nq_nentry = nentry;
 766 
 767         qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
 768         qp->nq_cqhdbl = NVME_REG_CQHDBL(nvme, idx);
 769 
 770         qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
 771         qp->nq_next_cmd = 0;
 772 
 773         *nqp = qp;


 800 }
 801 
 802 static void
 803 nvme_free_cmd(nvme_cmd_t *cmd)
 804 {
 805         if (cmd->nc_dma) {
 806                 if (cmd->nc_dma->nd_cached)
 807                         kmem_cache_free(cmd->nc_nvme->n_prp_cache,
 808                             cmd->nc_dma);
 809                 else
 810                         nvme_free_dma(cmd->nc_dma);
 811                 cmd->nc_dma = NULL;
 812         }
 813 
 814         cv_destroy(&cmd->nc_cv);
 815         mutex_destroy(&cmd->nc_mutex);
 816 
 817         kmem_cache_free(nvme_cmd_cache, cmd);
 818 }
 819 
 820 static void
 821 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
 822 {
 823         sema_p(&qp->nq_sema);
 824         nvme_submit_cmd_common(qp, cmd);
 825 }
 826 
 827 static int
 828 nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
 829 {
 830         if (sema_tryp(&qp->nq_sema) == 0)
 831                 return (EAGAIN);
 832 
 833         nvme_submit_cmd_common(qp, cmd);
 834         return (0);
 835 }
 836 
 837 static void
 838 nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd)
 839 {
 840         nvme_reg_sqtdbl_t tail = { 0 };
 841 
 842         mutex_enter(&qp->nq_mutex);
 843         cmd->nc_completed = B_FALSE;
 844 
 845         /*
 846          * Try to insert the cmd into the active cmd array at the nq_next_cmd
 847          * slot. If the slot is already occupied advance to the next slot and
 848          * try again. This can happen for long running commands like async event
 849          * requests.
 850          */
 851         while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
 852                 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 853         qp->nq_cmd[qp->nq_next_cmd] = cmd;
 854 
 855         qp->nq_active_cmds++;
 856 
 857         cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
 858         bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
 859         (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
 860             sizeof (nvme_sqe_t) * qp->nq_sqtail,
 861             sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
 862         qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 863 
 864         tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
 865         nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
 866 
 867         mutex_exit(&qp->nq_mutex);

 868 }
 869 
 870 static nvme_cmd_t *
 871 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
 872 {
 873         nvme_reg_cqhdbl_t head = { 0 };
 874 
 875         nvme_cqe_t *cqe;
 876         nvme_cmd_t *cmd;
 877 
 878         (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
 879             sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
 880 
 881         mutex_enter(&qp->nq_mutex);
 882         cqe = &qp->nq_cq[qp->nq_cqhead];
 883 
 884         /* Check phase tag of CQE. Hardware inverts it for new entries. */
 885         if (cqe->cqe_sf.sf_p == qp->nq_phase) {
 886                 mutex_exit(&qp->nq_mutex);
 887                 return (NULL);


 893         cmd = qp->nq_cmd[cqe->cqe_cid];
 894         qp->nq_cmd[cqe->cqe_cid] = NULL;
 895         qp->nq_active_cmds--;
 896 
 897         ASSERT(cmd != NULL);
 898         ASSERT(cmd->nc_nvme == nvme);
 899         ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
 900         ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
 901         bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
 902 
 903         qp->nq_sqhead = cqe->cqe_sqhd;
 904 
 905         head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
 906 
 907         /* Toggle phase on wrap-around. */
 908         if (qp->nq_cqhead == 0)
 909                 qp->nq_phase = qp->nq_phase ? 0 : 1;
 910 
 911         nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
 912         mutex_exit(&qp->nq_mutex);
 913         sema_v(&qp->nq_sema);
 914 
 915         return (cmd);
 916 }
 917 
 918 static int
 919 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
 920 {
 921         nvme_cqe_t *cqe = &cmd->nc_cqe;
 922 
 923         dev_err(cmd->nc_nvme->n_dip, CE_WARN,
 924             "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
 925             "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
 926             cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
 927             cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
 928 
 929         if (cmd->nc_xfer != NULL)
 930                 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
 931 
 932         if (cmd->nc_nvme->n_strict_version) {
 933                 cmd->nc_nvme->n_dead = B_TRUE;


1362         if (cmd->nc_callback == nvme_abort_cmd_cb) {
1363                 mutex_exit(&cmd->nc_mutex);
1364                 nvme_abort_cmd_cb(cmd);
1365                 return;
1366         }
1367 
1368         cmd->nc_completed = B_TRUE;
1369         cv_signal(&cmd->nc_cv);
1370         mutex_exit(&cmd->nc_mutex);
1371 }
1372 
1373 static void
1374 nvme_async_event_task(void *arg)
1375 {
1376         nvme_cmd_t *cmd = arg;
1377         nvme_t *nvme = cmd->nc_nvme;
1378         nvme_error_log_entry_t *error_log = NULL;
1379         nvme_health_log_t *health_log = NULL;
1380         size_t logsize = 0;
1381         nvme_async_event_t event;

1382 
1383         /*
1384          * Check for errors associated with the async request itself. The only
1385          * command-specific error is "async event limit exceeded", which
1386          * indicates a programming error in the driver and causes a panic in
1387          * nvme_check_cmd_status().
1388          *
1389          * Other possible errors are various scenarios where the async request
1390          * was aborted, or internal errors in the device. Internal errors are
1391          * reported to FMA, the command aborts need no special handling here.
1392          */
1393         if (nvme_check_cmd_status(cmd)) {
1394                 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1395                     "!async event request returned failure, sct = %x, "
1396                     "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
1397                     cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
1398                     cmd->nc_cqe.cqe_sf.sf_m);
1399 
1400                 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1401                     cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
1402                         cmd->nc_nvme->n_dead = B_TRUE;
1403                         ddi_fm_service_impact(cmd->nc_nvme->n_dip,
1404                             DDI_SERVICE_LOST);
1405                 }
1406                 nvme_free_cmd(cmd);
1407                 return;
1408         }
1409 
1410 
1411         event.r = cmd->nc_cqe.cqe_dw0;
1412 
1413         /* Clear CQE and re-submit the async request. */
1414         bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
1415         nvme_submit_admin_cmd(nvme->n_adminq, cmd);
1416 







1417         switch (event.b.ae_type) {
1418         case NVME_ASYNC_TYPE_ERROR:
1419                 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
1420                         (void) nvme_get_logpage(nvme, (void **)&error_log,
1421                             &logsize, event.b.ae_logpage);
1422                 } else {
1423                         dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
1424                             "async event reply: %d", event.b.ae_logpage);
1425                         atomic_inc_32(&nvme->n_wrong_logpage);
1426                 }
1427 
1428                 switch (event.b.ae_info) {
1429                 case NVME_ASYNC_ERROR_INV_SQ:
1430                         dev_err(nvme->n_dip, CE_PANIC, "programming error: "
1431                             "invalid submission queue");
1432                         return;
1433 
1434                 case NVME_ASYNC_ERROR_INV_DBL:
1435                         dev_err(nvme->n_dip, CE_PANIC, "programming error: "
1436                             "invalid doorbell write value");


1508                 break;
1509 
1510         default:
1511                 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
1512                     "type = %x, info = %x, logpage = %x", event.b.ae_type,
1513                     event.b.ae_info, event.b.ae_logpage);
1514                 atomic_inc_32(&nvme->n_unknown_event);
1515                 break;
1516         }
1517 
1518         if (error_log)
1519                 kmem_free(error_log, logsize);
1520 
1521         if (health_log)
1522                 kmem_free(health_log, logsize);
1523 }
1524 
1525 static int
1526 nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
1527 {


1528         mutex_enter(&cmd->nc_mutex);
1529         nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
1530 









1531         if (nvme_wait_cmd(cmd, sec) == B_FALSE) {
1532                 /*
1533                  * The command timed out. An abort command was posted that
1534                  * will take care of the cleanup.
1535                  */
1536                 return (DDI_FAILURE);
1537         }
1538         mutex_exit(&cmd->nc_mutex);
1539 
1540         return (DDI_SUCCESS);
1541 }
1542 
1543 static void
1544 nvme_async_event(nvme_t *nvme)
1545 {
1546         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);

1547 
1548         cmd->nc_sqid = 0;
1549         cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
1550         cmd->nc_callback = nvme_async_event_task;
1551 
1552         nvme_submit_admin_cmd(nvme->n_adminq, cmd);









1553 }
1554 
1555 static int
1556 nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms,
1557     uint8_t pi, boolean_t pil, uint8_t ses)
1558 {
1559         nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1560         nvme_format_nvm_t format_nvm = { 0 };
1561         int ret;
1562 
1563         format_nvm.b.fm_lbaf = lbaf & 0xf;
1564         format_nvm.b.fm_ms = ms ? 1 : 0;
1565         format_nvm.b.fm_pi = pi & 0x7;
1566         format_nvm.b.fm_pil = pil ? 1 : 0;
1567         format_nvm.b.fm_ses = ses & 0x7;
1568 
1569         cmd->nc_sqid = 0;
1570         cmd->nc_callback = nvme_wakeup_cmd;
1571         cmd->nc_sqe.sqe_nsid = nsid;
1572         cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT;


2349          */
2350         sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL);
2351 
2352         /*
2353          * Setup initial interrupt for admin queue.
2354          */
2355         if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1)
2356             != DDI_SUCCESS) &&
2357             (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1)
2358             != DDI_SUCCESS) &&
2359             (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
2360             != DDI_SUCCESS)) {
2361                 dev_err(nvme->n_dip, CE_WARN,
2362                     "!failed to setup initial interrupt");
2363                 goto fail;
2364         }
2365 
2366         /*
2367          * Post an asynchronous event command to catch errors.
2368          */
2369         nvme_async_event(nvme);




2370 
2371         /*
2372          * Identify Controller
2373          */
2374         nvme->n_idctl = nvme_identify(nvme, 0);
2375         if (nvme->n_idctl == NULL) {
2376                 dev_err(nvme->n_dip, CE_WARN,
2377                     "!failed to identify controller");
2378                 goto fail;
2379         }
2380 
2381         /*
2382          * Get Vendor & Product ID
2383          */
2384         bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
2385         model[sizeof (nvme->n_idctl->id_model)] = '\0';
2386         sata_split_model(model, &vendor, &product);
2387 
2388         if (vendor == NULL)
2389                 nvme->n_vendor = strdup("NVMe");


2574         for (i = 1; i != nvme->n_ioq_count + 1; i++) {
2575                 if (nvme_alloc_qpair(nvme, nvme->n_io_queue_len,
2576                     &nvme->n_ioq[i], i) != DDI_SUCCESS) {
2577                         dev_err(nvme->n_dip, CE_WARN,
2578                             "!unable to allocate I/O qpair %d", i);
2579                         goto fail;
2580                 }
2581 
2582                 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i)
2583                     != DDI_SUCCESS) {
2584                         dev_err(nvme->n_dip, CE_WARN,
2585                             "!unable to create I/O qpair %d", i);
2586                         goto fail;
2587                 }
2588         }
2589 
2590         /*
2591          * Post more asynchronous events commands to reduce event reporting
2592          * latency as suggested by the spec.
2593          */
2594         for (i = 1; i != nvme->n_async_event_limit; i++)
2595                 nvme_async_event(nvme);





2596 
2597         return (DDI_SUCCESS);
2598 
2599 fail:
2600         (void) nvme_reset(nvme, B_FALSE);
2601         return (DDI_FAILURE);
2602 }
2603 
2604 static uint_t
2605 nvme_intr(caddr_t arg1, caddr_t arg2)
2606 {
2607         /*LINTED: E_PTR_BAD_CAST_ALIGN*/
2608         nvme_t *nvme = (nvme_t *)arg1;
2609         int inum = (int)(uintptr_t)arg2;
2610         int ccnt = 0;
2611         int qnum;
2612         nvme_cmd_t *cmd;
2613 
2614         if (inum >= nvme->n_intr_cnt)
2615                 return (DDI_INTR_UNCLAIMED);


3239 
3240 static int
3241 nvme_bd_mediainfo(void *arg, bd_media_t *media)
3242 {
3243         nvme_namespace_t *ns = arg;
3244 
3245         media->m_nblks = ns->ns_block_count;
3246         media->m_blksize = ns->ns_block_size;
3247         media->m_readonly = B_FALSE;
3248         media->m_solidstate = B_TRUE;
3249 
3250         media->m_pblksize = ns->ns_best_block_size;
3251 
3252         return (0);
3253 }
3254 
3255 static int
3256 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
3257 {
3258         nvme_t *nvme = ns->ns_nvme;
3259         nvme_cmd_t *cmd;
3260         nvme_qpair_t *ioq;
3261         boolean_t poll;
3262         int ret;
3263 
3264         if (nvme->n_dead)
3265                 return (EIO);
3266 
3267         cmd = nvme_create_nvm_cmd(ns, opc, xfer);
3268         if (cmd == NULL)
3269                 return (ENOMEM);
3270 
3271         cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1;
3272         ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
3273         ioq = nvme->n_ioq[cmd->nc_sqid];
3274 
3275         /*
3276          * Get the polling flag before submitting the command. The command may
3277          * complete immediately after it was submitted, which means we must
3278          * treat both cmd and xfer as if they have been freed already.
3279          */
3280         poll = (xfer->x_flags & BD_XFER_POLL) != 0;
3281 
3282         ret = nvme_submit_io_cmd(ioq, cmd);

3283 
3284         if (ret != 0)
3285                 return (ret);
3286 
3287         if (!poll)
3288                 return (0);
3289 
3290         do {
3291                 cmd = nvme_retrieve_cmd(nvme, ioq);
3292                 if (cmd != NULL)
3293                         nvme_bd_xfer_done(cmd);
3294                 else
3295                         drv_usecwait(10);
3296         } while (ioq->nq_active_cmds != 0);
3297 
3298         return (0);
3299 }
3300 
3301 static int
3302 nvme_bd_read(void *arg, bd_xfer_t *xfer)
3303 {
3304         nvme_namespace_t *ns = arg;
3305 
3306         return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ));
3307 }
3308 
3309 static int
3310 nvme_bd_write(void *arg, bd_xfer_t *xfer)
3311 {
3312         nvme_namespace_t *ns = arg;
3313