Print this page
don't block in nvme_bd_cmd
8628 nvme: use a semaphore to guard submission queue
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/io/nvme/nvme.c
          +++ new/usr/src/uts/common/io/nvme/nvme.c
↓ open down ↓ 36 lines elided ↑ open up ↑
  37   37   * have one interrupt vector per CPU, but it will work correctly if less are
  38   38   * available. Interrupts can be shared by queues, the interrupt handler will
  39   39   * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
  40   40   * the admin queue will share an interrupt with one I/O queue. The interrupt
  41   41   * handler will retrieve completed commands from all queues sharing an interrupt
  42   42   * vector and will post them to a taskq for completion processing.
  43   43   *
  44   44   *
  45   45   * Command Processing:
  46   46   *
  47      - * NVMe devices can have up to 65536 I/O queue pairs, with each queue holding up
       47 + * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
  48   48   * to 65536 I/O commands. The driver will configure one I/O queue pair per
  49   49   * available interrupt vector, with the queue length usually much smaller than
  50   50   * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
  51   51   * interrupt vectors will be used.
  52   52   *
  53   53   * Additionally the hardware provides a single special admin queue pair that can
  54   54   * hold up to 4096 admin commands.
  55   55   *
  56   56   * From the hardware perspective both queues of a queue pair are independent,
  57   57   * but they share some driver state: the command array (holding pointers to
↓ open down ↓ 4 lines elided ↑ open up ↑
  62   62   * in the interrupt handler which does not run concurrently for the same
  63   63   * interrupt vector.
  64   64   *
  65   65   * When a command is submitted to a queue pair the active command counter is
  66   66   * incremented and a pointer to the command is stored in the command array. The
  67   67   * array index is used as command identifier (CID) in the submission queue
  68   68   * entry. Some commands may take a very long time to complete, and if the queue
  69   69   * wraps around in that time a submission may find the next array slot to still
  70   70   * be used by a long-running command. In this case the array is sequentially
  71   71   * searched for the next free slot. The length of the command array is the same
  72      - * as the configured queue length.
       72 + * as the configured queue length. Queue overrun is prevented by the semaphore,
       73 + * so a command submission may block if the queue is full.
  73   74   *
  74   75   *
  75   76   * Polled I/O Support:
  76   77   *
  77   78   * For kernel core dump support the driver can do polled I/O. As interrupts are
  78   79   * turned off while dumping the driver will just submit a command in the regular
  79   80   * way, and then repeatedly attempt a command retrieval until it gets the
  80   81   * command back.
  81   82   *
  82   83   *
↓ open down ↓ 167 lines elided ↑ open up ↑
 250  251  static uint_t nvme_intr(caddr_t, caddr_t);
 251  252  
 252  253  static void nvme_shutdown(nvme_t *, int, boolean_t);
 253  254  static boolean_t nvme_reset(nvme_t *, boolean_t);
 254  255  static int nvme_init(nvme_t *);
 255  256  static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
 256  257  static void nvme_free_cmd(nvme_cmd_t *);
 257  258  static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
 258  259      bd_xfer_t *);
 259  260  static int nvme_admin_cmd(nvme_cmd_t *, int);
 260      -static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *);
      261 +static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
      262 +static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
      263 +static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
 261  264  static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
 262  265  static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
 263  266  static void nvme_wakeup_cmd(void *);
 264  267  static void nvme_async_event_task(void *);
 265  268  
 266  269  static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
 267  270  static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
 268  271  static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
 269  272  static int nvme_check_specific_cmd_status(nvme_cmd_t *);
 270  273  static int nvme_check_generic_cmd_status(nvme_cmd_t *);
 271  274  static inline int nvme_check_cmd_status(nvme_cmd_t *);
 272  275  
 273  276  static void nvme_abort_cmd(nvme_cmd_t *);
 274      -static int nvme_async_event(nvme_t *);
      277 +static void nvme_async_event(nvme_t *);
 275  278  static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t,
 276  279      boolean_t, uint8_t);
 277  280  static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...);
 278  281  static void *nvme_identify(nvme_t *, uint32_t);
 279  282  static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,
 280  283      uint32_t *);
 281  284  static boolean_t nvme_get_features(nvme_t *, uint32_t, uint8_t, uint32_t *,
 282  285      void **, size_t *);
 283  286  static boolean_t nvme_write_cache_set(nvme_t *, boolean_t);
 284  287  static int nvme_set_nqueues(nvme_t *, uint16_t);
↓ open down ↓ 429 lines elided ↑ open up ↑
 714  717  
 715  718          return (DDI_FAILURE);
 716  719  }
 717  720  
 718  721  static void
 719  722  nvme_free_qpair(nvme_qpair_t *qp)
 720  723  {
 721  724          int i;
 722  725  
 723  726          mutex_destroy(&qp->nq_mutex);
      727 +        sema_destroy(&qp->nq_sema);
 724  728  
 725  729          if (qp->nq_sqdma != NULL)
 726  730                  nvme_free_dma(qp->nq_sqdma);
 727  731          if (qp->nq_cqdma != NULL)
 728  732                  nvme_free_dma(qp->nq_cqdma);
 729  733  
 730  734          if (qp->nq_active_cmds > 0)
 731  735                  for (i = 0; i != qp->nq_nentry; i++)
 732  736                          if (qp->nq_cmd[i] != NULL)
 733  737                                  nvme_free_cmd(qp->nq_cmd[i]);
↓ open down ↓ 5 lines elided ↑ open up ↑
 739  743  }
 740  744  
 741  745  static int
 742  746  nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
 743  747      int idx)
 744  748  {
 745  749          nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
 746  750  
 747  751          mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
 748  752              DDI_INTR_PRI(nvme->n_intr_pri));
      753 +        sema_init(&qp->nq_sema, nentry, NULL, SEMA_DRIVER, NULL);
 749  754  
 750  755          if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
 751  756              DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
 752  757                  goto fail;
 753  758  
 754  759          if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
 755  760              DDI_DMA_READ, &qp->nq_cqdma) != DDI_SUCCESS)
 756  761                  goto fail;
 757  762  
 758  763          qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
↓ open down ↓ 46 lines elided ↑ open up ↑
 805  810                          nvme_free_dma(cmd->nc_dma);
 806  811                  cmd->nc_dma = NULL;
 807  812          }
 808  813  
 809  814          cv_destroy(&cmd->nc_cv);
 810  815          mutex_destroy(&cmd->nc_mutex);
 811  816  
 812  817          kmem_cache_free(nvme_cmd_cache, cmd);
 813  818  }
 814  819  
      820 +static void
      821 +nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
      822 +{
      823 +        sema_p(&qp->nq_sema);
      824 +        nvme_submit_cmd_common(qp, cmd);
      825 +}
      826 +
 815  827  static int
 816      -nvme_submit_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
      828 +nvme_submit_io_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
 817  829  {
 818      -        nvme_reg_sqtdbl_t tail = { 0 };
      830 +        if (sema_tryp(&qp->nq_sema) == 0)
      831 +                return (EAGAIN);
 819  832  
 820      -        mutex_enter(&qp->nq_mutex);
      833 +        nvme_submit_cmd_common(qp, cmd);
      834 +        return (0);
      835 +}
 821  836  
 822      -        if (qp->nq_active_cmds == qp->nq_nentry) {
 823      -                mutex_exit(&qp->nq_mutex);
 824      -                return (DDI_FAILURE);
 825      -        }
      837 +static void
      838 +nvme_submit_cmd_common(nvme_qpair_t *qp, nvme_cmd_t *cmd)
      839 +{
      840 +        nvme_reg_sqtdbl_t tail = { 0 };
 826  841  
      842 +        mutex_enter(&qp->nq_mutex);
 827  843          cmd->nc_completed = B_FALSE;
 828  844  
 829  845          /*
 830  846           * Try to insert the cmd into the active cmd array at the nq_next_cmd
 831  847           * slot. If the slot is already occupied advance to the next slot and
 832  848           * try again. This can happen for long running commands like async event
 833  849           * requests.
 834  850           */
 835  851          while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
 836  852                  qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
↓ open down ↓ 5 lines elided ↑ open up ↑
 842  858          bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
 843  859          (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
 844  860              sizeof (nvme_sqe_t) * qp->nq_sqtail,
 845  861              sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
 846  862          qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
 847  863  
 848  864          tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
 849  865          nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
 850  866  
 851  867          mutex_exit(&qp->nq_mutex);
 852      -        return (DDI_SUCCESS);
 853  868  }
 854  869  
 855  870  static nvme_cmd_t *
 856  871  nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
 857  872  {
 858  873          nvme_reg_cqhdbl_t head = { 0 };
 859  874  
 860  875          nvme_cqe_t *cqe;
 861  876          nvme_cmd_t *cmd;
 862  877  
↓ open down ↓ 25 lines elided ↑ open up ↑
 888  903          qp->nq_sqhead = cqe->cqe_sqhd;
 889  904  
 890  905          head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
 891  906  
 892  907          /* Toggle phase on wrap-around. */
 893  908          if (qp->nq_cqhead == 0)
 894  909                  qp->nq_phase = qp->nq_phase ? 0 : 1;
 895  910  
 896  911          nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
 897  912          mutex_exit(&qp->nq_mutex);
      913 +        sema_v(&qp->nq_sema);
 898  914  
 899  915          return (cmd);
 900  916  }
 901  917  
 902  918  static int
 903  919  nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
 904  920  {
 905  921          nvme_cqe_t *cqe = &cmd->nc_cqe;
 906  922  
 907  923          dev_err(cmd->nc_nvme->n_dip, CE_WARN,
↓ open down ↓ 448 lines elided ↑ open up ↑
1356 1372  
1357 1373  static void
1358 1374  nvme_async_event_task(void *arg)
1359 1375  {
1360 1376          nvme_cmd_t *cmd = arg;
1361 1377          nvme_t *nvme = cmd->nc_nvme;
1362 1378          nvme_error_log_entry_t *error_log = NULL;
1363 1379          nvme_health_log_t *health_log = NULL;
1364 1380          size_t logsize = 0;
1365 1381          nvme_async_event_t event;
1366      -        int ret;
1367 1382  
1368 1383          /*
1369 1384           * Check for errors associated with the async request itself. The only
1370 1385           * command-specific error is "async event limit exceeded", which
1371 1386           * indicates a programming error in the driver and causes a panic in
1372 1387           * nvme_check_cmd_status().
1373 1388           *
1374 1389           * Other possible errors are various scenarios where the async request
1375 1390           * was aborted, or internal errors in the device. Internal errors are
1376 1391           * reported to FMA, the command aborts need no special handling here.
↓ open down ↓ 13 lines elided ↑ open up ↑
1390 1405                  }
1391 1406                  nvme_free_cmd(cmd);
1392 1407                  return;
1393 1408          }
1394 1409  
1395 1410  
1396 1411          event.r = cmd->nc_cqe.cqe_dw0;
1397 1412  
1398 1413          /* Clear CQE and re-submit the async request. */
1399 1414          bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t));
1400      -        ret = nvme_submit_cmd(nvme->n_adminq, cmd);
     1415 +        nvme_submit_admin_cmd(nvme->n_adminq, cmd);
1401 1416  
1402      -        if (ret != DDI_SUCCESS) {
1403      -                dev_err(nvme->n_dip, CE_WARN,
1404      -                    "!failed to resubmit async event request");
1405      -                atomic_inc_32(&nvme->n_async_resubmit_failed);
1406      -                nvme_free_cmd(cmd);
1407      -        }
1408      -
1409 1417          switch (event.b.ae_type) {
1410 1418          case NVME_ASYNC_TYPE_ERROR:
1411 1419                  if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) {
1412 1420                          (void) nvme_get_logpage(nvme, (void **)&error_log,
1413 1421                              &logsize, event.b.ae_logpage);
1414 1422                  } else {
1415 1423                          dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in "
1416 1424                              "async event reply: %d", event.b.ae_logpage);
1417 1425                          atomic_inc_32(&nvme->n_wrong_logpage);
1418 1426                  }
↓ open down ↓ 91 lines elided ↑ open up ↑
1510 1518          if (error_log)
1511 1519                  kmem_free(error_log, logsize);
1512 1520  
1513 1521          if (health_log)
1514 1522                  kmem_free(health_log, logsize);
1515 1523  }
1516 1524  
1517 1525  static int
1518 1526  nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
1519 1527  {
1520      -        int ret;
1521      -
1522 1528          mutex_enter(&cmd->nc_mutex);
1523      -        ret = nvme_submit_cmd(cmd->nc_nvme->n_adminq, cmd);
     1529 +        nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
1524 1530  
1525      -        if (ret != DDI_SUCCESS) {
1526      -                mutex_exit(&cmd->nc_mutex);
1527      -                dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1528      -                    "!nvme_submit_cmd failed");
1529      -                atomic_inc_32(&cmd->nc_nvme->n_admin_queue_full);
1530      -                nvme_free_cmd(cmd);
1531      -                return (DDI_FAILURE);
1532      -        }
1533      -
1534 1531          if (nvme_wait_cmd(cmd, sec) == B_FALSE) {
1535 1532                  /*
1536 1533                   * The command timed out. An abort command was posted that
1537 1534                   * will take care of the cleanup.
1538 1535                   */
1539 1536                  return (DDI_FAILURE);
1540 1537          }
1541 1538          mutex_exit(&cmd->nc_mutex);
1542 1539  
1543 1540          return (DDI_SUCCESS);
1544 1541  }
1545 1542  
1546      -static int
     1543 +static void
1547 1544  nvme_async_event(nvme_t *nvme)
1548 1545  {
1549 1546          nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1550      -        int ret;
1551 1547  
1552 1548          cmd->nc_sqid = 0;
1553 1549          cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
1554 1550          cmd->nc_callback = nvme_async_event_task;
1555 1551  
1556      -        ret = nvme_submit_cmd(nvme->n_adminq, cmd);
1557      -
1558      -        if (ret != DDI_SUCCESS) {
1559      -                dev_err(nvme->n_dip, CE_WARN,
1560      -                    "!nvme_submit_cmd failed for ASYNCHRONOUS EVENT");
1561      -                nvme_free_cmd(cmd);
1562      -                return (DDI_FAILURE);
1563      -        }
1564      -
1565      -        return (DDI_SUCCESS);
     1552 +        nvme_submit_admin_cmd(nvme->n_adminq, cmd);
1566 1553  }
1567 1554  
1568 1555  static int
1569 1556  nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms,
1570 1557      uint8_t pi, boolean_t pil, uint8_t ses)
1571 1558  {
1572 1559          nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1573 1560          nvme_format_nvm_t format_nvm = { 0 };
1574 1561          int ret;
1575 1562  
↓ open down ↓ 796 lines elided ↑ open up ↑
2372 2359              (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
2373 2360              != DDI_SUCCESS)) {
2374 2361                  dev_err(nvme->n_dip, CE_WARN,
2375 2362                      "!failed to setup initial interrupt");
2376 2363                  goto fail;
2377 2364          }
2378 2365  
2379 2366          /*
2380 2367           * Post an asynchronous event command to catch errors.
2381 2368           */
2382      -        if (nvme_async_event(nvme) != DDI_SUCCESS) {
2383      -                dev_err(nvme->n_dip, CE_WARN,
2384      -                    "!failed to post async event");
2385      -                goto fail;
2386      -        }
     2369 +        nvme_async_event(nvme);
2387 2370  
2388 2371          /*
2389 2372           * Identify Controller
2390 2373           */
2391 2374          nvme->n_idctl = nvme_identify(nvme, 0);
2392 2375          if (nvme->n_idctl == NULL) {
2393 2376                  dev_err(nvme->n_dip, CE_WARN,
2394 2377                      "!failed to identify controller");
2395 2378                  goto fail;
2396 2379          }
↓ open down ↓ 204 lines elided ↑ open up ↑
2601 2584                          dev_err(nvme->n_dip, CE_WARN,
2602 2585                              "!unable to create I/O qpair %d", i);
2603 2586                          goto fail;
2604 2587                  }
2605 2588          }
2606 2589  
2607 2590          /*
2608 2591           * Post more asynchronous events commands to reduce event reporting
2609 2592           * latency as suggested by the spec.
2610 2593           */
2611      -        for (i = 1; i != nvme->n_async_event_limit; i++) {
2612      -                if (nvme_async_event(nvme) != DDI_SUCCESS) {
2613      -                        dev_err(nvme->n_dip, CE_WARN,
2614      -                            "!failed to post async event %d", i);
2615      -                        goto fail;
2616      -                }
2617      -        }
     2594 +        for (i = 1; i != nvme->n_async_event_limit; i++)
     2595 +                nvme_async_event(nvme);
2618 2596  
2619 2597          return (DDI_SUCCESS);
2620 2598  
2621 2599  fail:
2622 2600          (void) nvme_reset(nvme, B_FALSE);
2623 2601          return (DDI_FAILURE);
2624 2602  }
2625 2603  
2626 2604  static uint_t
2627 2605  nvme_intr(caddr_t arg1, caddr_t arg2)
↓ open down ↓ 643 lines elided ↑ open up ↑
3271 3249  
3272 3250          media->m_pblksize = ns->ns_best_block_size;
3273 3251  
3274 3252          return (0);
3275 3253  }
3276 3254  
3277 3255  static int
3278 3256  nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
3279 3257  {
3280 3258          nvme_t *nvme = ns->ns_nvme;
3281      -        nvme_cmd_t *cmd, *ret;
     3259 +        nvme_cmd_t *cmd;
3282 3260          nvme_qpair_t *ioq;
3283 3261          boolean_t poll;
     3262 +        int ret;
3284 3263  
3285 3264          if (nvme->n_dead)
3286 3265                  return (EIO);
3287 3266  
3288 3267          cmd = nvme_create_nvm_cmd(ns, opc, xfer);
3289 3268          if (cmd == NULL)
3290 3269                  return (ENOMEM);
3291 3270  
3292 3271          cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1;
3293 3272          ASSERT(cmd->nc_sqid <= nvme->n_ioq_count);
3294 3273          ioq = nvme->n_ioq[cmd->nc_sqid];
3295 3274  
3296 3275          /*
3297 3276           * Get the polling flag before submitting the command. The command may
3298 3277           * complete immediately after it was submitted, which means we must
3299 3278           * treat both cmd and xfer as if they have been freed already.
3300 3279           */
3301 3280          poll = (xfer->x_flags & BD_XFER_POLL) != 0;
3302 3281  
3303      -        if (nvme_submit_cmd(ioq, cmd) != DDI_SUCCESS)
3304      -                return (EAGAIN);
     3282 +        ret = nvme_submit_io_cmd(ioq, cmd);
3305 3283  
     3284 +        if (ret != 0)
     3285 +                return (ret);
     3286 +
3306 3287          if (!poll)
3307 3288                  return (0);
3308 3289  
3309 3290          do {
3310      -                ret = nvme_retrieve_cmd(nvme, ioq);
3311      -                if (ret != NULL)
3312      -                        nvme_bd_xfer_done(ret);
     3291 +                cmd = nvme_retrieve_cmd(nvme, ioq);
     3292 +                if (cmd != NULL)
     3293 +                        nvme_bd_xfer_done(cmd);
3313 3294                  else
3314 3295                          drv_usecwait(10);
3315 3296          } while (ioq->nq_active_cmds != 0);
3316 3297  
3317 3298          return (0);
3318 3299  }
3319 3300  
3320 3301  static int
3321 3302  nvme_bd_read(void *arg, bd_xfer_t *xfer)
3322 3303  {
↓ open down ↓ 580 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX