1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 14 * Copyright 2016 Tegile Systems, Inc. All rights reserved. 15 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved. 16 * Copyright 2017 Joyent, Inc. 17 */ 18 19 /* 20 * blkdev driver for NVMe compliant storage devices 21 * 22 * This driver was written to conform to version 1.2.1 of the NVMe 23 * specification. It may work with newer versions, but that is completely 24 * untested and disabled by default. 25 * 26 * The driver has only been tested on x86 systems and will not work on big- 27 * endian systems without changes to the code accessing registers and data 28 * structures used by the hardware. 29 * 30 * 31 * Interrupt Usage: 32 * 33 * The driver will use a single interrupt while configuring the device as the 34 * specification requires, but contrary to the specification it will try to use 35 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it 36 * will switch to multiple-message MSI(-X) if supported. The driver wants to 37 * have one interrupt vector per CPU, but it will work correctly if less are 38 * available. Interrupts can be shared by queues, the interrupt handler will 39 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only 40 * the admin queue will share an interrupt with one I/O queue. The interrupt 41 * handler will retrieve completed commands from all queues sharing an interrupt 42 * vector and will post them to a taskq for completion processing. 43 * 44 * 45 * Command Processing: 46 * 47 * NVMe devices can have up to 65536 I/O queue pairs, with each queue holding up 48 * to 65536 I/O commands. The driver will configure one I/O queue pair per 49 * available interrupt vector, with the queue length usually much smaller than 50 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer 51 * interrupt vectors will be used. 52 * 53 * Additionally the hardware provides a single special admin queue pair that can 54 * hold up to 4096 admin commands. 55 * 56 * From the hardware perspective both queues of a queue pair are independent, 57 * but they share some driver state: the command array (holding pointers to 58 * commands currently being processed by the hardware) and the active command 59 * counter. Access to the submission side of a queue pair and the shared state 60 * is protected by nq_mutex. The completion side of a queue pair does not need 61 * that protection apart from its access to the shared state; it is called only 62 * in the interrupt handler which does not run concurrently for the same 63 * interrupt vector. 64 * 65 * When a command is submitted to a queue pair the active command counter is 66 * incremented and a pointer to the command is stored in the command array. The 67 * array index is used as command identifier (CID) in the submission queue 68 * entry. Some commands may take a very long time to complete, and if the queue 69 * wraps around in that time a submission may find the next array slot to still 70 * be used by a long-running command. In this case the array is sequentially 71 * searched for the next free slot. The length of the command array is the same 72 * as the configured queue length. 73 * 74 * 75 * Polled I/O Support: 76 * 77 * For kernel core dump support the driver can do polled I/O. As interrupts are 78 * turned off while dumping the driver will just submit a command in the regular 79 * way, and then repeatedly attempt a command retrieval until it gets the 80 * command back. 81 * 82 * 83 * Namespace Support: 84 * 85 * NVMe devices can have multiple namespaces, each being a independent data 86 * store. The driver supports multiple namespaces and creates a blkdev interface 87 * for each namespace found. Namespaces can have various attributes to support 88 * thin provisioning and protection information. This driver does not support 89 * any of this and ignores namespaces that have these attributes. 90 * 91 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier 92 * (EUI64). This driver uses the EUI64 if present to generate the devid and 93 * passes it to blkdev to use it in the device node names. As this is currently 94 * untested namespaces with EUI64 are ignored by default. 95 * 96 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a 97 * single controller. This is an artificial limit imposed by the driver to be 98 * able to address a reasonable number of controllers and namespaces using a 99 * 32bit minor node number. 100 * 101 * 102 * Minor nodes: 103 * 104 * For each NVMe device the driver exposes one minor node for the controller and 105 * one minor node for each namespace. The only operations supported by those 106 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the 107 * interface for the nvmeadm(1M) utility. 108 * 109 * 110 * Blkdev Interface: 111 * 112 * This driver uses blkdev to do all the heavy lifting involved with presenting 113 * a disk device to the system. As a result, the processing of I/O requests is 114 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA 115 * setup, and splitting of transfers into manageable chunks. 116 * 117 * I/O requests coming in from blkdev are turned into NVM commands and posted to 118 * an I/O queue. The queue is selected by taking the CPU id modulo the number of 119 * queues. There is currently no timeout handling of I/O commands. 120 * 121 * Blkdev also supports querying device/media information and generating a 122 * devid. The driver reports the best block size as determined by the namespace 123 * format back to blkdev as physical block size to support partition and block 124 * alignment. The devid is either based on the namespace EUI64, if present, or 125 * composed using the device vendor ID, model number, serial number, and the 126 * namespace ID. 127 * 128 * 129 * Error Handling: 130 * 131 * Error handling is currently limited to detecting fatal hardware errors, 132 * either by asynchronous events, or synchronously through command status or 133 * admin command timeouts. In case of severe errors the device is fenced off, 134 * all further requests will return EIO. FMA is then called to fault the device. 135 * 136 * The hardware has a limit for outstanding asynchronous event requests. Before 137 * this limit is known the driver assumes it is at least 1 and posts a single 138 * asynchronous request. Later when the limit is known more asynchronous event 139 * requests are posted to allow quicker reception of error information. When an 140 * asynchronous event is posted by the hardware the driver will parse the error 141 * status fields and log information or fault the device, depending on the 142 * severity of the asynchronous event. The asynchronous event request is then 143 * reused and posted to the admin queue again. 144 * 145 * On command completion the command status is checked for errors. In case of 146 * errors indicating a driver bug the driver panics. Almost all other error 147 * status values just cause EIO to be returned. 148 * 149 * Command timeouts are currently detected for all admin commands except 150 * asynchronous event requests. If a command times out and the hardware appears 151 * to be healthy the driver attempts to abort the command. If this fails the 152 * driver assumes the device to be dead, fences it off, and calls FMA to retire 153 * it. In general admin commands are issued at attach time only. No timeout 154 * handling of normal I/O commands is presently done. 155 * 156 * In some cases it may be possible that the ABORT command times out, too. In 157 * that case the device is also declared dead and fenced off. 158 * 159 * 160 * Quiesce / Fast Reboot: 161 * 162 * The driver currently does not support fast reboot. A quiesce(9E) entry point 163 * is still provided which is used to send a shutdown notification to the 164 * device. 165 * 166 * 167 * Driver Configuration: 168 * 169 * The following driver properties can be changed to control some aspects of the 170 * drivers operation: 171 * - strict-version: can be set to 0 to allow devices conforming to newer 172 * versions or namespaces with EUI64 to be used 173 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor 174 * specific command status as a fatal error leading device faulting 175 * - admin-queue-len: the maximum length of the admin queue (16-4096) 176 * - io-queue-len: the maximum length of the I/O queues (16-65536) 177 * - async-event-limit: the maximum number of asynchronous event requests to be 178 * posted by the driver 179 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write 180 * cache 181 * - min-phys-block-size: the minimum physical block size to report to blkdev, 182 * which is among other things the basis for ZFS vdev ashift 183 * 184 * 185 * TODO: 186 * - figure out sane default for I/O queue depth reported to blkdev 187 * - FMA handling of media errors 188 * - support for devices supporting very large I/O requests using chained PRPs 189 * - support for configuring hardware parameters like interrupt coalescing 190 * - support for media formatting and hard partitioning into namespaces 191 * - support for big-endian systems 192 * - support for fast reboot 193 * - support for firmware updates 194 * - support for NVMe Subsystem Reset (1.1) 195 * - support for Scatter/Gather lists (1.1) 196 * - support for Reservations (1.1) 197 * - support for power management 198 */ 199 200 #include <sys/byteorder.h> 201 #ifdef _BIG_ENDIAN 202 #error nvme driver needs porting for big-endian platforms 203 #endif 204 205 #include <sys/modctl.h> 206 #include <sys/conf.h> 207 #include <sys/devops.h> 208 #include <sys/ddi.h> 209 #include <sys/sunddi.h> 210 #include <sys/sunndi.h> 211 #include <sys/bitmap.h> 212 #include <sys/sysmacros.h> 213 #include <sys/param.h> 214 #include <sys/varargs.h> 215 #include <sys/cpuvar.h> 216 #include <sys/disp.h> 217 #include <sys/blkdev.h> 218 #include <sys/atomic.h> 219 #include <sys/archsystm.h> 220 #include <sys/sata/sata_hba.h> 221 #include <sys/stat.h> 222 #include <sys/policy.h> 223 224 #include <sys/nvme.h> 225 226 #ifdef __x86 227 #include <sys/x86_archext.h> 228 #endif 229 230 #include "nvme_reg.h" 231 #include "nvme_var.h" 232 233 234 /* NVMe spec version supported */ 235 static const int nvme_version_major = 1; 236 static const int nvme_version_minor = 2; 237 238 /* tunable for admin command timeout in seconds, default is 1s */ 239 int nvme_admin_cmd_timeout = 1; 240 241 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */ 242 int nvme_format_cmd_timeout = 600; 243 244 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t); 245 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t); 246 static int nvme_quiesce(dev_info_t *); 247 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *); 248 static int nvme_setup_interrupts(nvme_t *, int, int); 249 static void nvme_release_interrupts(nvme_t *); 250 static uint_t nvme_intr(caddr_t, caddr_t); 251 252 static void nvme_shutdown(nvme_t *, int, boolean_t); 253 static boolean_t nvme_reset(nvme_t *, boolean_t); 254 static int nvme_init(nvme_t *); 255 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int); 256 static void nvme_free_cmd(nvme_cmd_t *); 257 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t, 258 bd_xfer_t *); 259 static int nvme_admin_cmd(nvme_cmd_t *, int); 260 static int nvme_submit_cmd(nvme_qpair_t *, nvme_cmd_t *); 261 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *); 262 static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t); 263 static void nvme_wakeup_cmd(void *); 264 static void nvme_async_event_task(void *); 265 266 static int nvme_check_unknown_cmd_status(nvme_cmd_t *); 267 static int nvme_check_vendor_cmd_status(nvme_cmd_t *); 268 static int nvme_check_integrity_cmd_status(nvme_cmd_t *); 269 static int nvme_check_specific_cmd_status(nvme_cmd_t *); 270 static int nvme_check_generic_cmd_status(nvme_cmd_t *); 271 static inline int nvme_check_cmd_status(nvme_cmd_t *); 272 273 static void nvme_abort_cmd(nvme_cmd_t *); 274 static int nvme_async_event(nvme_t *); 275 static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t, 276 boolean_t, uint8_t); 277 static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...); 278 static void *nvme_identify(nvme_t *, uint32_t); 279 static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t, 280 uint32_t *); 281 static boolean_t nvme_get_features(nvme_t *, uint32_t, uint8_t, uint32_t *, 282 void **, size_t *); 283 static boolean_t nvme_write_cache_set(nvme_t *, boolean_t); 284 static int nvme_set_nqueues(nvme_t *, uint16_t); 285 286 static void nvme_free_dma(nvme_dma_t *); 287 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *, 288 nvme_dma_t **); 289 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t, 290 nvme_dma_t **); 291 static void nvme_free_qpair(nvme_qpair_t *); 292 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, int); 293 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t); 294 295 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t); 296 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t); 297 static inline uint64_t nvme_get64(nvme_t *, uintptr_t); 298 static inline uint32_t nvme_get32(nvme_t *, uintptr_t); 299 300 static boolean_t nvme_check_regs_hdl(nvme_t *); 301 static boolean_t nvme_check_dma_hdl(nvme_dma_t *); 302 303 static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *); 304 305 static void nvme_bd_xfer_done(void *); 306 static void nvme_bd_driveinfo(void *, bd_drive_t *); 307 static int nvme_bd_mediainfo(void *, bd_media_t *); 308 static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t); 309 static int nvme_bd_read(void *, bd_xfer_t *); 310 static int nvme_bd_write(void *, bd_xfer_t *); 311 static int nvme_bd_sync(void *, bd_xfer_t *); 312 static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *); 313 314 static int nvme_prp_dma_constructor(void *, void *, int); 315 static void nvme_prp_dma_destructor(void *, void *); 316 317 static void nvme_prepare_devid(nvme_t *, uint32_t); 318 319 static int nvme_open(dev_t *, int, int, cred_t *); 320 static int nvme_close(dev_t, int, int, cred_t *); 321 static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); 322 323 #define NVME_MINOR_INST_SHIFT 9 324 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid)) 325 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT) 326 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1)) 327 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2) 328 329 static void *nvme_state; 330 static kmem_cache_t *nvme_cmd_cache; 331 332 /* 333 * DMA attributes for queue DMA memory 334 * 335 * Queue DMA memory must be page aligned. The maximum length of a queue is 336 * 65536 entries, and an entry can be 64 bytes long. 337 */ 338 static ddi_dma_attr_t nvme_queue_dma_attr = { 339 .dma_attr_version = DMA_ATTR_V0, 340 .dma_attr_addr_lo = 0, 341 .dma_attr_addr_hi = 0xffffffffffffffffULL, 342 .dma_attr_count_max = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1, 343 .dma_attr_align = 0x1000, 344 .dma_attr_burstsizes = 0x7ff, 345 .dma_attr_minxfer = 0x1000, 346 .dma_attr_maxxfer = (UINT16_MAX + 1) * sizeof (nvme_sqe_t), 347 .dma_attr_seg = 0xffffffffffffffffULL, 348 .dma_attr_sgllen = 1, 349 .dma_attr_granular = 1, 350 .dma_attr_flags = 0, 351 }; 352 353 /* 354 * DMA attributes for transfers using Physical Region Page (PRP) entries 355 * 356 * A PRP entry describes one page of DMA memory using the page size specified 357 * in the controller configuration's memory page size register (CC.MPS). It uses 358 * a 64bit base address aligned to this page size. There is no limitation on 359 * chaining PRPs together for arbitrarily large DMA transfers. 360 */ 361 static ddi_dma_attr_t nvme_prp_dma_attr = { 362 .dma_attr_version = DMA_ATTR_V0, 363 .dma_attr_addr_lo = 0, 364 .dma_attr_addr_hi = 0xffffffffffffffffULL, 365 .dma_attr_count_max = 0xfff, 366 .dma_attr_align = 0x1000, 367 .dma_attr_burstsizes = 0x7ff, 368 .dma_attr_minxfer = 0x1000, 369 .dma_attr_maxxfer = 0x1000, 370 .dma_attr_seg = 0xfff, 371 .dma_attr_sgllen = -1, 372 .dma_attr_granular = 1, 373 .dma_attr_flags = 0, 374 }; 375 376 /* 377 * DMA attributes for transfers using scatter/gather lists 378 * 379 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a 380 * 32bit length field. SGL Segment and SGL Last Segment entries require the 381 * length to be a multiple of 16 bytes. 382 */ 383 static ddi_dma_attr_t nvme_sgl_dma_attr = { 384 .dma_attr_version = DMA_ATTR_V0, 385 .dma_attr_addr_lo = 0, 386 .dma_attr_addr_hi = 0xffffffffffffffffULL, 387 .dma_attr_count_max = 0xffffffffUL, 388 .dma_attr_align = 1, 389 .dma_attr_burstsizes = 0x7ff, 390 .dma_attr_minxfer = 0x10, 391 .dma_attr_maxxfer = 0xfffffffffULL, 392 .dma_attr_seg = 0xffffffffffffffffULL, 393 .dma_attr_sgllen = -1, 394 .dma_attr_granular = 0x10, 395 .dma_attr_flags = 0 396 }; 397 398 static ddi_device_acc_attr_t nvme_reg_acc_attr = { 399 .devacc_attr_version = DDI_DEVICE_ATTR_V0, 400 .devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC, 401 .devacc_attr_dataorder = DDI_STRICTORDER_ACC 402 }; 403 404 static struct cb_ops nvme_cb_ops = { 405 .cb_open = nvme_open, 406 .cb_close = nvme_close, 407 .cb_strategy = nodev, 408 .cb_print = nodev, 409 .cb_dump = nodev, 410 .cb_read = nodev, 411 .cb_write = nodev, 412 .cb_ioctl = nvme_ioctl, 413 .cb_devmap = nodev, 414 .cb_mmap = nodev, 415 .cb_segmap = nodev, 416 .cb_chpoll = nochpoll, 417 .cb_prop_op = ddi_prop_op, 418 .cb_str = 0, 419 .cb_flag = D_NEW | D_MP, 420 .cb_rev = CB_REV, 421 .cb_aread = nodev, 422 .cb_awrite = nodev 423 }; 424 425 static struct dev_ops nvme_dev_ops = { 426 .devo_rev = DEVO_REV, 427 .devo_refcnt = 0, 428 .devo_getinfo = ddi_no_info, 429 .devo_identify = nulldev, 430 .devo_probe = nulldev, 431 .devo_attach = nvme_attach, 432 .devo_detach = nvme_detach, 433 .devo_reset = nodev, 434 .devo_cb_ops = &nvme_cb_ops, 435 .devo_bus_ops = NULL, 436 .devo_power = NULL, 437 .devo_quiesce = nvme_quiesce, 438 }; 439 440 static struct modldrv nvme_modldrv = { 441 .drv_modops = &mod_driverops, 442 .drv_linkinfo = "NVMe v1.1b", 443 .drv_dev_ops = &nvme_dev_ops 444 }; 445 446 static struct modlinkage nvme_modlinkage = { 447 .ml_rev = MODREV_1, 448 .ml_linkage = { &nvme_modldrv, NULL } 449 }; 450 451 static bd_ops_t nvme_bd_ops = { 452 .o_version = BD_OPS_VERSION_0, 453 .o_drive_info = nvme_bd_driveinfo, 454 .o_media_info = nvme_bd_mediainfo, 455 .o_devid_init = nvme_bd_devid, 456 .o_sync_cache = nvme_bd_sync, 457 .o_read = nvme_bd_read, 458 .o_write = nvme_bd_write, 459 }; 460 461 int 462 _init(void) 463 { 464 int error; 465 466 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1); 467 if (error != DDI_SUCCESS) 468 return (error); 469 470 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache", 471 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0); 472 473 bd_mod_init(&nvme_dev_ops); 474 475 error = mod_install(&nvme_modlinkage); 476 if (error != DDI_SUCCESS) { 477 ddi_soft_state_fini(&nvme_state); 478 bd_mod_fini(&nvme_dev_ops); 479 } 480 481 return (error); 482 } 483 484 int 485 _fini(void) 486 { 487 int error; 488 489 error = mod_remove(&nvme_modlinkage); 490 if (error == DDI_SUCCESS) { 491 ddi_soft_state_fini(&nvme_state); 492 kmem_cache_destroy(nvme_cmd_cache); 493 bd_mod_fini(&nvme_dev_ops); 494 } 495 496 return (error); 497 } 498 499 int 500 _info(struct modinfo *modinfop) 501 { 502 return (mod_info(&nvme_modlinkage, modinfop)); 503 } 504 505 static inline void 506 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val) 507 { 508 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 509 510 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 511 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val); 512 } 513 514 static inline void 515 nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val) 516 { 517 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 518 519 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 520 ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val); 521 } 522 523 static inline uint64_t 524 nvme_get64(nvme_t *nvme, uintptr_t reg) 525 { 526 uint64_t val; 527 528 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0); 529 530 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 531 val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg)); 532 533 return (val); 534 } 535 536 static inline uint32_t 537 nvme_get32(nvme_t *nvme, uintptr_t reg) 538 { 539 uint32_t val; 540 541 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0); 542 543 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 544 val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg)); 545 546 return (val); 547 } 548 549 static boolean_t 550 nvme_check_regs_hdl(nvme_t *nvme) 551 { 552 ddi_fm_error_t error; 553 554 ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION); 555 556 if (error.fme_status != DDI_FM_OK) 557 return (B_TRUE); 558 559 return (B_FALSE); 560 } 561 562 static boolean_t 563 nvme_check_dma_hdl(nvme_dma_t *dma) 564 { 565 ddi_fm_error_t error; 566 567 if (dma == NULL) 568 return (B_FALSE); 569 570 ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION); 571 572 if (error.fme_status != DDI_FM_OK) 573 return (B_TRUE); 574 575 return (B_FALSE); 576 } 577 578 static void 579 nvme_free_dma_common(nvme_dma_t *dma) 580 { 581 if (dma->nd_dmah != NULL) 582 (void) ddi_dma_unbind_handle(dma->nd_dmah); 583 if (dma->nd_acch != NULL) 584 ddi_dma_mem_free(&dma->nd_acch); 585 if (dma->nd_dmah != NULL) 586 ddi_dma_free_handle(&dma->nd_dmah); 587 } 588 589 static void 590 nvme_free_dma(nvme_dma_t *dma) 591 { 592 nvme_free_dma_common(dma); 593 kmem_free(dma, sizeof (*dma)); 594 } 595 596 /* ARGSUSED */ 597 static void 598 nvme_prp_dma_destructor(void *buf, void *private) 599 { 600 nvme_dma_t *dma = (nvme_dma_t *)buf; 601 602 nvme_free_dma_common(dma); 603 } 604 605 static int 606 nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma, 607 size_t len, uint_t flags, ddi_dma_attr_t *dma_attr) 608 { 609 if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL, 610 &dma->nd_dmah) != DDI_SUCCESS) { 611 /* 612 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and 613 * the only other possible error is DDI_DMA_BADATTR which 614 * indicates a driver bug which should cause a panic. 615 */ 616 dev_err(nvme->n_dip, CE_PANIC, 617 "!failed to get DMA handle, check DMA attributes"); 618 return (DDI_FAILURE); 619 } 620 621 /* 622 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified 623 * or the flags are conflicting, which isn't the case here. 624 */ 625 (void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr, 626 DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp, 627 &dma->nd_len, &dma->nd_acch); 628 629 if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp, 630 dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, 631 &dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) { 632 dev_err(nvme->n_dip, CE_WARN, 633 "!failed to bind DMA memory"); 634 atomic_inc_32(&nvme->n_dma_bind_err); 635 nvme_free_dma_common(dma); 636 return (DDI_FAILURE); 637 } 638 639 return (DDI_SUCCESS); 640 } 641 642 static int 643 nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags, 644 ddi_dma_attr_t *dma_attr, nvme_dma_t **ret) 645 { 646 nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP); 647 648 if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) != 649 DDI_SUCCESS) { 650 *ret = NULL; 651 kmem_free(dma, sizeof (nvme_dma_t)); 652 return (DDI_FAILURE); 653 } 654 655 bzero(dma->nd_memp, dma->nd_len); 656 657 *ret = dma; 658 return (DDI_SUCCESS); 659 } 660 661 /* ARGSUSED */ 662 static int 663 nvme_prp_dma_constructor(void *buf, void *private, int flags) 664 { 665 nvme_dma_t *dma = (nvme_dma_t *)buf; 666 nvme_t *nvme = (nvme_t *)private; 667 668 dma->nd_dmah = NULL; 669 dma->nd_acch = NULL; 670 671 if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize, 672 DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) { 673 return (-1); 674 } 675 676 ASSERT(dma->nd_ncookie == 1); 677 678 dma->nd_cached = B_TRUE; 679 680 return (0); 681 } 682 683 static int 684 nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len, 685 uint_t flags, nvme_dma_t **dma) 686 { 687 uint32_t len = nentry * qe_len; 688 ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr; 689 690 len = roundup(len, nvme->n_pagesize); 691 692 q_dma_attr.dma_attr_minxfer = len; 693 694 if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma) 695 != DDI_SUCCESS) { 696 dev_err(nvme->n_dip, CE_WARN, 697 "!failed to get DMA memory for queue"); 698 goto fail; 699 } 700 701 if ((*dma)->nd_ncookie != 1) { 702 dev_err(nvme->n_dip, CE_WARN, 703 "!got too many cookies for queue DMA"); 704 goto fail; 705 } 706 707 return (DDI_SUCCESS); 708 709 fail: 710 if (*dma) { 711 nvme_free_dma(*dma); 712 *dma = NULL; 713 } 714 715 return (DDI_FAILURE); 716 } 717 718 static void 719 nvme_free_qpair(nvme_qpair_t *qp) 720 { 721 int i; 722 723 mutex_destroy(&qp->nq_mutex); 724 725 if (qp->nq_sqdma != NULL) 726 nvme_free_dma(qp->nq_sqdma); 727 if (qp->nq_cqdma != NULL) 728 nvme_free_dma(qp->nq_cqdma); 729 730 if (qp->nq_active_cmds > 0) 731 for (i = 0; i != qp->nq_nentry; i++) 732 if (qp->nq_cmd[i] != NULL) 733 nvme_free_cmd(qp->nq_cmd[i]); 734 735 if (qp->nq_cmd != NULL) 736 kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry); 737 738 kmem_free(qp, sizeof (nvme_qpair_t)); 739 } 740 741 static int 742 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp, 743 int idx) 744 { 745 nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP); 746 747 mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER, 748 DDI_INTR_PRI(nvme->n_intr_pri)); 749 750 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t), 751 DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS) 752 goto fail; 753 754 if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t), 755 DDI_DMA_READ, &qp->nq_cqdma) != DDI_SUCCESS) 756 goto fail; 757 758 qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp; 759 qp->nq_cq = (nvme_cqe_t *)qp->nq_cqdma->nd_memp; 760 qp->nq_nentry = nentry; 761 762 qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx); 763 qp->nq_cqhdbl = NVME_REG_CQHDBL(nvme, idx); 764 765 qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP); 766 qp->nq_next_cmd = 0; 767 768 *nqp = qp; 769 return (DDI_SUCCESS); 770 771 fail: 772 nvme_free_qpair(qp); 773 *nqp = NULL; 774 775 return (DDI_FAILURE); 776 } 777 778 static nvme_cmd_t * 779 nvme_alloc_cmd(nvme_t *nvme, int kmflag) 780 { 781 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag); 782 783 if (cmd == NULL) 784 return (cmd); 785 786 bzero(cmd, sizeof (nvme_cmd_t)); 787 788 cmd->nc_nvme = nvme; 789 790 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER, 791 DDI_INTR_PRI(nvme->n_intr_pri)); 792 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL); 793 794 return (cmd); 795 } 796 797 static void 798 nvme_free_cmd(nvme_cmd_t *cmd) 799 { 800 if (cmd->nc_dma) { 801 if (cmd->nc_dma->nd_cached) 802 kmem_cache_free(cmd->nc_nvme->n_prp_cache, 803 cmd->nc_dma); 804 else 805 nvme_free_dma(cmd->nc_dma); 806 cmd->nc_dma = NULL; 807 } 808 809 cv_destroy(&cmd->nc_cv); 810 mutex_destroy(&cmd->nc_mutex); 811 812 kmem_cache_free(nvme_cmd_cache, cmd); 813 } 814 815 static int 816 nvme_submit_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd) 817 { 818 nvme_reg_sqtdbl_t tail = { 0 }; 819 820 mutex_enter(&qp->nq_mutex); 821 822 if (qp->nq_active_cmds == qp->nq_nentry) { 823 mutex_exit(&qp->nq_mutex); 824 return (DDI_FAILURE); 825 } 826 827 cmd->nc_completed = B_FALSE; 828 829 /* 830 * Try to insert the cmd into the active cmd array at the nq_next_cmd 831 * slot. If the slot is already occupied advance to the next slot and 832 * try again. This can happen for long running commands like async event 833 * requests. 834 */ 835 while (qp->nq_cmd[qp->nq_next_cmd] != NULL) 836 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 837 qp->nq_cmd[qp->nq_next_cmd] = cmd; 838 839 qp->nq_active_cmds++; 840 841 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd; 842 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t)); 843 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah, 844 sizeof (nvme_sqe_t) * qp->nq_sqtail, 845 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV); 846 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry; 847 848 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry; 849 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r); 850 851 mutex_exit(&qp->nq_mutex); 852 return (DDI_SUCCESS); 853 } 854 855 static nvme_cmd_t * 856 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) 857 { 858 nvme_reg_cqhdbl_t head = { 0 }; 859 860 nvme_cqe_t *cqe; 861 nvme_cmd_t *cmd; 862 863 (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0, 864 sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL); 865 866 mutex_enter(&qp->nq_mutex); 867 cqe = &qp->nq_cq[qp->nq_cqhead]; 868 869 /* Check phase tag of CQE. Hardware inverts it for new entries. */ 870 if (cqe->cqe_sf.sf_p == qp->nq_phase) { 871 mutex_exit(&qp->nq_mutex); 872 return (NULL); 873 } 874 875 ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp); 876 ASSERT(cqe->cqe_cid < qp->nq_nentry); 877 878 cmd = qp->nq_cmd[cqe->cqe_cid]; 879 qp->nq_cmd[cqe->cqe_cid] = NULL; 880 qp->nq_active_cmds--; 881 882 ASSERT(cmd != NULL); 883 ASSERT(cmd->nc_nvme == nvme); 884 ASSERT(cmd->nc_sqid == cqe->cqe_sqid); 885 ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid); 886 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); 887 888 qp->nq_sqhead = cqe->cqe_sqhd; 889 890 head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry; 891 892 /* Toggle phase on wrap-around. */ 893 if (qp->nq_cqhead == 0) 894 qp->nq_phase = qp->nq_phase ? 0 : 1; 895 896 nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r); 897 mutex_exit(&qp->nq_mutex); 898 899 return (cmd); 900 } 901 902 static int 903 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) 904 { 905 nvme_cqe_t *cqe = &cmd->nc_cqe; 906 907 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 908 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 909 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 910 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 911 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 912 913 if (cmd->nc_xfer != NULL) 914 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 915 916 if (cmd->nc_nvme->n_strict_version) { 917 cmd->nc_nvme->n_dead = B_TRUE; 918 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 919 } 920 921 return (EIO); 922 } 923 924 static int 925 nvme_check_vendor_cmd_status(nvme_cmd_t *cmd) 926 { 927 nvme_cqe_t *cqe = &cmd->nc_cqe; 928 929 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 930 "!unknown command status received: opc = %x, sqid = %d, cid = %d, " 931 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc, 932 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, 933 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); 934 if (!cmd->nc_nvme->n_ignore_unknown_vendor_status) { 935 cmd->nc_nvme->n_dead = B_TRUE; 936 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 937 } 938 939 return (EIO); 940 } 941 942 static int 943 nvme_check_integrity_cmd_status(nvme_cmd_t *cmd) 944 { 945 nvme_cqe_t *cqe = &cmd->nc_cqe; 946 947 switch (cqe->cqe_sf.sf_sc) { 948 case NVME_CQE_SC_INT_NVM_WRITE: 949 /* write fail */ 950 /* TODO: post ereport */ 951 if (cmd->nc_xfer != NULL) 952 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 953 return (EIO); 954 955 case NVME_CQE_SC_INT_NVM_READ: 956 /* read fail */ 957 /* TODO: post ereport */ 958 if (cmd->nc_xfer != NULL) 959 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 960 return (EIO); 961 962 default: 963 return (nvme_check_unknown_cmd_status(cmd)); 964 } 965 } 966 967 static int 968 nvme_check_generic_cmd_status(nvme_cmd_t *cmd) 969 { 970 nvme_cqe_t *cqe = &cmd->nc_cqe; 971 972 switch (cqe->cqe_sf.sf_sc) { 973 case NVME_CQE_SC_GEN_SUCCESS: 974 return (0); 975 976 /* 977 * Errors indicating a bug in the driver should cause a panic. 978 */ 979 case NVME_CQE_SC_GEN_INV_OPC: 980 /* Invalid Command Opcode */ 981 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 982 "invalid opcode in cmd %p", (void *)cmd); 983 return (0); 984 985 case NVME_CQE_SC_GEN_INV_FLD: 986 /* Invalid Field in Command */ 987 if (!cmd->nc_dontpanic) 988 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 989 "programming error: invalid field in cmd %p", 990 (void *)cmd); 991 return (EIO); 992 993 case NVME_CQE_SC_GEN_ID_CNFL: 994 /* Command ID Conflict */ 995 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 996 "cmd ID conflict in cmd %p", (void *)cmd); 997 return (0); 998 999 case NVME_CQE_SC_GEN_INV_NS: 1000 /* Invalid Namespace or Format */ 1001 if (!cmd->nc_dontpanic) 1002 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, 1003 "programming error: " "invalid NS/format in cmd %p", 1004 (void *)cmd); 1005 return (EINVAL); 1006 1007 case NVME_CQE_SC_GEN_NVM_LBA_RANGE: 1008 /* LBA Out Of Range */ 1009 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1010 "LBA out of range in cmd %p", (void *)cmd); 1011 return (0); 1012 1013 /* 1014 * Non-fatal errors, handle gracefully. 1015 */ 1016 case NVME_CQE_SC_GEN_DATA_XFR_ERR: 1017 /* Data Transfer Error (DMA) */ 1018 /* TODO: post ereport */ 1019 atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err); 1020 if (cmd->nc_xfer != NULL) 1021 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1022 return (EIO); 1023 1024 case NVME_CQE_SC_GEN_INTERNAL_ERR: 1025 /* 1026 * Internal Error. The spec (v1.0, section 4.5.1.2) says 1027 * detailed error information is returned as async event, 1028 * so we pretty much ignore the error here and handle it 1029 * in the async event handler. 1030 */ 1031 atomic_inc_32(&cmd->nc_nvme->n_internal_err); 1032 if (cmd->nc_xfer != NULL) 1033 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1034 return (EIO); 1035 1036 case NVME_CQE_SC_GEN_ABORT_REQUEST: 1037 /* 1038 * Command Abort Requested. This normally happens only when a 1039 * command times out. 1040 */ 1041 /* TODO: post ereport or change blkdev to handle this? */ 1042 atomic_inc_32(&cmd->nc_nvme->n_abort_rq_err); 1043 return (ECANCELED); 1044 1045 case NVME_CQE_SC_GEN_ABORT_PWRLOSS: 1046 /* Command Aborted due to Power Loss Notification */ 1047 ddi_fm_service_impact(cmd->nc_nvme->n_dip, DDI_SERVICE_LOST); 1048 cmd->nc_nvme->n_dead = B_TRUE; 1049 return (EIO); 1050 1051 case NVME_CQE_SC_GEN_ABORT_SQ_DEL: 1052 /* Command Aborted due to SQ Deletion */ 1053 atomic_inc_32(&cmd->nc_nvme->n_abort_sq_del); 1054 return (EIO); 1055 1056 case NVME_CQE_SC_GEN_NVM_CAP_EXC: 1057 /* Capacity Exceeded */ 1058 atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc); 1059 if (cmd->nc_xfer != NULL) 1060 bd_error(cmd->nc_xfer, BD_ERR_MEDIA); 1061 return (EIO); 1062 1063 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY: 1064 /* Namespace Not Ready */ 1065 atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy); 1066 if (cmd->nc_xfer != NULL) 1067 bd_error(cmd->nc_xfer, BD_ERR_NTRDY); 1068 return (EIO); 1069 1070 default: 1071 return (nvme_check_unknown_cmd_status(cmd)); 1072 } 1073 } 1074 1075 static int 1076 nvme_check_specific_cmd_status(nvme_cmd_t *cmd) 1077 { 1078 nvme_cqe_t *cqe = &cmd->nc_cqe; 1079 1080 switch (cqe->cqe_sf.sf_sc) { 1081 case NVME_CQE_SC_SPC_INV_CQ: 1082 /* Completion Queue Invalid */ 1083 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE); 1084 atomic_inc_32(&cmd->nc_nvme->n_inv_cq_err); 1085 return (EINVAL); 1086 1087 case NVME_CQE_SC_SPC_INV_QID: 1088 /* Invalid Queue Identifier */ 1089 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1090 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_SQUEUE || 1091 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE || 1092 cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1093 atomic_inc_32(&cmd->nc_nvme->n_inv_qid_err); 1094 return (EINVAL); 1095 1096 case NVME_CQE_SC_SPC_MAX_QSZ_EXC: 1097 /* Max Queue Size Exceeded */ 1098 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_SQUEUE || 1099 cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1100 atomic_inc_32(&cmd->nc_nvme->n_max_qsz_exc); 1101 return (EINVAL); 1102 1103 case NVME_CQE_SC_SPC_ABRT_CMD_EXC: 1104 /* Abort Command Limit Exceeded */ 1105 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT); 1106 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1107 "abort command limit exceeded in cmd %p", (void *)cmd); 1108 return (0); 1109 1110 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC: 1111 /* Async Event Request Limit Exceeded */ 1112 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_ASYNC_EVENT); 1113 dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " 1114 "async event request limit exceeded in cmd %p", 1115 (void *)cmd); 1116 return (0); 1117 1118 case NVME_CQE_SC_SPC_INV_INT_VECT: 1119 /* Invalid Interrupt Vector */ 1120 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_CREATE_CQUEUE); 1121 atomic_inc_32(&cmd->nc_nvme->n_inv_int_vect); 1122 return (EINVAL); 1123 1124 case NVME_CQE_SC_SPC_INV_LOG_PAGE: 1125 /* Invalid Log Page */ 1126 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE); 1127 atomic_inc_32(&cmd->nc_nvme->n_inv_log_page); 1128 return (EINVAL); 1129 1130 case NVME_CQE_SC_SPC_INV_FORMAT: 1131 /* Invalid Format */ 1132 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT); 1133 atomic_inc_32(&cmd->nc_nvme->n_inv_format); 1134 if (cmd->nc_xfer != NULL) 1135 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1136 return (EINVAL); 1137 1138 case NVME_CQE_SC_SPC_INV_Q_DEL: 1139 /* Invalid Queue Deletion */ 1140 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_DELETE_CQUEUE); 1141 atomic_inc_32(&cmd->nc_nvme->n_inv_q_del); 1142 return (EINVAL); 1143 1144 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR: 1145 /* Conflicting Attributes */ 1146 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_DSET_MGMT || 1147 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1148 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1149 atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr); 1150 if (cmd->nc_xfer != NULL) 1151 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1152 return (EINVAL); 1153 1154 case NVME_CQE_SC_SPC_NVM_INV_PROT: 1155 /* Invalid Protection Information */ 1156 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_COMPARE || 1157 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || 1158 cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1159 atomic_inc_32(&cmd->nc_nvme->n_inv_prot); 1160 if (cmd->nc_xfer != NULL) 1161 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1162 return (EINVAL); 1163 1164 case NVME_CQE_SC_SPC_NVM_READONLY: 1165 /* Write to Read Only Range */ 1166 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); 1167 atomic_inc_32(&cmd->nc_nvme->n_readonly); 1168 if (cmd->nc_xfer != NULL) 1169 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); 1170 return (EROFS); 1171 1172 default: 1173 return (nvme_check_unknown_cmd_status(cmd)); 1174 } 1175 } 1176 1177 static inline int 1178 nvme_check_cmd_status(nvme_cmd_t *cmd) 1179 { 1180 nvme_cqe_t *cqe = &cmd->nc_cqe; 1181 1182 /* take a shortcut if everything is alright */ 1183 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1184 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS) 1185 return (0); 1186 1187 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC) 1188 return (nvme_check_generic_cmd_status(cmd)); 1189 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC) 1190 return (nvme_check_specific_cmd_status(cmd)); 1191 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY) 1192 return (nvme_check_integrity_cmd_status(cmd)); 1193 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR) 1194 return (nvme_check_vendor_cmd_status(cmd)); 1195 1196 return (nvme_check_unknown_cmd_status(cmd)); 1197 } 1198 1199 /* 1200 * nvme_abort_cmd_cb -- replaces nc_callback of aborted commands 1201 * 1202 * This functions takes care of cleaning up aborted commands. The command 1203 * status is checked to catch any fatal errors. 1204 */ 1205 static void 1206 nvme_abort_cmd_cb(void *arg) 1207 { 1208 nvme_cmd_t *cmd = arg; 1209 1210 /* 1211 * Grab the command mutex. Once we have it we hold the last reference 1212 * to the command and can safely free it. 1213 */ 1214 mutex_enter(&cmd->nc_mutex); 1215 (void) nvme_check_cmd_status(cmd); 1216 mutex_exit(&cmd->nc_mutex); 1217 1218 nvme_free_cmd(cmd); 1219 } 1220 1221 static void 1222 nvme_abort_cmd(nvme_cmd_t *abort_cmd) 1223 { 1224 nvme_t *nvme = abort_cmd->nc_nvme; 1225 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1226 nvme_abort_cmd_t ac = { 0 }; 1227 1228 sema_p(&nvme->n_abort_sema); 1229 1230 ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid; 1231 ac.b.ac_sqid = abort_cmd->nc_sqid; 1232 1233 /* 1234 * Drop the mutex of the aborted command. From this point on 1235 * we must assume that the abort callback has freed the command. 1236 */ 1237 mutex_exit(&abort_cmd->nc_mutex); 1238 1239 cmd->nc_sqid = 0; 1240 cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT; 1241 cmd->nc_callback = nvme_wakeup_cmd; 1242 cmd->nc_sqe.sqe_cdw10 = ac.r; 1243 1244 /* 1245 * Send the ABORT to the hardware. The ABORT command will return _after_ 1246 * the aborted command has completed (aborted or otherwise). 1247 */ 1248 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1249 sema_v(&nvme->n_abort_sema); 1250 dev_err(nvme->n_dip, CE_WARN, 1251 "!nvme_admin_cmd failed for ABORT"); 1252 atomic_inc_32(&nvme->n_abort_failed); 1253 return; 1254 } 1255 sema_v(&nvme->n_abort_sema); 1256 1257 if (nvme_check_cmd_status(cmd)) { 1258 dev_err(nvme->n_dip, CE_WARN, 1259 "!ABORT failed with sct = %x, sc = %x", 1260 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1261 atomic_inc_32(&nvme->n_abort_failed); 1262 } else { 1263 atomic_inc_32(&nvme->n_cmd_aborted); 1264 } 1265 1266 nvme_free_cmd(cmd); 1267 } 1268 1269 /* 1270 * nvme_wait_cmd -- wait for command completion or timeout 1271 * 1272 * Returns B_TRUE if the command completed normally. 1273 * 1274 * Returns B_FALSE if the command timed out and an abort was attempted. The 1275 * command mutex will be dropped and the command must be considered freed. The 1276 * freeing of the command is normally done by the abort command callback. 1277 * 1278 * In case of a serious error or a timeout of the abort command the hardware 1279 * will be declared dead and FMA will be notified. 1280 */ 1281 static boolean_t 1282 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec) 1283 { 1284 clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC); 1285 nvme_t *nvme = cmd->nc_nvme; 1286 nvme_reg_csts_t csts; 1287 1288 ASSERT(mutex_owned(&cmd->nc_mutex)); 1289 1290 while (!cmd->nc_completed) { 1291 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1) 1292 break; 1293 } 1294 1295 if (cmd->nc_completed) 1296 return (B_TRUE); 1297 1298 /* 1299 * The command timed out. Change the callback to the cleanup function. 1300 */ 1301 cmd->nc_callback = nvme_abort_cmd_cb; 1302 1303 /* 1304 * Check controller for fatal status, any errors associated with the 1305 * register or DMA handle, or for a double timeout (abort command timed 1306 * out). If necessary log a warning and call FMA. 1307 */ 1308 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 1309 dev_err(nvme->n_dip, CE_WARN, "!command timeout, " 1310 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_opc, csts.b.csts_cfs); 1311 atomic_inc_32(&nvme->n_cmd_timeout); 1312 1313 if (csts.b.csts_cfs || 1314 nvme_check_regs_hdl(nvme) || 1315 nvme_check_dma_hdl(cmd->nc_dma) || 1316 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) { 1317 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1318 nvme->n_dead = B_TRUE; 1319 mutex_exit(&cmd->nc_mutex); 1320 } else { 1321 /* 1322 * Try to abort the command. The command mutex is released by 1323 * nvme_abort_cmd(). 1324 * If the abort succeeds it will have freed the aborted command. 1325 * If the abort fails for other reasons we must assume that the 1326 * command may complete at any time, and the callback will free 1327 * it for us. 1328 */ 1329 nvme_abort_cmd(cmd); 1330 } 1331 1332 return (B_FALSE); 1333 } 1334 1335 static void 1336 nvme_wakeup_cmd(void *arg) 1337 { 1338 nvme_cmd_t *cmd = arg; 1339 1340 mutex_enter(&cmd->nc_mutex); 1341 /* 1342 * There is a slight chance that this command completed shortly after 1343 * the timeout was hit in nvme_wait_cmd() but before the callback was 1344 * changed. Catch that case here and clean up accordingly. 1345 */ 1346 if (cmd->nc_callback == nvme_abort_cmd_cb) { 1347 mutex_exit(&cmd->nc_mutex); 1348 nvme_abort_cmd_cb(cmd); 1349 return; 1350 } 1351 1352 cmd->nc_completed = B_TRUE; 1353 cv_signal(&cmd->nc_cv); 1354 mutex_exit(&cmd->nc_mutex); 1355 } 1356 1357 static void 1358 nvme_async_event_task(void *arg) 1359 { 1360 nvme_cmd_t *cmd = arg; 1361 nvme_t *nvme = cmd->nc_nvme; 1362 nvme_error_log_entry_t *error_log = NULL; 1363 nvme_health_log_t *health_log = NULL; 1364 size_t logsize = 0; 1365 nvme_async_event_t event; 1366 int ret; 1367 1368 /* 1369 * Check for errors associated with the async request itself. The only 1370 * command-specific error is "async event limit exceeded", which 1371 * indicates a programming error in the driver and causes a panic in 1372 * nvme_check_cmd_status(). 1373 * 1374 * Other possible errors are various scenarios where the async request 1375 * was aborted, or internal errors in the device. Internal errors are 1376 * reported to FMA, the command aborts need no special handling here. 1377 */ 1378 if (nvme_check_cmd_status(cmd)) { 1379 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1380 "!async event request returned failure, sct = %x, " 1381 "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct, 1382 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr, 1383 cmd->nc_cqe.cqe_sf.sf_m); 1384 1385 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1386 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) { 1387 cmd->nc_nvme->n_dead = B_TRUE; 1388 ddi_fm_service_impact(cmd->nc_nvme->n_dip, 1389 DDI_SERVICE_LOST); 1390 } 1391 nvme_free_cmd(cmd); 1392 return; 1393 } 1394 1395 1396 event.r = cmd->nc_cqe.cqe_dw0; 1397 1398 /* Clear CQE and re-submit the async request. */ 1399 bzero(&cmd->nc_cqe, sizeof (nvme_cqe_t)); 1400 ret = nvme_submit_cmd(nvme->n_adminq, cmd); 1401 1402 if (ret != DDI_SUCCESS) { 1403 dev_err(nvme->n_dip, CE_WARN, 1404 "!failed to resubmit async event request"); 1405 atomic_inc_32(&nvme->n_async_resubmit_failed); 1406 nvme_free_cmd(cmd); 1407 } 1408 1409 switch (event.b.ae_type) { 1410 case NVME_ASYNC_TYPE_ERROR: 1411 if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) { 1412 (void) nvme_get_logpage(nvme, (void **)&error_log, 1413 &logsize, event.b.ae_logpage); 1414 } else { 1415 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1416 "async event reply: %d", event.b.ae_logpage); 1417 atomic_inc_32(&nvme->n_wrong_logpage); 1418 } 1419 1420 switch (event.b.ae_info) { 1421 case NVME_ASYNC_ERROR_INV_SQ: 1422 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1423 "invalid submission queue"); 1424 return; 1425 1426 case NVME_ASYNC_ERROR_INV_DBL: 1427 dev_err(nvme->n_dip, CE_PANIC, "programming error: " 1428 "invalid doorbell write value"); 1429 return; 1430 1431 case NVME_ASYNC_ERROR_DIAGFAIL: 1432 dev_err(nvme->n_dip, CE_WARN, "!diagnostic failure"); 1433 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1434 nvme->n_dead = B_TRUE; 1435 atomic_inc_32(&nvme->n_diagfail_event); 1436 break; 1437 1438 case NVME_ASYNC_ERROR_PERSISTENT: 1439 dev_err(nvme->n_dip, CE_WARN, "!persistent internal " 1440 "device error"); 1441 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 1442 nvme->n_dead = B_TRUE; 1443 atomic_inc_32(&nvme->n_persistent_event); 1444 break; 1445 1446 case NVME_ASYNC_ERROR_TRANSIENT: 1447 dev_err(nvme->n_dip, CE_WARN, "!transient internal " 1448 "device error"); 1449 /* TODO: send ereport */ 1450 atomic_inc_32(&nvme->n_transient_event); 1451 break; 1452 1453 case NVME_ASYNC_ERROR_FW_LOAD: 1454 dev_err(nvme->n_dip, CE_WARN, 1455 "!firmware image load error"); 1456 atomic_inc_32(&nvme->n_fw_load_event); 1457 break; 1458 } 1459 break; 1460 1461 case NVME_ASYNC_TYPE_HEALTH: 1462 if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) { 1463 (void) nvme_get_logpage(nvme, (void **)&health_log, 1464 &logsize, event.b.ae_logpage, -1); 1465 } else { 1466 dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " 1467 "async event reply: %d", event.b.ae_logpage); 1468 atomic_inc_32(&nvme->n_wrong_logpage); 1469 } 1470 1471 switch (event.b.ae_info) { 1472 case NVME_ASYNC_HEALTH_RELIABILITY: 1473 dev_err(nvme->n_dip, CE_WARN, 1474 "!device reliability compromised"); 1475 /* TODO: send ereport */ 1476 atomic_inc_32(&nvme->n_reliability_event); 1477 break; 1478 1479 case NVME_ASYNC_HEALTH_TEMPERATURE: 1480 dev_err(nvme->n_dip, CE_WARN, 1481 "!temperature above threshold"); 1482 /* TODO: send ereport */ 1483 atomic_inc_32(&nvme->n_temperature_event); 1484 break; 1485 1486 case NVME_ASYNC_HEALTH_SPARE: 1487 dev_err(nvme->n_dip, CE_WARN, 1488 "!spare space below threshold"); 1489 /* TODO: send ereport */ 1490 atomic_inc_32(&nvme->n_spare_event); 1491 break; 1492 } 1493 break; 1494 1495 case NVME_ASYNC_TYPE_VENDOR: 1496 dev_err(nvme->n_dip, CE_WARN, "!vendor specific async event " 1497 "received, info = %x, logpage = %x", event.b.ae_info, 1498 event.b.ae_logpage); 1499 atomic_inc_32(&nvme->n_vendor_event); 1500 break; 1501 1502 default: 1503 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, " 1504 "type = %x, info = %x, logpage = %x", event.b.ae_type, 1505 event.b.ae_info, event.b.ae_logpage); 1506 atomic_inc_32(&nvme->n_unknown_event); 1507 break; 1508 } 1509 1510 if (error_log) 1511 kmem_free(error_log, logsize); 1512 1513 if (health_log) 1514 kmem_free(health_log, logsize); 1515 } 1516 1517 static int 1518 nvme_admin_cmd(nvme_cmd_t *cmd, int sec) 1519 { 1520 int ret; 1521 1522 mutex_enter(&cmd->nc_mutex); 1523 ret = nvme_submit_cmd(cmd->nc_nvme->n_adminq, cmd); 1524 1525 if (ret != DDI_SUCCESS) { 1526 mutex_exit(&cmd->nc_mutex); 1527 dev_err(cmd->nc_nvme->n_dip, CE_WARN, 1528 "!nvme_submit_cmd failed"); 1529 atomic_inc_32(&cmd->nc_nvme->n_admin_queue_full); 1530 nvme_free_cmd(cmd); 1531 return (DDI_FAILURE); 1532 } 1533 1534 if (nvme_wait_cmd(cmd, sec) == B_FALSE) { 1535 /* 1536 * The command timed out. An abort command was posted that 1537 * will take care of the cleanup. 1538 */ 1539 return (DDI_FAILURE); 1540 } 1541 mutex_exit(&cmd->nc_mutex); 1542 1543 return (DDI_SUCCESS); 1544 } 1545 1546 static int 1547 nvme_async_event(nvme_t *nvme) 1548 { 1549 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1550 int ret; 1551 1552 cmd->nc_sqid = 0; 1553 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT; 1554 cmd->nc_callback = nvme_async_event_task; 1555 1556 ret = nvme_submit_cmd(nvme->n_adminq, cmd); 1557 1558 if (ret != DDI_SUCCESS) { 1559 dev_err(nvme->n_dip, CE_WARN, 1560 "!nvme_submit_cmd failed for ASYNCHRONOUS EVENT"); 1561 nvme_free_cmd(cmd); 1562 return (DDI_FAILURE); 1563 } 1564 1565 return (DDI_SUCCESS); 1566 } 1567 1568 static int 1569 nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms, 1570 uint8_t pi, boolean_t pil, uint8_t ses) 1571 { 1572 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1573 nvme_format_nvm_t format_nvm = { 0 }; 1574 int ret; 1575 1576 format_nvm.b.fm_lbaf = lbaf & 0xf; 1577 format_nvm.b.fm_ms = ms ? 1 : 0; 1578 format_nvm.b.fm_pi = pi & 0x7; 1579 format_nvm.b.fm_pil = pil ? 1 : 0; 1580 format_nvm.b.fm_ses = ses & 0x7; 1581 1582 cmd->nc_sqid = 0; 1583 cmd->nc_callback = nvme_wakeup_cmd; 1584 cmd->nc_sqe.sqe_nsid = nsid; 1585 cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT; 1586 cmd->nc_sqe.sqe_cdw10 = format_nvm.r; 1587 1588 /* 1589 * Some devices like Samsung SM951 don't allow formatting of all 1590 * namespaces in one command. Handle that gracefully. 1591 */ 1592 if (nsid == (uint32_t)-1) 1593 cmd->nc_dontpanic = B_TRUE; 1594 1595 if ((ret = nvme_admin_cmd(cmd, nvme_format_cmd_timeout)) 1596 != DDI_SUCCESS) { 1597 dev_err(nvme->n_dip, CE_WARN, 1598 "!nvme_admin_cmd failed for FORMAT NVM"); 1599 return (EIO); 1600 } 1601 1602 if ((ret = nvme_check_cmd_status(cmd)) != 0) { 1603 dev_err(nvme->n_dip, CE_WARN, 1604 "!FORMAT failed with sct = %x, sc = %x", 1605 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1606 } 1607 1608 nvme_free_cmd(cmd); 1609 return (ret); 1610 } 1611 1612 static int 1613 nvme_get_logpage(nvme_t *nvme, void **buf, size_t *bufsize, uint8_t logpage, 1614 ...) 1615 { 1616 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1617 nvme_getlogpage_t getlogpage = { 0 }; 1618 va_list ap; 1619 int ret = DDI_FAILURE; 1620 1621 va_start(ap, logpage); 1622 1623 cmd->nc_sqid = 0; 1624 cmd->nc_callback = nvme_wakeup_cmd; 1625 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE; 1626 1627 getlogpage.b.lp_lid = logpage; 1628 1629 switch (logpage) { 1630 case NVME_LOGPAGE_ERROR: 1631 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 1632 /* 1633 * The GET LOG PAGE command can use at most 2 pages to return 1634 * data, PRP lists are not supported. 1635 */ 1636 *bufsize = MIN(2 * nvme->n_pagesize, 1637 nvme->n_error_log_len * sizeof (nvme_error_log_entry_t)); 1638 break; 1639 1640 case NVME_LOGPAGE_HEALTH: 1641 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); 1642 *bufsize = sizeof (nvme_health_log_t); 1643 break; 1644 1645 case NVME_LOGPAGE_FWSLOT: 1646 cmd->nc_sqe.sqe_nsid = (uint32_t)-1; 1647 *bufsize = sizeof (nvme_fwslot_log_t); 1648 break; 1649 1650 default: 1651 dev_err(nvme->n_dip, CE_WARN, "!unknown log page requested: %d", 1652 logpage); 1653 atomic_inc_32(&nvme->n_unknown_logpage); 1654 goto fail; 1655 } 1656 1657 va_end(ap); 1658 1659 getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1; 1660 1661 cmd->nc_sqe.sqe_cdw10 = getlogpage.r; 1662 1663 if (nvme_zalloc_dma(nvme, getlogpage.b.lp_numd * sizeof (uint32_t), 1664 DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 1665 dev_err(nvme->n_dip, CE_WARN, 1666 "!nvme_zalloc_dma failed for GET LOG PAGE"); 1667 goto fail; 1668 } 1669 1670 if (cmd->nc_dma->nd_ncookie > 2) { 1671 dev_err(nvme->n_dip, CE_WARN, 1672 "!too many DMA cookies for GET LOG PAGE"); 1673 atomic_inc_32(&nvme->n_too_many_cookies); 1674 goto fail; 1675 } 1676 1677 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 1678 if (cmd->nc_dma->nd_ncookie > 1) { 1679 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 1680 &cmd->nc_dma->nd_cookie); 1681 cmd->nc_sqe.sqe_dptr.d_prp[1] = 1682 cmd->nc_dma->nd_cookie.dmac_laddress; 1683 } 1684 1685 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1686 dev_err(nvme->n_dip, CE_WARN, 1687 "!nvme_admin_cmd failed for GET LOG PAGE"); 1688 return (ret); 1689 } 1690 1691 if (nvme_check_cmd_status(cmd)) { 1692 dev_err(nvme->n_dip, CE_WARN, 1693 "!GET LOG PAGE failed with sct = %x, sc = %x", 1694 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1695 goto fail; 1696 } 1697 1698 *buf = kmem_alloc(*bufsize, KM_SLEEP); 1699 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 1700 1701 ret = DDI_SUCCESS; 1702 1703 fail: 1704 nvme_free_cmd(cmd); 1705 1706 return (ret); 1707 } 1708 1709 static void * 1710 nvme_identify(nvme_t *nvme, uint32_t nsid) 1711 { 1712 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1713 void *buf = NULL; 1714 1715 cmd->nc_sqid = 0; 1716 cmd->nc_callback = nvme_wakeup_cmd; 1717 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY; 1718 cmd->nc_sqe.sqe_nsid = nsid; 1719 cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL; 1720 1721 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ, 1722 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 1723 dev_err(nvme->n_dip, CE_WARN, 1724 "!nvme_zalloc_dma failed for IDENTIFY"); 1725 goto fail; 1726 } 1727 1728 if (cmd->nc_dma->nd_ncookie > 2) { 1729 dev_err(nvme->n_dip, CE_WARN, 1730 "!too many DMA cookies for IDENTIFY"); 1731 atomic_inc_32(&nvme->n_too_many_cookies); 1732 goto fail; 1733 } 1734 1735 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress; 1736 if (cmd->nc_dma->nd_ncookie > 1) { 1737 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 1738 &cmd->nc_dma->nd_cookie); 1739 cmd->nc_sqe.sqe_dptr.d_prp[1] = 1740 cmd->nc_dma->nd_cookie.dmac_laddress; 1741 } 1742 1743 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1744 dev_err(nvme->n_dip, CE_WARN, 1745 "!nvme_admin_cmd failed for IDENTIFY"); 1746 return (NULL); 1747 } 1748 1749 if (nvme_check_cmd_status(cmd)) { 1750 dev_err(nvme->n_dip, CE_WARN, 1751 "!IDENTIFY failed with sct = %x, sc = %x", 1752 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 1753 goto fail; 1754 } 1755 1756 buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP); 1757 bcopy(cmd->nc_dma->nd_memp, buf, NVME_IDENTIFY_BUFSIZE); 1758 1759 fail: 1760 nvme_free_cmd(cmd); 1761 1762 return (buf); 1763 } 1764 1765 static boolean_t 1766 nvme_set_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t val, 1767 uint32_t *res) 1768 { 1769 _NOTE(ARGUNUSED(nsid)); 1770 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1771 boolean_t ret = B_FALSE; 1772 1773 ASSERT(res != NULL); 1774 1775 cmd->nc_sqid = 0; 1776 cmd->nc_callback = nvme_wakeup_cmd; 1777 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES; 1778 cmd->nc_sqe.sqe_cdw10 = feature; 1779 cmd->nc_sqe.sqe_cdw11 = val; 1780 1781 switch (feature) { 1782 case NVME_FEAT_WRITE_CACHE: 1783 if (!nvme->n_write_cache_present) 1784 goto fail; 1785 break; 1786 1787 case NVME_FEAT_NQUEUES: 1788 break; 1789 1790 default: 1791 goto fail; 1792 } 1793 1794 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1795 dev_err(nvme->n_dip, CE_WARN, 1796 "!nvme_admin_cmd failed for SET FEATURES"); 1797 return (ret); 1798 } 1799 1800 if (nvme_check_cmd_status(cmd)) { 1801 dev_err(nvme->n_dip, CE_WARN, 1802 "!SET FEATURES %d failed with sct = %x, sc = %x", 1803 feature, cmd->nc_cqe.cqe_sf.sf_sct, 1804 cmd->nc_cqe.cqe_sf.sf_sc); 1805 goto fail; 1806 } 1807 1808 *res = cmd->nc_cqe.cqe_dw0; 1809 ret = B_TRUE; 1810 1811 fail: 1812 nvme_free_cmd(cmd); 1813 return (ret); 1814 } 1815 1816 static boolean_t 1817 nvme_get_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t *res, 1818 void **buf, size_t *bufsize) 1819 { 1820 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1821 boolean_t ret = B_FALSE; 1822 1823 ASSERT(res != NULL); 1824 1825 if (bufsize != NULL) 1826 *bufsize = 0; 1827 1828 cmd->nc_sqid = 0; 1829 cmd->nc_callback = nvme_wakeup_cmd; 1830 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES; 1831 cmd->nc_sqe.sqe_cdw10 = feature; 1832 cmd->nc_sqe.sqe_cdw11 = *res; 1833 1834 switch (feature) { 1835 case NVME_FEAT_ARBITRATION: 1836 case NVME_FEAT_POWER_MGMT: 1837 case NVME_FEAT_TEMPERATURE: 1838 case NVME_FEAT_ERROR: 1839 case NVME_FEAT_NQUEUES: 1840 case NVME_FEAT_INTR_COAL: 1841 case NVME_FEAT_INTR_VECT: 1842 case NVME_FEAT_WRITE_ATOM: 1843 case NVME_FEAT_ASYNC_EVENT: 1844 case NVME_FEAT_PROGRESS: 1845 break; 1846 1847 case NVME_FEAT_WRITE_CACHE: 1848 if (!nvme->n_write_cache_present) 1849 goto fail; 1850 break; 1851 1852 case NVME_FEAT_LBA_RANGE: 1853 if (!nvme->n_lba_range_supported) 1854 goto fail; 1855 1856 /* 1857 * The LBA Range Type feature is optional. There doesn't seem 1858 * be a method of detecting whether it is supported other than 1859 * using it. This will cause a "invalid field in command" error, 1860 * which is normally considered a programming error and causes 1861 * panic in nvme_check_generic_cmd_status(). 1862 */ 1863 cmd->nc_dontpanic = B_TRUE; 1864 cmd->nc_sqe.sqe_nsid = nsid; 1865 ASSERT(bufsize != NULL); 1866 *bufsize = NVME_LBA_RANGE_BUFSIZE; 1867 1868 break; 1869 1870 case NVME_FEAT_AUTO_PST: 1871 if (!nvme->n_auto_pst_supported) 1872 goto fail; 1873 1874 ASSERT(bufsize != NULL); 1875 *bufsize = NVME_AUTO_PST_BUFSIZE; 1876 break; 1877 1878 default: 1879 goto fail; 1880 } 1881 1882 if (bufsize != NULL && *bufsize != 0) { 1883 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ, 1884 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { 1885 dev_err(nvme->n_dip, CE_WARN, 1886 "!nvme_zalloc_dma failed for GET FEATURES"); 1887 goto fail; 1888 } 1889 1890 if (cmd->nc_dma->nd_ncookie > 2) { 1891 dev_err(nvme->n_dip, CE_WARN, 1892 "!too many DMA cookies for GET FEATURES"); 1893 atomic_inc_32(&nvme->n_too_many_cookies); 1894 goto fail; 1895 } 1896 1897 cmd->nc_sqe.sqe_dptr.d_prp[0] = 1898 cmd->nc_dma->nd_cookie.dmac_laddress; 1899 if (cmd->nc_dma->nd_ncookie > 1) { 1900 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, 1901 &cmd->nc_dma->nd_cookie); 1902 cmd->nc_sqe.sqe_dptr.d_prp[1] = 1903 cmd->nc_dma->nd_cookie.dmac_laddress; 1904 } 1905 } 1906 1907 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1908 dev_err(nvme->n_dip, CE_WARN, 1909 "!nvme_admin_cmd failed for GET FEATURES"); 1910 return (ret); 1911 } 1912 1913 if (nvme_check_cmd_status(cmd)) { 1914 if (feature == NVME_FEAT_LBA_RANGE && 1915 cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && 1916 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) 1917 nvme->n_lba_range_supported = B_FALSE; 1918 else 1919 dev_err(nvme->n_dip, CE_WARN, 1920 "!GET FEATURES %d failed with sct = %x, sc = %x", 1921 feature, cmd->nc_cqe.cqe_sf.sf_sct, 1922 cmd->nc_cqe.cqe_sf.sf_sc); 1923 goto fail; 1924 } 1925 1926 if (bufsize != NULL && *bufsize != 0) { 1927 ASSERT(buf != NULL); 1928 *buf = kmem_alloc(*bufsize, KM_SLEEP); 1929 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); 1930 } 1931 1932 *res = cmd->nc_cqe.cqe_dw0; 1933 ret = B_TRUE; 1934 1935 fail: 1936 nvme_free_cmd(cmd); 1937 return (ret); 1938 } 1939 1940 static boolean_t 1941 nvme_write_cache_set(nvme_t *nvme, boolean_t enable) 1942 { 1943 nvme_write_cache_t nwc = { 0 }; 1944 1945 if (enable) 1946 nwc.b.wc_wce = 1; 1947 1948 if (!nvme_set_features(nvme, 0, NVME_FEAT_WRITE_CACHE, nwc.r, &nwc.r)) 1949 return (B_FALSE); 1950 1951 return (B_TRUE); 1952 } 1953 1954 static int 1955 nvme_set_nqueues(nvme_t *nvme, uint16_t nqueues) 1956 { 1957 nvme_nqueues_t nq = { 0 }; 1958 1959 nq.b.nq_nsq = nq.b.nq_ncq = nqueues - 1; 1960 1961 if (!nvme_set_features(nvme, 0, NVME_FEAT_NQUEUES, nq.r, &nq.r)) { 1962 return (0); 1963 } 1964 1965 /* 1966 * Always use the same number of submission and completion queues, and 1967 * never use more than the requested number of queues. 1968 */ 1969 return (MIN(nqueues, MIN(nq.b.nq_nsq, nq.b.nq_ncq) + 1)); 1970 } 1971 1972 static int 1973 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) 1974 { 1975 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 1976 nvme_create_queue_dw10_t dw10 = { 0 }; 1977 nvme_create_cq_dw11_t c_dw11 = { 0 }; 1978 nvme_create_sq_dw11_t s_dw11 = { 0 }; 1979 1980 dw10.b.q_qid = idx; 1981 dw10.b.q_qsize = qp->nq_nentry - 1; 1982 1983 c_dw11.b.cq_pc = 1; 1984 c_dw11.b.cq_ien = 1; 1985 c_dw11.b.cq_iv = idx % nvme->n_intr_cnt; 1986 1987 cmd->nc_sqid = 0; 1988 cmd->nc_callback = nvme_wakeup_cmd; 1989 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE; 1990 cmd->nc_sqe.sqe_cdw10 = dw10.r; 1991 cmd->nc_sqe.sqe_cdw11 = c_dw11.r; 1992 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_cqdma->nd_cookie.dmac_laddress; 1993 1994 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 1995 dev_err(nvme->n_dip, CE_WARN, 1996 "!nvme_admin_cmd failed for CREATE CQUEUE"); 1997 return (DDI_FAILURE); 1998 } 1999 2000 if (nvme_check_cmd_status(cmd)) { 2001 dev_err(nvme->n_dip, CE_WARN, 2002 "!CREATE CQUEUE failed with sct = %x, sc = %x", 2003 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2004 nvme_free_cmd(cmd); 2005 return (DDI_FAILURE); 2006 } 2007 2008 nvme_free_cmd(cmd); 2009 2010 s_dw11.b.sq_pc = 1; 2011 s_dw11.b.sq_cqid = idx; 2012 2013 cmd = nvme_alloc_cmd(nvme, KM_SLEEP); 2014 cmd->nc_sqid = 0; 2015 cmd->nc_callback = nvme_wakeup_cmd; 2016 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE; 2017 cmd->nc_sqe.sqe_cdw10 = dw10.r; 2018 cmd->nc_sqe.sqe_cdw11 = s_dw11.r; 2019 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress; 2020 2021 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { 2022 dev_err(nvme->n_dip, CE_WARN, 2023 "!nvme_admin_cmd failed for CREATE SQUEUE"); 2024 return (DDI_FAILURE); 2025 } 2026 2027 if (nvme_check_cmd_status(cmd)) { 2028 dev_err(nvme->n_dip, CE_WARN, 2029 "!CREATE SQUEUE failed with sct = %x, sc = %x", 2030 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); 2031 nvme_free_cmd(cmd); 2032 return (DDI_FAILURE); 2033 } 2034 2035 nvme_free_cmd(cmd); 2036 2037 return (DDI_SUCCESS); 2038 } 2039 2040 static boolean_t 2041 nvme_reset(nvme_t *nvme, boolean_t quiesce) 2042 { 2043 nvme_reg_csts_t csts; 2044 int i; 2045 2046 nvme_put32(nvme, NVME_REG_CC, 0); 2047 2048 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2049 if (csts.b.csts_rdy == 1) { 2050 nvme_put32(nvme, NVME_REG_CC, 0); 2051 for (i = 0; i != nvme->n_timeout * 10; i++) { 2052 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2053 if (csts.b.csts_rdy == 0) 2054 break; 2055 2056 if (quiesce) 2057 drv_usecwait(50000); 2058 else 2059 delay(drv_usectohz(50000)); 2060 } 2061 } 2062 2063 nvme_put32(nvme, NVME_REG_AQA, 0); 2064 nvme_put32(nvme, NVME_REG_ASQ, 0); 2065 nvme_put32(nvme, NVME_REG_ACQ, 0); 2066 2067 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2068 return (csts.b.csts_rdy == 0 ? B_TRUE : B_FALSE); 2069 } 2070 2071 static void 2072 nvme_shutdown(nvme_t *nvme, int mode, boolean_t quiesce) 2073 { 2074 nvme_reg_cc_t cc; 2075 nvme_reg_csts_t csts; 2076 int i; 2077 2078 ASSERT(mode == NVME_CC_SHN_NORMAL || mode == NVME_CC_SHN_ABRUPT); 2079 2080 cc.r = nvme_get32(nvme, NVME_REG_CC); 2081 cc.b.cc_shn = mode & 0x3; 2082 nvme_put32(nvme, NVME_REG_CC, cc.r); 2083 2084 for (i = 0; i != 10; i++) { 2085 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2086 if (csts.b.csts_shst == NVME_CSTS_SHN_COMPLETE) 2087 break; 2088 2089 if (quiesce) 2090 drv_usecwait(100000); 2091 else 2092 delay(drv_usectohz(100000)); 2093 } 2094 } 2095 2096 2097 static void 2098 nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) 2099 { 2100 /* 2101 * Section 7.7 of the spec describes how to get a unique ID for 2102 * the controller: the vendor ID, the model name and the serial 2103 * number shall be unique when combined. 2104 * 2105 * If a namespace has no EUI64 we use the above and add the hex 2106 * namespace ID to get a unique ID for the namespace. 2107 */ 2108 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2109 char serial[sizeof (nvme->n_idctl->id_serial) + 1]; 2110 2111 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 2112 bcopy(nvme->n_idctl->id_serial, serial, 2113 sizeof (nvme->n_idctl->id_serial)); 2114 2115 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 2116 serial[sizeof (nvme->n_idctl->id_serial)] = '\0'; 2117 2118 nvme->n_ns[nsid - 1].ns_devid = kmem_asprintf("%4X-%s-%s-%X", 2119 nvme->n_idctl->id_vid, model, serial, nsid); 2120 } 2121 2122 static int 2123 nvme_init_ns(nvme_t *nvme, int nsid) 2124 { 2125 nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; 2126 nvme_identify_nsid_t *idns; 2127 int last_rp; 2128 2129 ns->ns_nvme = nvme; 2130 idns = nvme_identify(nvme, nsid); 2131 2132 if (idns == NULL) { 2133 dev_err(nvme->n_dip, CE_WARN, 2134 "!failed to identify namespace %d", nsid); 2135 return (DDI_FAILURE); 2136 } 2137 2138 ns->ns_idns = idns; 2139 ns->ns_id = nsid; 2140 ns->ns_block_count = idns->id_nsize; 2141 ns->ns_block_size = 2142 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; 2143 ns->ns_best_block_size = ns->ns_block_size; 2144 2145 /* 2146 * Get the EUI64 if present. Use it for devid and device node names. 2147 */ 2148 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 2149 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64)); 2150 2151 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 2152 if (*(uint64_t *)ns->ns_eui64 != 0) { 2153 uint8_t *eui64 = ns->ns_eui64; 2154 2155 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), 2156 "%02x%02x%02x%02x%02x%02x%02x%02x", 2157 eui64[0], eui64[1], eui64[2], eui64[3], 2158 eui64[4], eui64[5], eui64[6], eui64[7]); 2159 } else { 2160 (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%d", 2161 ns->ns_id); 2162 2163 nvme_prepare_devid(nvme, ns->ns_id); 2164 } 2165 2166 /* 2167 * Find the LBA format with no metadata and the best relative 2168 * performance. A value of 3 means "degraded", 0 is best. 2169 */ 2170 last_rp = 3; 2171 for (int j = 0; j <= idns->id_nlbaf; j++) { 2172 if (idns->id_lbaf[j].lbaf_lbads == 0) 2173 break; 2174 if (idns->id_lbaf[j].lbaf_ms != 0) 2175 continue; 2176 if (idns->id_lbaf[j].lbaf_rp >= last_rp) 2177 continue; 2178 last_rp = idns->id_lbaf[j].lbaf_rp; 2179 ns->ns_best_block_size = 2180 1 << idns->id_lbaf[j].lbaf_lbads; 2181 } 2182 2183 if (ns->ns_best_block_size < nvme->n_min_block_size) 2184 ns->ns_best_block_size = nvme->n_min_block_size; 2185 2186 /* 2187 * We currently don't support namespaces that use either: 2188 * - thin provisioning 2189 * - protection information 2190 * - illegal block size (< 512) 2191 */ 2192 if (idns->id_nsfeat.f_thin || 2193 idns->id_dps.dp_pinfo) { 2194 dev_err(nvme->n_dip, CE_WARN, 2195 "!ignoring namespace %d, unsupported features: " 2196 "thin = %d, pinfo = %d", nsid, 2197 idns->id_nsfeat.f_thin, idns->id_dps.dp_pinfo); 2198 ns->ns_ignore = B_TRUE; 2199 } else if (ns->ns_block_size < 512) { 2200 dev_err(nvme->n_dip, CE_WARN, 2201 "!ignoring namespace %d, unsupported block size %"PRIu64, 2202 nsid, (uint64_t)ns->ns_block_size); 2203 ns->ns_ignore = B_TRUE; 2204 } else { 2205 ns->ns_ignore = B_FALSE; 2206 } 2207 2208 return (DDI_SUCCESS); 2209 } 2210 2211 static int 2212 nvme_init(nvme_t *nvme) 2213 { 2214 nvme_reg_cc_t cc = { 0 }; 2215 nvme_reg_aqa_t aqa = { 0 }; 2216 nvme_reg_asq_t asq = { 0 }; 2217 nvme_reg_acq_t acq = { 0 }; 2218 nvme_reg_cap_t cap; 2219 nvme_reg_vs_t vs; 2220 nvme_reg_csts_t csts; 2221 int i = 0; 2222 int nqueues; 2223 char model[sizeof (nvme->n_idctl->id_model) + 1]; 2224 char *vendor, *product; 2225 2226 /* Check controller version */ 2227 vs.r = nvme_get32(nvme, NVME_REG_VS); 2228 nvme->n_version.v_major = vs.b.vs_mjr; 2229 nvme->n_version.v_minor = vs.b.vs_mnr; 2230 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d", 2231 nvme->n_version.v_major, nvme->n_version.v_minor); 2232 2233 if (NVME_VERSION_HIGHER(&nvme->n_version, 2234 nvme_version_major, nvme_version_minor)) { 2235 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.%d", 2236 nvme_version_major, nvme_version_minor); 2237 if (nvme->n_strict_version) 2238 goto fail; 2239 } 2240 2241 /* retrieve controller configuration */ 2242 cap.r = nvme_get64(nvme, NVME_REG_CAP); 2243 2244 if ((cap.b.cap_css & NVME_CAP_CSS_NVM) == 0) { 2245 dev_err(nvme->n_dip, CE_WARN, 2246 "!NVM command set not supported by hardware"); 2247 goto fail; 2248 } 2249 2250 nvme->n_nssr_supported = cap.b.cap_nssrs; 2251 nvme->n_doorbell_stride = 4 << cap.b.cap_dstrd; 2252 nvme->n_timeout = cap.b.cap_to; 2253 nvme->n_arbitration_mechanisms = cap.b.cap_ams; 2254 nvme->n_cont_queues_reqd = cap.b.cap_cqr; 2255 nvme->n_max_queue_entries = cap.b.cap_mqes + 1; 2256 2257 /* 2258 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify 2259 * the base page size of 4k (1<<12), so add 12 here to get the real 2260 * page size value. 2261 */ 2262 nvme->n_pageshift = MIN(MAX(cap.b.cap_mpsmin + 12, PAGESHIFT), 2263 cap.b.cap_mpsmax + 12); 2264 nvme->n_pagesize = 1UL << (nvme->n_pageshift); 2265 2266 /* 2267 * Set up Queue DMA to transfer at least 1 page-aligned page at a time. 2268 */ 2269 nvme->n_queue_dma_attr.dma_attr_align = nvme->n_pagesize; 2270 nvme->n_queue_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2271 2272 /* 2273 * Set up PRP DMA to transfer 1 page-aligned page at a time. 2274 * Maxxfer may be increased after we identified the controller limits. 2275 */ 2276 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_pagesize; 2277 nvme->n_prp_dma_attr.dma_attr_minxfer = nvme->n_pagesize; 2278 nvme->n_prp_dma_attr.dma_attr_align = nvme->n_pagesize; 2279 nvme->n_prp_dma_attr.dma_attr_seg = nvme->n_pagesize - 1; 2280 2281 /* 2282 * Reset controller if it's still in ready state. 2283 */ 2284 if (nvme_reset(nvme, B_FALSE) == B_FALSE) { 2285 dev_err(nvme->n_dip, CE_WARN, "!unable to reset controller"); 2286 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2287 nvme->n_dead = B_TRUE; 2288 goto fail; 2289 } 2290 2291 /* 2292 * Create the admin queue pair. 2293 */ 2294 if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0) 2295 != DDI_SUCCESS) { 2296 dev_err(nvme->n_dip, CE_WARN, 2297 "!unable to allocate admin qpair"); 2298 goto fail; 2299 } 2300 nvme->n_ioq = kmem_alloc(sizeof (nvme_qpair_t *), KM_SLEEP); 2301 nvme->n_ioq[0] = nvme->n_adminq; 2302 2303 nvme->n_progress |= NVME_ADMIN_QUEUE; 2304 2305 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2306 "admin-queue-len", nvme->n_admin_queue_len); 2307 2308 aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1; 2309 asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress; 2310 acq = nvme->n_adminq->nq_cqdma->nd_cookie.dmac_laddress; 2311 2312 ASSERT((asq & (nvme->n_pagesize - 1)) == 0); 2313 ASSERT((acq & (nvme->n_pagesize - 1)) == 0); 2314 2315 nvme_put32(nvme, NVME_REG_AQA, aqa.r); 2316 nvme_put64(nvme, NVME_REG_ASQ, asq); 2317 nvme_put64(nvme, NVME_REG_ACQ, acq); 2318 2319 cc.b.cc_ams = 0; /* use Round-Robin arbitration */ 2320 cc.b.cc_css = 0; /* use NVM command set */ 2321 cc.b.cc_mps = nvme->n_pageshift - 12; 2322 cc.b.cc_shn = 0; /* no shutdown in progress */ 2323 cc.b.cc_en = 1; /* enable controller */ 2324 cc.b.cc_iosqes = 6; /* submission queue entry is 2^6 bytes long */ 2325 cc.b.cc_iocqes = 4; /* completion queue entry is 2^4 bytes long */ 2326 2327 nvme_put32(nvme, NVME_REG_CC, cc.r); 2328 2329 /* 2330 * Wait for the controller to become ready. 2331 */ 2332 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2333 if (csts.b.csts_rdy == 0) { 2334 for (i = 0; i != nvme->n_timeout * 10; i++) { 2335 delay(drv_usectohz(50000)); 2336 csts.r = nvme_get32(nvme, NVME_REG_CSTS); 2337 2338 if (csts.b.csts_cfs == 1) { 2339 dev_err(nvme->n_dip, CE_WARN, 2340 "!controller fatal status at init"); 2341 ddi_fm_service_impact(nvme->n_dip, 2342 DDI_SERVICE_LOST); 2343 nvme->n_dead = B_TRUE; 2344 goto fail; 2345 } 2346 2347 if (csts.b.csts_rdy == 1) 2348 break; 2349 } 2350 } 2351 2352 if (csts.b.csts_rdy == 0) { 2353 dev_err(nvme->n_dip, CE_WARN, "!controller not ready"); 2354 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST); 2355 nvme->n_dead = B_TRUE; 2356 goto fail; 2357 } 2358 2359 /* 2360 * Assume an abort command limit of 1. We'll destroy and re-init 2361 * that later when we know the true abort command limit. 2362 */ 2363 sema_init(&nvme->n_abort_sema, 1, NULL, SEMA_DRIVER, NULL); 2364 2365 /* 2366 * Setup initial interrupt for admin queue. 2367 */ 2368 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1) 2369 != DDI_SUCCESS) && 2370 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1) 2371 != DDI_SUCCESS) && 2372 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1) 2373 != DDI_SUCCESS)) { 2374 dev_err(nvme->n_dip, CE_WARN, 2375 "!failed to setup initial interrupt"); 2376 goto fail; 2377 } 2378 2379 /* 2380 * Post an asynchronous event command to catch errors. 2381 */ 2382 if (nvme_async_event(nvme) != DDI_SUCCESS) { 2383 dev_err(nvme->n_dip, CE_WARN, 2384 "!failed to post async event"); 2385 goto fail; 2386 } 2387 2388 /* 2389 * Identify Controller 2390 */ 2391 nvme->n_idctl = nvme_identify(nvme, 0); 2392 if (nvme->n_idctl == NULL) { 2393 dev_err(nvme->n_dip, CE_WARN, 2394 "!failed to identify controller"); 2395 goto fail; 2396 } 2397 2398 /* 2399 * Get Vendor & Product ID 2400 */ 2401 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model)); 2402 model[sizeof (nvme->n_idctl->id_model)] = '\0'; 2403 sata_split_model(model, &vendor, &product); 2404 2405 if (vendor == NULL) 2406 nvme->n_vendor = strdup("NVMe"); 2407 else 2408 nvme->n_vendor = strdup(vendor); 2409 2410 nvme->n_product = strdup(product); 2411 2412 /* 2413 * Get controller limits. 2414 */ 2415 nvme->n_async_event_limit = MAX(NVME_MIN_ASYNC_EVENT_LIMIT, 2416 MIN(nvme->n_admin_queue_len / 10, 2417 MIN(nvme->n_idctl->id_aerl + 1, nvme->n_async_event_limit))); 2418 2419 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2420 "async-event-limit", nvme->n_async_event_limit); 2421 2422 nvme->n_abort_command_limit = nvme->n_idctl->id_acl + 1; 2423 2424 /* 2425 * Reinitialize the semaphore with the true abort command limit 2426 * supported by the hardware. It's not necessary to disable interrupts 2427 * as only command aborts use the semaphore, and no commands are 2428 * executed or aborted while we're here. 2429 */ 2430 sema_destroy(&nvme->n_abort_sema); 2431 sema_init(&nvme->n_abort_sema, nvme->n_abort_command_limit - 1, NULL, 2432 SEMA_DRIVER, NULL); 2433 2434 nvme->n_progress |= NVME_CTRL_LIMITS; 2435 2436 if (nvme->n_idctl->id_mdts == 0) 2437 nvme->n_max_data_transfer_size = nvme->n_pagesize * 65536; 2438 else 2439 nvme->n_max_data_transfer_size = 2440 1ull << (nvme->n_pageshift + nvme->n_idctl->id_mdts); 2441 2442 nvme->n_error_log_len = nvme->n_idctl->id_elpe + 1; 2443 2444 /* 2445 * Limit n_max_data_transfer_size to what we can handle in one PRP. 2446 * Chained PRPs are currently unsupported. 2447 * 2448 * This is a no-op on hardware which doesn't support a transfer size 2449 * big enough to require chained PRPs. 2450 */ 2451 nvme->n_max_data_transfer_size = MIN(nvme->n_max_data_transfer_size, 2452 (nvme->n_pagesize / sizeof (uint64_t) * nvme->n_pagesize)); 2453 2454 nvme->n_prp_dma_attr.dma_attr_maxxfer = nvme->n_max_data_transfer_size; 2455 2456 /* 2457 * Make sure the minimum/maximum queue entry sizes are not 2458 * larger/smaller than the default. 2459 */ 2460 2461 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) || 2462 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) || 2463 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) || 2464 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t))) 2465 goto fail; 2466 2467 /* 2468 * Check for the presence of a Volatile Write Cache. If present, 2469 * enable or disable based on the value of the property 2470 * volatile-write-cache-enable (default is enabled). 2471 */ 2472 nvme->n_write_cache_present = 2473 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE; 2474 2475 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2476 "volatile-write-cache-present", 2477 nvme->n_write_cache_present ? 1 : 0); 2478 2479 if (!nvme->n_write_cache_present) { 2480 nvme->n_write_cache_enabled = B_FALSE; 2481 } else if (!nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)) { 2482 dev_err(nvme->n_dip, CE_WARN, 2483 "!failed to %sable volatile write cache", 2484 nvme->n_write_cache_enabled ? "en" : "dis"); 2485 /* 2486 * Assume the cache is (still) enabled. 2487 */ 2488 nvme->n_write_cache_enabled = B_TRUE; 2489 } 2490 2491 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, 2492 "volatile-write-cache-enable", 2493 nvme->n_write_cache_enabled ? 1 : 0); 2494 2495 /* 2496 * Assume LBA Range Type feature is supported. If it isn't this 2497 * will be set to B_FALSE by nvme_get_features(). 2498 */ 2499 nvme->n_lba_range_supported = B_TRUE; 2500 2501 /* 2502 * Check support for Autonomous Power State Transition. 2503 */ 2504 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) 2505 nvme->n_auto_pst_supported = 2506 nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE; 2507 2508 /* 2509 * Identify Namespaces 2510 */ 2511 nvme->n_namespace_count = nvme->n_idctl->id_nn; 2512 if (nvme->n_namespace_count > NVME_MINOR_MAX) { 2513 dev_err(nvme->n_dip, CE_WARN, 2514 "!too many namespaces: %d, limiting to %d\n", 2515 nvme->n_namespace_count, NVME_MINOR_MAX); 2516 nvme->n_namespace_count = NVME_MINOR_MAX; 2517 } 2518 2519 nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) * 2520 nvme->n_namespace_count, KM_SLEEP); 2521 2522 for (i = 0; i != nvme->n_namespace_count; i++) { 2523 mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER, 2524 NULL); 2525 if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS) 2526 goto fail; 2527 } 2528 2529 /* 2530 * Try to set up MSI/MSI-X interrupts. 2531 */ 2532 if ((nvme->n_intr_types & (DDI_INTR_TYPE_MSI | DDI_INTR_TYPE_MSIX)) 2533 != 0) { 2534 nvme_release_interrupts(nvme); 2535 2536 nqueues = MIN(UINT16_MAX, ncpus); 2537 2538 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 2539 nqueues) != DDI_SUCCESS) && 2540 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 2541 nqueues) != DDI_SUCCESS)) { 2542 dev_err(nvme->n_dip, CE_WARN, 2543 "!failed to setup MSI/MSI-X interrupts"); 2544 goto fail; 2545 } 2546 } 2547 2548 nqueues = nvme->n_intr_cnt; 2549 2550 /* 2551 * Create I/O queue pairs. 2552 */ 2553 nvme->n_ioq_count = nvme_set_nqueues(nvme, nqueues); 2554 if (nvme->n_ioq_count == 0) { 2555 dev_err(nvme->n_dip, CE_WARN, 2556 "!failed to set number of I/O queues to %d", nqueues); 2557 goto fail; 2558 } 2559 2560 /* 2561 * Reallocate I/O queue array 2562 */ 2563 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *)); 2564 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) * 2565 (nvme->n_ioq_count + 1), KM_SLEEP); 2566 nvme->n_ioq[0] = nvme->n_adminq; 2567 2568 /* 2569 * If we got less queues than we asked for we might as well give 2570 * some of the interrupt vectors back to the system. 2571 */ 2572 if (nvme->n_ioq_count < nqueues) { 2573 nvme_release_interrupts(nvme); 2574 2575 if (nvme_setup_interrupts(nvme, nvme->n_intr_type, 2576 nvme->n_ioq_count) != DDI_SUCCESS) { 2577 dev_err(nvme->n_dip, CE_WARN, 2578 "!failed to reduce number of interrupts"); 2579 goto fail; 2580 } 2581 } 2582 2583 /* 2584 * Alloc & register I/O queue pairs 2585 */ 2586 nvme->n_io_queue_len = 2587 MIN(nvme->n_io_queue_len, nvme->n_max_queue_entries); 2588 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-queue-len", 2589 nvme->n_io_queue_len); 2590 2591 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 2592 if (nvme_alloc_qpair(nvme, nvme->n_io_queue_len, 2593 &nvme->n_ioq[i], i) != DDI_SUCCESS) { 2594 dev_err(nvme->n_dip, CE_WARN, 2595 "!unable to allocate I/O qpair %d", i); 2596 goto fail; 2597 } 2598 2599 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) 2600 != DDI_SUCCESS) { 2601 dev_err(nvme->n_dip, CE_WARN, 2602 "!unable to create I/O qpair %d", i); 2603 goto fail; 2604 } 2605 } 2606 2607 /* 2608 * Post more asynchronous events commands to reduce event reporting 2609 * latency as suggested by the spec. 2610 */ 2611 for (i = 1; i != nvme->n_async_event_limit; i++) { 2612 if (nvme_async_event(nvme) != DDI_SUCCESS) { 2613 dev_err(nvme->n_dip, CE_WARN, 2614 "!failed to post async event %d", i); 2615 goto fail; 2616 } 2617 } 2618 2619 return (DDI_SUCCESS); 2620 2621 fail: 2622 (void) nvme_reset(nvme, B_FALSE); 2623 return (DDI_FAILURE); 2624 } 2625 2626 static uint_t 2627 nvme_intr(caddr_t arg1, caddr_t arg2) 2628 { 2629 /*LINTED: E_PTR_BAD_CAST_ALIGN*/ 2630 nvme_t *nvme = (nvme_t *)arg1; 2631 int inum = (int)(uintptr_t)arg2; 2632 int ccnt = 0; 2633 int qnum; 2634 nvme_cmd_t *cmd; 2635 2636 if (inum >= nvme->n_intr_cnt) 2637 return (DDI_INTR_UNCLAIMED); 2638 2639 /* 2640 * The interrupt vector a queue uses is calculated as queue_idx % 2641 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array 2642 * in steps of n_intr_cnt to process all queues using this vector. 2643 */ 2644 for (qnum = inum; 2645 qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL; 2646 qnum += nvme->n_intr_cnt) { 2647 while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) { 2648 taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq, 2649 cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); 2650 ccnt++; 2651 } 2652 } 2653 2654 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); 2655 } 2656 2657 static void 2658 nvme_release_interrupts(nvme_t *nvme) 2659 { 2660 int i; 2661 2662 for (i = 0; i < nvme->n_intr_cnt; i++) { 2663 if (nvme->n_inth[i] == NULL) 2664 break; 2665 2666 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 2667 (void) ddi_intr_block_disable(&nvme->n_inth[i], 1); 2668 else 2669 (void) ddi_intr_disable(nvme->n_inth[i]); 2670 2671 (void) ddi_intr_remove_handler(nvme->n_inth[i]); 2672 (void) ddi_intr_free(nvme->n_inth[i]); 2673 } 2674 2675 kmem_free(nvme->n_inth, nvme->n_inth_sz); 2676 nvme->n_inth = NULL; 2677 nvme->n_inth_sz = 0; 2678 2679 nvme->n_progress &= ~NVME_INTERRUPTS; 2680 } 2681 2682 static int 2683 nvme_setup_interrupts(nvme_t *nvme, int intr_type, int nqpairs) 2684 { 2685 int nintrs, navail, count; 2686 int ret; 2687 int i; 2688 2689 if (nvme->n_intr_types == 0) { 2690 ret = ddi_intr_get_supported_types(nvme->n_dip, 2691 &nvme->n_intr_types); 2692 if (ret != DDI_SUCCESS) { 2693 dev_err(nvme->n_dip, CE_WARN, 2694 "!%s: ddi_intr_get_supported types failed", 2695 __func__); 2696 return (ret); 2697 } 2698 #ifdef __x86 2699 if (get_hwenv() == HW_VMWARE) 2700 nvme->n_intr_types &= ~DDI_INTR_TYPE_MSIX; 2701 #endif 2702 } 2703 2704 if ((nvme->n_intr_types & intr_type) == 0) 2705 return (DDI_FAILURE); 2706 2707 ret = ddi_intr_get_nintrs(nvme->n_dip, intr_type, &nintrs); 2708 if (ret != DDI_SUCCESS) { 2709 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_nintrs failed", 2710 __func__); 2711 return (ret); 2712 } 2713 2714 ret = ddi_intr_get_navail(nvme->n_dip, intr_type, &navail); 2715 if (ret != DDI_SUCCESS) { 2716 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_navail failed", 2717 __func__); 2718 return (ret); 2719 } 2720 2721 /* We want at most one interrupt per queue pair. */ 2722 if (navail > nqpairs) 2723 navail = nqpairs; 2724 2725 nvme->n_inth_sz = sizeof (ddi_intr_handle_t) * navail; 2726 nvme->n_inth = kmem_zalloc(nvme->n_inth_sz, KM_SLEEP); 2727 2728 ret = ddi_intr_alloc(nvme->n_dip, nvme->n_inth, intr_type, 0, navail, 2729 &count, 0); 2730 if (ret != DDI_SUCCESS) { 2731 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_alloc failed", 2732 __func__); 2733 goto fail; 2734 } 2735 2736 nvme->n_intr_cnt = count; 2737 2738 ret = ddi_intr_get_pri(nvme->n_inth[0], &nvme->n_intr_pri); 2739 if (ret != DDI_SUCCESS) { 2740 dev_err(nvme->n_dip, CE_WARN, "!%s: ddi_intr_get_pri failed", 2741 __func__); 2742 goto fail; 2743 } 2744 2745 for (i = 0; i < count; i++) { 2746 ret = ddi_intr_add_handler(nvme->n_inth[i], nvme_intr, 2747 (void *)nvme, (void *)(uintptr_t)i); 2748 if (ret != DDI_SUCCESS) { 2749 dev_err(nvme->n_dip, CE_WARN, 2750 "!%s: ddi_intr_add_handler failed", __func__); 2751 goto fail; 2752 } 2753 } 2754 2755 (void) ddi_intr_get_cap(nvme->n_inth[0], &nvme->n_intr_cap); 2756 2757 for (i = 0; i < count; i++) { 2758 if (nvme->n_intr_cap & DDI_INTR_FLAG_BLOCK) 2759 ret = ddi_intr_block_enable(&nvme->n_inth[i], 1); 2760 else 2761 ret = ddi_intr_enable(nvme->n_inth[i]); 2762 2763 if (ret != DDI_SUCCESS) { 2764 dev_err(nvme->n_dip, CE_WARN, 2765 "!%s: enabling interrupt %d failed", __func__, i); 2766 goto fail; 2767 } 2768 } 2769 2770 nvme->n_intr_type = intr_type; 2771 2772 nvme->n_progress |= NVME_INTERRUPTS; 2773 2774 return (DDI_SUCCESS); 2775 2776 fail: 2777 nvme_release_interrupts(nvme); 2778 2779 return (ret); 2780 } 2781 2782 static int 2783 nvme_fm_errcb(dev_info_t *dip, ddi_fm_error_t *fm_error, const void *arg) 2784 { 2785 _NOTE(ARGUNUSED(arg)); 2786 2787 pci_ereport_post(dip, fm_error, NULL); 2788 return (fm_error->fme_status); 2789 } 2790 2791 static int 2792 nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2793 { 2794 nvme_t *nvme; 2795 int instance; 2796 int nregs; 2797 off_t regsize; 2798 int i; 2799 char name[32]; 2800 2801 if (cmd != DDI_ATTACH) 2802 return (DDI_FAILURE); 2803 2804 instance = ddi_get_instance(dip); 2805 2806 if (ddi_soft_state_zalloc(nvme_state, instance) != DDI_SUCCESS) 2807 return (DDI_FAILURE); 2808 2809 nvme = ddi_get_soft_state(nvme_state, instance); 2810 ddi_set_driver_private(dip, nvme); 2811 nvme->n_dip = dip; 2812 2813 mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); 2814 2815 nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2816 DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; 2817 nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY, 2818 dip, DDI_PROP_DONTPASS, "ignore-unknown-vendor-status", 0) == 1 ? 2819 B_TRUE : B_FALSE; 2820 nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2821 DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN); 2822 nvme->n_io_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2823 DDI_PROP_DONTPASS, "io-queue-len", NVME_DEFAULT_IO_QUEUE_LEN); 2824 nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2825 DDI_PROP_DONTPASS, "async-event-limit", 2826 NVME_DEFAULT_ASYNC_EVENT_LIMIT); 2827 nvme->n_write_cache_enabled = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2828 DDI_PROP_DONTPASS, "volatile-write-cache-enable", 1) != 0 ? 2829 B_TRUE : B_FALSE; 2830 nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2831 DDI_PROP_DONTPASS, "min-phys-block-size", 2832 NVME_DEFAULT_MIN_BLOCK_SIZE); 2833 2834 if (!ISP2(nvme->n_min_block_size) || 2835 (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) { 2836 dev_err(dip, CE_WARN, "!min-phys-block-size %s, " 2837 "using default %d", ISP2(nvme->n_min_block_size) ? 2838 "too low" : "not a power of 2", 2839 NVME_DEFAULT_MIN_BLOCK_SIZE); 2840 nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE; 2841 } 2842 2843 if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN) 2844 nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN; 2845 else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN) 2846 nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN; 2847 2848 if (nvme->n_io_queue_len < NVME_MIN_IO_QUEUE_LEN) 2849 nvme->n_io_queue_len = NVME_MIN_IO_QUEUE_LEN; 2850 2851 if (nvme->n_async_event_limit < 1) 2852 nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT; 2853 2854 nvme->n_reg_acc_attr = nvme_reg_acc_attr; 2855 nvme->n_queue_dma_attr = nvme_queue_dma_attr; 2856 nvme->n_prp_dma_attr = nvme_prp_dma_attr; 2857 nvme->n_sgl_dma_attr = nvme_sgl_dma_attr; 2858 2859 /* 2860 * Setup FMA support. 2861 */ 2862 nvme->n_fm_cap = ddi_getprop(DDI_DEV_T_ANY, dip, 2863 DDI_PROP_CANSLEEP | DDI_PROP_DONTPASS, "fm-capable", 2864 DDI_FM_EREPORT_CAPABLE | DDI_FM_ACCCHK_CAPABLE | 2865 DDI_FM_DMACHK_CAPABLE | DDI_FM_ERRCB_CAPABLE); 2866 2867 ddi_fm_init(dip, &nvme->n_fm_cap, &nvme->n_fm_ibc); 2868 2869 if (nvme->n_fm_cap) { 2870 if (nvme->n_fm_cap & DDI_FM_ACCCHK_CAPABLE) 2871 nvme->n_reg_acc_attr.devacc_attr_access = 2872 DDI_FLAGERR_ACC; 2873 2874 if (nvme->n_fm_cap & DDI_FM_DMACHK_CAPABLE) { 2875 nvme->n_prp_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 2876 nvme->n_sgl_dma_attr.dma_attr_flags |= DDI_DMA_FLAGERR; 2877 } 2878 2879 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 2880 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 2881 pci_ereport_setup(dip); 2882 2883 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 2884 ddi_fm_handler_register(dip, nvme_fm_errcb, 2885 (void *)nvme); 2886 } 2887 2888 nvme->n_progress |= NVME_FMA_INIT; 2889 2890 /* 2891 * The spec defines several register sets. Only the controller 2892 * registers (set 1) are currently used. 2893 */ 2894 if (ddi_dev_nregs(dip, &nregs) == DDI_FAILURE || 2895 nregs < 2 || 2896 ddi_dev_regsize(dip, 1, ®size) == DDI_FAILURE) 2897 goto fail; 2898 2899 if (ddi_regs_map_setup(dip, 1, &nvme->n_regs, 0, regsize, 2900 &nvme->n_reg_acc_attr, &nvme->n_regh) != DDI_SUCCESS) { 2901 dev_err(dip, CE_WARN, "!failed to map regset 1"); 2902 goto fail; 2903 } 2904 2905 nvme->n_progress |= NVME_REGS_MAPPED; 2906 2907 /* 2908 * Create taskq for command completion. 2909 */ 2910 (void) snprintf(name, sizeof (name), "%s%d_cmd_taskq", 2911 ddi_driver_name(dip), ddi_get_instance(dip)); 2912 nvme->n_cmd_taskq = ddi_taskq_create(dip, name, MIN(UINT16_MAX, ncpus), 2913 TASKQ_DEFAULTPRI, 0); 2914 if (nvme->n_cmd_taskq == NULL) { 2915 dev_err(dip, CE_WARN, "!failed to create cmd taskq"); 2916 goto fail; 2917 } 2918 2919 /* 2920 * Create PRP DMA cache 2921 */ 2922 (void) snprintf(name, sizeof (name), "%s%d_prp_cache", 2923 ddi_driver_name(dip), ddi_get_instance(dip)); 2924 nvme->n_prp_cache = kmem_cache_create(name, sizeof (nvme_dma_t), 2925 0, nvme_prp_dma_constructor, nvme_prp_dma_destructor, 2926 NULL, (void *)nvme, NULL, 0); 2927 2928 if (nvme_init(nvme) != DDI_SUCCESS) 2929 goto fail; 2930 2931 /* 2932 * Attach the blkdev driver for each namespace. 2933 */ 2934 for (i = 0; i != nvme->n_namespace_count; i++) { 2935 if (ddi_create_minor_node(nvme->n_dip, nvme->n_ns[i].ns_name, 2936 S_IFCHR, NVME_MINOR(ddi_get_instance(nvme->n_dip), i + 1), 2937 DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) { 2938 dev_err(dip, CE_WARN, 2939 "!failed to create minor node for namespace %d", i); 2940 goto fail; 2941 } 2942 2943 if (nvme->n_ns[i].ns_ignore) 2944 continue; 2945 2946 nvme->n_ns[i].ns_bd_hdl = bd_alloc_handle(&nvme->n_ns[i], 2947 &nvme_bd_ops, &nvme->n_prp_dma_attr, KM_SLEEP); 2948 2949 if (nvme->n_ns[i].ns_bd_hdl == NULL) { 2950 dev_err(dip, CE_WARN, 2951 "!failed to get blkdev handle for namespace %d", i); 2952 goto fail; 2953 } 2954 2955 if (bd_attach_handle(dip, nvme->n_ns[i].ns_bd_hdl) 2956 != DDI_SUCCESS) { 2957 dev_err(dip, CE_WARN, 2958 "!failed to attach blkdev handle for namespace %d", 2959 i); 2960 goto fail; 2961 } 2962 } 2963 2964 if (ddi_create_minor_node(dip, "devctl", S_IFCHR, 2965 NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0) 2966 != DDI_SUCCESS) { 2967 dev_err(dip, CE_WARN, "nvme_attach: " 2968 "cannot create devctl minor node"); 2969 goto fail; 2970 } 2971 2972 return (DDI_SUCCESS); 2973 2974 fail: 2975 /* attach successful anyway so that FMA can retire the device */ 2976 if (nvme->n_dead) 2977 return (DDI_SUCCESS); 2978 2979 (void) nvme_detach(dip, DDI_DETACH); 2980 2981 return (DDI_FAILURE); 2982 } 2983 2984 static int 2985 nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2986 { 2987 int instance, i; 2988 nvme_t *nvme; 2989 2990 if (cmd != DDI_DETACH) 2991 return (DDI_FAILURE); 2992 2993 instance = ddi_get_instance(dip); 2994 2995 nvme = ddi_get_soft_state(nvme_state, instance); 2996 2997 if (nvme == NULL) 2998 return (DDI_FAILURE); 2999 3000 ddi_remove_minor_node(dip, "devctl"); 3001 mutex_destroy(&nvme->n_minor.nm_mutex); 3002 3003 if (nvme->n_ns) { 3004 for (i = 0; i != nvme->n_namespace_count; i++) { 3005 ddi_remove_minor_node(dip, nvme->n_ns[i].ns_name); 3006 mutex_destroy(&nvme->n_ns[i].ns_minor.nm_mutex); 3007 3008 if (nvme->n_ns[i].ns_bd_hdl) { 3009 (void) bd_detach_handle( 3010 nvme->n_ns[i].ns_bd_hdl); 3011 bd_free_handle(nvme->n_ns[i].ns_bd_hdl); 3012 } 3013 3014 if (nvme->n_ns[i].ns_idns) 3015 kmem_free(nvme->n_ns[i].ns_idns, 3016 sizeof (nvme_identify_nsid_t)); 3017 if (nvme->n_ns[i].ns_devid) 3018 strfree(nvme->n_ns[i].ns_devid); 3019 } 3020 3021 kmem_free(nvme->n_ns, sizeof (nvme_namespace_t) * 3022 nvme->n_namespace_count); 3023 } 3024 3025 if (nvme->n_progress & NVME_INTERRUPTS) 3026 nvme_release_interrupts(nvme); 3027 3028 if (nvme->n_cmd_taskq) 3029 ddi_taskq_wait(nvme->n_cmd_taskq); 3030 3031 if (nvme->n_ioq_count > 0) { 3032 for (i = 1; i != nvme->n_ioq_count + 1; i++) { 3033 if (nvme->n_ioq[i] != NULL) { 3034 /* TODO: send destroy queue commands */ 3035 nvme_free_qpair(nvme->n_ioq[i]); 3036 } 3037 } 3038 3039 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *) * 3040 (nvme->n_ioq_count + 1)); 3041 } 3042 3043 if (nvme->n_prp_cache != NULL) { 3044 kmem_cache_destroy(nvme->n_prp_cache); 3045 } 3046 3047 if (nvme->n_progress & NVME_REGS_MAPPED) { 3048 nvme_shutdown(nvme, NVME_CC_SHN_NORMAL, B_FALSE); 3049 (void) nvme_reset(nvme, B_FALSE); 3050 } 3051 3052 if (nvme->n_cmd_taskq) 3053 ddi_taskq_destroy(nvme->n_cmd_taskq); 3054 3055 if (nvme->n_progress & NVME_CTRL_LIMITS) 3056 sema_destroy(&nvme->n_abort_sema); 3057 3058 if (nvme->n_progress & NVME_ADMIN_QUEUE) 3059 nvme_free_qpair(nvme->n_adminq); 3060 3061 if (nvme->n_idctl) 3062 kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE); 3063 3064 if (nvme->n_progress & NVME_REGS_MAPPED) 3065 ddi_regs_map_free(&nvme->n_regh); 3066 3067 if (nvme->n_progress & NVME_FMA_INIT) { 3068 if (DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3069 ddi_fm_handler_unregister(nvme->n_dip); 3070 3071 if (DDI_FM_EREPORT_CAP(nvme->n_fm_cap) || 3072 DDI_FM_ERRCB_CAP(nvme->n_fm_cap)) 3073 pci_ereport_teardown(nvme->n_dip); 3074 3075 ddi_fm_fini(nvme->n_dip); 3076 } 3077 3078 if (nvme->n_vendor != NULL) 3079 strfree(nvme->n_vendor); 3080 3081 if (nvme->n_product != NULL) 3082 strfree(nvme->n_product); 3083 3084 ddi_soft_state_free(nvme_state, instance); 3085 3086 return (DDI_SUCCESS); 3087 } 3088 3089 static int 3090 nvme_quiesce(dev_info_t *dip) 3091 { 3092 int instance; 3093 nvme_t *nvme; 3094 3095 instance = ddi_get_instance(dip); 3096 3097 nvme = ddi_get_soft_state(nvme_state, instance); 3098 3099 if (nvme == NULL) 3100 return (DDI_FAILURE); 3101 3102 nvme_shutdown(nvme, NVME_CC_SHN_ABRUPT, B_TRUE); 3103 3104 (void) nvme_reset(nvme, B_TRUE); 3105 3106 return (DDI_FAILURE); 3107 } 3108 3109 static int 3110 nvme_fill_prp(nvme_cmd_t *cmd, bd_xfer_t *xfer) 3111 { 3112 nvme_t *nvme = cmd->nc_nvme; 3113 int nprp_page, nprp; 3114 uint64_t *prp; 3115 3116 if (xfer->x_ndmac == 0) 3117 return (DDI_FAILURE); 3118 3119 cmd->nc_sqe.sqe_dptr.d_prp[0] = xfer->x_dmac.dmac_laddress; 3120 ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac); 3121 3122 if (xfer->x_ndmac == 1) { 3123 cmd->nc_sqe.sqe_dptr.d_prp[1] = 0; 3124 return (DDI_SUCCESS); 3125 } else if (xfer->x_ndmac == 2) { 3126 cmd->nc_sqe.sqe_dptr.d_prp[1] = xfer->x_dmac.dmac_laddress; 3127 return (DDI_SUCCESS); 3128 } 3129 3130 xfer->x_ndmac--; 3131 3132 nprp_page = nvme->n_pagesize / sizeof (uint64_t) - 1; 3133 ASSERT(nprp_page > 0); 3134 nprp = (xfer->x_ndmac + nprp_page - 1) / nprp_page; 3135 3136 /* 3137 * We currently don't support chained PRPs and set up our DMA 3138 * attributes to reflect that. If we still get an I/O request 3139 * that needs a chained PRP something is very wrong. 3140 */ 3141 VERIFY(nprp == 1); 3142 3143 cmd->nc_dma = kmem_cache_alloc(nvme->n_prp_cache, KM_SLEEP); 3144 bzero(cmd->nc_dma->nd_memp, cmd->nc_dma->nd_len); 3145 3146 cmd->nc_sqe.sqe_dptr.d_prp[1] = cmd->nc_dma->nd_cookie.dmac_laddress; 3147 3148 /*LINTED: E_PTR_BAD_CAST_ALIGN*/ 3149 for (prp = (uint64_t *)cmd->nc_dma->nd_memp; 3150 xfer->x_ndmac > 0; 3151 prp++, xfer->x_ndmac--) { 3152 *prp = xfer->x_dmac.dmac_laddress; 3153 ddi_dma_nextcookie(xfer->x_dmah, &xfer->x_dmac); 3154 } 3155 3156 (void) ddi_dma_sync(cmd->nc_dma->nd_dmah, 0, cmd->nc_dma->nd_len, 3157 DDI_DMA_SYNC_FORDEV); 3158 return (DDI_SUCCESS); 3159 } 3160 3161 static nvme_cmd_t * 3162 nvme_create_nvm_cmd(nvme_namespace_t *ns, uint8_t opc, bd_xfer_t *xfer) 3163 { 3164 nvme_t *nvme = ns->ns_nvme; 3165 nvme_cmd_t *cmd; 3166 3167 /* 3168 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep. 3169 */ 3170 cmd = nvme_alloc_cmd(nvme, (xfer->x_flags & BD_XFER_POLL) ? 3171 KM_NOSLEEP : KM_SLEEP); 3172 3173 if (cmd == NULL) 3174 return (NULL); 3175 3176 cmd->nc_sqe.sqe_opc = opc; 3177 cmd->nc_callback = nvme_bd_xfer_done; 3178 cmd->nc_xfer = xfer; 3179 3180 switch (opc) { 3181 case NVME_OPC_NVM_WRITE: 3182 case NVME_OPC_NVM_READ: 3183 VERIFY(xfer->x_nblks <= 0x10000); 3184 3185 cmd->nc_sqe.sqe_nsid = ns->ns_id; 3186 3187 cmd->nc_sqe.sqe_cdw10 = xfer->x_blkno & 0xffffffffu; 3188 cmd->nc_sqe.sqe_cdw11 = (xfer->x_blkno >> 32); 3189 cmd->nc_sqe.sqe_cdw12 = (uint16_t)(xfer->x_nblks - 1); 3190 3191 if (nvme_fill_prp(cmd, xfer) != DDI_SUCCESS) 3192 goto fail; 3193 break; 3194 3195 case NVME_OPC_NVM_FLUSH: 3196 cmd->nc_sqe.sqe_nsid = ns->ns_id; 3197 break; 3198 3199 default: 3200 goto fail; 3201 } 3202 3203 return (cmd); 3204 3205 fail: 3206 nvme_free_cmd(cmd); 3207 return (NULL); 3208 } 3209 3210 static void 3211 nvme_bd_xfer_done(void *arg) 3212 { 3213 nvme_cmd_t *cmd = arg; 3214 bd_xfer_t *xfer = cmd->nc_xfer; 3215 int error = 0; 3216 3217 error = nvme_check_cmd_status(cmd); 3218 nvme_free_cmd(cmd); 3219 3220 bd_xfer_done(xfer, error); 3221 } 3222 3223 static void 3224 nvme_bd_driveinfo(void *arg, bd_drive_t *drive) 3225 { 3226 nvme_namespace_t *ns = arg; 3227 nvme_t *nvme = ns->ns_nvme; 3228 3229 /* 3230 * blkdev maintains one queue size per instance (namespace), 3231 * but all namespace share the I/O queues. 3232 * TODO: need to figure out a sane default, or use per-NS I/O queues, 3233 * or change blkdev to handle EAGAIN 3234 */ 3235 drive->d_qsize = nvme->n_ioq_count * nvme->n_io_queue_len 3236 / nvme->n_namespace_count; 3237 3238 /* 3239 * d_maxxfer is not set, which means the value is taken from the DMA 3240 * attributes specified to bd_alloc_handle. 3241 */ 3242 3243 drive->d_removable = B_FALSE; 3244 drive->d_hotpluggable = B_FALSE; 3245 3246 bcopy(ns->ns_eui64, drive->d_eui64, sizeof (drive->d_eui64)); 3247 drive->d_target = ns->ns_id; 3248 drive->d_lun = 0; 3249 3250 drive->d_model = nvme->n_idctl->id_model; 3251 drive->d_model_len = sizeof (nvme->n_idctl->id_model); 3252 drive->d_vendor = nvme->n_vendor; 3253 drive->d_vendor_len = strlen(nvme->n_vendor); 3254 drive->d_product = nvme->n_product; 3255 drive->d_product_len = strlen(nvme->n_product); 3256 drive->d_serial = nvme->n_idctl->id_serial; 3257 drive->d_serial_len = sizeof (nvme->n_idctl->id_serial); 3258 drive->d_revision = nvme->n_idctl->id_fwrev; 3259 drive->d_revision_len = sizeof (nvme->n_idctl->id_fwrev); 3260 } 3261 3262 static int 3263 nvme_bd_mediainfo(void *arg, bd_media_t *media) 3264 { 3265 nvme_namespace_t *ns = arg; 3266 3267 media->m_nblks = ns->ns_block_count; 3268 media->m_blksize = ns->ns_block_size; 3269 media->m_readonly = B_FALSE; 3270 media->m_solidstate = B_TRUE; 3271 3272 media->m_pblksize = ns->ns_best_block_size; 3273 3274 return (0); 3275 } 3276 3277 static int 3278 nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) 3279 { 3280 nvme_t *nvme = ns->ns_nvme; 3281 nvme_cmd_t *cmd, *ret; 3282 nvme_qpair_t *ioq; 3283 boolean_t poll; 3284 3285 if (nvme->n_dead) 3286 return (EIO); 3287 3288 cmd = nvme_create_nvm_cmd(ns, opc, xfer); 3289 if (cmd == NULL) 3290 return (ENOMEM); 3291 3292 cmd->nc_sqid = (CPU->cpu_id % nvme->n_ioq_count) + 1; 3293 ASSERT(cmd->nc_sqid <= nvme->n_ioq_count); 3294 ioq = nvme->n_ioq[cmd->nc_sqid]; 3295 3296 /* 3297 * Get the polling flag before submitting the command. The command may 3298 * complete immediately after it was submitted, which means we must 3299 * treat both cmd and xfer as if they have been freed already. 3300 */ 3301 poll = (xfer->x_flags & BD_XFER_POLL) != 0; 3302 3303 if (nvme_submit_cmd(ioq, cmd) != DDI_SUCCESS) 3304 return (EAGAIN); 3305 3306 if (!poll) 3307 return (0); 3308 3309 do { 3310 ret = nvme_retrieve_cmd(nvme, ioq); 3311 if (ret != NULL) 3312 nvme_bd_xfer_done(ret); 3313 else 3314 drv_usecwait(10); 3315 } while (ioq->nq_active_cmds != 0); 3316 3317 return (0); 3318 } 3319 3320 static int 3321 nvme_bd_read(void *arg, bd_xfer_t *xfer) 3322 { 3323 nvme_namespace_t *ns = arg; 3324 3325 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_READ)); 3326 } 3327 3328 static int 3329 nvme_bd_write(void *arg, bd_xfer_t *xfer) 3330 { 3331 nvme_namespace_t *ns = arg; 3332 3333 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_WRITE)); 3334 } 3335 3336 static int 3337 nvme_bd_sync(void *arg, bd_xfer_t *xfer) 3338 { 3339 nvme_namespace_t *ns = arg; 3340 3341 if (ns->ns_nvme->n_dead) 3342 return (EIO); 3343 3344 /* 3345 * If the volatile write cache is not present or not enabled the FLUSH 3346 * command is a no-op, so we can take a shortcut here. 3347 */ 3348 if (!ns->ns_nvme->n_write_cache_present) { 3349 bd_xfer_done(xfer, ENOTSUP); 3350 return (0); 3351 } 3352 3353 if (!ns->ns_nvme->n_write_cache_enabled) { 3354 bd_xfer_done(xfer, 0); 3355 return (0); 3356 } 3357 3358 return (nvme_bd_cmd(ns, xfer, NVME_OPC_NVM_FLUSH)); 3359 } 3360 3361 static int 3362 nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) 3363 { 3364 nvme_namespace_t *ns = arg; 3365 3366 /*LINTED: E_BAD_PTR_CAST_ALIGN*/ 3367 if (*(uint64_t *)ns->ns_eui64 != 0) { 3368 return (ddi_devid_init(devinfo, DEVID_SCSI3_WWN, 3369 sizeof (ns->ns_eui64), ns->ns_eui64, devid)); 3370 } else { 3371 return (ddi_devid_init(devinfo, DEVID_ENCAP, 3372 strlen(ns->ns_devid), ns->ns_devid, devid)); 3373 } 3374 } 3375 3376 static int 3377 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) 3378 { 3379 #ifndef __lock_lint 3380 _NOTE(ARGUNUSED(cred_p)); 3381 #endif 3382 minor_t minor = getminor(*devp); 3383 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 3384 int nsid = NVME_MINOR_NSID(minor); 3385 nvme_minor_state_t *nm; 3386 int rv = 0; 3387 3388 if (otyp != OTYP_CHR) 3389 return (EINVAL); 3390 3391 if (nvme == NULL) 3392 return (ENXIO); 3393 3394 if (nsid > nvme->n_namespace_count) 3395 return (ENXIO); 3396 3397 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; 3398 3399 mutex_enter(&nm->nm_mutex); 3400 if (nm->nm_oexcl) { 3401 rv = EBUSY; 3402 goto out; 3403 } 3404 3405 if (flag & FEXCL) { 3406 if (nm->nm_ocnt != 0) { 3407 rv = EBUSY; 3408 goto out; 3409 } 3410 nm->nm_oexcl = B_TRUE; 3411 } 3412 3413 nm->nm_ocnt++; 3414 3415 out: 3416 mutex_exit(&nm->nm_mutex); 3417 return (rv); 3418 3419 } 3420 3421 static int 3422 nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p) 3423 { 3424 #ifndef __lock_lint 3425 _NOTE(ARGUNUSED(cred_p)); 3426 _NOTE(ARGUNUSED(flag)); 3427 #endif 3428 minor_t minor = getminor(dev); 3429 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 3430 int nsid = NVME_MINOR_NSID(minor); 3431 nvme_minor_state_t *nm; 3432 3433 if (otyp != OTYP_CHR) 3434 return (ENXIO); 3435 3436 if (nvme == NULL) 3437 return (ENXIO); 3438 3439 if (nsid > nvme->n_namespace_count) 3440 return (ENXIO); 3441 3442 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; 3443 3444 mutex_enter(&nm->nm_mutex); 3445 if (nm->nm_oexcl) 3446 nm->nm_oexcl = B_FALSE; 3447 3448 ASSERT(nm->nm_ocnt > 0); 3449 nm->nm_ocnt--; 3450 mutex_exit(&nm->nm_mutex); 3451 3452 return (0); 3453 } 3454 3455 static int 3456 nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 3457 cred_t *cred_p) 3458 { 3459 _NOTE(ARGUNUSED(cred_p)); 3460 int rv = 0; 3461 void *idctl; 3462 3463 if ((mode & FREAD) == 0) 3464 return (EPERM); 3465 3466 if (nioc->n_len < NVME_IDENTIFY_BUFSIZE) 3467 return (EINVAL); 3468 3469 idctl = nvme_identify(nvme, nsid); 3470 if (idctl == NULL) 3471 return (EIO); 3472 3473 if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode) 3474 != 0) 3475 rv = EFAULT; 3476 3477 kmem_free(idctl, NVME_IDENTIFY_BUFSIZE); 3478 3479 return (rv); 3480 } 3481 3482 static int 3483 nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 3484 int mode, cred_t *cred_p) 3485 { 3486 _NOTE(ARGUNUSED(nsid, cred_p)); 3487 int rv = 0; 3488 nvme_reg_cap_t cap = { 0 }; 3489 nvme_capabilities_t nc; 3490 3491 if ((mode & FREAD) == 0) 3492 return (EPERM); 3493 3494 if (nioc->n_len < sizeof (nc)) 3495 return (EINVAL); 3496 3497 cap.r = nvme_get64(nvme, NVME_REG_CAP); 3498 3499 /* 3500 * The MPSMIN and MPSMAX fields in the CAP register use 0 to 3501 * specify the base page size of 4k (1<<12), so add 12 here to 3502 * get the real page size value. 3503 */ 3504 nc.mpsmax = 1 << (12 + cap.b.cap_mpsmax); 3505 nc.mpsmin = 1 << (12 + cap.b.cap_mpsmin); 3506 3507 if (ddi_copyout(&nc, (void *)nioc->n_buf, sizeof (nc), mode) != 0) 3508 rv = EFAULT; 3509 3510 return (rv); 3511 } 3512 3513 static int 3514 nvme_ioctl_get_logpage(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 3515 int mode, cred_t *cred_p) 3516 { 3517 _NOTE(ARGUNUSED(cred_p)); 3518 void *log = NULL; 3519 size_t bufsize = 0; 3520 int rv = 0; 3521 3522 if ((mode & FREAD) == 0) 3523 return (EPERM); 3524 3525 switch (nioc->n_arg) { 3526 case NVME_LOGPAGE_ERROR: 3527 if (nsid != 0) 3528 return (EINVAL); 3529 break; 3530 case NVME_LOGPAGE_HEALTH: 3531 if (nsid != 0 && nvme->n_idctl->id_lpa.lp_smart == 0) 3532 return (EINVAL); 3533 3534 if (nsid == 0) 3535 nsid = (uint32_t)-1; 3536 3537 break; 3538 case NVME_LOGPAGE_FWSLOT: 3539 if (nsid != 0) 3540 return (EINVAL); 3541 break; 3542 default: 3543 return (EINVAL); 3544 } 3545 3546 if (nvme_get_logpage(nvme, &log, &bufsize, nioc->n_arg, nsid) 3547 != DDI_SUCCESS) 3548 return (EIO); 3549 3550 if (nioc->n_len < bufsize) { 3551 kmem_free(log, bufsize); 3552 return (EINVAL); 3553 } 3554 3555 if (ddi_copyout(log, (void *)nioc->n_buf, bufsize, mode) != 0) 3556 rv = EFAULT; 3557 3558 nioc->n_len = bufsize; 3559 kmem_free(log, bufsize); 3560 3561 return (rv); 3562 } 3563 3564 static int 3565 nvme_ioctl_get_features(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, 3566 int mode, cred_t *cred_p) 3567 { 3568 _NOTE(ARGUNUSED(cred_p)); 3569 void *buf = NULL; 3570 size_t bufsize = 0; 3571 uint32_t res = 0; 3572 uint8_t feature; 3573 int rv = 0; 3574 3575 if ((mode & FREAD) == 0) 3576 return (EPERM); 3577 3578 if ((nioc->n_arg >> 32) > 0xff) 3579 return (EINVAL); 3580 3581 feature = (uint8_t)(nioc->n_arg >> 32); 3582 3583 switch (feature) { 3584 case NVME_FEAT_ARBITRATION: 3585 case NVME_FEAT_POWER_MGMT: 3586 case NVME_FEAT_TEMPERATURE: 3587 case NVME_FEAT_ERROR: 3588 case NVME_FEAT_NQUEUES: 3589 case NVME_FEAT_INTR_COAL: 3590 case NVME_FEAT_WRITE_ATOM: 3591 case NVME_FEAT_ASYNC_EVENT: 3592 case NVME_FEAT_PROGRESS: 3593 if (nsid != 0) 3594 return (EINVAL); 3595 break; 3596 3597 case NVME_FEAT_INTR_VECT: 3598 if (nsid != 0) 3599 return (EINVAL); 3600 3601 res = nioc->n_arg & 0xffffffffUL; 3602 if (res >= nvme->n_intr_cnt) 3603 return (EINVAL); 3604 break; 3605 3606 case NVME_FEAT_LBA_RANGE: 3607 if (nvme->n_lba_range_supported == B_FALSE) 3608 return (EINVAL); 3609 3610 if (nsid == 0 || 3611 nsid > nvme->n_namespace_count) 3612 return (EINVAL); 3613 3614 break; 3615 3616 case NVME_FEAT_WRITE_CACHE: 3617 if (nsid != 0) 3618 return (EINVAL); 3619 3620 if (!nvme->n_write_cache_present) 3621 return (EINVAL); 3622 3623 break; 3624 3625 case NVME_FEAT_AUTO_PST: 3626 if (nsid != 0) 3627 return (EINVAL); 3628 3629 if (!nvme->n_auto_pst_supported) 3630 return (EINVAL); 3631 3632 break; 3633 3634 default: 3635 return (EINVAL); 3636 } 3637 3638 if (nvme_get_features(nvme, nsid, feature, &res, &buf, &bufsize) == 3639 B_FALSE) 3640 return (EIO); 3641 3642 if (nioc->n_len < bufsize) { 3643 kmem_free(buf, bufsize); 3644 return (EINVAL); 3645 } 3646 3647 if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0) 3648 rv = EFAULT; 3649 3650 kmem_free(buf, bufsize); 3651 nioc->n_arg = res; 3652 nioc->n_len = bufsize; 3653 3654 return (rv); 3655 } 3656 3657 static int 3658 nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 3659 cred_t *cred_p) 3660 { 3661 _NOTE(ARGUNUSED(nsid, mode, cred_p)); 3662 3663 if ((mode & FREAD) == 0) 3664 return (EPERM); 3665 3666 nioc->n_arg = nvme->n_intr_cnt; 3667 return (0); 3668 } 3669 3670 static int 3671 nvme_ioctl_version(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 3672 cred_t *cred_p) 3673 { 3674 _NOTE(ARGUNUSED(nsid, cred_p)); 3675 int rv = 0; 3676 3677 if ((mode & FREAD) == 0) 3678 return (EPERM); 3679 3680 if (nioc->n_len < sizeof (nvme->n_version)) 3681 return (ENOMEM); 3682 3683 if (ddi_copyout(&nvme->n_version, (void *)nioc->n_buf, 3684 sizeof (nvme->n_version), mode) != 0) 3685 rv = EFAULT; 3686 3687 return (rv); 3688 } 3689 3690 static int 3691 nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 3692 cred_t *cred_p) 3693 { 3694 _NOTE(ARGUNUSED(mode)); 3695 nvme_format_nvm_t frmt = { 0 }; 3696 int c_nsid = nsid != 0 ? nsid - 1 : 0; 3697 3698 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 3699 return (EPERM); 3700 3701 frmt.r = nioc->n_arg & 0xffffffff; 3702 3703 /* 3704 * Check whether the FORMAT NVM command is supported. 3705 */ 3706 if (nvme->n_idctl->id_oacs.oa_format == 0) 3707 return (EINVAL); 3708 3709 /* 3710 * Don't allow format or secure erase of individual namespace if that 3711 * would cause a format or secure erase of all namespaces. 3712 */ 3713 if (nsid != 0 && nvme->n_idctl->id_fna.fn_format != 0) 3714 return (EINVAL); 3715 3716 if (nsid != 0 && frmt.b.fm_ses != NVME_FRMT_SES_NONE && 3717 nvme->n_idctl->id_fna.fn_sec_erase != 0) 3718 return (EINVAL); 3719 3720 /* 3721 * Don't allow formatting with Protection Information. 3722 */ 3723 if (frmt.b.fm_pi != 0 || frmt.b.fm_pil != 0 || frmt.b.fm_ms != 0) 3724 return (EINVAL); 3725 3726 /* 3727 * Don't allow formatting using an illegal LBA format, or any LBA format 3728 * that uses metadata. 3729 */ 3730 if (frmt.b.fm_lbaf > nvme->n_ns[c_nsid].ns_idns->id_nlbaf || 3731 nvme->n_ns[c_nsid].ns_idns->id_lbaf[frmt.b.fm_lbaf].lbaf_ms != 0) 3732 return (EINVAL); 3733 3734 /* 3735 * Don't allow formatting using an illegal Secure Erase setting. 3736 */ 3737 if (frmt.b.fm_ses > NVME_FRMT_MAX_SES || 3738 (frmt.b.fm_ses == NVME_FRMT_SES_CRYPTO && 3739 nvme->n_idctl->id_fna.fn_crypt_erase == 0)) 3740 return (EINVAL); 3741 3742 if (nsid == 0) 3743 nsid = (uint32_t)-1; 3744 3745 return (nvme_format_nvm(nvme, nsid, frmt.b.fm_lbaf, B_FALSE, 0, B_FALSE, 3746 frmt.b.fm_ses)); 3747 } 3748 3749 static int 3750 nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 3751 cred_t *cred_p) 3752 { 3753 _NOTE(ARGUNUSED(nioc, mode)); 3754 int rv = 0; 3755 3756 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 3757 return (EPERM); 3758 3759 if (nsid == 0) 3760 return (EINVAL); 3761 3762 rv = bd_detach_handle(nvme->n_ns[nsid - 1].ns_bd_hdl); 3763 if (rv != DDI_SUCCESS) 3764 rv = EBUSY; 3765 3766 return (rv); 3767 } 3768 3769 static int 3770 nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, 3771 cred_t *cred_p) 3772 { 3773 _NOTE(ARGUNUSED(nioc, mode)); 3774 nvme_identify_nsid_t *idns; 3775 int rv = 0; 3776 3777 if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) 3778 return (EPERM); 3779 3780 if (nsid == 0) 3781 return (EINVAL); 3782 3783 /* 3784 * Identify namespace again, free old identify data. 3785 */ 3786 idns = nvme->n_ns[nsid - 1].ns_idns; 3787 if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS) 3788 return (EIO); 3789 3790 kmem_free(idns, sizeof (nvme_identify_nsid_t)); 3791 3792 rv = bd_attach_handle(nvme->n_dip, nvme->n_ns[nsid - 1].ns_bd_hdl); 3793 if (rv != DDI_SUCCESS) 3794 rv = EBUSY; 3795 3796 return (rv); 3797 } 3798 3799 static int 3800 nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, 3801 int *rval_p) 3802 { 3803 #ifndef __lock_lint 3804 _NOTE(ARGUNUSED(rval_p)); 3805 #endif 3806 minor_t minor = getminor(dev); 3807 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); 3808 int nsid = NVME_MINOR_NSID(minor); 3809 int rv = 0; 3810 nvme_ioctl_t nioc; 3811 3812 int (*nvme_ioctl[])(nvme_t *, int, nvme_ioctl_t *, int, cred_t *) = { 3813 NULL, 3814 nvme_ioctl_identify, 3815 nvme_ioctl_identify, 3816 nvme_ioctl_capabilities, 3817 nvme_ioctl_get_logpage, 3818 nvme_ioctl_get_features, 3819 nvme_ioctl_intr_cnt, 3820 nvme_ioctl_version, 3821 nvme_ioctl_format, 3822 nvme_ioctl_detach, 3823 nvme_ioctl_attach 3824 }; 3825 3826 if (nvme == NULL) 3827 return (ENXIO); 3828 3829 if (nsid > nvme->n_namespace_count) 3830 return (ENXIO); 3831 3832 if (IS_DEVCTL(cmd)) 3833 return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0)); 3834 3835 #ifdef _MULTI_DATAMODEL 3836 switch (ddi_model_convert_from(mode & FMODELS)) { 3837 case DDI_MODEL_ILP32: { 3838 nvme_ioctl32_t nioc32; 3839 if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t), 3840 mode) != 0) 3841 return (EFAULT); 3842 nioc.n_len = nioc32.n_len; 3843 nioc.n_buf = nioc32.n_buf; 3844 nioc.n_arg = nioc32.n_arg; 3845 break; 3846 } 3847 case DDI_MODEL_NONE: 3848 #endif 3849 if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode) 3850 != 0) 3851 return (EFAULT); 3852 #ifdef _MULTI_DATAMODEL 3853 break; 3854 } 3855 #endif 3856 3857 if (cmd == NVME_IOC_IDENTIFY_CTRL) { 3858 /* 3859 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and 3860 * attachment point nodes. 3861 */ 3862 nsid = 0; 3863 } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) { 3864 /* 3865 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it 3866 * will always return identify data for namespace 1. 3867 */ 3868 nsid = 1; 3869 } 3870 3871 if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL) 3872 rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode, 3873 cred_p); 3874 else 3875 rv = EINVAL; 3876 3877 #ifdef _MULTI_DATAMODEL 3878 switch (ddi_model_convert_from(mode & FMODELS)) { 3879 case DDI_MODEL_ILP32: { 3880 nvme_ioctl32_t nioc32; 3881 3882 nioc32.n_len = (size32_t)nioc.n_len; 3883 nioc32.n_buf = (uintptr32_t)nioc.n_buf; 3884 nioc32.n_arg = nioc.n_arg; 3885 3886 if (ddi_copyout(&nioc32, (void *)arg, sizeof (nvme_ioctl32_t), 3887 mode) != 0) 3888 return (EFAULT); 3889 break; 3890 } 3891 case DDI_MODEL_NONE: 3892 #endif 3893 if (ddi_copyout(&nioc, (void *)arg, sizeof (nvme_ioctl_t), mode) 3894 != 0) 3895 return (EFAULT); 3896 #ifdef _MULTI_DATAMODEL 3897 break; 3898 } 3899 #endif 3900 3901 return (rv); 3902 }