39 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
40 * the admin queue will share an interrupt with one I/O queue. The interrupt
41 * handler will retrieve completed commands from all queues sharing an interrupt
42 * vector and will post them to a taskq for completion processing.
43 *
44 *
45 * Command Processing:
46 *
47 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
48 * to 65536 I/O commands. The driver will configure one I/O queue pair per
49 * available interrupt vector, with the queue length usually much smaller than
50 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
51 * interrupt vectors will be used.
52 *
53 * Additionally the hardware provides a single special admin queue pair that can
54 * hold up to 4096 admin commands.
55 *
56 * From the hardware perspective both queues of a queue pair are independent,
57 * but they share some driver state: the command array (holding pointers to
58 * commands currently being processed by the hardware) and the active command
59 * counter. Access to the submission side of a queue pair and the shared state
60 * is protected by nq_mutex. The completion side of a queue pair does not need
61 * that protection apart from its access to the shared state; it is called only
62 * in the interrupt handler which does not run concurrently for the same
63 * interrupt vector.
64 *
65 * When a command is submitted to a queue pair the active command counter is
66 * incremented and a pointer to the command is stored in the command array. The
67 * array index is used as command identifier (CID) in the submission queue
68 * entry. Some commands may take a very long time to complete, and if the queue
69 * wraps around in that time a submission may find the next array slot to still
70 * be used by a long-running command. In this case the array is sequentially
71 * searched for the next free slot. The length of the command array is the same
72 * as the configured queue length. Queue overrun is prevented by the semaphore,
73 * so a command submission may block if the queue is full.
74 *
75 *
76 * Polled I/O Support:
77 *
78 * For kernel core dump support the driver can do polled I/O. As interrupts are
79 * turned off while dumping the driver will just submit a command in the regular
80 * way, and then repeatedly attempt a command retrieval until it gets the
81 * command back.
82 *
83 *
132 * Error handling is currently limited to detecting fatal hardware errors,
133 * either by asynchronous events, or synchronously through command status or
134 * admin command timeouts. In case of severe errors the device is fenced off,
135 * all further requests will return EIO. FMA is then called to fault the device.
136 *
137 * The hardware has a limit for outstanding asynchronous event requests. Before
138 * this limit is known the driver assumes it is at least 1 and posts a single
139 * asynchronous request. Later when the limit is known more asynchronous event
140 * requests are posted to allow quicker reception of error information. When an
141 * asynchronous event is posted by the hardware the driver will parse the error
142 * status fields and log information or fault the device, depending on the
143 * severity of the asynchronous event. The asynchronous event request is then
144 * reused and posted to the admin queue again.
145 *
146 * On command completion the command status is checked for errors. In case of
147 * errors indicating a driver bug the driver panics. Almost all other error
148 * status values just cause EIO to be returned.
149 *
150 * Command timeouts are currently detected for all admin commands except
151 * asynchronous event requests. If a command times out and the hardware appears
152 * to be healthy the driver attempts to abort the command. If this fails the
153 * driver assumes the device to be dead, fences it off, and calls FMA to retire
154 * it. In general admin commands are issued at attach time only. No timeout
155 * handling of normal I/O commands is presently done.
156 *
157 * In some cases it may be possible that the ABORT command times out, too. In
158 * that case the device is also declared dead and fenced off.
159 *
160 *
161 * Quiesce / Fast Reboot:
162 *
163 * The driver currently does not support fast reboot. A quiesce(9E) entry point
164 * is still provided which is used to send a shutdown notification to the
165 * device.
166 *
167 *
168 * Driver Configuration:
169 *
170 * The following driver properties can be changed to control some aspects of the
171 * drivers operation:
172 * - strict-version: can be set to 0 to allow devices conforming to newer
173 * versions or namespaces with EUI64 to be used
174 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
175 * specific command status as a fatal error leading device faulting
176 * - admin-queue-len: the maximum length of the admin queue (16-4096)
177 * - io-queue-len: the maximum length of the I/O queues (16-65536)
178 * - async-event-limit: the maximum number of asynchronous event requests to be
179 * posted by the driver
180 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
204 #endif
205
206 #include <sys/modctl.h>
207 #include <sys/conf.h>
208 #include <sys/devops.h>
209 #include <sys/ddi.h>
210 #include <sys/sunddi.h>
211 #include <sys/sunndi.h>
212 #include <sys/bitmap.h>
213 #include <sys/sysmacros.h>
214 #include <sys/param.h>
215 #include <sys/varargs.h>
216 #include <sys/cpuvar.h>
217 #include <sys/disp.h>
218 #include <sys/blkdev.h>
219 #include <sys/atomic.h>
220 #include <sys/archsystm.h>
221 #include <sys/sata/sata_hba.h>
222 #include <sys/stat.h>
223 #include <sys/policy.h>
224
225 #include <sys/nvme.h>
226
227 #ifdef __x86
228 #include <sys/x86_archext.h>
229 #endif
230
231 #include "nvme_reg.h"
232 #include "nvme_var.h"
233
234
235 /* NVMe spec version supported */
236 static const int nvme_version_major = 1;
237 static const int nvme_version_minor = 2;
238
239 /* tunable for admin command timeout in seconds, default is 1s */
240 int nvme_admin_cmd_timeout = 1;
241
242 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
243 int nvme_format_cmd_timeout = 600;
244
245 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
246 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
247 static int nvme_quiesce(dev_info_t *);
248 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
249 static int nvme_setup_interrupts(nvme_t *, int, int);
250 static void nvme_release_interrupts(nvme_t *);
251 static uint_t nvme_intr(caddr_t, caddr_t);
252
253 static void nvme_shutdown(nvme_t *, int, boolean_t);
254 static boolean_t nvme_reset(nvme_t *, boolean_t);
255 static int nvme_init(nvme_t *);
256 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
257 static void nvme_free_cmd(nvme_cmd_t *);
258 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
259 bd_xfer_t *);
260 static int nvme_admin_cmd(nvme_cmd_t *, int);
261 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
262 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
263 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
264 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
265 static boolean_t nvme_wait_cmd(nvme_cmd_t *, uint_t);
266 static void nvme_wakeup_cmd(void *);
267 static void nvme_async_event_task(void *);
268
269 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
270 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
271 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
272 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
273 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
274 static inline int nvme_check_cmd_status(nvme_cmd_t *);
275
276 static void nvme_abort_cmd(nvme_cmd_t *);
277 static void nvme_async_event(nvme_t *);
278 static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t,
279 boolean_t, uint8_t);
280 static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...);
281 static void *nvme_identify(nvme_t *, uint32_t);
282 static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,
283 uint32_t *);
284 static boolean_t nvme_get_features(nvme_t *, uint32_t, uint8_t, uint32_t *,
285 void **, size_t *);
286 static boolean_t nvme_write_cache_set(nvme_t *, boolean_t);
287 static int nvme_set_nqueues(nvme_t *, uint16_t);
288
289 static void nvme_free_dma(nvme_dma_t *);
290 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
291 nvme_dma_t **);
292 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
293 nvme_dma_t **);
294 static void nvme_free_qpair(nvme_qpair_t *);
295 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, int);
296 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
297
298 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
299 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
300 static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
301 static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
302
303 static boolean_t nvme_check_regs_hdl(nvme_t *);
304 static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
305
306 static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *);
307
444 .drv_modops = &mod_driverops,
445 .drv_linkinfo = "NVMe v1.1b",
446 .drv_dev_ops = &nvme_dev_ops
447 };
448
449 static struct modlinkage nvme_modlinkage = {
450 .ml_rev = MODREV_1,
451 .ml_linkage = { &nvme_modldrv, NULL }
452 };
453
454 static bd_ops_t nvme_bd_ops = {
455 .o_version = BD_OPS_VERSION_0,
456 .o_drive_info = nvme_bd_driveinfo,
457 .o_media_info = nvme_bd_mediainfo,
458 .o_devid_init = nvme_bd_devid,
459 .o_sync_cache = nvme_bd_sync,
460 .o_read = nvme_bd_read,
461 .o_write = nvme_bd_write,
462 };
463
464 int
465 _init(void)
466 {
467 int error;
468
469 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
470 if (error != DDI_SUCCESS)
471 return (error);
472
473 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
474 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
475
476 bd_mod_init(&nvme_dev_ops);
477
478 error = mod_install(&nvme_modlinkage);
479 if (error != DDI_SUCCESS) {
480 ddi_soft_state_fini(&nvme_state);
481 bd_mod_fini(&nvme_dev_ops);
482 }
483
484 return (error);
485 }
486
487 int
488 _fini(void)
489 {
490 int error;
491
492 error = mod_remove(&nvme_modlinkage);
493 if (error == DDI_SUCCESS) {
494 ddi_soft_state_fini(&nvme_state);
495 kmem_cache_destroy(nvme_cmd_cache);
496 bd_mod_fini(&nvme_dev_ops);
497 }
498
499 return (error);
500 }
501
502 int
503 _info(struct modinfo *modinfop)
504 {
505 return (mod_info(&nvme_modlinkage, modinfop));
506 }
507
508 static inline void
509 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
510 {
511 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
512
513 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
514 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
515 }
785 {
786 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag);
787
788 if (cmd == NULL)
789 return (cmd);
790
791 bzero(cmd, sizeof (nvme_cmd_t));
792
793 cmd->nc_nvme = nvme;
794
795 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER,
796 DDI_INTR_PRI(nvme->n_intr_pri));
797 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL);
798
799 return (cmd);
800 }
801
802 static void
803 nvme_free_cmd(nvme_cmd_t *cmd)
804 {
805 if (cmd->nc_dma) {
806 if (cmd->nc_dma->nd_cached)
807 kmem_cache_free(cmd->nc_nvme->n_prp_cache,
808 cmd->nc_dma);
809 else
810 nvme_free_dma(cmd->nc_dma);
811 cmd->nc_dma = NULL;
812 }
813
814 cv_destroy(&cmd->nc_cv);
815 mutex_destroy(&cmd->nc_mutex);
816
817 kmem_cache_free(nvme_cmd_cache, cmd);
818 }
819
820 static void
821 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
822 {
823 sema_p(&qp->nq_sema);
824 nvme_submit_cmd_common(qp, cmd);
851 while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
852 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
853 qp->nq_cmd[qp->nq_next_cmd] = cmd;
854
855 qp->nq_active_cmds++;
856
857 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
858 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
859 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
860 sizeof (nvme_sqe_t) * qp->nq_sqtail,
861 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
862 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
863
864 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
865 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
866
867 mutex_exit(&qp->nq_mutex);
868 }
869
870 static nvme_cmd_t *
871 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
872 {
873 nvme_reg_cqhdbl_t head = { 0 };
874
875 nvme_cqe_t *cqe;
876 nvme_cmd_t *cmd;
877
878 (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
879 sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
880
881 mutex_enter(&qp->nq_mutex);
882 cqe = &qp->nq_cq[qp->nq_cqhead];
883
884 /* Check phase tag of CQE. Hardware inverts it for new entries. */
885 if (cqe->cqe_sf.sf_p == qp->nq_phase) {
886 mutex_exit(&qp->nq_mutex);
887 return (NULL);
888 }
889
890 ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
891 ASSERT(cqe->cqe_cid < qp->nq_nentry);
892
893 cmd = qp->nq_cmd[cqe->cqe_cid];
894 qp->nq_cmd[cqe->cqe_cid] = NULL;
895 qp->nq_active_cmds--;
896
897 ASSERT(cmd != NULL);
898 ASSERT(cmd->nc_nvme == nvme);
899 ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
900 ASSERT(cmd->nc_sqe.sqe_cid == cqe->cqe_cid);
901 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
902
903 qp->nq_sqhead = cqe->cqe_sqhd;
904
905 head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
906
907 /* Toggle phase on wrap-around. */
908 if (qp->nq_cqhead == 0)
909 qp->nq_phase = qp->nq_phase ? 0 : 1;
910
911 nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
912 mutex_exit(&qp->nq_mutex);
913 sema_v(&qp->nq_sema);
914
915 return (cmd);
916 }
917
918 static int
919 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
920 {
921 nvme_cqe_t *cqe = &cmd->nc_cqe;
922
923 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
924 "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
925 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
926 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
927 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
928
929 if (cmd->nc_xfer != NULL)
930 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
931
932 if (cmd->nc_nvme->n_strict_version) {
933 cmd->nc_nvme->n_dead = B_TRUE;
1178 return (EINVAL);
1179
1180 case NVME_CQE_SC_SPC_NVM_READONLY:
1181 /* Write to Read Only Range */
1182 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1183 atomic_inc_32(&cmd->nc_nvme->n_readonly);
1184 if (cmd->nc_xfer != NULL)
1185 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1186 return (EROFS);
1187
1188 default:
1189 return (nvme_check_unknown_cmd_status(cmd));
1190 }
1191 }
1192
1193 static inline int
1194 nvme_check_cmd_status(nvme_cmd_t *cmd)
1195 {
1196 nvme_cqe_t *cqe = &cmd->nc_cqe;
1197
1198 /* take a shortcut if everything is alright */
1199 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1200 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
1201 return (0);
1202
1203 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)
1204 return (nvme_check_generic_cmd_status(cmd));
1205 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC)
1206 return (nvme_check_specific_cmd_status(cmd));
1207 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY)
1208 return (nvme_check_integrity_cmd_status(cmd));
1209 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR)
1210 return (nvme_check_vendor_cmd_status(cmd));
1211
1212 return (nvme_check_unknown_cmd_status(cmd));
1213 }
1214
1215 /*
1216 * nvme_abort_cmd_cb -- replaces nc_callback of aborted commands
1217 *
1218 * This functions takes care of cleaning up aborted commands. The command
1219 * status is checked to catch any fatal errors.
1220 */
1221 static void
1222 nvme_abort_cmd_cb(void *arg)
1223 {
1224 nvme_cmd_t *cmd = arg;
1225
1226 /*
1227 * Grab the command mutex. Once we have it we hold the last reference
1228 * to the command and can safely free it.
1229 */
1230 mutex_enter(&cmd->nc_mutex);
1231 (void) nvme_check_cmd_status(cmd);
1232 mutex_exit(&cmd->nc_mutex);
1233
1234 nvme_free_cmd(cmd);
1235 }
1236
1237 static void
1238 nvme_abort_cmd(nvme_cmd_t *abort_cmd)
1239 {
1240 nvme_t *nvme = abort_cmd->nc_nvme;
1241 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1242 nvme_abort_cmd_t ac = { 0 };
1243
1244 sema_p(&nvme->n_abort_sema);
1245
1246 ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid;
1247 ac.b.ac_sqid = abort_cmd->nc_sqid;
1248
1249 /*
1250 * Drop the mutex of the aborted command. From this point on
1251 * we must assume that the abort callback has freed the command.
1252 */
1253 mutex_exit(&abort_cmd->nc_mutex);
1254
1255 cmd->nc_sqid = 0;
1256 cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
1257 cmd->nc_callback = nvme_wakeup_cmd;
1258 cmd->nc_sqe.sqe_cdw10 = ac.r;
1259
1260 /*
1261 * Send the ABORT to the hardware. The ABORT command will return _after_
1262 * the aborted command has completed (aborted or otherwise).
1263 */
1264 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
1265 sema_v(&nvme->n_abort_sema);
1266 dev_err(nvme->n_dip, CE_WARN,
1267 "!nvme_admin_cmd failed for ABORT");
1268 atomic_inc_32(&nvme->n_abort_failed);
1269 return;
1270 }
1271 sema_v(&nvme->n_abort_sema);
1272
1273 if (nvme_check_cmd_status(cmd)) {
1274 dev_err(nvme->n_dip, CE_WARN,
1275 "!ABORT failed with sct = %x, sc = %x",
1276 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1277 atomic_inc_32(&nvme->n_abort_failed);
1278 } else {
1279 atomic_inc_32(&nvme->n_cmd_aborted);
1280 }
1281
1282 nvme_free_cmd(cmd);
1283 }
1284
1285 /*
1286 * nvme_wait_cmd -- wait for command completion or timeout
1287 *
1288 * Returns B_TRUE if the command completed normally.
1289 *
1290 * Returns B_FALSE if the command timed out and an abort was attempted. The
1291 * command mutex will be dropped and the command must be considered freed. The
1292 * freeing of the command is normally done by the abort command callback.
1293 *
1294 * In case of a serious error or a timeout of the abort command the hardware
1295 * will be declared dead and FMA will be notified.
1296 */
1297 static boolean_t
1298 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec)
1299 {
1300 clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC);
1301 nvme_t *nvme = cmd->nc_nvme;
1302 nvme_reg_csts_t csts;
1303
1304 ASSERT(mutex_owned(&cmd->nc_mutex));
1305
1306 while (!cmd->nc_completed) {
1307 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1)
1308 break;
1309 }
1310
1311 if (cmd->nc_completed)
1312 return (B_TRUE);
1313
1314 /*
1315 * The command timed out. Change the callback to the cleanup function.
1316 */
1317 cmd->nc_callback = nvme_abort_cmd_cb;
1318
1319 /*
1320 * Check controller for fatal status, any errors associated with the
1321 * register or DMA handle, or for a double timeout (abort command timed
1322 * out). If necessary log a warning and call FMA.
1323 */
1324 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
1325 dev_err(nvme->n_dip, CE_WARN, "!command timeout, "
1326 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
1327 atomic_inc_32(&nvme->n_cmd_timeout);
1328
1329 if (csts.b.csts_cfs ||
1330 nvme_check_regs_hdl(nvme) ||
1331 nvme_check_dma_hdl(cmd->nc_dma) ||
1332 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
1333 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1334 nvme->n_dead = B_TRUE;
1335 mutex_exit(&cmd->nc_mutex);
1336 } else {
1337 /*
1338 * Try to abort the command. The command mutex is released by
1339 * nvme_abort_cmd().
1340 * If the abort succeeds it will have freed the aborted command.
1341 * If the abort fails for other reasons we must assume that the
1342 * command may complete at any time, and the callback will free
1343 * it for us.
1344 */
1345 nvme_abort_cmd(cmd);
1346 }
1347
1348 return (B_FALSE);
1349 }
1350
1351 static void
1352 nvme_wakeup_cmd(void *arg)
1353 {
1354 nvme_cmd_t *cmd = arg;
1355
1356 mutex_enter(&cmd->nc_mutex);
1357 /*
1358 * There is a slight chance that this command completed shortly after
1359 * the timeout was hit in nvme_wait_cmd() but before the callback was
1360 * changed. Catch that case here and clean up accordingly.
1361 */
1362 if (cmd->nc_callback == nvme_abort_cmd_cb) {
1363 mutex_exit(&cmd->nc_mutex);
1364 nvme_abort_cmd_cb(cmd);
1365 return;
1366 }
1367
1368 cmd->nc_completed = B_TRUE;
1369 cv_signal(&cmd->nc_cv);
1370 mutex_exit(&cmd->nc_mutex);
1371 }
1372
1373 static void
1374 nvme_async_event_task(void *arg)
1375 {
1376 nvme_cmd_t *cmd = arg;
1377 nvme_t *nvme = cmd->nc_nvme;
1378 nvme_error_log_entry_t *error_log = NULL;
1379 nvme_health_log_t *health_log = NULL;
1380 size_t logsize = 0;
1381 nvme_async_event_t event;
1382
1383 /*
1384 * Check for errors associated with the async request itself. The only
1385 * command-specific error is "async event limit exceeded", which
1386 * indicates a programming error in the driver and causes a panic in
1387 * nvme_check_cmd_status().
1388 *
1389 * Other possible errors are various scenarios where the async request
1390 * was aborted, or internal errors in the device. Internal errors are
1391 * reported to FMA, the command aborts need no special handling here.
1392 */
1393 if (nvme_check_cmd_status(cmd)) {
1394 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1395 "!async event request returned failure, sct = %x, "
1396 "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
1397 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
1398 cmd->nc_cqe.cqe_sf.sf_m);
1399
1400 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1401 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
1402 cmd->nc_nvme->n_dead = B_TRUE;
1403 ddi_fm_service_impact(cmd->nc_nvme->n_dip,
1404 DDI_SERVICE_LOST);
1405 }
1406 nvme_free_cmd(cmd);
1407 return;
1408 }
1409
1410
1411 event.r = cmd->nc_cqe.cqe_dw0;
1412
1413 /* Clear CQE and re-submit the async request. */
1505 "received, info = %x, logpage = %x", event.b.ae_info,
1506 event.b.ae_logpage);
1507 atomic_inc_32(&nvme->n_vendor_event);
1508 break;
1509
1510 default:
1511 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
1512 "type = %x, info = %x, logpage = %x", event.b.ae_type,
1513 event.b.ae_info, event.b.ae_logpage);
1514 atomic_inc_32(&nvme->n_unknown_event);
1515 break;
1516 }
1517
1518 if (error_log)
1519 kmem_free(error_log, logsize);
1520
1521 if (health_log)
1522 kmem_free(health_log, logsize);
1523 }
1524
1525 static int
1526 nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
1527 {
1528 mutex_enter(&cmd->nc_mutex);
1529 nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
1530
1531 if (nvme_wait_cmd(cmd, sec) == B_FALSE) {
1532 /*
1533 * The command timed out. An abort command was posted that
1534 * will take care of the cleanup.
1535 */
1536 return (DDI_FAILURE);
1537 }
1538 mutex_exit(&cmd->nc_mutex);
1539
1540 return (DDI_SUCCESS);
1541 }
1542
1543 static void
1544 nvme_async_event(nvme_t *nvme)
1545 {
1546 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1547
1548 cmd->nc_sqid = 0;
1549 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
1550 cmd->nc_callback = nvme_async_event_task;
1551
1552 nvme_submit_admin_cmd(nvme->n_adminq, cmd);
1553 }
1554
1555 static int
1556 nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms,
1557 uint8_t pi, boolean_t pil, uint8_t ses)
1558 {
1559 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1560 nvme_format_nvm_t format_nvm = { 0 };
1562
1563 format_nvm.b.fm_lbaf = lbaf & 0xf;
1564 format_nvm.b.fm_ms = ms ? 1 : 0;
1565 format_nvm.b.fm_pi = pi & 0x7;
1566 format_nvm.b.fm_pil = pil ? 1 : 0;
1567 format_nvm.b.fm_ses = ses & 0x7;
1568
1569 cmd->nc_sqid = 0;
1570 cmd->nc_callback = nvme_wakeup_cmd;
1571 cmd->nc_sqe.sqe_nsid = nsid;
1572 cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT;
1573 cmd->nc_sqe.sqe_cdw10 = format_nvm.r;
1574
1575 /*
1576 * Some devices like Samsung SM951 don't allow formatting of all
1577 * namespaces in one command. Handle that gracefully.
1578 */
1579 if (nsid == (uint32_t)-1)
1580 cmd->nc_dontpanic = B_TRUE;
1581
1582 if ((ret = nvme_admin_cmd(cmd, nvme_format_cmd_timeout))
1583 != DDI_SUCCESS) {
1584 dev_err(nvme->n_dip, CE_WARN,
1585 "!nvme_admin_cmd failed for FORMAT NVM");
1586 return (EIO);
1587 }
1588
1589 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1590 dev_err(nvme->n_dip, CE_WARN,
1591 "!FORMAT failed with sct = %x, sc = %x",
1592 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1593 }
1594
1595 nvme_free_cmd(cmd);
1596 return (ret);
1597 }
1598
1599 static int
1600 nvme_get_logpage(nvme_t *nvme, void **buf, size_t *bufsize, uint8_t logpage,
1601 ...)
1602 {
1603 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1604 nvme_getlogpage_t getlogpage = { 0 };
1605 va_list ap;
1606 int ret = DDI_FAILURE;
1607
1608 va_start(ap, logpage);
1609
1610 cmd->nc_sqid = 0;
1611 cmd->nc_callback = nvme_wakeup_cmd;
1612 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE;
1613
1614 getlogpage.b.lp_lid = logpage;
1615
1616 switch (logpage) {
1617 case NVME_LOGPAGE_ERROR:
1618 cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
1619 /*
1620 * The GET LOG PAGE command can use at most 2 pages to return
1621 * data, PRP lists are not supported.
1622 */
1623 *bufsize = MIN(2 * nvme->n_pagesize,
1624 nvme->n_error_log_len * sizeof (nvme_error_log_entry_t));
1625 break;
1626
1627 case NVME_LOGPAGE_HEALTH:
1628 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t);
1629 *bufsize = sizeof (nvme_health_log_t);
1630 break;
1631
1632 case NVME_LOGPAGE_FWSLOT:
1633 cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
1634 *bufsize = sizeof (nvme_fwslot_log_t);
1635 break;
1636
1637 default:
1638 dev_err(nvme->n_dip, CE_WARN, "!unknown log page requested: %d",
1639 logpage);
1640 atomic_inc_32(&nvme->n_unknown_logpage);
1641 goto fail;
1642 }
1643
1644 va_end(ap);
1645
1646 getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1;
1647
1648 cmd->nc_sqe.sqe_cdw10 = getlogpage.r;
1649
1650 if (nvme_zalloc_dma(nvme, getlogpage.b.lp_numd * sizeof (uint32_t),
1651 DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1652 dev_err(nvme->n_dip, CE_WARN,
1653 "!nvme_zalloc_dma failed for GET LOG PAGE");
1654 goto fail;
1655 }
1656
1657 if (cmd->nc_dma->nd_ncookie > 2) {
1658 dev_err(nvme->n_dip, CE_WARN,
1659 "!too many DMA cookies for GET LOG PAGE");
1660 atomic_inc_32(&nvme->n_too_many_cookies);
1661 goto fail;
1662 }
1663
1664 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
1665 if (cmd->nc_dma->nd_ncookie > 1) {
1666 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
1667 &cmd->nc_dma->nd_cookie);
1668 cmd->nc_sqe.sqe_dptr.d_prp[1] =
1669 cmd->nc_dma->nd_cookie.dmac_laddress;
1670 }
1671
1672 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
1673 dev_err(nvme->n_dip, CE_WARN,
1674 "!nvme_admin_cmd failed for GET LOG PAGE");
1675 return (ret);
1676 }
1677
1678 if (nvme_check_cmd_status(cmd)) {
1679 dev_err(nvme->n_dip, CE_WARN,
1680 "!GET LOG PAGE failed with sct = %x, sc = %x",
1681 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1682 goto fail;
1683 }
1684
1685 *buf = kmem_alloc(*bufsize, KM_SLEEP);
1686 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
1687
1688 ret = DDI_SUCCESS;
1689
1690 fail:
1691 nvme_free_cmd(cmd);
1692
1693 return (ret);
1694 }
1695
1696 static void *
1697 nvme_identify(nvme_t *nvme, uint32_t nsid)
1698 {
1699 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1700 void *buf = NULL;
1701
1702 cmd->nc_sqid = 0;
1703 cmd->nc_callback = nvme_wakeup_cmd;
1704 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
1705 cmd->nc_sqe.sqe_nsid = nsid;
1706 cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL;
1707
1708 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
1709 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1710 dev_err(nvme->n_dip, CE_WARN,
1711 "!nvme_zalloc_dma failed for IDENTIFY");
1712 goto fail;
1713 }
1714
1715 if (cmd->nc_dma->nd_ncookie > 2) {
1716 dev_err(nvme->n_dip, CE_WARN,
1717 "!too many DMA cookies for IDENTIFY");
1718 atomic_inc_32(&nvme->n_too_many_cookies);
1719 goto fail;
1720 }
1721
1722 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
1723 if (cmd->nc_dma->nd_ncookie > 1) {
1724 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
1725 &cmd->nc_dma->nd_cookie);
1726 cmd->nc_sqe.sqe_dptr.d_prp[1] =
1727 cmd->nc_dma->nd_cookie.dmac_laddress;
1728 }
1729
1730 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
1731 dev_err(nvme->n_dip, CE_WARN,
1732 "!nvme_admin_cmd failed for IDENTIFY");
1733 return (NULL);
1734 }
1735
1736 if (nvme_check_cmd_status(cmd)) {
1737 dev_err(nvme->n_dip, CE_WARN,
1738 "!IDENTIFY failed with sct = %x, sc = %x",
1739 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1740 goto fail;
1741 }
1742
1743 buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
1744 bcopy(cmd->nc_dma->nd_memp, buf, NVME_IDENTIFY_BUFSIZE);
1745
1746 fail:
1747 nvme_free_cmd(cmd);
1748
1749 return (buf);
1750 }
1751
1752 static boolean_t
1753 nvme_set_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t val,
1754 uint32_t *res)
1755 {
1756 _NOTE(ARGUNUSED(nsid));
1757 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1758 boolean_t ret = B_FALSE;
1759
1760 ASSERT(res != NULL);
1761
1762 cmd->nc_sqid = 0;
1763 cmd->nc_callback = nvme_wakeup_cmd;
1764 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES;
1765 cmd->nc_sqe.sqe_cdw10 = feature;
1766 cmd->nc_sqe.sqe_cdw11 = val;
1767
1768 switch (feature) {
1769 case NVME_FEAT_WRITE_CACHE:
1770 if (!nvme->n_write_cache_present)
1771 goto fail;
1772 break;
1773
1774 case NVME_FEAT_NQUEUES:
1775 break;
1776
1777 default:
1778 goto fail;
1779 }
1780
1781 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
1782 dev_err(nvme->n_dip, CE_WARN,
1783 "!nvme_admin_cmd failed for SET FEATURES");
1784 return (ret);
1785 }
1786
1787 if (nvme_check_cmd_status(cmd)) {
1788 dev_err(nvme->n_dip, CE_WARN,
1789 "!SET FEATURES %d failed with sct = %x, sc = %x",
1790 feature, cmd->nc_cqe.cqe_sf.sf_sct,
1791 cmd->nc_cqe.cqe_sf.sf_sc);
1792 goto fail;
1793 }
1794
1795 *res = cmd->nc_cqe.cqe_dw0;
1796 ret = B_TRUE;
1797
1798 fail:
1799 nvme_free_cmd(cmd);
1800 return (ret);
1801 }
1802
1803 static boolean_t
1804 nvme_get_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t *res,
1805 void **buf, size_t *bufsize)
1806 {
1807 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1808 boolean_t ret = B_FALSE;
1809
1810 ASSERT(res != NULL);
1811
1812 if (bufsize != NULL)
1813 *bufsize = 0;
1814
1815 cmd->nc_sqid = 0;
1816 cmd->nc_callback = nvme_wakeup_cmd;
1817 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES;
1818 cmd->nc_sqe.sqe_cdw10 = feature;
1819 cmd->nc_sqe.sqe_cdw11 = *res;
1820
1821 switch (feature) {
1822 case NVME_FEAT_ARBITRATION:
1823 case NVME_FEAT_POWER_MGMT:
1824 case NVME_FEAT_TEMPERATURE:
1825 case NVME_FEAT_ERROR:
1826 case NVME_FEAT_NQUEUES:
1827 case NVME_FEAT_INTR_COAL:
1828 case NVME_FEAT_INTR_VECT:
1854
1855 break;
1856
1857 case NVME_FEAT_AUTO_PST:
1858 if (!nvme->n_auto_pst_supported)
1859 goto fail;
1860
1861 ASSERT(bufsize != NULL);
1862 *bufsize = NVME_AUTO_PST_BUFSIZE;
1863 break;
1864
1865 default:
1866 goto fail;
1867 }
1868
1869 if (bufsize != NULL && *bufsize != 0) {
1870 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ,
1871 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1872 dev_err(nvme->n_dip, CE_WARN,
1873 "!nvme_zalloc_dma failed for GET FEATURES");
1874 goto fail;
1875 }
1876
1877 if (cmd->nc_dma->nd_ncookie > 2) {
1878 dev_err(nvme->n_dip, CE_WARN,
1879 "!too many DMA cookies for GET FEATURES");
1880 atomic_inc_32(&nvme->n_too_many_cookies);
1881 goto fail;
1882 }
1883
1884 cmd->nc_sqe.sqe_dptr.d_prp[0] =
1885 cmd->nc_dma->nd_cookie.dmac_laddress;
1886 if (cmd->nc_dma->nd_ncookie > 1) {
1887 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
1888 &cmd->nc_dma->nd_cookie);
1889 cmd->nc_sqe.sqe_dptr.d_prp[1] =
1890 cmd->nc_dma->nd_cookie.dmac_laddress;
1891 }
1892 }
1893
1894 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
1895 dev_err(nvme->n_dip, CE_WARN,
1896 "!nvme_admin_cmd failed for GET FEATURES");
1897 return (ret);
1898 }
1899
1900 if (nvme_check_cmd_status(cmd)) {
1901 if (feature == NVME_FEAT_LBA_RANGE &&
1902 cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1903 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD)
1904 nvme->n_lba_range_supported = B_FALSE;
1905 else
1906 dev_err(nvme->n_dip, CE_WARN,
1907 "!GET FEATURES %d failed with sct = %x, sc = %x",
1908 feature, cmd->nc_cqe.cqe_sf.sf_sct,
1909 cmd->nc_cqe.cqe_sf.sf_sc);
1910 goto fail;
1911 }
1912
1913 if (bufsize != NULL && *bufsize != 0) {
1914 ASSERT(buf != NULL);
1915 *buf = kmem_alloc(*bufsize, KM_SLEEP);
1916 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
1917 }
1918
1919 *res = cmd->nc_cqe.cqe_dw0;
1920 ret = B_TRUE;
1921
1922 fail:
1923 nvme_free_cmd(cmd);
1924 return (ret);
1925 }
1926
1927 static boolean_t
1928 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
1929 {
1930 nvme_write_cache_t nwc = { 0 };
1931
1932 if (enable)
1933 nwc.b.wc_wce = 1;
1934
1935 if (!nvme_set_features(nvme, 0, NVME_FEAT_WRITE_CACHE, nwc.r, &nwc.r))
1936 return (B_FALSE);
1937
1938 return (B_TRUE);
1939 }
1940
1941 static int
1942 nvme_set_nqueues(nvme_t *nvme, uint16_t nqueues)
1943 {
1944 nvme_nqueues_t nq = { 0 };
1945
1946 nq.b.nq_nsq = nq.b.nq_ncq = nqueues - 1;
1947
1948 if (!nvme_set_features(nvme, 0, NVME_FEAT_NQUEUES, nq.r, &nq.r)) {
1949 return (0);
1950 }
1951
1952 /*
1953 * Always use the same number of submission and completion queues, and
1954 * never use more than the requested number of queues.
1955 */
1956 return (MIN(nqueues, MIN(nq.b.nq_nsq, nq.b.nq_ncq) + 1));
1957 }
1958
1959 static int
1960 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
1961 {
1962 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1963 nvme_create_queue_dw10_t dw10 = { 0 };
1964 nvme_create_cq_dw11_t c_dw11 = { 0 };
1965 nvme_create_sq_dw11_t s_dw11 = { 0 };
1966
1967 dw10.b.q_qid = idx;
1968 dw10.b.q_qsize = qp->nq_nentry - 1;
1969
1970 c_dw11.b.cq_pc = 1;
1971 c_dw11.b.cq_ien = 1;
1972 c_dw11.b.cq_iv = idx % nvme->n_intr_cnt;
1973
1974 cmd->nc_sqid = 0;
1975 cmd->nc_callback = nvme_wakeup_cmd;
1976 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
1977 cmd->nc_sqe.sqe_cdw10 = dw10.r;
1978 cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
1979 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_cqdma->nd_cookie.dmac_laddress;
1980
1981 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
1982 dev_err(nvme->n_dip, CE_WARN,
1983 "!nvme_admin_cmd failed for CREATE CQUEUE");
1984 return (DDI_FAILURE);
1985 }
1986
1987 if (nvme_check_cmd_status(cmd)) {
1988 dev_err(nvme->n_dip, CE_WARN,
1989 "!CREATE CQUEUE failed with sct = %x, sc = %x",
1990 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1991 nvme_free_cmd(cmd);
1992 return (DDI_FAILURE);
1993 }
1994
1995 nvme_free_cmd(cmd);
1996
1997 s_dw11.b.sq_pc = 1;
1998 s_dw11.b.sq_cqid = idx;
1999
2000 cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2001 cmd->nc_sqid = 0;
2002 cmd->nc_callback = nvme_wakeup_cmd;
2003 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
2004 cmd->nc_sqe.sqe_cdw10 = dw10.r;
2005 cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
2006 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
2007
2008 if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) {
2009 dev_err(nvme->n_dip, CE_WARN,
2010 "!nvme_admin_cmd failed for CREATE SQUEUE");
2011 return (DDI_FAILURE);
2012 }
2013
2014 if (nvme_check_cmd_status(cmd)) {
2015 dev_err(nvme->n_dip, CE_WARN,
2016 "!CREATE SQUEUE failed with sct = %x, sc = %x",
2017 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2018 nvme_free_cmd(cmd);
2019 return (DDI_FAILURE);
2020 }
2021
2022 nvme_free_cmd(cmd);
2023
2024 return (DDI_SUCCESS);
2025 }
2026
2027 static boolean_t
2028 nvme_reset(nvme_t *nvme, boolean_t quiesce)
2029 {
2030 nvme_reg_csts_t csts;
2031 int i;
2032
2033 nvme_put32(nvme, NVME_REG_CC, 0);
2034
2035 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2036 if (csts.b.csts_rdy == 1) {
2037 nvme_put32(nvme, NVME_REG_CC, 0);
2038 for (i = 0; i != nvme->n_timeout * 10; i++) {
2039 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2040 if (csts.b.csts_rdy == 0)
2041 break;
2042
2043 if (quiesce)
2044 drv_usecwait(50000);
2097
2098 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
2099 bcopy(nvme->n_idctl->id_serial, serial,
2100 sizeof (nvme->n_idctl->id_serial));
2101
2102 model[sizeof (nvme->n_idctl->id_model)] = '\0';
2103 serial[sizeof (nvme->n_idctl->id_serial)] = '\0';
2104
2105 nvme->n_ns[nsid - 1].ns_devid = kmem_asprintf("%4X-%s-%s-%X",
2106 nvme->n_idctl->id_vid, model, serial, nsid);
2107 }
2108
2109 static int
2110 nvme_init_ns(nvme_t *nvme, int nsid)
2111 {
2112 nvme_namespace_t *ns = &nvme->n_ns[nsid - 1];
2113 nvme_identify_nsid_t *idns;
2114 int last_rp;
2115
2116 ns->ns_nvme = nvme;
2117 idns = nvme_identify(nvme, nsid);
2118
2119 if (idns == NULL) {
2120 dev_err(nvme->n_dip, CE_WARN,
2121 "!failed to identify namespace %d", nsid);
2122 return (DDI_FAILURE);
2123 }
2124
2125 ns->ns_idns = idns;
2126 ns->ns_id = nsid;
2127 ns->ns_block_count = idns->id_nsize;
2128 ns->ns_block_size =
2129 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
2130 ns->ns_best_block_size = ns->ns_block_size;
2131
2132 /*
2133 * Get the EUI64 if present. Use it for devid and device node names.
2134 */
2135 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
2136 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64));
2137
2138 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
2139 if (*(uint64_t *)ns->ns_eui64 != 0) {
2189 nsid, (uint64_t)ns->ns_block_size);
2190 ns->ns_ignore = B_TRUE;
2191 } else {
2192 ns->ns_ignore = B_FALSE;
2193 }
2194
2195 return (DDI_SUCCESS);
2196 }
2197
2198 static int
2199 nvme_init(nvme_t *nvme)
2200 {
2201 nvme_reg_cc_t cc = { 0 };
2202 nvme_reg_aqa_t aqa = { 0 };
2203 nvme_reg_asq_t asq = { 0 };
2204 nvme_reg_acq_t acq = { 0 };
2205 nvme_reg_cap_t cap;
2206 nvme_reg_vs_t vs;
2207 nvme_reg_csts_t csts;
2208 int i = 0;
2209 int nqueues;
2210 char model[sizeof (nvme->n_idctl->id_model) + 1];
2211 char *vendor, *product;
2212
2213 /* Check controller version */
2214 vs.r = nvme_get32(nvme, NVME_REG_VS);
2215 nvme->n_version.v_major = vs.b.vs_mjr;
2216 nvme->n_version.v_minor = vs.b.vs_mnr;
2217 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
2218 nvme->n_version.v_major, nvme->n_version.v_minor);
2219
2220 if (NVME_VERSION_HIGHER(&nvme->n_version,
2221 nvme_version_major, nvme_version_minor)) {
2222 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.%d",
2223 nvme_version_major, nvme_version_minor);
2224 if (nvme->n_strict_version)
2225 goto fail;
2226 }
2227
2228 /* retrieve controller configuration */
2229 cap.r = nvme_get64(nvme, NVME_REG_CAP);
2354 */
2355 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1)
2356 != DDI_SUCCESS) &&
2357 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1)
2358 != DDI_SUCCESS) &&
2359 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
2360 != DDI_SUCCESS)) {
2361 dev_err(nvme->n_dip, CE_WARN,
2362 "!failed to setup initial interrupt");
2363 goto fail;
2364 }
2365
2366 /*
2367 * Post an asynchronous event command to catch errors.
2368 */
2369 nvme_async_event(nvme);
2370
2371 /*
2372 * Identify Controller
2373 */
2374 nvme->n_idctl = nvme_identify(nvme, 0);
2375 if (nvme->n_idctl == NULL) {
2376 dev_err(nvme->n_dip, CE_WARN,
2377 "!failed to identify controller");
2378 goto fail;
2379 }
2380
2381 /*
2382 * Get Vendor & Product ID
2383 */
2384 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
2385 model[sizeof (nvme->n_idctl->id_model)] = '\0';
2386 sata_split_model(model, &vendor, &product);
2387
2388 if (vendor == NULL)
2389 nvme->n_vendor = strdup("NVMe");
2390 else
2391 nvme->n_vendor = strdup(vendor);
2392
2393 nvme->n_product = strdup(product);
2394
2395 /*
2444 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) ||
2445 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) ||
2446 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) ||
2447 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t)))
2448 goto fail;
2449
2450 /*
2451 * Check for the presence of a Volatile Write Cache. If present,
2452 * enable or disable based on the value of the property
2453 * volatile-write-cache-enable (default is enabled).
2454 */
2455 nvme->n_write_cache_present =
2456 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE;
2457
2458 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2459 "volatile-write-cache-present",
2460 nvme->n_write_cache_present ? 1 : 0);
2461
2462 if (!nvme->n_write_cache_present) {
2463 nvme->n_write_cache_enabled = B_FALSE;
2464 } else if (!nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)) {
2465 dev_err(nvme->n_dip, CE_WARN,
2466 "!failed to %sable volatile write cache",
2467 nvme->n_write_cache_enabled ? "en" : "dis");
2468 /*
2469 * Assume the cache is (still) enabled.
2470 */
2471 nvme->n_write_cache_enabled = B_TRUE;
2472 }
2473
2474 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2475 "volatile-write-cache-enable",
2476 nvme->n_write_cache_enabled ? 1 : 0);
2477
2478 /*
2479 * Assume LBA Range Type feature is supported. If it isn't this
2480 * will be set to B_FALSE by nvme_get_features().
2481 */
2482 nvme->n_lba_range_supported = B_TRUE;
2483
2484 /*
2516 != 0) {
2517 nvme_release_interrupts(nvme);
2518
2519 nqueues = MIN(UINT16_MAX, ncpus);
2520
2521 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX,
2522 nqueues) != DDI_SUCCESS) &&
2523 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI,
2524 nqueues) != DDI_SUCCESS)) {
2525 dev_err(nvme->n_dip, CE_WARN,
2526 "!failed to setup MSI/MSI-X interrupts");
2527 goto fail;
2528 }
2529 }
2530
2531 nqueues = nvme->n_intr_cnt;
2532
2533 /*
2534 * Create I/O queue pairs.
2535 */
2536 nvme->n_ioq_count = nvme_set_nqueues(nvme, nqueues);
2537 if (nvme->n_ioq_count == 0) {
2538 dev_err(nvme->n_dip, CE_WARN,
2539 "!failed to set number of I/O queues to %d", nqueues);
2540 goto fail;
2541 }
2542
2543 /*
2544 * Reallocate I/O queue array
2545 */
2546 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
2547 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
2548 (nvme->n_ioq_count + 1), KM_SLEEP);
2549 nvme->n_ioq[0] = nvme->n_adminq;
2550
2551 /*
2552 * If we got less queues than we asked for we might as well give
2553 * some of the interrupt vectors back to the system.
2554 */
2555 if (nvme->n_ioq_count < nqueues) {
2556 nvme_release_interrupts(nvme);
2557
2558 if (nvme_setup_interrupts(nvme, nvme->n_intr_type,
2559 nvme->n_ioq_count) != DDI_SUCCESS) {
2560 dev_err(nvme->n_dip, CE_WARN,
2561 "!failed to reduce number of interrupts");
2562 goto fail;
2563 }
2564 }
2565
2566 /*
2567 * Alloc & register I/O queue pairs
2568 */
2569 nvme->n_io_queue_len =
2570 MIN(nvme->n_io_queue_len, nvme->n_max_queue_entries);
2571 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-queue-len",
2572 nvme->n_io_queue_len);
2573
2574 for (i = 1; i != nvme->n_ioq_count + 1; i++) {
2575 if (nvme_alloc_qpair(nvme, nvme->n_io_queue_len,
2576 &nvme->n_ioq[i], i) != DDI_SUCCESS) {
2577 dev_err(nvme->n_dip, CE_WARN,
2578 "!unable to allocate I/O qpair %d", i);
2579 goto fail;
2580 }
2581
2582 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i)
2583 != DDI_SUCCESS) {
2584 dev_err(nvme->n_dip, CE_WARN,
2585 "!unable to create I/O qpair %d", i);
2586 goto fail;
2587 }
2588 }
2589
2590 /*
2591 * Post more asynchronous events commands to reduce event reporting
2592 * latency as suggested by the spec.
2593 */
2594 for (i = 1; i != nvme->n_async_event_limit; i++)
2595 nvme_async_event(nvme);
2596
2597 return (DDI_SUCCESS);
2598
2599 fail:
2600 (void) nvme_reset(nvme, B_FALSE);
2601 return (DDI_FAILURE);
2602 }
2603
2604 static uint_t
2605 nvme_intr(caddr_t arg1, caddr_t arg2)
2606 {
2607 /*LINTED: E_PTR_BAD_CAST_ALIGN*/
2608 nvme_t *nvme = (nvme_t *)arg1;
2609 int inum = (int)(uintptr_t)arg2;
2610 int ccnt = 0;
2611 int qnum;
2612 nvme_cmd_t *cmd;
2613
2614 if (inum >= nvme->n_intr_cnt)
2615 return (DDI_INTR_UNCLAIMED);
2616
2617 /*
2618 * The interrupt vector a queue uses is calculated as queue_idx %
2619 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
2620 * in steps of n_intr_cnt to process all queues using this vector.
2621 */
2622 for (qnum = inum;
2623 qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL;
2624 qnum += nvme->n_intr_cnt) {
2625 while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) {
2626 taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
2627 cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
2628 ccnt++;
2629 }
2630 }
2631
2632 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
2633 }
2634
2635 static void
2636 nvme_release_interrupts(nvme_t *nvme)
3358 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
3359 {
3360 #ifndef __lock_lint
3361 _NOTE(ARGUNUSED(cred_p));
3362 #endif
3363 minor_t minor = getminor(*devp);
3364 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor));
3365 int nsid = NVME_MINOR_NSID(minor);
3366 nvme_minor_state_t *nm;
3367 int rv = 0;
3368
3369 if (otyp != OTYP_CHR)
3370 return (EINVAL);
3371
3372 if (nvme == NULL)
3373 return (ENXIO);
3374
3375 if (nsid > nvme->n_namespace_count)
3376 return (ENXIO);
3377
3378 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor;
3379
3380 mutex_enter(&nm->nm_mutex);
3381 if (nm->nm_oexcl) {
3382 rv = EBUSY;
3383 goto out;
3384 }
3385
3386 if (flag & FEXCL) {
3387 if (nm->nm_ocnt != 0) {
3388 rv = EBUSY;
3389 goto out;
3390 }
3391 nm->nm_oexcl = B_TRUE;
3392 }
3393
3394 nm->nm_ocnt++;
3395
3396 out:
3397 mutex_exit(&nm->nm_mutex);
3430 nm->nm_ocnt--;
3431 mutex_exit(&nm->nm_mutex);
3432
3433 return (0);
3434 }
3435
3436 static int
3437 nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
3438 cred_t *cred_p)
3439 {
3440 _NOTE(ARGUNUSED(cred_p));
3441 int rv = 0;
3442 void *idctl;
3443
3444 if ((mode & FREAD) == 0)
3445 return (EPERM);
3446
3447 if (nioc->n_len < NVME_IDENTIFY_BUFSIZE)
3448 return (EINVAL);
3449
3450 idctl = nvme_identify(nvme, nsid);
3451 if (idctl == NULL)
3452 return (EIO);
3453
3454 if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode)
3455 != 0)
3456 rv = EFAULT;
3457
3458 kmem_free(idctl, NVME_IDENTIFY_BUFSIZE);
3459
3460 return (rv);
3461 }
3462
3463 static int
3464 nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
3465 int mode, cred_t *cred_p)
3466 {
3467 _NOTE(ARGUNUSED(nsid, cred_p));
3468 int rv = 0;
3469 nvme_reg_cap_t cap = { 0 };
3470 nvme_capabilities_t nc;
3471
3472 if ((mode & FREAD) == 0)
3599 return (EINVAL);
3600
3601 if (!nvme->n_write_cache_present)
3602 return (EINVAL);
3603
3604 break;
3605
3606 case NVME_FEAT_AUTO_PST:
3607 if (nsid != 0)
3608 return (EINVAL);
3609
3610 if (!nvme->n_auto_pst_supported)
3611 return (EINVAL);
3612
3613 break;
3614
3615 default:
3616 return (EINVAL);
3617 }
3618
3619 if (nvme_get_features(nvme, nsid, feature, &res, &buf, &bufsize) ==
3620 B_FALSE)
3621 return (EIO);
3622
3623 if (nioc->n_len < bufsize) {
3624 kmem_free(buf, bufsize);
3625 return (EINVAL);
3626 }
3627
3628 if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0)
3629 rv = EFAULT;
3630
3631 kmem_free(buf, bufsize);
3632 nioc->n_arg = res;
3633 nioc->n_len = bufsize;
3634
3635 return (rv);
3636 }
3637
3638 static int
3639 nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
3640 cred_t *cred_p)
3641 {
3818 case DDI_MODEL_ILP32: {
3819 nvme_ioctl32_t nioc32;
3820 if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t),
3821 mode) != 0)
3822 return (EFAULT);
3823 nioc.n_len = nioc32.n_len;
3824 nioc.n_buf = nioc32.n_buf;
3825 nioc.n_arg = nioc32.n_arg;
3826 break;
3827 }
3828 case DDI_MODEL_NONE:
3829 #endif
3830 if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode)
3831 != 0)
3832 return (EFAULT);
3833 #ifdef _MULTI_DATAMODEL
3834 break;
3835 }
3836 #endif
3837
3838 if (cmd == NVME_IOC_IDENTIFY_CTRL) {
3839 /*
3840 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and
3841 * attachment point nodes.
3842 */
3843 nsid = 0;
3844 } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) {
3845 /*
3846 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it
3847 * will always return identify data for namespace 1.
3848 */
3849 nsid = 1;
3850 }
3851
3852 if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL)
3853 rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode,
3854 cred_p);
3855 else
3856 rv = EINVAL;
3857
|
39 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
40 * the admin queue will share an interrupt with one I/O queue. The interrupt
41 * handler will retrieve completed commands from all queues sharing an interrupt
42 * vector and will post them to a taskq for completion processing.
43 *
44 *
45 * Command Processing:
46 *
47 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
48 * to 65536 I/O commands. The driver will configure one I/O queue pair per
49 * available interrupt vector, with the queue length usually much smaller than
50 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
51 * interrupt vectors will be used.
52 *
53 * Additionally the hardware provides a single special admin queue pair that can
54 * hold up to 4096 admin commands.
55 *
56 * From the hardware perspective both queues of a queue pair are independent,
57 * but they share some driver state: the command array (holding pointers to
58 * commands currently being processed by the hardware) and the active command
59 * counter. Access to a queue pair and the shared state is protected by
60 * nq_mutex.
61 *
62 * When a command is submitted to a queue pair the active command counter is
63 * incremented and a pointer to the command is stored in the command array. The
64 * array index is used as command identifier (CID) in the submission queue
65 * entry. Some commands may take a very long time to complete, and if the queue
66 * wraps around in that time a submission may find the next array slot to still
67 * be used by a long-running command. In this case the array is sequentially
68 * searched for the next free slot. The length of the command array is the same
69 * as the configured queue length. Queue overrun is prevented by the semaphore,
70 * so a command submission may block if the queue is full.
71 *
72 *
73 * Polled I/O Support:
74 *
75 * For kernel core dump support the driver can do polled I/O. As interrupts are
76 * turned off while dumping the driver will just submit a command in the regular
77 * way, and then repeatedly attempt a command retrieval until it gets the
78 * command back.
79 *
80 *
129 * Error handling is currently limited to detecting fatal hardware errors,
130 * either by asynchronous events, or synchronously through command status or
131 * admin command timeouts. In case of severe errors the device is fenced off,
132 * all further requests will return EIO. FMA is then called to fault the device.
133 *
134 * The hardware has a limit for outstanding asynchronous event requests. Before
135 * this limit is known the driver assumes it is at least 1 and posts a single
136 * asynchronous request. Later when the limit is known more asynchronous event
137 * requests are posted to allow quicker reception of error information. When an
138 * asynchronous event is posted by the hardware the driver will parse the error
139 * status fields and log information or fault the device, depending on the
140 * severity of the asynchronous event. The asynchronous event request is then
141 * reused and posted to the admin queue again.
142 *
143 * On command completion the command status is checked for errors. In case of
144 * errors indicating a driver bug the driver panics. Almost all other error
145 * status values just cause EIO to be returned.
146 *
147 * Command timeouts are currently detected for all admin commands except
148 * asynchronous event requests. If a command times out and the hardware appears
149 * to be healthy the driver attempts to abort the command. The original command
150 * timeout is also applied to the abort command. If the abort times out too the
151 * driver assumes the device to be dead, fences it off, and calls FMA to retire
152 * it. In all other cases the aborted command should return immediately with a
153 * status indicating it was aborted, and the driver will wait indefinitely for
154 * that to happen. No timeout handling of normal I/O commands is presently done.
155 *
156 * Any command that times out due to the controller dropping dead will be put on
157 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
158 * memory being reused by the system and later be written to by a "dead" NVMe
159 * controller.
160 *
161 *
162 * Locking:
163 *
164 * Each queue pair has its own nq_mutex, which must be held when accessing the
165 * associated queue registers or the shared state of the queue pair. Callers of
166 * nvme_unqueue_cmd() must make sure that nq_mutex is held, while
167 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of this
168 * themselves.
169 *
170 * Each command also has its own nc_mutex, which is associated with the
171 * condition variable nc_cv. It is only used on admin commands which are run
172 * synchronously. In that case it must be held across calls to
173 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
174 * nvme_admin_cmd(). It must also be held whenever the completion state of the
175 * command is changed or while a admin command timeout is handled.
176 *
177 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
178 * More than one nc_mutex may only be held when aborting commands. In this case,
179 * the nc_mutex of the command to be aborted must be held across the call to
180 * nvme_abort_cmd() to prevent the command from completing while the abort is in
181 * progress.
182 *
183 * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
184 * and exclusive-open flag nm_oexcl.
185 *
186 *
187 * Quiesce / Fast Reboot:
188 *
189 * The driver currently does not support fast reboot. A quiesce(9E) entry point
190 * is still provided which is used to send a shutdown notification to the
191 * device.
192 *
193 *
194 * Driver Configuration:
195 *
196 * The following driver properties can be changed to control some aspects of the
197 * drivers operation:
198 * - strict-version: can be set to 0 to allow devices conforming to newer
199 * versions or namespaces with EUI64 to be used
200 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
201 * specific command status as a fatal error leading device faulting
202 * - admin-queue-len: the maximum length of the admin queue (16-4096)
203 * - io-queue-len: the maximum length of the I/O queues (16-65536)
204 * - async-event-limit: the maximum number of asynchronous event requests to be
205 * posted by the driver
206 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
230 #endif
231
232 #include <sys/modctl.h>
233 #include <sys/conf.h>
234 #include <sys/devops.h>
235 #include <sys/ddi.h>
236 #include <sys/sunddi.h>
237 #include <sys/sunndi.h>
238 #include <sys/bitmap.h>
239 #include <sys/sysmacros.h>
240 #include <sys/param.h>
241 #include <sys/varargs.h>
242 #include <sys/cpuvar.h>
243 #include <sys/disp.h>
244 #include <sys/blkdev.h>
245 #include <sys/atomic.h>
246 #include <sys/archsystm.h>
247 #include <sys/sata/sata_hba.h>
248 #include <sys/stat.h>
249 #include <sys/policy.h>
250 #include <sys/list.h>
251
252 #include <sys/nvme.h>
253
254 #ifdef __x86
255 #include <sys/x86_archext.h>
256 #endif
257
258 #include "nvme_reg.h"
259 #include "nvme_var.h"
260
261
262 /* NVMe spec version supported */
263 static const int nvme_version_major = 1;
264 static const int nvme_version_minor = 2;
265
266 /* tunable for admin command timeout in seconds, default is 1s */
267 int nvme_admin_cmd_timeout = 1;
268
269 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
270 int nvme_format_cmd_timeout = 600;
271
272 static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
273 static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
274 static int nvme_quiesce(dev_info_t *);
275 static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
276 static int nvme_setup_interrupts(nvme_t *, int, int);
277 static void nvme_release_interrupts(nvme_t *);
278 static uint_t nvme_intr(caddr_t, caddr_t);
279
280 static void nvme_shutdown(nvme_t *, int, boolean_t);
281 static boolean_t nvme_reset(nvme_t *, boolean_t);
282 static int nvme_init(nvme_t *);
283 static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
284 static void nvme_free_cmd(nvme_cmd_t *);
285 static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
286 bd_xfer_t *);
287 static void nvme_admin_cmd(nvme_cmd_t *, int);
288 static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
289 static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
290 static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
291 static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int);
292 static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
293 static void nvme_wait_cmd(nvme_cmd_t *, uint_t);
294 static void nvme_wakeup_cmd(void *);
295 static void nvme_async_event_task(void *);
296
297 static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
298 static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
299 static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
300 static int nvme_check_specific_cmd_status(nvme_cmd_t *);
301 static int nvme_check_generic_cmd_status(nvme_cmd_t *);
302 static inline int nvme_check_cmd_status(nvme_cmd_t *);
303
304 static int nvme_abort_cmd(nvme_cmd_t *, uint_t);
305 static void nvme_async_event(nvme_t *);
306 static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t,
307 boolean_t, uint8_t);
308 static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...);
309 static int nvme_identify(nvme_t *, uint32_t, void **);
310 static int nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t,
311 uint32_t *);
312 static int nvme_get_features(nvme_t *, uint32_t, uint8_t, uint32_t *,
313 void **, size_t *);
314 static int nvme_write_cache_set(nvme_t *, boolean_t);
315 static int nvme_set_nqueues(nvme_t *, uint16_t *);
316
317 static void nvme_free_dma(nvme_dma_t *);
318 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
319 nvme_dma_t **);
320 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
321 nvme_dma_t **);
322 static void nvme_free_qpair(nvme_qpair_t *);
323 static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, int);
324 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
325
326 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
327 static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
328 static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
329 static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
330
331 static boolean_t nvme_check_regs_hdl(nvme_t *);
332 static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
333
334 static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *);
335
472 .drv_modops = &mod_driverops,
473 .drv_linkinfo = "NVMe v1.1b",
474 .drv_dev_ops = &nvme_dev_ops
475 };
476
477 static struct modlinkage nvme_modlinkage = {
478 .ml_rev = MODREV_1,
479 .ml_linkage = { &nvme_modldrv, NULL }
480 };
481
482 static bd_ops_t nvme_bd_ops = {
483 .o_version = BD_OPS_VERSION_0,
484 .o_drive_info = nvme_bd_driveinfo,
485 .o_media_info = nvme_bd_mediainfo,
486 .o_devid_init = nvme_bd_devid,
487 .o_sync_cache = nvme_bd_sync,
488 .o_read = nvme_bd_read,
489 .o_write = nvme_bd_write,
490 };
491
492 /*
493 * This list will hold commands that have timed out and couldn't be aborted.
494 * As we don't know what the hardware may still do with the DMA memory we can't
495 * free them, so we'll keep them forever on this list where we can easily look
496 * at them with mdb.
497 */
498 static struct list nvme_lost_cmds;
499 static kmutex_t nvme_lc_mutex;
500
501 int
502 _init(void)
503 {
504 int error;
505
506 error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
507 if (error != DDI_SUCCESS)
508 return (error);
509
510 nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
511 sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
512
513 mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL);
514 list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t),
515 offsetof(nvme_cmd_t, nc_list));
516
517 bd_mod_init(&nvme_dev_ops);
518
519 error = mod_install(&nvme_modlinkage);
520 if (error != DDI_SUCCESS) {
521 ddi_soft_state_fini(&nvme_state);
522 mutex_destroy(&nvme_lc_mutex);
523 list_destroy(&nvme_lost_cmds);
524 bd_mod_fini(&nvme_dev_ops);
525 }
526
527 return (error);
528 }
529
530 int
531 _fini(void)
532 {
533 int error;
534
535 if (!list_is_empty(&nvme_lost_cmds))
536 return (DDI_FAILURE);
537
538 error = mod_remove(&nvme_modlinkage);
539 if (error == DDI_SUCCESS) {
540 ddi_soft_state_fini(&nvme_state);
541 kmem_cache_destroy(nvme_cmd_cache);
542 mutex_destroy(&nvme_lc_mutex);
543 list_destroy(&nvme_lost_cmds);
544 bd_mod_fini(&nvme_dev_ops);
545 }
546
547 return (error);
548 }
549
550 int
551 _info(struct modinfo *modinfop)
552 {
553 return (mod_info(&nvme_modlinkage, modinfop));
554 }
555
556 static inline void
557 nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
558 {
559 ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
560
561 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
562 ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
563 }
833 {
834 nvme_cmd_t *cmd = kmem_cache_alloc(nvme_cmd_cache, kmflag);
835
836 if (cmd == NULL)
837 return (cmd);
838
839 bzero(cmd, sizeof (nvme_cmd_t));
840
841 cmd->nc_nvme = nvme;
842
843 mutex_init(&cmd->nc_mutex, NULL, MUTEX_DRIVER,
844 DDI_INTR_PRI(nvme->n_intr_pri));
845 cv_init(&cmd->nc_cv, NULL, CV_DRIVER, NULL);
846
847 return (cmd);
848 }
849
850 static void
851 nvme_free_cmd(nvme_cmd_t *cmd)
852 {
853 /* Don't free commands on the lost commands list. */
854 if (list_link_active(&cmd->nc_list))
855 return;
856
857 if (cmd->nc_dma) {
858 if (cmd->nc_dma->nd_cached)
859 kmem_cache_free(cmd->nc_nvme->n_prp_cache,
860 cmd->nc_dma);
861 else
862 nvme_free_dma(cmd->nc_dma);
863 cmd->nc_dma = NULL;
864 }
865
866 cv_destroy(&cmd->nc_cv);
867 mutex_destroy(&cmd->nc_mutex);
868
869 kmem_cache_free(nvme_cmd_cache, cmd);
870 }
871
872 static void
873 nvme_submit_admin_cmd(nvme_qpair_t *qp, nvme_cmd_t *cmd)
874 {
875 sema_p(&qp->nq_sema);
876 nvme_submit_cmd_common(qp, cmd);
903 while (qp->nq_cmd[qp->nq_next_cmd] != NULL)
904 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
905 qp->nq_cmd[qp->nq_next_cmd] = cmd;
906
907 qp->nq_active_cmds++;
908
909 cmd->nc_sqe.sqe_cid = qp->nq_next_cmd;
910 bcopy(&cmd->nc_sqe, &qp->nq_sq[qp->nq_sqtail], sizeof (nvme_sqe_t));
911 (void) ddi_dma_sync(qp->nq_sqdma->nd_dmah,
912 sizeof (nvme_sqe_t) * qp->nq_sqtail,
913 sizeof (nvme_sqe_t), DDI_DMA_SYNC_FORDEV);
914 qp->nq_next_cmd = (qp->nq_next_cmd + 1) % qp->nq_nentry;
915
916 tail.b.sqtdbl_sqt = qp->nq_sqtail = (qp->nq_sqtail + 1) % qp->nq_nentry;
917 nvme_put32(cmd->nc_nvme, qp->nq_sqtdbl, tail.r);
918
919 mutex_exit(&qp->nq_mutex);
920 }
921
922 static nvme_cmd_t *
923 nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid)
924 {
925 nvme_cmd_t *cmd;
926
927 ASSERT(mutex_owned(&qp->nq_mutex));
928 ASSERT3S(cid, <, qp->nq_nentry);
929
930 cmd = qp->nq_cmd[cid];
931 qp->nq_cmd[cid] = NULL;
932 ASSERT3U(qp->nq_active_cmds, >, 0);
933 qp->nq_active_cmds--;
934 sema_v(&qp->nq_sema);
935
936 ASSERT3P(cmd, !=, NULL);
937 ASSERT3P(cmd->nc_nvme, ==, nvme);
938 ASSERT3S(cmd->nc_sqe.sqe_cid, ==, cid);
939
940 return (cmd);
941 }
942
943 static nvme_cmd_t *
944 nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
945 {
946 nvme_reg_cqhdbl_t head = { 0 };
947
948 nvme_cqe_t *cqe;
949 nvme_cmd_t *cmd;
950
951 (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
952 sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
953
954 mutex_enter(&qp->nq_mutex);
955 cqe = &qp->nq_cq[qp->nq_cqhead];
956
957 /* Check phase tag of CQE. Hardware inverts it for new entries. */
958 if (cqe->cqe_sf.sf_p == qp->nq_phase) {
959 mutex_exit(&qp->nq_mutex);
960 return (NULL);
961 }
962
963 ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
964
965 cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid);
966
967 ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
968 bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
969
970 qp->nq_sqhead = cqe->cqe_sqhd;
971
972 head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
973
974 /* Toggle phase on wrap-around. */
975 if (qp->nq_cqhead == 0)
976 qp->nq_phase = qp->nq_phase ? 0 : 1;
977
978 nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
979 mutex_exit(&qp->nq_mutex);
980
981 return (cmd);
982 }
983
984 static int
985 nvme_check_unknown_cmd_status(nvme_cmd_t *cmd)
986 {
987 nvme_cqe_t *cqe = &cmd->nc_cqe;
988
989 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
990 "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
991 "sc = %x, sct = %x, dnr = %d, m = %d", cmd->nc_sqe.sqe_opc,
992 cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct,
993 cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m);
994
995 if (cmd->nc_xfer != NULL)
996 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
997
998 if (cmd->nc_nvme->n_strict_version) {
999 cmd->nc_nvme->n_dead = B_TRUE;
1244 return (EINVAL);
1245
1246 case NVME_CQE_SC_SPC_NVM_READONLY:
1247 /* Write to Read Only Range */
1248 ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE);
1249 atomic_inc_32(&cmd->nc_nvme->n_readonly);
1250 if (cmd->nc_xfer != NULL)
1251 bd_error(cmd->nc_xfer, BD_ERR_ILLRQ);
1252 return (EROFS);
1253
1254 default:
1255 return (nvme_check_unknown_cmd_status(cmd));
1256 }
1257 }
1258
1259 static inline int
1260 nvme_check_cmd_status(nvme_cmd_t *cmd)
1261 {
1262 nvme_cqe_t *cqe = &cmd->nc_cqe;
1263
1264 /*
1265 * Take a shortcut if the controller is dead, or if
1266 * command status indicates no error.
1267 */
1268 if (cmd->nc_nvme->n_dead)
1269 return (EIO);
1270
1271 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1272 cqe->cqe_sf.sf_sc == NVME_CQE_SC_GEN_SUCCESS)
1273 return (0);
1274
1275 if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC)
1276 return (nvme_check_generic_cmd_status(cmd));
1277 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_SPECIFIC)
1278 return (nvme_check_specific_cmd_status(cmd));
1279 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_INTEGRITY)
1280 return (nvme_check_integrity_cmd_status(cmd));
1281 else if (cqe->cqe_sf.sf_sct == NVME_CQE_SCT_VENDOR)
1282 return (nvme_check_vendor_cmd_status(cmd));
1283
1284 return (nvme_check_unknown_cmd_status(cmd));
1285 }
1286
1287 static int
1288 nvme_abort_cmd(nvme_cmd_t *abort_cmd, uint_t sec)
1289 {
1290 nvme_t *nvme = abort_cmd->nc_nvme;
1291 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1292 nvme_abort_cmd_t ac = { 0 };
1293 int ret = 0;
1294
1295 sema_p(&nvme->n_abort_sema);
1296
1297 ac.b.ac_cid = abort_cmd->nc_sqe.sqe_cid;
1298 ac.b.ac_sqid = abort_cmd->nc_sqid;
1299
1300 cmd->nc_sqid = 0;
1301 cmd->nc_sqe.sqe_opc = NVME_OPC_ABORT;
1302 cmd->nc_callback = nvme_wakeup_cmd;
1303 cmd->nc_sqe.sqe_cdw10 = ac.r;
1304
1305 /*
1306 * Send the ABORT to the hardware. The ABORT command will return _after_
1307 * the aborted command has completed (aborted or otherwise), but since
1308 * we still hold the aborted command's mutex its callback hasn't been
1309 * processed yet.
1310 */
1311 nvme_admin_cmd(cmd, sec);
1312 sema_v(&nvme->n_abort_sema);
1313
1314 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1315 dev_err(nvme->n_dip, CE_WARN,
1316 "!ABORT failed with sct = %x, sc = %x",
1317 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1318 atomic_inc_32(&nvme->n_abort_failed);
1319 } else {
1320 dev_err(nvme->n_dip, CE_WARN,
1321 "!ABORT of command %d/%d %ssuccessful",
1322 abort_cmd->nc_sqe.sqe_cid, abort_cmd->nc_sqid,
1323 cmd->nc_cqe.cqe_dw0 & 1 ? "un" : "");
1324 if ((cmd->nc_cqe.cqe_dw0 & 1) == 0)
1325 atomic_inc_32(&nvme->n_cmd_aborted);
1326 }
1327
1328 nvme_free_cmd(cmd);
1329 return (ret);
1330 }
1331
1332 /*
1333 * nvme_wait_cmd -- wait for command completion or timeout
1334 *
1335 * In case of a serious error or a timeout of the abort command the hardware
1336 * will be declared dead and FMA will be notified.
1337 */
1338 static void
1339 nvme_wait_cmd(nvme_cmd_t *cmd, uint_t sec)
1340 {
1341 clock_t timeout = ddi_get_lbolt() + drv_usectohz(sec * MICROSEC);
1342 nvme_t *nvme = cmd->nc_nvme;
1343 nvme_reg_csts_t csts;
1344 nvme_qpair_t *qp;
1345
1346 ASSERT(mutex_owned(&cmd->nc_mutex));
1347
1348 while (!cmd->nc_completed) {
1349 if (cv_timedwait(&cmd->nc_cv, &cmd->nc_mutex, timeout) == -1)
1350 break;
1351 }
1352
1353 if (cmd->nc_completed)
1354 return;
1355
1356 /*
1357 * The command timed out.
1358 *
1359 * Check controller for fatal status, any errors associated with the
1360 * register or DMA handle, or for a double timeout (abort command timed
1361 * out). If necessary log a warning and call FMA.
1362 */
1363 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
1364 dev_err(nvme->n_dip, CE_WARN, "!command %d/%d timeout, "
1365 "OPC = %x, CFS = %d", cmd->nc_sqe.sqe_cid, cmd->nc_sqid,
1366 cmd->nc_sqe.sqe_opc, csts.b.csts_cfs);
1367 atomic_inc_32(&nvme->n_cmd_timeout);
1368
1369 if (csts.b.csts_cfs ||
1370 nvme_check_regs_hdl(nvme) ||
1371 nvme_check_dma_hdl(cmd->nc_dma) ||
1372 cmd->nc_sqe.sqe_opc == NVME_OPC_ABORT) {
1373 ddi_fm_service_impact(nvme->n_dip, DDI_SERVICE_LOST);
1374 nvme->n_dead = B_TRUE;
1375 } else if (nvme_abort_cmd(cmd, sec) == 0) {
1376 /*
1377 * If the abort succeeded the command should complete
1378 * immediately with an appropriate status.
1379 */
1380 while (!cmd->nc_completed)
1381 cv_wait(&cmd->nc_cv, &cmd->nc_mutex);
1382
1383 return;
1384 }
1385
1386 qp = nvme->n_ioq[cmd->nc_sqid];
1387
1388 mutex_enter(&qp->nq_mutex);
1389 (void) nvme_unqueue_cmd(nvme, qp, cmd->nc_sqe.sqe_cid);
1390 mutex_exit(&qp->nq_mutex);
1391
1392 /*
1393 * As we don't know what the presumed dead hardware might still do with
1394 * the DMA memory, we'll put the command on the lost commands list if it
1395 * has any DMA memory.
1396 */
1397 if (cmd->nc_dma != NULL) {
1398 mutex_enter(&nvme_lc_mutex);
1399 list_insert_head(&nvme_lost_cmds, cmd);
1400 mutex_exit(&nvme_lc_mutex);
1401 }
1402 }
1403
1404 static void
1405 nvme_wakeup_cmd(void *arg)
1406 {
1407 nvme_cmd_t *cmd = arg;
1408
1409 mutex_enter(&cmd->nc_mutex);
1410 cmd->nc_completed = B_TRUE;
1411 cv_signal(&cmd->nc_cv);
1412 mutex_exit(&cmd->nc_mutex);
1413 }
1414
1415 static void
1416 nvme_async_event_task(void *arg)
1417 {
1418 nvme_cmd_t *cmd = arg;
1419 nvme_t *nvme = cmd->nc_nvme;
1420 nvme_error_log_entry_t *error_log = NULL;
1421 nvme_health_log_t *health_log = NULL;
1422 size_t logsize = 0;
1423 nvme_async_event_t event;
1424
1425 /*
1426 * Check for errors associated with the async request itself. The only
1427 * command-specific error is "async event limit exceeded", which
1428 * indicates a programming error in the driver and causes a panic in
1429 * nvme_check_cmd_status().
1430 *
1431 * Other possible errors are various scenarios where the async request
1432 * was aborted, or internal errors in the device. Internal errors are
1433 * reported to FMA, the command aborts need no special handling here.
1434 */
1435 if (nvme_check_cmd_status(cmd) != 0) {
1436 dev_err(cmd->nc_nvme->n_dip, CE_WARN,
1437 "!async event request returned failure, sct = %x, "
1438 "sc = %x, dnr = %d, m = %d", cmd->nc_cqe.cqe_sf.sf_sct,
1439 cmd->nc_cqe.cqe_sf.sf_sc, cmd->nc_cqe.cqe_sf.sf_dnr,
1440 cmd->nc_cqe.cqe_sf.sf_m);
1441
1442 if (cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1443 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INTERNAL_ERR) {
1444 cmd->nc_nvme->n_dead = B_TRUE;
1445 ddi_fm_service_impact(cmd->nc_nvme->n_dip,
1446 DDI_SERVICE_LOST);
1447 }
1448 nvme_free_cmd(cmd);
1449 return;
1450 }
1451
1452
1453 event.r = cmd->nc_cqe.cqe_dw0;
1454
1455 /* Clear CQE and re-submit the async request. */
1547 "received, info = %x, logpage = %x", event.b.ae_info,
1548 event.b.ae_logpage);
1549 atomic_inc_32(&nvme->n_vendor_event);
1550 break;
1551
1552 default:
1553 dev_err(nvme->n_dip, CE_WARN, "!unknown async event received, "
1554 "type = %x, info = %x, logpage = %x", event.b.ae_type,
1555 event.b.ae_info, event.b.ae_logpage);
1556 atomic_inc_32(&nvme->n_unknown_event);
1557 break;
1558 }
1559
1560 if (error_log)
1561 kmem_free(error_log, logsize);
1562
1563 if (health_log)
1564 kmem_free(health_log, logsize);
1565 }
1566
1567 static void
1568 nvme_admin_cmd(nvme_cmd_t *cmd, int sec)
1569 {
1570 mutex_enter(&cmd->nc_mutex);
1571 nvme_submit_admin_cmd(cmd->nc_nvme->n_adminq, cmd);
1572 nvme_wait_cmd(cmd, sec);
1573 mutex_exit(&cmd->nc_mutex);
1574 }
1575
1576 static void
1577 nvme_async_event(nvme_t *nvme)
1578 {
1579 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1580
1581 cmd->nc_sqid = 0;
1582 cmd->nc_sqe.sqe_opc = NVME_OPC_ASYNC_EVENT;
1583 cmd->nc_callback = nvme_async_event_task;
1584
1585 nvme_submit_admin_cmd(nvme->n_adminq, cmd);
1586 }
1587
1588 static int
1589 nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms,
1590 uint8_t pi, boolean_t pil, uint8_t ses)
1591 {
1592 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1593 nvme_format_nvm_t format_nvm = { 0 };
1595
1596 format_nvm.b.fm_lbaf = lbaf & 0xf;
1597 format_nvm.b.fm_ms = ms ? 1 : 0;
1598 format_nvm.b.fm_pi = pi & 0x7;
1599 format_nvm.b.fm_pil = pil ? 1 : 0;
1600 format_nvm.b.fm_ses = ses & 0x7;
1601
1602 cmd->nc_sqid = 0;
1603 cmd->nc_callback = nvme_wakeup_cmd;
1604 cmd->nc_sqe.sqe_nsid = nsid;
1605 cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT;
1606 cmd->nc_sqe.sqe_cdw10 = format_nvm.r;
1607
1608 /*
1609 * Some devices like Samsung SM951 don't allow formatting of all
1610 * namespaces in one command. Handle that gracefully.
1611 */
1612 if (nsid == (uint32_t)-1)
1613 cmd->nc_dontpanic = B_TRUE;
1614
1615 nvme_admin_cmd(cmd, nvme_format_cmd_timeout);
1616
1617 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1618 dev_err(nvme->n_dip, CE_WARN,
1619 "!FORMAT failed with sct = %x, sc = %x",
1620 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1621 }
1622
1623 nvme_free_cmd(cmd);
1624 return (ret);
1625 }
1626
1627 static int
1628 nvme_get_logpage(nvme_t *nvme, void **buf, size_t *bufsize, uint8_t logpage,
1629 ...)
1630 {
1631 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1632 nvme_getlogpage_t getlogpage = { 0 };
1633 va_list ap;
1634 int ret;
1635
1636 va_start(ap, logpage);
1637
1638 cmd->nc_sqid = 0;
1639 cmd->nc_callback = nvme_wakeup_cmd;
1640 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_LOG_PAGE;
1641
1642 getlogpage.b.lp_lid = logpage;
1643
1644 switch (logpage) {
1645 case NVME_LOGPAGE_ERROR:
1646 cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
1647 /*
1648 * The GET LOG PAGE command can use at most 2 pages to return
1649 * data, PRP lists are not supported.
1650 */
1651 *bufsize = MIN(2 * nvme->n_pagesize,
1652 nvme->n_error_log_len * sizeof (nvme_error_log_entry_t));
1653 break;
1654
1655 case NVME_LOGPAGE_HEALTH:
1656 cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t);
1657 *bufsize = sizeof (nvme_health_log_t);
1658 break;
1659
1660 case NVME_LOGPAGE_FWSLOT:
1661 cmd->nc_sqe.sqe_nsid = (uint32_t)-1;
1662 *bufsize = sizeof (nvme_fwslot_log_t);
1663 break;
1664
1665 default:
1666 dev_err(nvme->n_dip, CE_WARN, "!unknown log page requested: %d",
1667 logpage);
1668 atomic_inc_32(&nvme->n_unknown_logpage);
1669 ret = EINVAL;
1670 goto fail;
1671 }
1672
1673 va_end(ap);
1674
1675 getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1;
1676
1677 cmd->nc_sqe.sqe_cdw10 = getlogpage.r;
1678
1679 if (nvme_zalloc_dma(nvme, getlogpage.b.lp_numd * sizeof (uint32_t),
1680 DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1681 dev_err(nvme->n_dip, CE_WARN,
1682 "!nvme_zalloc_dma failed for GET LOG PAGE");
1683 ret = ENOMEM;
1684 goto fail;
1685 }
1686
1687 if (cmd->nc_dma->nd_ncookie > 2) {
1688 dev_err(nvme->n_dip, CE_WARN,
1689 "!too many DMA cookies for GET LOG PAGE");
1690 atomic_inc_32(&nvme->n_too_many_cookies);
1691 ret = ENOMEM;
1692 goto fail;
1693 }
1694
1695 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
1696 if (cmd->nc_dma->nd_ncookie > 1) {
1697 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
1698 &cmd->nc_dma->nd_cookie);
1699 cmd->nc_sqe.sqe_dptr.d_prp[1] =
1700 cmd->nc_dma->nd_cookie.dmac_laddress;
1701 }
1702
1703 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
1704
1705 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1706 dev_err(nvme->n_dip, CE_WARN,
1707 "!GET LOG PAGE failed with sct = %x, sc = %x",
1708 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1709 goto fail;
1710 }
1711
1712 *buf = kmem_alloc(*bufsize, KM_SLEEP);
1713 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
1714
1715 fail:
1716 nvme_free_cmd(cmd);
1717
1718 return (ret);
1719 }
1720
1721 static int
1722 nvme_identify(nvme_t *nvme, uint32_t nsid, void **buf)
1723 {
1724 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1725 int ret;
1726
1727 if (buf == NULL)
1728 return (EINVAL);
1729
1730 cmd->nc_sqid = 0;
1731 cmd->nc_callback = nvme_wakeup_cmd;
1732 cmd->nc_sqe.sqe_opc = NVME_OPC_IDENTIFY;
1733 cmd->nc_sqe.sqe_nsid = nsid;
1734 cmd->nc_sqe.sqe_cdw10 = nsid ? NVME_IDENTIFY_NSID : NVME_IDENTIFY_CTRL;
1735
1736 if (nvme_zalloc_dma(nvme, NVME_IDENTIFY_BUFSIZE, DDI_DMA_READ,
1737 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1738 dev_err(nvme->n_dip, CE_WARN,
1739 "!nvme_zalloc_dma failed for IDENTIFY");
1740 ret = ENOMEM;
1741 goto fail;
1742 }
1743
1744 if (cmd->nc_dma->nd_ncookie > 2) {
1745 dev_err(nvme->n_dip, CE_WARN,
1746 "!too many DMA cookies for IDENTIFY");
1747 atomic_inc_32(&nvme->n_too_many_cookies);
1748 ret = ENOMEM;
1749 goto fail;
1750 }
1751
1752 cmd->nc_sqe.sqe_dptr.d_prp[0] = cmd->nc_dma->nd_cookie.dmac_laddress;
1753 if (cmd->nc_dma->nd_ncookie > 1) {
1754 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
1755 &cmd->nc_dma->nd_cookie);
1756 cmd->nc_sqe.sqe_dptr.d_prp[1] =
1757 cmd->nc_dma->nd_cookie.dmac_laddress;
1758 }
1759
1760 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
1761
1762 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1763 dev_err(nvme->n_dip, CE_WARN,
1764 "!IDENTIFY failed with sct = %x, sc = %x",
1765 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
1766 goto fail;
1767 }
1768
1769 *buf = kmem_alloc(NVME_IDENTIFY_BUFSIZE, KM_SLEEP);
1770 bcopy(cmd->nc_dma->nd_memp, *buf, NVME_IDENTIFY_BUFSIZE);
1771
1772 fail:
1773 nvme_free_cmd(cmd);
1774
1775 return (ret);
1776 }
1777
1778 static int
1779 nvme_set_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t val,
1780 uint32_t *res)
1781 {
1782 _NOTE(ARGUNUSED(nsid));
1783 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1784 int ret = EINVAL;
1785
1786 ASSERT(res != NULL);
1787
1788 cmd->nc_sqid = 0;
1789 cmd->nc_callback = nvme_wakeup_cmd;
1790 cmd->nc_sqe.sqe_opc = NVME_OPC_SET_FEATURES;
1791 cmd->nc_sqe.sqe_cdw10 = feature;
1792 cmd->nc_sqe.sqe_cdw11 = val;
1793
1794 switch (feature) {
1795 case NVME_FEAT_WRITE_CACHE:
1796 if (!nvme->n_write_cache_present)
1797 goto fail;
1798 break;
1799
1800 case NVME_FEAT_NQUEUES:
1801 break;
1802
1803 default:
1804 goto fail;
1805 }
1806
1807 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
1808
1809 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1810 dev_err(nvme->n_dip, CE_WARN,
1811 "!SET FEATURES %d failed with sct = %x, sc = %x",
1812 feature, cmd->nc_cqe.cqe_sf.sf_sct,
1813 cmd->nc_cqe.cqe_sf.sf_sc);
1814 goto fail;
1815 }
1816
1817 *res = cmd->nc_cqe.cqe_dw0;
1818
1819 fail:
1820 nvme_free_cmd(cmd);
1821 return (ret);
1822 }
1823
1824 static int
1825 nvme_get_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t *res,
1826 void **buf, size_t *bufsize)
1827 {
1828 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1829 int ret = EINVAL;
1830
1831 ASSERT(res != NULL);
1832
1833 if (bufsize != NULL)
1834 *bufsize = 0;
1835
1836 cmd->nc_sqid = 0;
1837 cmd->nc_callback = nvme_wakeup_cmd;
1838 cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES;
1839 cmd->nc_sqe.sqe_cdw10 = feature;
1840 cmd->nc_sqe.sqe_cdw11 = *res;
1841
1842 switch (feature) {
1843 case NVME_FEAT_ARBITRATION:
1844 case NVME_FEAT_POWER_MGMT:
1845 case NVME_FEAT_TEMPERATURE:
1846 case NVME_FEAT_ERROR:
1847 case NVME_FEAT_NQUEUES:
1848 case NVME_FEAT_INTR_COAL:
1849 case NVME_FEAT_INTR_VECT:
1875
1876 break;
1877
1878 case NVME_FEAT_AUTO_PST:
1879 if (!nvme->n_auto_pst_supported)
1880 goto fail;
1881
1882 ASSERT(bufsize != NULL);
1883 *bufsize = NVME_AUTO_PST_BUFSIZE;
1884 break;
1885
1886 default:
1887 goto fail;
1888 }
1889
1890 if (bufsize != NULL && *bufsize != 0) {
1891 if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ,
1892 &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
1893 dev_err(nvme->n_dip, CE_WARN,
1894 "!nvme_zalloc_dma failed for GET FEATURES");
1895 ret = ENOMEM;
1896 goto fail;
1897 }
1898
1899 if (cmd->nc_dma->nd_ncookie > 2) {
1900 dev_err(nvme->n_dip, CE_WARN,
1901 "!too many DMA cookies for GET FEATURES");
1902 atomic_inc_32(&nvme->n_too_many_cookies);
1903 ret = ENOMEM;
1904 goto fail;
1905 }
1906
1907 cmd->nc_sqe.sqe_dptr.d_prp[0] =
1908 cmd->nc_dma->nd_cookie.dmac_laddress;
1909 if (cmd->nc_dma->nd_ncookie > 1) {
1910 ddi_dma_nextcookie(cmd->nc_dma->nd_dmah,
1911 &cmd->nc_dma->nd_cookie);
1912 cmd->nc_sqe.sqe_dptr.d_prp[1] =
1913 cmd->nc_dma->nd_cookie.dmac_laddress;
1914 }
1915 }
1916
1917 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
1918
1919 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
1920 if (feature == NVME_FEAT_LBA_RANGE &&
1921 cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC &&
1922 cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD)
1923 nvme->n_lba_range_supported = B_FALSE;
1924 else
1925 dev_err(nvme->n_dip, CE_WARN,
1926 "!GET FEATURES %d failed with sct = %x, sc = %x",
1927 feature, cmd->nc_cqe.cqe_sf.sf_sct,
1928 cmd->nc_cqe.cqe_sf.sf_sc);
1929 goto fail;
1930 }
1931
1932 if (bufsize != NULL && *bufsize != 0) {
1933 ASSERT(buf != NULL);
1934 *buf = kmem_alloc(*bufsize, KM_SLEEP);
1935 bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize);
1936 }
1937
1938 *res = cmd->nc_cqe.cqe_dw0;
1939
1940 fail:
1941 nvme_free_cmd(cmd);
1942 return (ret);
1943 }
1944
1945 static int
1946 nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
1947 {
1948 nvme_write_cache_t nwc = { 0 };
1949
1950 if (enable)
1951 nwc.b.wc_wce = 1;
1952
1953 return (nvme_set_features(nvme, 0, NVME_FEAT_WRITE_CACHE, nwc.r,
1954 &nwc.r));
1955 }
1956
1957 static int
1958 nvme_set_nqueues(nvme_t *nvme, uint16_t *nqueues)
1959 {
1960 nvme_nqueues_t nq = { 0 };
1961 int ret;
1962
1963 nq.b.nq_nsq = nq.b.nq_ncq = *nqueues - 1;
1964
1965 ret = nvme_set_features(nvme, 0, NVME_FEAT_NQUEUES, nq.r, &nq.r);
1966
1967 if (ret == 0) {
1968 /*
1969 * Always use the same number of submission and completion
1970 * queues, and never use more than the requested number of
1971 * queues.
1972 */
1973 *nqueues = MIN(*nqueues, MIN(nq.b.nq_nsq, nq.b.nq_ncq) + 1);
1974 }
1975
1976 return (ret);
1977 }
1978
1979 static int
1980 nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
1981 {
1982 nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
1983 nvme_create_queue_dw10_t dw10 = { 0 };
1984 nvme_create_cq_dw11_t c_dw11 = { 0 };
1985 nvme_create_sq_dw11_t s_dw11 = { 0 };
1986 int ret;
1987
1988 dw10.b.q_qid = idx;
1989 dw10.b.q_qsize = qp->nq_nentry - 1;
1990
1991 c_dw11.b.cq_pc = 1;
1992 c_dw11.b.cq_ien = 1;
1993 c_dw11.b.cq_iv = idx % nvme->n_intr_cnt;
1994
1995 cmd->nc_sqid = 0;
1996 cmd->nc_callback = nvme_wakeup_cmd;
1997 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
1998 cmd->nc_sqe.sqe_cdw10 = dw10.r;
1999 cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
2000 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_cqdma->nd_cookie.dmac_laddress;
2001
2002 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2003
2004 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2005 dev_err(nvme->n_dip, CE_WARN,
2006 "!CREATE CQUEUE failed with sct = %x, sc = %x",
2007 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2008 goto fail;
2009 }
2010
2011 nvme_free_cmd(cmd);
2012
2013 s_dw11.b.sq_pc = 1;
2014 s_dw11.b.sq_cqid = idx;
2015
2016 cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
2017 cmd->nc_sqid = 0;
2018 cmd->nc_callback = nvme_wakeup_cmd;
2019 cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_SQUEUE;
2020 cmd->nc_sqe.sqe_cdw10 = dw10.r;
2021 cmd->nc_sqe.sqe_cdw11 = s_dw11.r;
2022 cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_sqdma->nd_cookie.dmac_laddress;
2023
2024 nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
2025
2026 if ((ret = nvme_check_cmd_status(cmd)) != 0) {
2027 dev_err(nvme->n_dip, CE_WARN,
2028 "!CREATE SQUEUE failed with sct = %x, sc = %x",
2029 cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
2030 goto fail;
2031 }
2032
2033 fail:
2034 nvme_free_cmd(cmd);
2035
2036 return (ret);
2037 }
2038
2039 static boolean_t
2040 nvme_reset(nvme_t *nvme, boolean_t quiesce)
2041 {
2042 nvme_reg_csts_t csts;
2043 int i;
2044
2045 nvme_put32(nvme, NVME_REG_CC, 0);
2046
2047 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2048 if (csts.b.csts_rdy == 1) {
2049 nvme_put32(nvme, NVME_REG_CC, 0);
2050 for (i = 0; i != nvme->n_timeout * 10; i++) {
2051 csts.r = nvme_get32(nvme, NVME_REG_CSTS);
2052 if (csts.b.csts_rdy == 0)
2053 break;
2054
2055 if (quiesce)
2056 drv_usecwait(50000);
2109
2110 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
2111 bcopy(nvme->n_idctl->id_serial, serial,
2112 sizeof (nvme->n_idctl->id_serial));
2113
2114 model[sizeof (nvme->n_idctl->id_model)] = '\0';
2115 serial[sizeof (nvme->n_idctl->id_serial)] = '\0';
2116
2117 nvme->n_ns[nsid - 1].ns_devid = kmem_asprintf("%4X-%s-%s-%X",
2118 nvme->n_idctl->id_vid, model, serial, nsid);
2119 }
2120
2121 static int
2122 nvme_init_ns(nvme_t *nvme, int nsid)
2123 {
2124 nvme_namespace_t *ns = &nvme->n_ns[nsid - 1];
2125 nvme_identify_nsid_t *idns;
2126 int last_rp;
2127
2128 ns->ns_nvme = nvme;
2129
2130 if (nvme_identify(nvme, nsid, (void **)&idns) != 0) {
2131 dev_err(nvme->n_dip, CE_WARN,
2132 "!failed to identify namespace %d", nsid);
2133 return (DDI_FAILURE);
2134 }
2135
2136 ns->ns_idns = idns;
2137 ns->ns_id = nsid;
2138 ns->ns_block_count = idns->id_nsize;
2139 ns->ns_block_size =
2140 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
2141 ns->ns_best_block_size = ns->ns_block_size;
2142
2143 /*
2144 * Get the EUI64 if present. Use it for devid and device node names.
2145 */
2146 if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1))
2147 bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64));
2148
2149 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
2150 if (*(uint64_t *)ns->ns_eui64 != 0) {
2200 nsid, (uint64_t)ns->ns_block_size);
2201 ns->ns_ignore = B_TRUE;
2202 } else {
2203 ns->ns_ignore = B_FALSE;
2204 }
2205
2206 return (DDI_SUCCESS);
2207 }
2208
2209 static int
2210 nvme_init(nvme_t *nvme)
2211 {
2212 nvme_reg_cc_t cc = { 0 };
2213 nvme_reg_aqa_t aqa = { 0 };
2214 nvme_reg_asq_t asq = { 0 };
2215 nvme_reg_acq_t acq = { 0 };
2216 nvme_reg_cap_t cap;
2217 nvme_reg_vs_t vs;
2218 nvme_reg_csts_t csts;
2219 int i = 0;
2220 uint16_t nqueues;
2221 char model[sizeof (nvme->n_idctl->id_model) + 1];
2222 char *vendor, *product;
2223
2224 /* Check controller version */
2225 vs.r = nvme_get32(nvme, NVME_REG_VS);
2226 nvme->n_version.v_major = vs.b.vs_mjr;
2227 nvme->n_version.v_minor = vs.b.vs_mnr;
2228 dev_err(nvme->n_dip, CE_CONT, "?NVMe spec version %d.%d",
2229 nvme->n_version.v_major, nvme->n_version.v_minor);
2230
2231 if (NVME_VERSION_HIGHER(&nvme->n_version,
2232 nvme_version_major, nvme_version_minor)) {
2233 dev_err(nvme->n_dip, CE_WARN, "!no support for version > %d.%d",
2234 nvme_version_major, nvme_version_minor);
2235 if (nvme->n_strict_version)
2236 goto fail;
2237 }
2238
2239 /* retrieve controller configuration */
2240 cap.r = nvme_get64(nvme, NVME_REG_CAP);
2365 */
2366 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX, 1)
2367 != DDI_SUCCESS) &&
2368 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI, 1)
2369 != DDI_SUCCESS) &&
2370 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_FIXED, 1)
2371 != DDI_SUCCESS)) {
2372 dev_err(nvme->n_dip, CE_WARN,
2373 "!failed to setup initial interrupt");
2374 goto fail;
2375 }
2376
2377 /*
2378 * Post an asynchronous event command to catch errors.
2379 */
2380 nvme_async_event(nvme);
2381
2382 /*
2383 * Identify Controller
2384 */
2385 if (nvme_identify(nvme, 0, (void **)&nvme->n_idctl) != 0) {
2386 dev_err(nvme->n_dip, CE_WARN,
2387 "!failed to identify controller");
2388 goto fail;
2389 }
2390
2391 /*
2392 * Get Vendor & Product ID
2393 */
2394 bcopy(nvme->n_idctl->id_model, model, sizeof (nvme->n_idctl->id_model));
2395 model[sizeof (nvme->n_idctl->id_model)] = '\0';
2396 sata_split_model(model, &vendor, &product);
2397
2398 if (vendor == NULL)
2399 nvme->n_vendor = strdup("NVMe");
2400 else
2401 nvme->n_vendor = strdup(vendor);
2402
2403 nvme->n_product = strdup(product);
2404
2405 /*
2454 if (((1 << nvme->n_idctl->id_sqes.qes_min) > sizeof (nvme_sqe_t)) ||
2455 ((1 << nvme->n_idctl->id_sqes.qes_max) < sizeof (nvme_sqe_t)) ||
2456 ((1 << nvme->n_idctl->id_cqes.qes_min) > sizeof (nvme_cqe_t)) ||
2457 ((1 << nvme->n_idctl->id_cqes.qes_max) < sizeof (nvme_cqe_t)))
2458 goto fail;
2459
2460 /*
2461 * Check for the presence of a Volatile Write Cache. If present,
2462 * enable or disable based on the value of the property
2463 * volatile-write-cache-enable (default is enabled).
2464 */
2465 nvme->n_write_cache_present =
2466 nvme->n_idctl->id_vwc.vwc_present == 0 ? B_FALSE : B_TRUE;
2467
2468 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2469 "volatile-write-cache-present",
2470 nvme->n_write_cache_present ? 1 : 0);
2471
2472 if (!nvme->n_write_cache_present) {
2473 nvme->n_write_cache_enabled = B_FALSE;
2474 } else if (nvme_write_cache_set(nvme, nvme->n_write_cache_enabled)
2475 != 0) {
2476 dev_err(nvme->n_dip, CE_WARN,
2477 "!failed to %sable volatile write cache",
2478 nvme->n_write_cache_enabled ? "en" : "dis");
2479 /*
2480 * Assume the cache is (still) enabled.
2481 */
2482 nvme->n_write_cache_enabled = B_TRUE;
2483 }
2484
2485 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip,
2486 "volatile-write-cache-enable",
2487 nvme->n_write_cache_enabled ? 1 : 0);
2488
2489 /*
2490 * Assume LBA Range Type feature is supported. If it isn't this
2491 * will be set to B_FALSE by nvme_get_features().
2492 */
2493 nvme->n_lba_range_supported = B_TRUE;
2494
2495 /*
2527 != 0) {
2528 nvme_release_interrupts(nvme);
2529
2530 nqueues = MIN(UINT16_MAX, ncpus);
2531
2532 if ((nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSIX,
2533 nqueues) != DDI_SUCCESS) &&
2534 (nvme_setup_interrupts(nvme, DDI_INTR_TYPE_MSI,
2535 nqueues) != DDI_SUCCESS)) {
2536 dev_err(nvme->n_dip, CE_WARN,
2537 "!failed to setup MSI/MSI-X interrupts");
2538 goto fail;
2539 }
2540 }
2541
2542 nqueues = nvme->n_intr_cnt;
2543
2544 /*
2545 * Create I/O queue pairs.
2546 */
2547
2548 if (nvme_set_nqueues(nvme, &nqueues) != 0) {
2549 dev_err(nvme->n_dip, CE_WARN,
2550 "!failed to set number of I/O queues to %d",
2551 nvme->n_intr_cnt);
2552 goto fail;
2553 }
2554
2555 /*
2556 * Reallocate I/O queue array
2557 */
2558 kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
2559 nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
2560 (nqueues + 1), KM_SLEEP);
2561 nvme->n_ioq[0] = nvme->n_adminq;
2562
2563 nvme->n_ioq_count = nqueues;
2564
2565 /*
2566 * If we got less queues than we asked for we might as well give
2567 * some of the interrupt vectors back to the system.
2568 */
2569 if (nvme->n_ioq_count < nvme->n_intr_cnt) {
2570 nvme_release_interrupts(nvme);
2571
2572 if (nvme_setup_interrupts(nvme, nvme->n_intr_type,
2573 nvme->n_ioq_count) != DDI_SUCCESS) {
2574 dev_err(nvme->n_dip, CE_WARN,
2575 "!failed to reduce number of interrupts");
2576 goto fail;
2577 }
2578 }
2579
2580 /*
2581 * Alloc & register I/O queue pairs
2582 */
2583 nvme->n_io_queue_len =
2584 MIN(nvme->n_io_queue_len, nvme->n_max_queue_entries);
2585 (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-queue-len",
2586 nvme->n_io_queue_len);
2587
2588 for (i = 1; i != nvme->n_ioq_count + 1; i++) {
2589 if (nvme_alloc_qpair(nvme, nvme->n_io_queue_len,
2590 &nvme->n_ioq[i], i) != DDI_SUCCESS) {
2591 dev_err(nvme->n_dip, CE_WARN,
2592 "!unable to allocate I/O qpair %d", i);
2593 goto fail;
2594 }
2595
2596 if (nvme_create_io_qpair(nvme, nvme->n_ioq[i], i) != 0) {
2597 dev_err(nvme->n_dip, CE_WARN,
2598 "!unable to create I/O qpair %d", i);
2599 goto fail;
2600 }
2601 }
2602
2603 /*
2604 * Post more asynchronous events commands to reduce event reporting
2605 * latency as suggested by the spec.
2606 */
2607 for (i = 1; i != nvme->n_async_event_limit; i++)
2608 nvme_async_event(nvme);
2609
2610 return (DDI_SUCCESS);
2611
2612 fail:
2613 (void) nvme_reset(nvme, B_FALSE);
2614 return (DDI_FAILURE);
2615 }
2616
2617 static uint_t
2618 nvme_intr(caddr_t arg1, caddr_t arg2)
2619 {
2620 /*LINTED: E_PTR_BAD_CAST_ALIGN*/
2621 nvme_t *nvme = (nvme_t *)arg1;
2622 int inum = (int)(uintptr_t)arg2;
2623 int ccnt = 0;
2624 int qnum;
2625 nvme_cmd_t *cmd;
2626
2627 if (inum >= nvme->n_intr_cnt)
2628 return (DDI_INTR_UNCLAIMED);
2629
2630 if (nvme->n_dead)
2631 return (nvme->n_intr_type == DDI_INTR_TYPE_FIXED ?
2632 DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED);
2633
2634 /*
2635 * The interrupt vector a queue uses is calculated as queue_idx %
2636 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
2637 * in steps of n_intr_cnt to process all queues using this vector.
2638 */
2639 for (qnum = inum;
2640 qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL;
2641 qnum += nvme->n_intr_cnt) {
2642 while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) {
2643 taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
2644 cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
2645 ccnt++;
2646 }
2647 }
2648
2649 return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
2650 }
2651
2652 static void
2653 nvme_release_interrupts(nvme_t *nvme)
3375 nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
3376 {
3377 #ifndef __lock_lint
3378 _NOTE(ARGUNUSED(cred_p));
3379 #endif
3380 minor_t minor = getminor(*devp);
3381 nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor));
3382 int nsid = NVME_MINOR_NSID(minor);
3383 nvme_minor_state_t *nm;
3384 int rv = 0;
3385
3386 if (otyp != OTYP_CHR)
3387 return (EINVAL);
3388
3389 if (nvme == NULL)
3390 return (ENXIO);
3391
3392 if (nsid > nvme->n_namespace_count)
3393 return (ENXIO);
3394
3395 if (nvme->n_dead)
3396 return (EIO);
3397
3398 nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor;
3399
3400 mutex_enter(&nm->nm_mutex);
3401 if (nm->nm_oexcl) {
3402 rv = EBUSY;
3403 goto out;
3404 }
3405
3406 if (flag & FEXCL) {
3407 if (nm->nm_ocnt != 0) {
3408 rv = EBUSY;
3409 goto out;
3410 }
3411 nm->nm_oexcl = B_TRUE;
3412 }
3413
3414 nm->nm_ocnt++;
3415
3416 out:
3417 mutex_exit(&nm->nm_mutex);
3450 nm->nm_ocnt--;
3451 mutex_exit(&nm->nm_mutex);
3452
3453 return (0);
3454 }
3455
3456 static int
3457 nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
3458 cred_t *cred_p)
3459 {
3460 _NOTE(ARGUNUSED(cred_p));
3461 int rv = 0;
3462 void *idctl;
3463
3464 if ((mode & FREAD) == 0)
3465 return (EPERM);
3466
3467 if (nioc->n_len < NVME_IDENTIFY_BUFSIZE)
3468 return (EINVAL);
3469
3470 if ((rv = nvme_identify(nvme, nsid, (void **)&idctl)) != 0)
3471 return (rv);
3472
3473 if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode)
3474 != 0)
3475 rv = EFAULT;
3476
3477 kmem_free(idctl, NVME_IDENTIFY_BUFSIZE);
3478
3479 return (rv);
3480 }
3481
3482 static int
3483 nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc,
3484 int mode, cred_t *cred_p)
3485 {
3486 _NOTE(ARGUNUSED(nsid, cred_p));
3487 int rv = 0;
3488 nvme_reg_cap_t cap = { 0 };
3489 nvme_capabilities_t nc;
3490
3491 if ((mode & FREAD) == 0)
3618 return (EINVAL);
3619
3620 if (!nvme->n_write_cache_present)
3621 return (EINVAL);
3622
3623 break;
3624
3625 case NVME_FEAT_AUTO_PST:
3626 if (nsid != 0)
3627 return (EINVAL);
3628
3629 if (!nvme->n_auto_pst_supported)
3630 return (EINVAL);
3631
3632 break;
3633
3634 default:
3635 return (EINVAL);
3636 }
3637
3638 rv = nvme_get_features(nvme, nsid, feature, &res, &buf, &bufsize);
3639 if (rv != 0)
3640 return (rv);
3641
3642 if (nioc->n_len < bufsize) {
3643 kmem_free(buf, bufsize);
3644 return (EINVAL);
3645 }
3646
3647 if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0)
3648 rv = EFAULT;
3649
3650 kmem_free(buf, bufsize);
3651 nioc->n_arg = res;
3652 nioc->n_len = bufsize;
3653
3654 return (rv);
3655 }
3656
3657 static int
3658 nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode,
3659 cred_t *cred_p)
3660 {
3837 case DDI_MODEL_ILP32: {
3838 nvme_ioctl32_t nioc32;
3839 if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t),
3840 mode) != 0)
3841 return (EFAULT);
3842 nioc.n_len = nioc32.n_len;
3843 nioc.n_buf = nioc32.n_buf;
3844 nioc.n_arg = nioc32.n_arg;
3845 break;
3846 }
3847 case DDI_MODEL_NONE:
3848 #endif
3849 if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode)
3850 != 0)
3851 return (EFAULT);
3852 #ifdef _MULTI_DATAMODEL
3853 break;
3854 }
3855 #endif
3856
3857 if (nvme->n_dead && cmd != NVME_IOC_DETACH)
3858 return (EIO);
3859
3860
3861 if (cmd == NVME_IOC_IDENTIFY_CTRL) {
3862 /*
3863 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and
3864 * attachment point nodes.
3865 */
3866 nsid = 0;
3867 } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) {
3868 /*
3869 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it
3870 * will always return identify data for namespace 1.
3871 */
3872 nsid = 1;
3873 }
3874
3875 if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL)
3876 rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode,
3877 cred_p);
3878 else
3879 rv = EINVAL;
3880
|