Print this page
8622 panic in PTE_set_all()
8623 IMMU_CONTIG_PADDR is broken for cookies with more than one page
8625 nvme causes bad free panic in IOMMU
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/i86pc/io/immu_dvma.c
+++ new/usr/src/uts/i86pc/io/immu_dvma.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
21 21 /*
22 22 * Portions Copyright (c) 2010, Oracle and/or its affiliates.
23 23 * All rights reserved.
24 24 */
25 25 /*
26 26 * Copyright (c) 2009, Intel Corporation.
27 27 * All rights reserved.
28 28 */
29 29 /*
30 30 * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved.
31 + * Copyright 2017 Joyent, Inc.
31 32 */
32 33
33 34 /*
34 35 * DVMA code
35 36 * This file contains Intel IOMMU code that deals with DVMA
36 37 * i.e. DMA remapping.
37 38 */
38 39
39 40 #include <sys/sysmacros.h>
40 41 #include <sys/pcie.h>
41 42 #include <sys/pci_cfgspace.h>
42 43 #include <vm/hat_i86.h>
43 44 #include <sys/memlist.h>
44 45 #include <sys/acpi/acpi.h>
45 46 #include <sys/acpica.h>
46 47 #include <sys/modhash.h>
47 48 #include <sys/immu.h>
48 49 #include <sys/x86_archext.h>
49 50 #include <sys/archsystm.h>
50 51
↓ open down ↓ |
10 lines elided |
↑ open up ↑ |
51 52 #undef TEST
52 53
53 54 /*
54 55 * Macros based on PCI spec
55 56 */
56 57 #define IMMU_PCI_REV2CLASS(r) ((r) >> 8) /* classcode from revid */
57 58 #define IMMU_PCI_CLASS2BASE(c) ((c) >> 16) /* baseclass from classcode */
58 59 #define IMMU_PCI_CLASS2SUB(c) (((c) >> 8) & 0xff); /* classcode */
59 60
60 61 #define IMMU_CONTIG_PADDR(d, p) \
61 - ((d).dck_paddr && ((d).dck_paddr + IMMU_PAGESIZE) == (p))
62 + ((d).dck_paddr && ((d).dck_paddr + (d).dck_npages * IMMU_PAGESIZE) \
63 + == (p))
62 64
63 65 typedef struct dvma_arg {
64 66 immu_t *dva_immu;
65 67 dev_info_t *dva_rdip;
66 68 dev_info_t *dva_ddip;
67 69 domain_t *dva_domain;
68 70 int dva_level;
69 71 immu_flags_t dva_flags;
70 72 list_t *dva_list;
71 73 int dva_error;
72 74 } dvma_arg_t;
73 75
74 76 static domain_t *domain_create(immu_t *immu, dev_info_t *ddip,
75 77 dev_info_t *rdip, immu_flags_t immu_flags);
76 78 static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus,
77 79 int dev, int func, immu_flags_t immu_flags);
78 80 static void destroy_immu_devi(immu_devi_t *immu_devi);
79 81 static boolean_t dvma_map(domain_t *domain, uint64_t sdvma,
80 82 uint64_t nvpages, immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip,
81 83 immu_flags_t immu_flags);
82 84
83 85 /* Extern globals */
84 86 extern struct memlist *phys_install;
85 87
86 88 /*
87 89 * iommulib interface functions.
88 90 */
89 91 static int immu_probe(iommulib_handle_t unitp, dev_info_t *dip);
90 92 static int immu_allochdl(iommulib_handle_t handle,
91 93 dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr,
92 94 int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep);
93 95 static int immu_freehdl(iommulib_handle_t handle,
94 96 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle);
95 97 static int immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip,
96 98 dev_info_t *rdip, ddi_dma_handle_t dma_handle, struct ddi_dma_req *dma_req,
97 99 ddi_dma_cookie_t *cookiep, uint_t *ccountp);
98 100 static int immu_unbindhdl(iommulib_handle_t handle,
99 101 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle);
100 102 static int immu_sync(iommulib_handle_t handle, dev_info_t *dip,
101 103 dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off, size_t len,
102 104 uint_t cachefl);
103 105 static int immu_win(iommulib_handle_t handle, dev_info_t *dip,
104 106 dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win,
105 107 off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep, uint_t *ccountp);
106 108 static int immu_mapobject(iommulib_handle_t handle, dev_info_t *dip,
107 109 dev_info_t *rdip, ddi_dma_handle_t dma_handle,
108 110 struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao);
109 111 static int immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip,
110 112 dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao);
111 113
112 114 /* static Globals */
113 115
114 116 /*
115 117 * Used to setup DMA objects (memory regions)
116 118 * for DMA reads by IOMMU units
117 119 */
118 120 static ddi_dma_attr_t immu_dma_attr = {
119 121 DMA_ATTR_V0,
120 122 0U,
121 123 0xffffffffffffffffULL,
122 124 0xffffffffU,
123 125 MMU_PAGESIZE, /* MMU page aligned */
124 126 0x1,
125 127 0x1,
126 128 0xffffffffU,
127 129 0xffffffffffffffffULL,
128 130 1,
129 131 4,
130 132 0
131 133 };
132 134
133 135 static ddi_device_acc_attr_t immu_acc_attr = {
134 136 DDI_DEVICE_ATTR_V0,
135 137 DDI_NEVERSWAP_ACC,
136 138 DDI_STRICTORDER_ACC
137 139 };
138 140
139 141 struct iommulib_ops immulib_ops = {
140 142 IOMMU_OPS_VERSION,
141 143 INTEL_IOMMU,
142 144 "Intel IOMMU",
143 145 NULL,
144 146 immu_probe,
145 147 immu_allochdl,
146 148 immu_freehdl,
147 149 immu_bindhdl,
148 150 immu_unbindhdl,
149 151 immu_sync,
150 152 immu_win,
151 153 immu_mapobject,
152 154 immu_unmapobject,
153 155 };
154 156
155 157 /*
156 158 * Fake physical address range used to set up initial prealloc mappings.
157 159 * This memory is never actually accessed. It is mapped read-only,
158 160 * and is overwritten as soon as the first DMA bind operation is
159 161 * performed. Since 0 is a special case, just start at the 2nd
160 162 * physical page.
161 163 */
162 164
163 165 static immu_dcookie_t immu_precookie = { MMU_PAGESIZE, IMMU_NPREPTES };
164 166
165 167 /* globals private to this file */
166 168 static kmutex_t immu_domain_lock;
167 169 static list_t immu_unity_domain_list;
168 170 static list_t immu_xlate_domain_list;
169 171
170 172 /* structure used to store idx into each level of the page tables */
171 173 typedef struct xlate {
172 174 int xlt_level;
173 175 uint_t xlt_idx;
174 176 pgtable_t *xlt_pgtable;
175 177 } xlate_t;
176 178
177 179 /* 0 is reserved by Vt-d spec. Solaris reserves 1 */
178 180 #define IMMU_UNITY_DID 1
179 181
180 182 static mod_hash_t *bdf_domain_hash;
181 183
182 184 int immu_use_alh;
183 185 int immu_use_tm;
184 186
185 187 static domain_t *
186 188 bdf_domain_lookup(immu_devi_t *immu_devi)
187 189 {
188 190 domain_t *domain;
189 191 int16_t seg = immu_devi->imd_seg;
190 192 int16_t bus = immu_devi->imd_bus;
191 193 int16_t devfunc = immu_devi->imd_devfunc;
192 194 uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
193 195
194 196 if (seg < 0 || bus < 0 || devfunc < 0) {
195 197 return (NULL);
196 198 }
197 199
198 200 domain = NULL;
199 201 if (mod_hash_find(bdf_domain_hash,
200 202 (void *)bdf, (void *)&domain) == 0) {
201 203 ASSERT(domain);
202 204 ASSERT(domain->dom_did > 0);
203 205 return (domain);
204 206 } else {
205 207 return (NULL);
206 208 }
207 209 }
208 210
209 211 static void
210 212 bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain)
211 213 {
212 214 int16_t seg = immu_devi->imd_seg;
213 215 int16_t bus = immu_devi->imd_bus;
214 216 int16_t devfunc = immu_devi->imd_devfunc;
215 217 uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
216 218
217 219 if (seg < 0 || bus < 0 || devfunc < 0) {
218 220 return;
219 221 }
220 222
221 223 (void) mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain);
222 224 }
223 225
224 226 static int
225 227 match_lpc(dev_info_t *pdip, void *arg)
226 228 {
227 229 immu_devi_t *immu_devi;
228 230 dvma_arg_t *dvap = (dvma_arg_t *)arg;
229 231
230 232 if (list_is_empty(dvap->dva_list)) {
231 233 return (DDI_WALK_TERMINATE);
232 234 }
233 235
234 236 immu_devi = list_head(dvap->dva_list);
235 237 for (; immu_devi; immu_devi = list_next(dvap->dva_list,
236 238 immu_devi)) {
237 239 if (immu_devi->imd_dip == pdip) {
238 240 dvap->dva_ddip = pdip;
239 241 dvap->dva_error = DDI_SUCCESS;
240 242 return (DDI_WALK_TERMINATE);
241 243 }
242 244 }
243 245
244 246 return (DDI_WALK_CONTINUE);
245 247 }
246 248
247 249 static void
248 250 immu_devi_set_spclist(dev_info_t *dip, immu_t *immu)
249 251 {
250 252 list_t *spclist = NULL;
251 253 immu_devi_t *immu_devi;
252 254
253 255 immu_devi = IMMU_DEVI(dip);
254 256 if (immu_devi->imd_display == B_TRUE) {
255 257 spclist = &(immu->immu_dvma_gfx_list);
256 258 } else if (immu_devi->imd_lpc == B_TRUE) {
257 259 spclist = &(immu->immu_dvma_lpc_list);
258 260 }
259 261
260 262 if (spclist) {
261 263 mutex_enter(&(immu->immu_lock));
262 264 list_insert_head(spclist, immu_devi);
263 265 mutex_exit(&(immu->immu_lock));
264 266 }
265 267 }
266 268
267 269 /*
268 270 * Set the immu_devi struct in the immu_devi field of a devinfo node
269 271 */
270 272 int
271 273 immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags)
272 274 {
273 275 int bus, dev, func;
274 276 immu_devi_t *new_imd;
275 277 immu_devi_t *immu_devi;
276 278
277 279 immu_devi = immu_devi_get(dip);
278 280 if (immu_devi != NULL) {
279 281 return (DDI_SUCCESS);
280 282 }
281 283
282 284 bus = dev = func = -1;
283 285
284 286 /*
285 287 * Assume a new immu_devi struct is needed
286 288 */
287 289 if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) {
288 290 /*
289 291 * No BDF. Set bus = -1 to indicate this.
290 292 * We still need to create a immu_devi struct
291 293 * though
292 294 */
293 295 bus = -1;
294 296 dev = 0;
295 297 func = 0;
296 298 }
297 299
298 300 new_imd = create_immu_devi(dip, bus, dev, func, immu_flags);
299 301 if (new_imd == NULL) {
300 302 ddi_err(DER_WARN, dip, "Failed to create immu_devi "
301 303 "structure");
302 304 return (DDI_FAILURE);
303 305 }
304 306
305 307 /*
306 308 * Check if some other thread allocated a immu_devi while we
307 309 * didn't own the lock.
308 310 */
309 311 mutex_enter(&(DEVI(dip)->devi_lock));
310 312 if (IMMU_DEVI(dip) == NULL) {
311 313 IMMU_DEVI_SET(dip, new_imd);
312 314 } else {
313 315 destroy_immu_devi(new_imd);
314 316 }
315 317 mutex_exit(&(DEVI(dip)->devi_lock));
316 318
317 319 return (DDI_SUCCESS);
318 320 }
319 321
320 322 static dev_info_t *
321 323 get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags)
322 324 {
323 325 dvma_arg_t dvarg = {0};
324 326 dvarg.dva_list = &(immu->immu_dvma_lpc_list);
325 327 dvarg.dva_rdip = rdip;
326 328 dvarg.dva_error = DDI_FAILURE;
327 329
328 330 if (immu_walk_ancestor(rdip, NULL, match_lpc,
329 331 &dvarg, NULL, immu_flags) != DDI_SUCCESS) {
330 332 ddi_err(DER_MODE, rdip, "Could not walk ancestors to "
331 333 "find lpc_devinfo for ISA device");
332 334 return (NULL);
333 335 }
334 336
335 337 if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) {
336 338 ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for "
337 339 "ISA device");
338 340 return (NULL);
339 341 }
340 342
341 343 return (dvarg.dva_ddip);
342 344 }
343 345
344 346 static dev_info_t *
345 347 get_gfx_devinfo(dev_info_t *rdip)
346 348 {
347 349 immu_t *immu;
348 350 immu_devi_t *immu_devi;
349 351 list_t *list_gfx;
350 352
351 353 /*
352 354 * The GFX device may not be on the same iommu unit as "agpgart"
353 355 * so search globally
354 356 */
355 357 immu_devi = NULL;
356 358 immu = list_head(&immu_list);
357 359 for (; immu; immu = list_next(&immu_list, immu)) {
358 360 list_gfx = &(immu->immu_dvma_gfx_list);
359 361 if (!list_is_empty(list_gfx)) {
360 362 immu_devi = list_head(list_gfx);
361 363 break;
362 364 }
363 365 }
364 366
365 367 if (immu_devi == NULL) {
366 368 ddi_err(DER_WARN, rdip, "iommu: No GFX device. "
367 369 "Cannot redirect agpgart");
368 370 return (NULL);
369 371 }
370 372
371 373 ddi_err(DER_LOG, rdip, "iommu: GFX redirect to %s",
372 374 ddi_node_name(immu_devi->imd_dip));
373 375
374 376 return (immu_devi->imd_dip);
375 377 }
376 378
377 379 static immu_flags_t
378 380 dma_to_immu_flags(struct ddi_dma_req *dmareq)
379 381 {
380 382 immu_flags_t flags = 0;
381 383
382 384 if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
383 385 flags |= IMMU_FLAGS_SLEEP;
384 386 } else {
385 387 flags |= IMMU_FLAGS_NOSLEEP;
386 388 }
387 389
388 390 #ifdef BUGGY_DRIVERS
389 391
390 392 flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
391 393
392 394 #else
393 395 /*
394 396 * Read and write flags need to be reversed.
395 397 * DMA_READ means read from device and write
396 398 * to memory. So DMA read means DVMA write.
397 399 */
398 400 if (dmareq->dmar_flags & DDI_DMA_READ)
399 401 flags |= IMMU_FLAGS_WRITE;
400 402
401 403 if (dmareq->dmar_flags & DDI_DMA_WRITE)
402 404 flags |= IMMU_FLAGS_READ;
403 405
404 406 /*
405 407 * Some buggy drivers specify neither READ or WRITE
406 408 * For such drivers set both read and write permissions
407 409 */
408 410 if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) {
409 411 flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
410 412 }
411 413 #endif
412 414
413 415 return (flags);
414 416 }
415 417
416 418 /*ARGSUSED*/
417 419 int
418 420 pgtable_ctor(void *buf, void *arg, int kmflag)
419 421 {
420 422 size_t actual_size = 0;
421 423 pgtable_t *pgtable;
422 424 int (*dmafp)(caddr_t);
423 425 caddr_t vaddr;
424 426 void *next;
425 427 uint_t flags;
426 428 immu_t *immu = arg;
427 429
428 430 pgtable = (pgtable_t *)buf;
429 431
430 432 dmafp = (kmflag & KM_NOSLEEP) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
431 433
432 434 next = kmem_zalloc(IMMU_PAGESIZE, kmflag);
433 435 if (next == NULL) {
434 436 return (-1);
435 437 }
436 438
437 439 if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr,
438 440 dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) {
439 441 kmem_free(next, IMMU_PAGESIZE);
440 442 return (-1);
441 443 }
442 444
443 445 flags = DDI_DMA_CONSISTENT;
444 446 if (!immu->immu_dvma_coherent)
445 447 flags |= IOMEM_DATA_UC_WR_COMBINE;
446 448
447 449 if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE,
448 450 &immu_acc_attr, flags,
449 451 dmafp, NULL, &vaddr, &actual_size,
450 452 &pgtable->hwpg_memhdl) != DDI_SUCCESS) {
451 453 ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
452 454 kmem_free(next, IMMU_PAGESIZE);
453 455 return (-1);
454 456 }
455 457
456 458 /*
457 459 * Memory allocation failure. Maybe a temporary condition
458 460 * so return error rather than panic, so we can try again
459 461 */
460 462 if (actual_size < IMMU_PAGESIZE) {
461 463 ddi_dma_mem_free(&pgtable->hwpg_memhdl);
462 464 ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
463 465 kmem_free(next, IMMU_PAGESIZE);
464 466 return (-1);
465 467 }
466 468
467 469 pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr));
468 470 pgtable->hwpg_vaddr = vaddr;
469 471 pgtable->swpg_next_array = next;
470 472
471 473 rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL);
472 474
473 475 return (0);
474 476 }
475 477
476 478 /*ARGSUSED*/
477 479 void
478 480 pgtable_dtor(void *buf, void *arg)
479 481 {
480 482 pgtable_t *pgtable;
481 483
482 484 pgtable = (pgtable_t *)buf;
483 485
484 486 /* destroy will panic if lock is held. */
485 487 rw_destroy(&(pgtable->swpg_rwlock));
486 488
487 489 ddi_dma_mem_free(&pgtable->hwpg_memhdl);
488 490 ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
489 491 kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE);
490 492 }
491 493
492 494 /*
493 495 * pgtable_alloc()
494 496 * alloc a IOMMU pgtable structure.
495 497 * This same struct is used for root and context tables as well.
496 498 * This routine allocs the f/ollowing:
497 499 * - a pgtable_t struct
498 500 * - a HW page which holds PTEs/entries which is accesssed by HW
499 501 * so we set up DMA for this page
500 502 * - a SW page which is only for our bookeeping
501 503 * (for example to hold pointers to the next level pgtable).
502 504 * So a simple kmem_alloc suffices
503 505 */
504 506 static pgtable_t *
505 507 pgtable_alloc(immu_t *immu, immu_flags_t immu_flags)
506 508 {
507 509 pgtable_t *pgtable;
508 510 int kmflags;
509 511
510 512 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
511 513
512 514 pgtable = kmem_cache_alloc(immu->immu_pgtable_cache, kmflags);
513 515 if (pgtable == NULL) {
514 516 return (NULL);
515 517 }
516 518 return (pgtable);
517 519 }
518 520
519 521 static void
520 522 pgtable_zero(pgtable_t *pgtable)
521 523 {
522 524 bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE);
523 525 bzero(pgtable->swpg_next_array, IMMU_PAGESIZE);
524 526 }
525 527
526 528 static void
527 529 pgtable_free(immu_t *immu, pgtable_t *pgtable)
528 530 {
529 531 kmem_cache_free(immu->immu_pgtable_cache, pgtable);
530 532 }
531 533
532 534 /*
533 535 * Function to identify a display device from the PCI class code
534 536 */
535 537 static boolean_t
536 538 device_is_display(uint_t classcode)
537 539 {
538 540 static uint_t disp_classes[] = {
539 541 0x000100,
540 542 0x030000,
541 543 0x030001
542 544 };
543 545 int i, nclasses = sizeof (disp_classes) / sizeof (uint_t);
544 546
545 547 for (i = 0; i < nclasses; i++) {
546 548 if (classcode == disp_classes[i])
547 549 return (B_TRUE);
548 550 }
549 551 return (B_FALSE);
550 552 }
551 553
552 554 /*
553 555 * Function that determines if device is PCIEX and/or PCIEX bridge
554 556 */
555 557 static boolean_t
556 558 device_is_pciex(
557 559 uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib)
558 560 {
559 561 ushort_t cap;
560 562 ushort_t capsp;
561 563 ushort_t cap_count = PCI_CAP_MAX_PTR;
562 564 ushort_t status;
563 565 boolean_t is_pciex = B_FALSE;
564 566
565 567 *is_pcib = B_FALSE;
566 568
567 569 status = pci_getw_func(bus, dev, func, PCI_CONF_STAT);
568 570 if (!(status & PCI_STAT_CAP))
569 571 return (B_FALSE);
570 572
571 573 capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR);
572 574 while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) {
573 575 capsp &= PCI_CAP_PTR_MASK;
574 576 cap = pci_getb_func(bus, dev, func, capsp);
575 577
576 578 if (cap == PCI_CAP_ID_PCI_E) {
577 579 status = pci_getw_func(bus, dev, func, capsp + 2);
578 580 /*
579 581 * See section 7.8.2 of PCI-Express Base Spec v1.0a
580 582 * for Device/Port Type.
581 583 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the
582 584 * device is a PCIE2PCI bridge
583 585 */
584 586 *is_pcib =
585 587 ((status & PCIE_PCIECAP_DEV_TYPE_MASK) ==
586 588 PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE;
587 589 is_pciex = B_TRUE;
588 590 }
589 591
590 592 capsp = (*pci_getb_func)(bus, dev, func,
591 593 capsp + PCI_CAP_NEXT_PTR);
592 594 }
593 595
594 596 return (is_pciex);
595 597 }
596 598
597 599 static boolean_t
598 600 device_use_premap(uint_t classcode)
599 601 {
600 602 if (IMMU_PCI_CLASS2BASE(classcode) == PCI_CLASS_NET)
601 603 return (B_TRUE);
602 604 return (B_FALSE);
603 605 }
604 606
605 607
606 608 /*
607 609 * immu_dvma_get_immu()
608 610 * get the immu unit structure for a dev_info node
609 611 */
610 612 immu_t *
611 613 immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags)
612 614 {
613 615 immu_devi_t *immu_devi;
614 616 immu_t *immu;
615 617
616 618 /*
617 619 * check if immu unit was already found earlier.
618 620 * If yes, then it will be stashed in immu_devi struct.
619 621 */
620 622 immu_devi = immu_devi_get(dip);
621 623 if (immu_devi == NULL) {
622 624 if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) {
623 625 /*
624 626 * May fail because of low memory. Return error rather
625 627 * than panic as we want driver to rey again later
626 628 */
627 629 ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
628 630 "No immu_devi structure");
629 631 /*NOTREACHED*/
630 632 }
631 633 immu_devi = immu_devi_get(dip);
632 634 }
633 635
634 636 mutex_enter(&(DEVI(dip)->devi_lock));
635 637 if (immu_devi->imd_immu) {
636 638 immu = immu_devi->imd_immu;
637 639 mutex_exit(&(DEVI(dip)->devi_lock));
638 640 return (immu);
639 641 }
640 642 mutex_exit(&(DEVI(dip)->devi_lock));
641 643
642 644 immu = immu_dmar_get_immu(dip);
643 645 if (immu == NULL) {
644 646 ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
645 647 "Cannot find immu_t for device");
646 648 /*NOTREACHED*/
647 649 }
648 650
649 651 /*
650 652 * Check if some other thread found immu
651 653 * while lock was not held
652 654 */
653 655 immu_devi = immu_devi_get(dip);
654 656 /* immu_devi should be present as we found it earlier */
655 657 if (immu_devi == NULL) {
656 658 ddi_err(DER_PANIC, dip,
657 659 "immu_dvma_get_immu: No immu_devi structure");
658 660 /*NOTREACHED*/
659 661 }
660 662
661 663 mutex_enter(&(DEVI(dip)->devi_lock));
662 664 if (immu_devi->imd_immu == NULL) {
663 665 /* nobody else set it, so we should do it */
664 666 immu_devi->imd_immu = immu;
665 667 immu_devi_set_spclist(dip, immu);
666 668 } else {
667 669 /*
668 670 * if some other thread got immu before
669 671 * us, it should get the same results
670 672 */
671 673 if (immu_devi->imd_immu != immu) {
672 674 ddi_err(DER_PANIC, dip, "Multiple "
673 675 "immu units found for device. Expected (%p), "
674 676 "actual (%p)", (void *)immu,
675 677 (void *)immu_devi->imd_immu);
676 678 mutex_exit(&(DEVI(dip)->devi_lock));
677 679 /*NOTREACHED*/
678 680 }
679 681 }
680 682 mutex_exit(&(DEVI(dip)->devi_lock));
681 683
682 684 return (immu);
683 685 }
684 686
685 687
686 688 /* ############################# IMMU_DEVI code ############################ */
687 689
688 690 /*
689 691 * Allocate a immu_devi structure and initialize it
690 692 */
691 693 static immu_devi_t *
692 694 create_immu_devi(dev_info_t *rdip, int bus, int dev, int func,
693 695 immu_flags_t immu_flags)
694 696 {
695 697 uchar_t baseclass, subclass;
696 698 uint_t classcode, revclass;
697 699 immu_devi_t *immu_devi;
698 700 boolean_t pciex = B_FALSE;
699 701 int kmflags;
700 702 boolean_t is_pcib = B_FALSE;
701 703
702 704 /* bus == -1 indicate non-PCI device (no BDF) */
703 705 ASSERT(bus == -1 || bus >= 0);
704 706 ASSERT(dev >= 0);
705 707 ASSERT(func >= 0);
706 708
707 709 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
708 710 immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags);
709 711 if (immu_devi == NULL) {
710 712 ddi_err(DER_WARN, rdip, "Failed to allocate memory for "
711 713 "Intel IOMMU immu_devi structure");
712 714 return (NULL);
713 715 }
714 716 immu_devi->imd_dip = rdip;
715 717 immu_devi->imd_seg = 0; /* Currently seg can only be 0 */
716 718 immu_devi->imd_bus = bus;
717 719 immu_devi->imd_pcib_type = IMMU_PCIB_BAD;
718 720
719 721 if (bus == -1) {
720 722 immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF;
721 723 return (immu_devi);
722 724 }
723 725
724 726 immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func);
725 727 immu_devi->imd_sec = 0;
726 728 immu_devi->imd_sub = 0;
727 729
728 730 revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID);
729 731
730 732 classcode = IMMU_PCI_REV2CLASS(revclass);
731 733 baseclass = IMMU_PCI_CLASS2BASE(classcode);
732 734 subclass = IMMU_PCI_CLASS2SUB(classcode);
733 735
734 736 if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) {
735 737
736 738 immu_devi->imd_sec = pci_getb_func(bus, dev, func,
737 739 PCI_BCNF_SECBUS);
738 740 immu_devi->imd_sub = pci_getb_func(bus, dev, func,
739 741 PCI_BCNF_SUBBUS);
740 742
741 743 pciex = device_is_pciex(bus, dev, func, &is_pcib);
742 744 if (pciex == B_TRUE && is_pcib == B_TRUE) {
743 745 immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI;
744 746 } else if (pciex == B_TRUE) {
745 747 immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE;
746 748 } else {
747 749 immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI;
748 750 }
749 751 } else {
750 752 immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT;
751 753 }
752 754
753 755 /* check for certain special devices */
754 756 immu_devi->imd_display = device_is_display(classcode);
755 757 immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) &&
756 758 (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE;
757 759 immu_devi->imd_use_premap = device_use_premap(classcode);
758 760
759 761 immu_devi->imd_domain = NULL;
760 762
761 763 immu_devi->imd_dvma_flags = immu_global_dvma_flags;
762 764
763 765 return (immu_devi);
764 766 }
765 767
766 768 static void
767 769 destroy_immu_devi(immu_devi_t *immu_devi)
768 770 {
769 771 kmem_free(immu_devi, sizeof (immu_devi_t));
770 772 }
771 773
772 774 static domain_t *
773 775 immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp)
774 776 {
775 777 immu_devi_t *immu_devi;
776 778 domain_t *domain;
777 779 dev_info_t *ddip;
778 780
779 781 *ddipp = NULL;
780 782
781 783 immu_devi = immu_devi_get(rdip);
782 784 if (immu_devi == NULL) {
783 785 return (NULL);
784 786 }
785 787
786 788 mutex_enter(&(DEVI(rdip)->devi_lock));
787 789 domain = immu_devi->imd_domain;
788 790 ddip = immu_devi->imd_ddip;
789 791 mutex_exit(&(DEVI(rdip)->devi_lock));
790 792
791 793 if (domain)
792 794 *ddipp = ddip;
793 795
794 796 return (domain);
795 797
796 798 }
797 799
798 800 /* ############################# END IMMU_DEVI code ######################## */
799 801 /* ############################# DOMAIN code ############################### */
800 802
801 803 /*
802 804 * This routine always succeeds
803 805 */
804 806 static int
805 807 did_alloc(immu_t *immu, dev_info_t *rdip,
806 808 dev_info_t *ddip, immu_flags_t immu_flags)
807 809 {
808 810 int did;
809 811
810 812 did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1,
811 813 (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP);
812 814
813 815 if (did == 0) {
814 816 ddi_err(DER_WARN, rdip, "device domain-id alloc error"
815 817 " domain-device: %s%d. immu unit is %s. Using "
816 818 "unity domain with domain-id (%d)",
817 819 ddi_driver_name(ddip), ddi_get_instance(ddip),
818 820 immu->immu_name, immu->immu_unity_domain->dom_did);
819 821 did = immu->immu_unity_domain->dom_did;
820 822 }
821 823
822 824 return (did);
823 825 }
824 826
825 827 static int
826 828 get_branch_domain(dev_info_t *pdip, void *arg)
827 829 {
828 830 immu_devi_t *immu_devi;
829 831 domain_t *domain;
830 832 dev_info_t *ddip;
831 833 immu_t *immu;
832 834 dvma_arg_t *dvp = (dvma_arg_t *)arg;
833 835
834 836 /*
835 837 * The field dvp->dva_rdip is a work-in-progress
836 838 * and gets updated as we walk up the ancestor
837 839 * tree. The final ddip is set only when we reach
838 840 * the top of the tree. So the dvp->dva_ddip field cannot
839 841 * be relied on until we reach the top of the field.
840 842 */
841 843
842 844 /* immu_devi may not be set. */
843 845 immu_devi = immu_devi_get(pdip);
844 846 if (immu_devi == NULL) {
845 847 if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) {
846 848 dvp->dva_error = DDI_FAILURE;
847 849 return (DDI_WALK_TERMINATE);
848 850 }
849 851 }
850 852
851 853 immu_devi = immu_devi_get(pdip);
852 854 immu = immu_devi->imd_immu;
853 855 if (immu == NULL)
854 856 immu = immu_dvma_get_immu(pdip, dvp->dva_flags);
855 857
856 858 /*
857 859 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to
858 860 * terminate the walk (since the device under the PCIE bridge
859 861 * is a PCIE device and has an independent entry in the
860 862 * root/context table)
861 863 */
862 864 if (dvp->dva_rdip != pdip &&
863 865 immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) {
864 866 return (DDI_WALK_TERMINATE);
865 867 }
866 868
867 869 /*
868 870 * In order to be a domain-dim, it must be a PCI device i.e.
869 871 * must have valid BDF. This also eliminates the root complex.
870 872 */
871 873 if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD &&
872 874 immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) {
873 875 ASSERT(immu_devi->imd_bus >= 0);
874 876 ASSERT(immu_devi->imd_devfunc >= 0);
875 877 dvp->dva_ddip = pdip;
876 878 }
877 879
878 880 if (immu_devi->imd_display == B_TRUE ||
879 881 (dvp->dva_flags & IMMU_FLAGS_UNITY)) {
880 882 dvp->dva_domain = immu->immu_unity_domain;
881 883 /* continue walking to find ddip */
882 884 return (DDI_WALK_CONTINUE);
883 885 }
884 886
885 887 mutex_enter(&(DEVI(pdip)->devi_lock));
886 888 domain = immu_devi->imd_domain;
887 889 ddip = immu_devi->imd_ddip;
888 890 mutex_exit(&(DEVI(pdip)->devi_lock));
889 891
890 892 if (domain && ddip) {
891 893 /* if domain is set, it must be the same */
892 894 if (dvp->dva_domain) {
893 895 ASSERT(domain == dvp->dva_domain);
894 896 }
895 897 dvp->dva_domain = domain;
896 898 dvp->dva_ddip = ddip;
897 899 return (DDI_WALK_TERMINATE);
898 900 }
899 901
900 902 /* Domain may already be set, continue walking so that ddip gets set */
901 903 if (dvp->dva_domain) {
902 904 return (DDI_WALK_CONTINUE);
903 905 }
904 906
905 907 /* domain is not set in either immu_devi or dvp */
906 908 domain = bdf_domain_lookup(immu_devi);
907 909 if (domain == NULL) {
908 910 return (DDI_WALK_CONTINUE);
909 911 }
910 912
911 913 /* ok, the BDF hash had a domain for this BDF. */
912 914
913 915 /* Grab lock again to check if something else set immu_devi fields */
914 916 mutex_enter(&(DEVI(pdip)->devi_lock));
915 917 if (immu_devi->imd_domain != NULL) {
916 918 dvp->dva_domain = domain;
917 919 } else {
918 920 dvp->dva_domain = domain;
919 921 }
920 922 mutex_exit(&(DEVI(pdip)->devi_lock));
921 923
922 924 /*
923 925 * walk upwards until the topmost PCI bridge is found
924 926 */
925 927 return (DDI_WALK_CONTINUE);
926 928
927 929 }
928 930
929 931 static void
930 932 map_unity_domain(domain_t *domain)
931 933 {
932 934 struct memlist *mp;
933 935 uint64_t start;
934 936 uint64_t npages;
935 937 immu_dcookie_t dcookies[1] = {0};
936 938 int dcount = 0;
937 939
938 940 /*
939 941 * UNITY arenas are a mirror of the physical memory
940 942 * installed on the system.
941 943 */
942 944
943 945 #ifdef BUGGY_DRIVERS
944 946 /*
945 947 * Dont skip page0. Some broken HW/FW access it.
946 948 */
947 949 dcookies[0].dck_paddr = 0;
948 950 dcookies[0].dck_npages = 1;
949 951 dcount = 1;
950 952 (void) dvma_map(domain, 0, 1, dcookies, dcount, NULL,
951 953 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
952 954 #endif
953 955
954 956 memlist_read_lock();
955 957
956 958 mp = phys_install;
957 959
958 960 if (mp->ml_address == 0) {
959 961 /* since we already mapped page1 above */
960 962 start = IMMU_PAGESIZE;
961 963 } else {
962 964 start = mp->ml_address;
963 965 }
964 966 npages = mp->ml_size/IMMU_PAGESIZE + 1;
965 967
966 968 dcookies[0].dck_paddr = start;
967 969 dcookies[0].dck_npages = npages;
968 970 dcount = 1;
969 971 (void) dvma_map(domain, start, npages, dcookies,
970 972 dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
971 973
972 974 ddi_err(DER_LOG, domain->dom_dip, "iommu: mapping PHYS span [0x%" PRIx64
973 975 " - 0x%" PRIx64 "]", start, start + mp->ml_size);
974 976
975 977 mp = mp->ml_next;
976 978 while (mp) {
977 979 ddi_err(DER_LOG, domain->dom_dip,
978 980 "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]",
979 981 mp->ml_address, mp->ml_address + mp->ml_size);
980 982
981 983 start = mp->ml_address;
982 984 npages = mp->ml_size/IMMU_PAGESIZE + 1;
983 985
984 986 dcookies[0].dck_paddr = start;
985 987 dcookies[0].dck_npages = npages;
986 988 dcount = 1;
987 989 (void) dvma_map(domain, start, npages,
988 990 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
989 991 mp = mp->ml_next;
990 992 }
991 993
992 994 mp = bios_rsvd;
993 995 while (mp) {
994 996 ddi_err(DER_LOG, domain->dom_dip,
995 997 "iommu: mapping PHYS span [0x%" PRIx64 " - 0x%" PRIx64 "]",
996 998 mp->ml_address, mp->ml_address + mp->ml_size);
997 999
998 1000 start = mp->ml_address;
999 1001 npages = mp->ml_size/IMMU_PAGESIZE + 1;
1000 1002
1001 1003 dcookies[0].dck_paddr = start;
1002 1004 dcookies[0].dck_npages = npages;
1003 1005 dcount = 1;
1004 1006 (void) dvma_map(domain, start, npages,
1005 1007 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
1006 1008
1007 1009 mp = mp->ml_next;
1008 1010 }
1009 1011
1010 1012 memlist_read_unlock();
1011 1013 }
1012 1014
1013 1015 /*
1014 1016 * create_xlate_arena()
1015 1017 * Create the dvma arena for a domain with translation
1016 1018 * mapping
1017 1019 */
1018 1020 static void
1019 1021 create_xlate_arena(immu_t *immu, domain_t *domain,
1020 1022 dev_info_t *rdip, immu_flags_t immu_flags)
1021 1023 {
1022 1024 char *arena_name;
1023 1025 struct memlist *mp;
1024 1026 int vmem_flags;
1025 1027 uint64_t start;
1026 1028 uint_t mgaw;
1027 1029 uint64_t size;
1028 1030 uint64_t maxaddr;
1029 1031 void *vmem_ret;
1030 1032
1031 1033 arena_name = domain->dom_dvma_arena_name;
1032 1034
1033 1035 /* Note, don't do sizeof (arena_name) - it is just a pointer */
1034 1036 (void) snprintf(arena_name,
1035 1037 sizeof (domain->dom_dvma_arena_name),
1036 1038 "%s-domain-%d-xlate-DVMA-arena", immu->immu_name,
1037 1039 domain->dom_did);
1038 1040
1039 1041 vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP;
1040 1042
1041 1043 /* Restrict mgaddr (max guest addr) to MGAW */
1042 1044 mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap);
1043 1045
1044 1046 /*
1045 1047 * To ensure we avoid ioapic and PCI MMIO ranges we just
1046 1048 * use the physical memory address range of the system as the
1047 1049 * range
1048 1050 */
1049 1051 maxaddr = ((uint64_t)1 << mgaw);
1050 1052
1051 1053 memlist_read_lock();
1052 1054
1053 1055 mp = phys_install;
1054 1056
1055 1057 if (mp->ml_address == 0)
1056 1058 start = MMU_PAGESIZE;
1057 1059 else
1058 1060 start = mp->ml_address;
1059 1061
1060 1062 if (start + mp->ml_size > maxaddr)
1061 1063 size = maxaddr - start;
1062 1064 else
1063 1065 size = mp->ml_size;
1064 1066
1065 1067 ddi_err(DER_VERB, rdip,
1066 1068 "iommu: %s: Creating dvma vmem arena [0x%" PRIx64
1067 1069 " - 0x%" PRIx64 "]", arena_name, start, start + size);
1068 1070
1069 1071 /*
1070 1072 * We always allocate in quanta of IMMU_PAGESIZE
1071 1073 */
1072 1074 domain->dom_dvma_arena = vmem_create(arena_name,
1073 1075 (void *)(uintptr_t)start, /* start addr */
1074 1076 size, /* size */
1075 1077 IMMU_PAGESIZE, /* quantum */
1076 1078 NULL, /* afunc */
1077 1079 NULL, /* ffunc */
1078 1080 NULL, /* source */
1079 1081 0, /* qcache_max */
1080 1082 vmem_flags);
1081 1083
1082 1084 if (domain->dom_dvma_arena == NULL) {
1083 1085 ddi_err(DER_PANIC, rdip,
1084 1086 "Failed to allocate DVMA arena(%s) "
1085 1087 "for domain ID (%d)", arena_name, domain->dom_did);
1086 1088 /*NOTREACHED*/
1087 1089 }
1088 1090
1089 1091 mp = mp->ml_next;
1090 1092 while (mp) {
1091 1093
1092 1094 if (mp->ml_address == 0)
1093 1095 start = MMU_PAGESIZE;
1094 1096 else
1095 1097 start = mp->ml_address;
1096 1098
1097 1099 if (start + mp->ml_size > maxaddr)
1098 1100 size = maxaddr - start;
1099 1101 else
1100 1102 size = mp->ml_size;
1101 1103
1102 1104 ddi_err(DER_VERB, rdip,
1103 1105 "iommu: %s: Adding dvma vmem span [0x%" PRIx64
1104 1106 " - 0x%" PRIx64 "]", arena_name, start,
1105 1107 start + size);
1106 1108
1107 1109 vmem_ret = vmem_add(domain->dom_dvma_arena,
1108 1110 (void *)(uintptr_t)start, size, vmem_flags);
1109 1111
1110 1112 if (vmem_ret == NULL) {
1111 1113 ddi_err(DER_PANIC, rdip,
1112 1114 "Failed to allocate DVMA arena(%s) "
1113 1115 "for domain ID (%d)",
1114 1116 arena_name, domain->dom_did);
1115 1117 /*NOTREACHED*/
1116 1118 }
1117 1119 mp = mp->ml_next;
1118 1120 }
1119 1121 memlist_read_unlock();
1120 1122 }
1121 1123
1122 1124 /* ################################### DOMAIN CODE ######################### */
1123 1125
1124 1126 /*
1125 1127 * Set the domain and domain-dip for a dip
1126 1128 */
1127 1129 static void
1128 1130 set_domain(
1129 1131 dev_info_t *dip,
1130 1132 dev_info_t *ddip,
1131 1133 domain_t *domain)
1132 1134 {
1133 1135 immu_devi_t *immu_devi;
1134 1136 domain_t *fdomain;
1135 1137 dev_info_t *fddip;
1136 1138
1137 1139 immu_devi = immu_devi_get(dip);
1138 1140
1139 1141 mutex_enter(&(DEVI(dip)->devi_lock));
1140 1142 fddip = immu_devi->imd_ddip;
1141 1143 fdomain = immu_devi->imd_domain;
1142 1144
1143 1145 if (fddip) {
1144 1146 ASSERT(fddip == ddip);
1145 1147 } else {
1146 1148 immu_devi->imd_ddip = ddip;
1147 1149 }
1148 1150
1149 1151 if (fdomain) {
1150 1152 ASSERT(fdomain == domain);
1151 1153 } else {
1152 1154 immu_devi->imd_domain = domain;
1153 1155 }
1154 1156 mutex_exit(&(DEVI(dip)->devi_lock));
1155 1157 }
1156 1158
1157 1159 /*
1158 1160 * device_domain()
1159 1161 * Get domain for a device. The domain may be global in which case it
1160 1162 * is shared between all IOMMU units. Due to potential AGAW differences
1161 1163 * between IOMMU units, such global domains *have to be* UNITY mapping
1162 1164 * domains. Alternatively, the domain may be local to a IOMMU unit.
1163 1165 * Local domains may be shared or immu_devi, although the
1164 1166 * scope of sharing
1165 1167 * is restricted to devices controlled by the IOMMU unit to
1166 1168 * which the domain
1167 1169 * belongs. If shared, they (currently) have to be UNITY domains. If
1168 1170 * immu_devi a domain may be either UNITY or translation (XLATE) domain.
1169 1171 */
1170 1172 static domain_t *
1171 1173 device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags)
1172 1174 {
1173 1175 dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */
1174 1176 immu_t *immu;
1175 1177 domain_t *domain;
1176 1178 dvma_arg_t dvarg = {0};
1177 1179 int level;
1178 1180
1179 1181 *ddipp = NULL;
1180 1182
1181 1183 /*
1182 1184 * Check if the domain is already set. This is usually true
1183 1185 * if this is not the first DVMA transaction.
1184 1186 */
1185 1187 ddip = NULL;
1186 1188 domain = immu_devi_domain(rdip, &ddip);
1187 1189 if (domain) {
1188 1190 *ddipp = ddip;
1189 1191 return (domain);
1190 1192 }
1191 1193
1192 1194 immu = immu_dvma_get_immu(rdip, immu_flags);
1193 1195 if (immu == NULL) {
1194 1196 /*
1195 1197 * possible that there is no IOMMU unit for this device
1196 1198 * - BIOS bugs are one example.
1197 1199 */
1198 1200 ddi_err(DER_WARN, rdip, "No iommu unit found for device");
1199 1201 return (NULL);
1200 1202 }
1201 1203
1202 1204 immu_flags |= immu_devi_get(rdip)->imd_dvma_flags;
1203 1205
1204 1206 dvarg.dva_rdip = rdip;
1205 1207 dvarg.dva_ddip = NULL;
1206 1208 dvarg.dva_domain = NULL;
1207 1209 dvarg.dva_flags = immu_flags;
1208 1210 level = 0;
1209 1211 if (immu_walk_ancestor(rdip, NULL, get_branch_domain,
1210 1212 &dvarg, &level, immu_flags) != DDI_SUCCESS) {
1211 1213 /*
1212 1214 * maybe low memory. return error,
1213 1215 * so driver tries again later
1214 1216 */
1215 1217 return (NULL);
1216 1218 }
1217 1219
1218 1220 /* should have walked at least 1 dip (i.e. edip) */
1219 1221 ASSERT(level > 0);
1220 1222
1221 1223 ddip = dvarg.dva_ddip; /* must be present */
1222 1224 domain = dvarg.dva_domain; /* may be NULL */
1223 1225
1224 1226 /*
1225 1227 * We may find the domain during our ancestor walk on any one of our
1226 1228 * ancestor dips, If the domain is found then the domain-dip
1227 1229 * (i.e. ddip) will also be found in the same immu_devi struct.
1228 1230 * The domain-dip is the highest ancestor dip which shares the
1229 1231 * same domain with edip.
1230 1232 * The domain may or may not be found, but the domain dip must
1231 1233 * be found.
1232 1234 */
1233 1235 if (ddip == NULL) {
1234 1236 ddi_err(DER_MODE, rdip, "Cannot find domain dip for device.");
1235 1237 return (NULL);
1236 1238 }
1237 1239
1238 1240 /*
1239 1241 * Did we find a domain ?
1240 1242 */
1241 1243 if (domain) {
1242 1244 goto found;
1243 1245 }
1244 1246
1245 1247 /* nope, so allocate */
1246 1248 domain = domain_create(immu, ddip, rdip, immu_flags);
1247 1249 if (domain == NULL) {
1248 1250 return (NULL);
1249 1251 }
1250 1252
1251 1253 /*FALLTHROUGH*/
1252 1254 found:
1253 1255 /*
1254 1256 * We know *domain *is* the right domain, so panic if
1255 1257 * another domain is set for either the request-dip or
1256 1258 * effective dip.
1257 1259 */
1258 1260 set_domain(ddip, ddip, domain);
1259 1261 set_domain(rdip, ddip, domain);
1260 1262
1261 1263 *ddipp = ddip;
1262 1264 return (domain);
1263 1265 }
1264 1266
1265 1267 static void
1266 1268 create_unity_domain(immu_t *immu)
1267 1269 {
1268 1270 domain_t *domain;
1269 1271
1270 1272 /* domain created during boot and always use sleep flag */
1271 1273 domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP);
1272 1274
1273 1275 rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1274 1276
1275 1277 domain->dom_did = IMMU_UNITY_DID;
1276 1278 domain->dom_maptype = IMMU_MAPTYPE_UNITY;
1277 1279
1278 1280 domain->dom_immu = immu;
1279 1281 immu->immu_unity_domain = domain;
1280 1282
1281 1283 /*
1282 1284 * Setup the domain's initial page table
1283 1285 * should never fail.
1284 1286 */
1285 1287 domain->dom_pgtable_root = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1286 1288 pgtable_zero(domain->dom_pgtable_root);
1287 1289
1288 1290 /*
1289 1291 * Only map all physical memory in to the unity domain
1290 1292 * if passthrough is not supported. If it is supported,
1291 1293 * passthrough is set in the context entry instead.
1292 1294 */
1293 1295 if (!IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1294 1296 map_unity_domain(domain);
1295 1297
1296 1298
1297 1299 /*
1298 1300 * put it on the system-wide UNITY domain list
1299 1301 */
1300 1302 mutex_enter(&(immu_domain_lock));
1301 1303 list_insert_tail(&immu_unity_domain_list, domain);
1302 1304 mutex_exit(&(immu_domain_lock));
1303 1305 }
1304 1306
1305 1307 /*
1306 1308 * ddip is the domain-dip - the topmost dip in a domain
1307 1309 * rdip is the requesting-dip - the device which is
1308 1310 * requesting DVMA setup
1309 1311 * if domain is a non-shared domain rdip == ddip
1310 1312 */
1311 1313 static domain_t *
1312 1314 domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip,
1313 1315 immu_flags_t immu_flags)
1314 1316 {
1315 1317 int kmflags;
1316 1318 domain_t *domain;
1317 1319 char mod_hash_name[128];
1318 1320 immu_devi_t *immu_devi;
1319 1321 int did;
1320 1322 immu_dcookie_t dcookies[1] = {0};
1321 1323 int dcount = 0;
1322 1324
1323 1325 immu_devi = immu_devi_get(rdip);
1324 1326
1325 1327 /*
1326 1328 * First allocate a domainid.
1327 1329 * This routine will never fail, since if we run out
1328 1330 * of domains the unity domain will be allocated.
1329 1331 */
1330 1332 did = did_alloc(immu, rdip, ddip, immu_flags);
1331 1333 if (did == IMMU_UNITY_DID) {
1332 1334 /* domain overflow */
1333 1335 ASSERT(immu->immu_unity_domain);
1334 1336 return (immu->immu_unity_domain);
1335 1337 }
1336 1338
1337 1339 kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1338 1340 domain = kmem_zalloc(sizeof (domain_t), kmflags);
1339 1341 if (domain == NULL) {
1340 1342 ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain "
1341 1343 "structure for device. IOMMU unit: %s", immu->immu_name);
1342 1344 /*NOTREACHED*/
1343 1345 }
1344 1346
1345 1347 rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
1346 1348
1347 1349 (void) snprintf(mod_hash_name, sizeof (mod_hash_name),
1348 1350 "immu%s-domain%d-pava-hash", immu->immu_name, did);
1349 1351
1350 1352 domain->dom_did = did;
1351 1353 domain->dom_immu = immu;
1352 1354 domain->dom_maptype = IMMU_MAPTYPE_XLATE;
1353 1355 domain->dom_dip = ddip;
1354 1356
1355 1357 /*
1356 1358 * Create xlate DVMA arena for this domain.
1357 1359 */
1358 1360 create_xlate_arena(immu, domain, rdip, immu_flags);
1359 1361
1360 1362 /*
1361 1363 * Setup the domain's initial page table
1362 1364 */
1363 1365 domain->dom_pgtable_root = pgtable_alloc(immu, immu_flags);
1364 1366 if (domain->dom_pgtable_root == NULL) {
1365 1367 ddi_err(DER_PANIC, rdip, "Failed to alloc root "
1366 1368 "pgtable for domain (%d). IOMMU unit: %s",
1367 1369 domain->dom_did, immu->immu_name);
1368 1370 /*NOTREACHED*/
1369 1371 }
1370 1372 pgtable_zero(domain->dom_pgtable_root);
1371 1373
1372 1374 /*
1373 1375 * Since this is a immu unit-specific domain, put it on
1374 1376 * the per-immu domain list.
1375 1377 */
1376 1378 mutex_enter(&(immu->immu_lock));
1377 1379 list_insert_head(&immu->immu_domain_list, domain);
1378 1380 mutex_exit(&(immu->immu_lock));
1379 1381
1380 1382 /*
1381 1383 * Also put it on the system-wide xlate domain list
1382 1384 */
1383 1385 mutex_enter(&(immu_domain_lock));
1384 1386 list_insert_head(&immu_xlate_domain_list, domain);
1385 1387 mutex_exit(&(immu_domain_lock));
1386 1388
1387 1389 bdf_domain_insert(immu_devi, domain);
1388 1390
1389 1391 #ifdef BUGGY_DRIVERS
1390 1392 /*
1391 1393 * Map page0. Some broken HW/FW access it.
1392 1394 */
1393 1395 dcookies[0].dck_paddr = 0;
1394 1396 dcookies[0].dck_npages = 1;
1395 1397 dcount = 1;
1396 1398 (void) dvma_map(domain, 0, 1, dcookies, dcount, NULL,
1397 1399 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
1398 1400 #endif
1399 1401 return (domain);
1400 1402 }
1401 1403
1402 1404 /*
1403 1405 * Create domainid arena.
1404 1406 * Domainid 0 is reserved by Vt-d spec and cannot be used by
1405 1407 * system software.
1406 1408 * Domainid 1 is reserved by solaris and used for *all* of the following:
1407 1409 * as the "uninitialized" domain - For devices not yet controlled
1408 1410 * by Solaris
1409 1411 * as the "unity" domain - For devices that will always belong
1410 1412 * to the unity domain
1411 1413 * as the "overflow" domain - Used for any new device after we
1412 1414 * run out of domains
1413 1415 * All of the above domains map into a single domain with
1414 1416 * domainid 1 and UNITY DVMA mapping
1415 1417 * Each IMMU unity has its own unity/uninit/overflow domain
1416 1418 */
1417 1419 static void
1418 1420 did_init(immu_t *immu)
1419 1421 {
1420 1422 (void) snprintf(immu->immu_did_arena_name,
1421 1423 sizeof (immu->immu_did_arena_name),
1422 1424 "%s_domainid_arena", immu->immu_name);
1423 1425
1424 1426 ddi_err(DER_VERB, immu->immu_dip, "creating domainid arena %s",
1425 1427 immu->immu_did_arena_name);
1426 1428
1427 1429 immu->immu_did_arena = vmem_create(
1428 1430 immu->immu_did_arena_name,
1429 1431 (void *)(uintptr_t)(IMMU_UNITY_DID + 1), /* start addr */
1430 1432 immu->immu_max_domains - IMMU_UNITY_DID,
1431 1433 1, /* quantum */
1432 1434 NULL, /* afunc */
1433 1435 NULL, /* ffunc */
1434 1436 NULL, /* source */
1435 1437 0, /* qcache_max */
1436 1438 VM_SLEEP);
1437 1439
1438 1440 /* Even with SLEEP flag, vmem_create() can fail */
1439 1441 if (immu->immu_did_arena == NULL) {
1440 1442 ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel "
1441 1443 "IOMMU domainid allocator: %s", immu->immu_name,
1442 1444 immu->immu_did_arena_name);
1443 1445 }
1444 1446 }
1445 1447
1446 1448 /* ######################### CONTEXT CODE ################################# */
1447 1449
1448 1450 static void
1449 1451 context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table,
1450 1452 int bus, int devfunc)
1451 1453 {
1452 1454 pgtable_t *context;
1453 1455 pgtable_t *pgtable_root;
1454 1456 hw_rce_t *hw_rent;
1455 1457 hw_rce_t *hw_cent;
1456 1458 hw_rce_t *ctxp;
1457 1459 int sid;
1458 1460 krw_t rwtype;
1459 1461 boolean_t fill_root;
1460 1462 boolean_t fill_ctx;
1461 1463
1462 1464 pgtable_root = domain->dom_pgtable_root;
1463 1465
1464 1466 ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1465 1467 context = *(pgtable_t **)(ctxp + bus);
1466 1468 hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus;
1467 1469
1468 1470 fill_root = B_FALSE;
1469 1471 fill_ctx = B_FALSE;
1470 1472
1471 1473 /* Check the most common case first with reader lock */
1472 1474 rw_enter(&(immu->immu_ctx_rwlock), RW_READER);
1473 1475 rwtype = RW_READER;
1474 1476 again:
1475 1477 if (ROOT_GET_P(hw_rent)) {
1476 1478 hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1477 1479 if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED) {
1478 1480 rw_exit(&(immu->immu_ctx_rwlock));
1479 1481 return;
1480 1482 } else {
1481 1483 fill_ctx = B_TRUE;
1482 1484 }
1483 1485 } else {
1484 1486 fill_root = B_TRUE;
1485 1487 fill_ctx = B_TRUE;
1486 1488 }
1487 1489
1488 1490 if (rwtype == RW_READER &&
1489 1491 rw_tryupgrade(&(immu->immu_ctx_rwlock)) == 0) {
1490 1492 rw_exit(&(immu->immu_ctx_rwlock));
1491 1493 rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1492 1494 rwtype = RW_WRITER;
1493 1495 goto again;
1494 1496 }
1495 1497 rwtype = RW_WRITER;
1496 1498
1497 1499 if (fill_root == B_TRUE) {
1498 1500 ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1499 1501 ROOT_SET_P(hw_rent);
1500 1502 immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t));
1501 1503 }
1502 1504
1503 1505 if (fill_ctx == B_TRUE) {
1504 1506 hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
1505 1507 /* need to disable context entry before reprogramming it */
1506 1508 bzero(hw_cent, sizeof (hw_rce_t));
1507 1509
1508 1510 /* flush caches */
1509 1511 immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1510 1512
1511 1513 sid = ((bus << 8) | devfunc);
1512 1514 immu_flush_context_fsi(immu, 0, sid, domain->dom_did,
1513 1515 &immu->immu_ctx_inv_wait);
1514 1516
1515 1517 CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED);
1516 1518 CONT_SET_DID(hw_cent, domain->dom_did);
1517 1519 CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1518 1520 CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1519 1521 if (domain->dom_did == IMMU_UNITY_DID &&
1520 1522 IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1521 1523 CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU);
1522 1524 else
1523 1525 /*LINTED*/
1524 1526 CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1525 1527 CONT_SET_P(hw_cent);
1526 1528 if (IMMU_ECAP_GET_CH(immu->immu_regs_excap)) {
1527 1529 CONT_SET_EH(hw_cent);
1528 1530 if (immu_use_alh)
1529 1531 CONT_SET_ALH(hw_cent);
1530 1532 }
1531 1533 immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
1532 1534 }
1533 1535 rw_exit(&(immu->immu_ctx_rwlock));
1534 1536 }
1535 1537
1536 1538 static pgtable_t *
1537 1539 context_create(immu_t *immu)
1538 1540 {
1539 1541 int bus;
1540 1542 int devfunc;
1541 1543 pgtable_t *root_table;
1542 1544 pgtable_t *context;
1543 1545 pgtable_t *pgtable_root;
1544 1546 hw_rce_t *ctxp;
1545 1547 hw_rce_t *hw_rent;
1546 1548 hw_rce_t *hw_cent;
1547 1549
1548 1550 /* Allocate a zeroed root table (4K 256b entries) */
1549 1551 root_table = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1550 1552 pgtable_zero(root_table);
1551 1553
1552 1554 /*
1553 1555 * Setup context tables for all possible root table entries.
1554 1556 * Start out with unity domains for all entries.
1555 1557 */
1556 1558 ctxp = (hw_rce_t *)(root_table->swpg_next_array);
1557 1559 hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr);
1558 1560 for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) {
1559 1561 context = pgtable_alloc(immu, IMMU_FLAGS_SLEEP);
1560 1562 pgtable_zero(context);
1561 1563 ROOT_SET_P(hw_rent);
1562 1564 ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
1563 1565 hw_cent = (hw_rce_t *)(context->hwpg_vaddr);
1564 1566 for (devfunc = 0; devfunc < IMMU_CONT_NUM;
1565 1567 devfunc++, hw_cent++) {
1566 1568 pgtable_root =
1567 1569 immu->immu_unity_domain->dom_pgtable_root;
1568 1570 CONT_SET_DID(hw_cent,
1569 1571 immu->immu_unity_domain->dom_did);
1570 1572 CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
1571 1573 CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
1572 1574 if (IMMU_ECAP_GET_PT(immu->immu_regs_excap))
1573 1575 CONT_SET_TTYPE(hw_cent, TTYPE_PASSTHRU);
1574 1576 else
1575 1577 /*LINTED*/
1576 1578 CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
1577 1579 CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED);
1578 1580 CONT_SET_P(hw_cent);
1579 1581 }
1580 1582 immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE);
1581 1583 *((pgtable_t **)ctxp) = context;
1582 1584 }
1583 1585
1584 1586 return (root_table);
1585 1587 }
1586 1588
1587 1589 /*
1588 1590 * Called during rootnex attach, so no locks needed
1589 1591 */
1590 1592 static void
1591 1593 context_init(immu_t *immu)
1592 1594 {
1593 1595 rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL);
1594 1596
1595 1597 immu_init_inv_wait(&immu->immu_ctx_inv_wait, "ctxglobal", B_TRUE);
1596 1598
1597 1599 immu_regs_wbf_flush(immu);
1598 1600
1599 1601 immu->immu_ctx_root = context_create(immu);
1600 1602
1601 1603 immu_regs_set_root_table(immu);
1602 1604
1603 1605 rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
1604 1606 immu_flush_context_gbl(immu, &immu->immu_ctx_inv_wait);
1605 1607 immu_flush_iotlb_gbl(immu, &immu->immu_ctx_inv_wait);
1606 1608 rw_exit(&(immu->immu_ctx_rwlock));
1607 1609 }
1608 1610
1609 1611
1610 1612 /*
1611 1613 * Find top pcib
1612 1614 */
1613 1615 static int
1614 1616 find_top_pcib(dev_info_t *dip, void *arg)
1615 1617 {
1616 1618 immu_devi_t *immu_devi;
1617 1619 dev_info_t **pcibdipp = (dev_info_t **)arg;
1618 1620
1619 1621 immu_devi = immu_devi_get(dip);
1620 1622
1621 1623 if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) {
1622 1624 *pcibdipp = dip;
1623 1625 }
1624 1626
1625 1627 return (DDI_WALK_CONTINUE);
1626 1628 }
1627 1629
1628 1630 static int
1629 1631 immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip,
1630 1632 dev_info_t *rdip, immu_flags_t immu_flags)
1631 1633 {
1632 1634 immu_devi_t *r_immu_devi;
1633 1635 immu_devi_t *d_immu_devi;
1634 1636 int r_bus;
1635 1637 int d_bus;
1636 1638 int r_devfunc;
1637 1639 int d_devfunc;
1638 1640 immu_pcib_t d_pcib_type;
1639 1641 dev_info_t *pcibdip;
1640 1642
1641 1643 if (ddip == NULL || rdip == NULL ||
1642 1644 ddip == root_devinfo || rdip == root_devinfo) {
1643 1645 ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or "
1644 1646 "request-dip are NULL or are root devinfo");
1645 1647 return (DDI_FAILURE);
1646 1648 }
1647 1649
1648 1650 /*
1649 1651 * We need to set the context fields
1650 1652 * based on what type of device rdip and ddip are.
1651 1653 * To do that we need the immu_devi field.
1652 1654 * Set the immu_devi field (if not already set)
1653 1655 */
1654 1656 if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) {
1655 1657 ddi_err(DER_MODE, rdip,
1656 1658 "immu_context_update: failed to set immu_devi for ddip");
1657 1659 return (DDI_FAILURE);
1658 1660 }
1659 1661
1660 1662 if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) {
1661 1663 ddi_err(DER_MODE, rdip,
1662 1664 "immu_context_update: failed to set immu_devi for rdip");
1663 1665 return (DDI_FAILURE);
1664 1666 }
1665 1667
1666 1668 d_immu_devi = immu_devi_get(ddip);
1667 1669 r_immu_devi = immu_devi_get(rdip);
1668 1670
1669 1671 d_bus = d_immu_devi->imd_bus;
1670 1672 d_devfunc = d_immu_devi->imd_devfunc;
1671 1673 d_pcib_type = d_immu_devi->imd_pcib_type;
1672 1674 r_bus = r_immu_devi->imd_bus;
1673 1675 r_devfunc = r_immu_devi->imd_devfunc;
1674 1676
1675 1677 if (rdip == ddip) {
1676 1678 /* rdip is a PCIE device. set context for it only */
1677 1679 context_set(immu, domain, immu->immu_ctx_root, r_bus,
1678 1680 r_devfunc);
1679 1681 #ifdef BUGGY_DRIVERS
1680 1682 } else if (r_immu_devi == d_immu_devi) {
1681 1683 #ifdef TEST
1682 1684 ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and "
1683 1685 "0x%lx are identical", rdip, ddip);
1684 1686 #endif
1685 1687 /* rdip is a PCIE device. set context for it only */
1686 1688 context_set(immu, domain, immu->immu_ctx_root, r_bus,
1687 1689 r_devfunc);
1688 1690 #endif
1689 1691 } else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) {
1690 1692 /*
1691 1693 * ddip is a PCIE_PCI bridge. Set context for ddip's
1692 1694 * secondary bus. If rdip is on ddip's secondary
1693 1695 * bus, set context for rdip. Else, set context
1694 1696 * for rdip's PCI bridge on ddip's secondary bus.
1695 1697 */
1696 1698 context_set(immu, domain, immu->immu_ctx_root,
1697 1699 d_immu_devi->imd_sec, 0);
1698 1700 if (d_immu_devi->imd_sec == r_bus) {
1699 1701 context_set(immu, domain, immu->immu_ctx_root,
1700 1702 r_bus, r_devfunc);
1701 1703 } else {
1702 1704 pcibdip = NULL;
1703 1705 if (immu_walk_ancestor(rdip, ddip, find_top_pcib,
1704 1706 &pcibdip, NULL, immu_flags) == DDI_SUCCESS &&
1705 1707 pcibdip != NULL) {
1706 1708 r_immu_devi = immu_devi_get(pcibdip);
1707 1709 r_bus = r_immu_devi->imd_bus;
1708 1710 r_devfunc = r_immu_devi->imd_devfunc;
1709 1711 context_set(immu, domain, immu->immu_ctx_root,
1710 1712 r_bus, r_devfunc);
1711 1713 } else {
1712 1714 ddi_err(DER_PANIC, rdip, "Failed to find PCI "
1713 1715 " bridge for PCI device");
1714 1716 /*NOTREACHED*/
1715 1717 }
1716 1718 }
1717 1719 } else if (d_pcib_type == IMMU_PCIB_PCI_PCI) {
1718 1720 context_set(immu, domain, immu->immu_ctx_root, d_bus,
1719 1721 d_devfunc);
1720 1722 } else if (d_pcib_type == IMMU_PCIB_ENDPOINT) {
1721 1723 /*
1722 1724 * ddip is a PCIE device which has a non-PCI device under it
1723 1725 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata
1724 1726 */
1725 1727 context_set(immu, domain, immu->immu_ctx_root, d_bus,
1726 1728 d_devfunc);
1727 1729 } else {
1728 1730 ddi_err(DER_PANIC, rdip, "unknown device type. Cannot "
1729 1731 "set iommu context.");
1730 1732 /*NOTREACHED*/
1731 1733 }
1732 1734
1733 1735 /* XXX do we need a membar_producer() here */
1734 1736 return (DDI_SUCCESS);
1735 1737 }
1736 1738
1737 1739 /* ##################### END CONTEXT CODE ################################## */
1738 1740 /* ##################### MAPPING CODE ################################## */
1739 1741
1740 1742
1741 1743 #ifdef DEBUG
1742 1744 static boolean_t
1743 1745 PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr,
1744 1746 dev_info_t *rdip, immu_flags_t immu_flags)
1745 1747 {
1746 1748 /* The PDTE must be set i.e. present bit is set */
1747 1749 if (!PDTE_P(pdte)) {
1748 1750 ddi_err(DER_MODE, rdip, "No present flag");
1749 1751 return (B_FALSE);
1750 1752 }
1751 1753
1752 1754 /*
1753 1755 * Just assert to check most significant system software field
1754 1756 * (PDTE_SW4) as it is same as present bit and we
1755 1757 * checked that above
1756 1758 */
1757 1759 ASSERT(PDTE_SW4(pdte));
1758 1760
1759 1761 /*
1760 1762 * TM field should be clear if not reserved.
1761 1763 * non-leaf is always reserved
1762 1764 */
1763 1765 if (next == NULL && immu->immu_TM_reserved == B_FALSE) {
1764 1766 if (PDTE_TM(pdte)) {
1765 1767 ddi_err(DER_MODE, rdip, "TM flag set");
1766 1768 return (B_FALSE);
1767 1769 }
1768 1770 }
1769 1771
1770 1772 /*
1771 1773 * The SW3 field is not used and must be clear
1772 1774 */
1773 1775 if (PDTE_SW3(pdte)) {
1774 1776 ddi_err(DER_MODE, rdip, "SW3 set");
1775 1777 return (B_FALSE);
1776 1778 }
1777 1779
1778 1780 /*
1779 1781 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set
1780 1782 */
1781 1783 if (next == NULL) {
1782 1784 ASSERT(paddr % IMMU_PAGESIZE == 0);
1783 1785 if (PDTE_PADDR(pdte) != paddr) {
1784 1786 ddi_err(DER_MODE, rdip,
1785 1787 "PTE paddr mismatch: %lx != %lx",
1786 1788 PDTE_PADDR(pdte), paddr);
1787 1789 return (B_FALSE);
1788 1790 }
1789 1791 } else {
1790 1792 if (PDTE_PADDR(pdte) != next->hwpg_paddr) {
1791 1793 ddi_err(DER_MODE, rdip,
1792 1794 "PDE paddr mismatch: %lx != %lx",
1793 1795 PDTE_PADDR(pdte), next->hwpg_paddr);
1794 1796 return (B_FALSE);
1795 1797 }
1796 1798 }
1797 1799
1798 1800 /*
1799 1801 * SNP field should be clear if not reserved.
1800 1802 * non-leaf is always reserved
1801 1803 */
1802 1804 if (next == NULL && immu->immu_SNP_reserved == B_FALSE) {
1803 1805 if (PDTE_SNP(pdte)) {
1804 1806 ddi_err(DER_MODE, rdip, "SNP set");
1805 1807 return (B_FALSE);
1806 1808 }
1807 1809 }
1808 1810
1809 1811 /* second field available for system software should be clear */
1810 1812 if (PDTE_SW2(pdte)) {
1811 1813 ddi_err(DER_MODE, rdip, "SW2 set");
1812 1814 return (B_FALSE);
1813 1815 }
1814 1816
1815 1817 /* Super pages field should be clear */
1816 1818 if (PDTE_SP(pdte)) {
1817 1819 ddi_err(DER_MODE, rdip, "SP set");
1818 1820 return (B_FALSE);
1819 1821 }
1820 1822
1821 1823 /*
1822 1824 * least significant field available for
1823 1825 * system software should be clear
1824 1826 */
1825 1827 if (PDTE_SW1(pdte)) {
1826 1828 ddi_err(DER_MODE, rdip, "SW1 set");
1827 1829 return (B_FALSE);
1828 1830 }
1829 1831
1830 1832 if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) {
1831 1833 ddi_err(DER_MODE, rdip, "READ not set");
1832 1834 return (B_FALSE);
1833 1835 }
1834 1836
1835 1837 if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) {
1836 1838 ddi_err(DER_MODE, rdip, "WRITE not set");
1837 1839 return (B_FALSE);
1838 1840 }
1839 1841
1840 1842 return (B_TRUE);
1841 1843 }
1842 1844 #endif
1843 1845
1844 1846 /*ARGSUSED*/
1845 1847 static void
1846 1848 PTE_clear_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
1847 1849 uint64_t *dvma_ptr, uint64_t *npages_ptr, dev_info_t *rdip)
1848 1850 {
1849 1851 uint64_t npages;
1850 1852 uint64_t dvma;
1851 1853 pgtable_t *pgtable;
1852 1854 hw_pdte_t *hwp;
1853 1855 hw_pdte_t *shwp;
1854 1856 int idx;
1855 1857
1856 1858 pgtable = xlate->xlt_pgtable;
1857 1859 idx = xlate->xlt_idx;
1858 1860
1859 1861 dvma = *dvma_ptr;
1860 1862 npages = *npages_ptr;
1861 1863
1862 1864 /*
1863 1865 * since a caller gets a unique dvma for a physical address,
1864 1866 * no other concurrent thread will be writing to the same
1865 1867 * PTE even if it has the same paddr. So no locks needed.
1866 1868 */
1867 1869 shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
1868 1870
1869 1871 hwp = shwp;
1870 1872 for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
1871 1873 PDTE_CLEAR_P(*hwp);
1872 1874 dvma += IMMU_PAGESIZE;
1873 1875 npages--;
1874 1876 }
1875 1877
1876 1878 *dvma_ptr = dvma;
1877 1879 *npages_ptr = npages;
1878 1880
1879 1881 xlate->xlt_idx = idx;
1880 1882 }
1881 1883
1882 1884 static void
1883 1885 xlate_setup(uint64_t dvma, xlate_t *xlate, int nlevels)
1884 1886 {
1885 1887 int level;
1886 1888 uint64_t offbits;
1887 1889
1888 1890 /*
1889 1891 * Skip the first 12 bits which is the offset into
1890 1892 * 4K PFN (phys page frame based on IMMU_PAGESIZE)
1891 1893 */
1892 1894 offbits = dvma >> IMMU_PAGESHIFT;
1893 1895
1894 1896 /* skip to level 1 i.e. leaf PTE */
1895 1897 for (level = 1, xlate++; level <= nlevels; level++, xlate++) {
1896 1898 xlate->xlt_level = level;
1897 1899 xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK);
1898 1900 ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX);
1899 1901 xlate->xlt_pgtable = NULL;
1900 1902 offbits >>= IMMU_PGTABLE_LEVEL_STRIDE;
1901 1903 }
1902 1904 }
1903 1905
1904 1906 /*
1905 1907 * Read the pgtables
1906 1908 */
1907 1909 static boolean_t
1908 1910 PDE_lookup(domain_t *domain, xlate_t *xlate, int nlevels)
1909 1911 {
1910 1912 pgtable_t *pgtable;
1911 1913 pgtable_t *next;
1912 1914 uint_t idx;
1913 1915
1914 1916 /* start with highest level pgtable i.e. root */
1915 1917 xlate += nlevels;
1916 1918
1917 1919 if (xlate->xlt_pgtable == NULL) {
1918 1920 xlate->xlt_pgtable = domain->dom_pgtable_root;
1919 1921 }
1920 1922
1921 1923 for (; xlate->xlt_level > 1; xlate--) {
1922 1924 idx = xlate->xlt_idx;
1923 1925 pgtable = xlate->xlt_pgtable;
1924 1926
1925 1927 if ((xlate - 1)->xlt_pgtable) {
1926 1928 continue;
1927 1929 }
1928 1930
1929 1931 /* Lock the pgtable in read mode */
1930 1932 rw_enter(&(pgtable->swpg_rwlock), RW_READER);
1931 1933
1932 1934 /*
1933 1935 * since we are unmapping, the pgtable should
1934 1936 * already point to a leafier pgtable.
1935 1937 */
1936 1938 next = *(pgtable->swpg_next_array + idx);
1937 1939 (xlate - 1)->xlt_pgtable = next;
1938 1940 rw_exit(&(pgtable->swpg_rwlock));
1939 1941 if (next == NULL)
1940 1942 return (B_FALSE);
1941 1943 }
1942 1944
1943 1945 return (B_TRUE);
1944 1946 }
1945 1947
1946 1948 static void
1947 1949 immu_fault_walk(void *arg, void *base, size_t len)
1948 1950 {
1949 1951 uint64_t dvma, start;
1950 1952
1951 1953 dvma = *(uint64_t *)arg;
1952 1954 start = (uint64_t)(uintptr_t)base;
1953 1955
1954 1956 if (dvma >= start && dvma < (start + len)) {
1955 1957 ddi_err(DER_WARN, NULL,
1956 1958 "faulting DVMA address is in vmem arena "
1957 1959 "(%" PRIx64 "-%" PRIx64 ")",
1958 1960 start, start + len);
1959 1961 *(uint64_t *)arg = ~0ULL;
1960 1962 }
1961 1963 }
1962 1964
1963 1965 void
1964 1966 immu_print_fault_info(uint_t sid, uint64_t dvma)
1965 1967 {
1966 1968 int nlevels;
1967 1969 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
1968 1970 xlate_t *xlatep;
1969 1971 hw_pdte_t pte;
1970 1972 domain_t *domain;
1971 1973 immu_t *immu;
1972 1974 uint64_t dvma_arg;
1973 1975
1974 1976 if (mod_hash_find(bdf_domain_hash,
1975 1977 (void *)(uintptr_t)sid, (void *)&domain) != 0) {
1976 1978 ddi_err(DER_WARN, NULL,
1977 1979 "no domain for faulting SID %08x", sid);
1978 1980 return;
1979 1981 }
1980 1982
1981 1983 immu = domain->dom_immu;
1982 1984
1983 1985 dvma_arg = dvma;
1984 1986 vmem_walk(domain->dom_dvma_arena, VMEM_ALLOC, immu_fault_walk,
1985 1987 (void *)&dvma_arg);
1986 1988 if (dvma_arg != ~0ULL)
1987 1989 ddi_err(DER_WARN, domain->dom_dip,
1988 1990 "faulting DVMA address is not in vmem arena");
1989 1991
1990 1992 nlevels = immu->immu_dvma_nlevels;
1991 1993 xlate_setup(dvma, xlate, nlevels);
1992 1994
1993 1995 if (!PDE_lookup(domain, xlate, nlevels)) {
1994 1996 ddi_err(DER_WARN, domain->dom_dip,
1995 1997 "pte not found in domid %d for faulting addr %" PRIx64,
1996 1998 domain->dom_did, dvma);
1997 1999 return;
1998 2000 }
1999 2001
2000 2002 xlatep = &xlate[1];
2001 2003 pte = *((hw_pdte_t *)
2002 2004 (xlatep->xlt_pgtable->hwpg_vaddr) + xlatep->xlt_idx);
2003 2005
2004 2006 ddi_err(DER_WARN, domain->dom_dip,
2005 2007 "domid %d pte: %" PRIx64 "(paddr %" PRIx64 ")", domain->dom_did,
2006 2008 (unsigned long long)pte, (unsigned long long)PDTE_PADDR(pte));
2007 2009 }
2008 2010
2009 2011 /*ARGSUSED*/
2010 2012 static void
2011 2013 PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr,
2012 2014 dev_info_t *rdip, immu_flags_t immu_flags)
2013 2015 {
2014 2016 hw_pdte_t pte;
2015 2017
2016 2018 #ifndef DEBUG
2017 2019 pte = immu->immu_ptemask;
2018 2020 PDTE_SET_PADDR(pte, paddr);
2019 2021 #else
2020 2022 pte = *hwp;
2021 2023
2022 2024 if (PDTE_P(pte)) {
2023 2025 if (PDTE_PADDR(pte) != paddr) {
2024 2026 ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx",
2025 2027 PDTE_PADDR(pte), paddr);
2026 2028 }
2027 2029 #ifdef BUGGY_DRIVERS
2028 2030 return;
2029 2031 #else
2030 2032 goto out;
2031 2033 #endif
2032 2034 }
2033 2035
2034 2036 /* clear TM field if not reserved */
2035 2037 if (immu->immu_TM_reserved == B_FALSE) {
2036 2038 PDTE_CLEAR_TM(pte);
2037 2039 }
2038 2040
2039 2041 /* Clear 3rd field for system software - not used */
2040 2042 PDTE_CLEAR_SW3(pte);
2041 2043
2042 2044 /* Set paddr */
2043 2045 ASSERT(paddr % IMMU_PAGESIZE == 0);
2044 2046 PDTE_CLEAR_PADDR(pte);
2045 2047 PDTE_SET_PADDR(pte, paddr);
2046 2048
2047 2049 /* clear SNP field if not reserved. */
2048 2050 if (immu->immu_SNP_reserved == B_FALSE) {
2049 2051 PDTE_CLEAR_SNP(pte);
2050 2052 }
2051 2053
2052 2054 /* Clear SW2 field available for software */
2053 2055 PDTE_CLEAR_SW2(pte);
2054 2056
2055 2057
2056 2058 /* SP is don't care for PTEs. Clear it for cleanliness */
2057 2059 PDTE_CLEAR_SP(pte);
2058 2060
2059 2061 /* Clear SW1 field available for software */
2060 2062 PDTE_CLEAR_SW1(pte);
2061 2063
2062 2064 /*
2063 2065 * Now that we are done writing the PTE
2064 2066 * set the "present" flag. Note this present
2065 2067 * flag is a bit in the PDE/PTE that the
2066 2068 * spec says is available for system software.
2067 2069 * This is an implementation detail of Solaris
2068 2070 * bare-metal Intel IOMMU.
2069 2071 * The present field in a PDE/PTE is not defined
2070 2072 * by the Vt-d spec
2071 2073 */
2072 2074
2073 2075 PDTE_SET_P(pte);
2074 2076
2075 2077 pte |= immu->immu_ptemask;
2076 2078
2077 2079 out:
2078 2080 #endif /* DEBUG */
2079 2081 #ifdef BUGGY_DRIVERS
2080 2082 PDTE_SET_READ(pte);
2081 2083 PDTE_SET_WRITE(pte);
2082 2084 #else
2083 2085 if (immu_flags & IMMU_FLAGS_READ)
2084 2086 PDTE_SET_READ(pte);
2085 2087 if (immu_flags & IMMU_FLAGS_WRITE)
2086 2088 PDTE_SET_WRITE(pte);
2087 2089 #endif /* BUGGY_DRIVERS */
2088 2090
2089 2091 *hwp = pte;
2090 2092 }
2091 2093
2092 2094 /*ARGSUSED*/
2093 2095 static void
2094 2096 PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
2095 2097 uint64_t *dvma_ptr, uint64_t *nvpages_ptr, immu_dcookie_t *dcookies,
2096 2098 int dcount, dev_info_t *rdip, immu_flags_t immu_flags)
2097 2099 {
2098 2100 paddr_t paddr;
2099 2101 uint64_t nvpages;
2100 2102 uint64_t nppages;
2101 2103 uint64_t dvma;
2102 2104 pgtable_t *pgtable;
2103 2105 hw_pdte_t *hwp;
2104 2106 hw_pdte_t *shwp;
2105 2107 int idx, nset;
2106 2108 int j;
2107 2109
2108 2110 pgtable = xlate->xlt_pgtable;
2109 2111 idx = xlate->xlt_idx;
2110 2112
2111 2113 dvma = *dvma_ptr;
2112 2114 nvpages = *nvpages_ptr;
2113 2115
2114 2116 /*
2115 2117 * since a caller gets a unique dvma for a physical address,
2116 2118 * no other concurrent thread will be writing to the same
2117 2119 * PTE even if it has the same paddr. So no locks needed.
↓ open down ↓ |
2046 lines elided |
↑ open up ↑ |
2118 2120 */
2119 2121 shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2120 2122
2121 2123 hwp = shwp;
2122 2124 for (j = dcount - 1; j >= 0; j--) {
2123 2125 if (nvpages <= dcookies[j].dck_npages)
2124 2126 break;
2125 2127 nvpages -= dcookies[j].dck_npages;
2126 2128 }
2127 2129
2130 + VERIFY(j >= 0);
2128 2131 nppages = nvpages;
2129 2132 paddr = dcookies[j].dck_paddr +
2130 2133 (dcookies[j].dck_npages - nppages) * IMMU_PAGESIZE;
2131 2134
2132 2135 nvpages = *nvpages_ptr;
2133 2136 nset = 0;
2134 2137 for (; nvpages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
2135 2138 PTE_set_one(immu, hwp, paddr, rdip, immu_flags);
2136 2139 nset++;
2137 2140
2138 2141 ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags)
2139 2142 == B_TRUE);
2140 2143 nppages--;
2141 2144 nvpages--;
2142 2145 paddr += IMMU_PAGESIZE;
2143 2146 dvma += IMMU_PAGESIZE;
2144 2147
2145 2148 if (nppages == 0) {
2146 2149 j++;
2147 2150 }
2148 2151
2149 2152 if (j == dcount)
2150 2153 break;
2151 2154
2152 2155 if (nppages == 0) {
2153 2156 nppages = dcookies[j].dck_npages;
2154 2157 paddr = dcookies[j].dck_paddr;
2155 2158 }
2156 2159 }
2157 2160
2158 2161 if (nvpages) {
2159 2162 *dvma_ptr = dvma;
2160 2163 *nvpages_ptr = nvpages;
2161 2164 } else {
2162 2165 *dvma_ptr = 0;
2163 2166 *nvpages_ptr = 0;
2164 2167 }
2165 2168
2166 2169 xlate->xlt_idx = idx;
2167 2170 }
2168 2171
2169 2172 /*ARGSUSED*/
2170 2173 static void
2171 2174 PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next,
2172 2175 dev_info_t *rdip, immu_flags_t immu_flags)
2173 2176 {
2174 2177 hw_pdte_t pde;
2175 2178
2176 2179 pde = *hwp;
2177 2180
2178 2181 /* if PDE is already set, make sure it is correct */
2179 2182 if (PDTE_P(pde)) {
2180 2183 ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr);
2181 2184 #ifdef BUGGY_DRIVERS
2182 2185 return;
2183 2186 #else
2184 2187 goto out;
2185 2188 #endif
2186 2189 }
2187 2190
2188 2191 /* Dont touch SW4, it is the present bit */
2189 2192
2190 2193 /* don't touch TM field it is reserved for PDEs */
2191 2194
2192 2195 /* 3rd field available for system software is not used */
2193 2196 PDTE_CLEAR_SW3(pde);
2194 2197
2195 2198 /* Set next level pgtable-paddr for PDE */
2196 2199 PDTE_CLEAR_PADDR(pde);
2197 2200 PDTE_SET_PADDR(pde, next->hwpg_paddr);
2198 2201
2199 2202 /* don't touch SNP field it is reserved for PDEs */
2200 2203
2201 2204 /* Clear second field available for system software */
2202 2205 PDTE_CLEAR_SW2(pde);
2203 2206
2204 2207 /* No super pages for PDEs */
2205 2208 PDTE_CLEAR_SP(pde);
2206 2209
2207 2210 /* Clear SW1 for software */
2208 2211 PDTE_CLEAR_SW1(pde);
2209 2212
2210 2213 /*
2211 2214 * Now that we are done writing the PDE
2212 2215 * set the "present" flag. Note this present
2213 2216 * flag is a bit in the PDE/PTE that the
2214 2217 * spec says is available for system software.
2215 2218 * This is an implementation detail of Solaris
2216 2219 * base-metal Intel IOMMU.
2217 2220 * The present field in a PDE/PTE is not defined
2218 2221 * by the Vt-d spec
2219 2222 */
2220 2223
2221 2224 out:
2222 2225 #ifdef BUGGY_DRIVERS
2223 2226 PDTE_SET_READ(pde);
2224 2227 PDTE_SET_WRITE(pde);
2225 2228 #else
2226 2229 if (immu_flags & IMMU_FLAGS_READ)
2227 2230 PDTE_SET_READ(pde);
2228 2231 if (immu_flags & IMMU_FLAGS_WRITE)
2229 2232 PDTE_SET_WRITE(pde);
2230 2233 #endif
2231 2234
2232 2235 PDTE_SET_P(pde);
2233 2236
2234 2237 *hwp = pde;
2235 2238 }
2236 2239
2237 2240 /*
2238 2241 * Used to set PDEs
2239 2242 */
2240 2243 static boolean_t
2241 2244 PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
2242 2245 dev_info_t *rdip, immu_flags_t immu_flags)
2243 2246 {
2244 2247 pgtable_t *pgtable;
2245 2248 pgtable_t *new;
2246 2249 pgtable_t *next;
2247 2250 hw_pdte_t *hwp;
2248 2251 int level;
2249 2252 uint_t idx;
2250 2253 krw_t rwtype;
2251 2254 boolean_t set = B_FALSE;
2252 2255
2253 2256 /* start with highest level pgtable i.e. root */
2254 2257 xlate += nlevels;
2255 2258
2256 2259 new = NULL;
2257 2260 xlate->xlt_pgtable = domain->dom_pgtable_root;
2258 2261 for (level = nlevels; level > 1; level--, xlate--) {
2259 2262 idx = xlate->xlt_idx;
2260 2263 pgtable = xlate->xlt_pgtable;
2261 2264
2262 2265 /* Lock the pgtable in READ mode first */
2263 2266 rw_enter(&(pgtable->swpg_rwlock), RW_READER);
2264 2267 rwtype = RW_READER;
2265 2268 again:
2266 2269 hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
2267 2270 next = (pgtable->swpg_next_array)[idx];
2268 2271
2269 2272 /*
2270 2273 * check if leafier level already has a pgtable
2271 2274 * if yes, verify
2272 2275 */
2273 2276 if (next == NULL) {
2274 2277 if (new == NULL) {
2275 2278
2276 2279 IMMU_DPROBE2(immu__pdp__alloc, dev_info_t *,
2277 2280 rdip, int, level);
2278 2281
2279 2282 new = pgtable_alloc(immu, immu_flags);
2280 2283 if (new == NULL) {
2281 2284 ddi_err(DER_PANIC, rdip,
2282 2285 "pgtable alloc err");
2283 2286 }
2284 2287 pgtable_zero(new);
2285 2288 }
2286 2289
2287 2290 /* Change to a write lock */
2288 2291 if (rwtype == RW_READER &&
2289 2292 rw_tryupgrade(&(pgtable->swpg_rwlock)) == 0) {
2290 2293 rw_exit(&(pgtable->swpg_rwlock));
2291 2294 rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
2292 2295 rwtype = RW_WRITER;
2293 2296 goto again;
2294 2297 }
2295 2298 rwtype = RW_WRITER;
2296 2299 next = new;
2297 2300 (pgtable->swpg_next_array)[idx] = next;
2298 2301 new = NULL;
2299 2302 PDE_set_one(immu, hwp, next, rdip, immu_flags);
2300 2303 set = B_TRUE;
2301 2304 rw_downgrade(&(pgtable->swpg_rwlock));
2302 2305 rwtype = RW_READER;
2303 2306 }
2304 2307 #ifndef BUGGY_DRIVERS
2305 2308 else {
2306 2309 hw_pdte_t pde = *hwp;
2307 2310
2308 2311 /*
2309 2312 * If buggy driver we already set permission
2310 2313 * READ+WRITE so nothing to do for that case
2311 2314 * XXX Check that read writer perms change before
2312 2315 * actually setting perms. Also need to hold lock
2313 2316 */
2314 2317 if (immu_flags & IMMU_FLAGS_READ)
2315 2318 PDTE_SET_READ(pde);
2316 2319 if (immu_flags & IMMU_FLAGS_WRITE)
2317 2320 PDTE_SET_WRITE(pde);
2318 2321
2319 2322 *hwp = pde;
2320 2323 }
2321 2324 #endif
2322 2325
2323 2326 ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags)
2324 2327 == B_TRUE);
2325 2328
2326 2329 (xlate - 1)->xlt_pgtable = next;
2327 2330 rw_exit(&(pgtable->swpg_rwlock));
2328 2331 }
2329 2332
2330 2333 if (new) {
2331 2334 pgtable_free(immu, new);
2332 2335 }
2333 2336
2334 2337 return (set);
2335 2338 }
2336 2339
2337 2340 /*
2338 2341 * dvma_map()
2339 2342 * map a contiguous range of DVMA pages
2340 2343 *
2341 2344 * immu: IOMMU unit for which we are generating DVMA cookies
2342 2345 * domain: domain
2343 2346 * sdvma: Starting dvma
2344 2347 * spaddr: Starting paddr
2345 2348 * npages: Number of pages
2346 2349 * rdip: requesting device
2347 2350 * immu_flags: flags
2348 2351 */
2349 2352 static boolean_t
2350 2353 dvma_map(domain_t *domain, uint64_t sdvma, uint64_t snvpages,
2351 2354 immu_dcookie_t *dcookies, int dcount, dev_info_t *rdip,
2352 2355 immu_flags_t immu_flags)
2353 2356 {
2354 2357 uint64_t dvma;
2355 2358 uint64_t n;
2356 2359 immu_t *immu = domain->dom_immu;
2357 2360 int nlevels = immu->immu_dvma_nlevels;
2358 2361 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2359 2362 boolean_t pde_set = B_FALSE;
2360 2363
2361 2364 n = snvpages;
2362 2365 dvma = sdvma;
2363 2366
2364 2367 while (n > 0) {
2365 2368 xlate_setup(dvma, xlate, nlevels);
2366 2369
2367 2370 /* Lookup or allocate PGDIRs and PGTABLEs if necessary */
2368 2371 if (PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags)
2369 2372 == B_TRUE) {
2370 2373 pde_set = B_TRUE;
2371 2374 }
2372 2375
2373 2376 /* set all matching ptes that fit into this leaf pgtable */
2374 2377 PTE_set_all(immu, domain, &xlate[1], &dvma, &n, dcookies,
2375 2378 dcount, rdip, immu_flags);
2376 2379 }
2377 2380
2378 2381 return (pde_set);
2379 2382 }
2380 2383
2381 2384 /*
2382 2385 * dvma_unmap()
2383 2386 * unmap a range of DVMAs
2384 2387 *
2385 2388 * immu: IOMMU unit state
2386 2389 * domain: domain for requesting device
2387 2390 * ddip: domain-dip
2388 2391 * dvma: starting DVMA
2389 2392 * npages: Number of IMMU pages to be unmapped
2390 2393 * rdip: requesting device
2391 2394 */
2392 2395 static void
2393 2396 dvma_unmap(domain_t *domain, uint64_t sdvma, uint64_t snpages,
2394 2397 dev_info_t *rdip)
2395 2398 {
2396 2399 immu_t *immu = domain->dom_immu;
2397 2400 int nlevels = immu->immu_dvma_nlevels;
2398 2401 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
2399 2402 uint64_t n;
2400 2403 uint64_t dvma;
2401 2404
2402 2405 dvma = sdvma;
2403 2406 n = snpages;
2404 2407
2405 2408 while (n > 0) {
2406 2409 /* setup the xlate array */
2407 2410 xlate_setup(dvma, xlate, nlevels);
2408 2411
2409 2412 /* just lookup existing pgtables. Should never fail */
2410 2413 if (!PDE_lookup(domain, xlate, nlevels))
2411 2414 ddi_err(DER_PANIC, rdip,
2412 2415 "PTE not found for addr %" PRIx64,
2413 2416 (unsigned long long)dvma);
2414 2417
2415 2418 /* clear all matching ptes that fit into this leaf pgtable */
2416 2419 PTE_clear_all(immu, domain, &xlate[1], &dvma, &n, rdip);
2417 2420 }
2418 2421
2419 2422 /* No need to flush IOTLB after unmap */
2420 2423 }
2421 2424
2422 2425 static uint64_t
2423 2426 dvma_alloc(domain_t *domain, ddi_dma_attr_t *dma_attr, uint_t npages, int kmf)
2424 2427 {
2425 2428 uint64_t dvma;
2426 2429 size_t xsize, align;
2427 2430 uint64_t minaddr, maxaddr;
2428 2431
2429 2432 /* parameters */
2430 2433 xsize = npages * IMMU_PAGESIZE;
2431 2434 align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
2432 2435 minaddr = dma_attr->dma_attr_addr_lo;
2433 2436 maxaddr = dma_attr->dma_attr_addr_hi + 1;
2434 2437
2435 2438 /* handle the rollover cases */
2436 2439 if (maxaddr < dma_attr->dma_attr_addr_hi) {
2437 2440 maxaddr = dma_attr->dma_attr_addr_hi;
2438 2441 }
2439 2442
2440 2443 /*
2441 2444 * allocate from vmem arena.
2442 2445 */
2443 2446 dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
2444 2447 xsize, align, 0, 0, (void *)(uintptr_t)minaddr,
2445 2448 (void *)(uintptr_t)maxaddr, kmf);
2446 2449
2447 2450 return (dvma);
2448 2451 }
2449 2452
2450 2453 static void
2451 2454 dvma_prealloc(dev_info_t *rdip, immu_hdl_priv_t *ihp, ddi_dma_attr_t *dma_attr)
2452 2455 {
2453 2456 int nlevels;
2454 2457 xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0}, *xlp;
2455 2458 uint64_t dvma, n;
2456 2459 size_t xsize, align;
2457 2460 uint64_t minaddr, maxaddr, dmamax;
2458 2461 int on, npte, pindex;
2459 2462 hw_pdte_t *shwp;
2460 2463 immu_t *immu;
2461 2464 domain_t *domain;
2462 2465
2463 2466 /* parameters */
2464 2467 domain = IMMU_DEVI(rdip)->imd_domain;
2465 2468 immu = domain->dom_immu;
2466 2469 nlevels = immu->immu_dvma_nlevels;
2467 2470 xsize = IMMU_NPREPTES * IMMU_PAGESIZE;
2468 2471 align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
2469 2472 minaddr = dma_attr->dma_attr_addr_lo;
2470 2473 if (dma_attr->dma_attr_flags & _DDI_DMA_BOUNCE_ON_SEG)
2471 2474 dmamax = dma_attr->dma_attr_seg;
2472 2475 else
2473 2476 dmamax = dma_attr->dma_attr_addr_hi;
2474 2477 maxaddr = dmamax + 1;
2475 2478
2476 2479 if (maxaddr < dmamax)
2477 2480 maxaddr = dmamax;
2478 2481
2479 2482 dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
2480 2483 xsize, align, 0, dma_attr->dma_attr_seg + 1,
2481 2484 (void *)(uintptr_t)minaddr, (void *)(uintptr_t)maxaddr, VM_NOSLEEP);
2482 2485
2483 2486 ihp->ihp_predvma = dvma;
2484 2487 ihp->ihp_npremapped = 0;
2485 2488 if (dvma == 0)
2486 2489 return;
2487 2490
2488 2491 n = IMMU_NPREPTES;
2489 2492 pindex = 0;
2490 2493
2491 2494 /*
2492 2495 * Set up a mapping at address 0, just so that all PDPs get allocated
2493 2496 * now. Although this initial mapping should never be used,
2494 2497 * explicitly set it to read-only, just to be safe.
2495 2498 */
2496 2499 while (n > 0) {
2497 2500 xlate_setup(dvma, xlate, nlevels);
2498 2501
2499 2502 (void) PDE_set_all(immu, domain, xlate, nlevels, rdip,
2500 2503 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2501 2504
2502 2505 xlp = &xlate[1];
2503 2506 shwp = (hw_pdte_t *)(xlp->xlt_pgtable->hwpg_vaddr)
2504 2507 + xlp->xlt_idx;
2505 2508 on = n;
2506 2509
2507 2510 PTE_set_all(immu, domain, xlp, &dvma, &n, &immu_precookie,
2508 2511 1, rdip, IMMU_FLAGS_READ);
2509 2512
2510 2513 npte = on - n;
2511 2514
2512 2515 while (npte > 0) {
2513 2516 ihp->ihp_preptes[pindex++] = shwp;
2514 2517 #ifdef BUGGY_DRIVERS
2515 2518 PDTE_CLEAR_WRITE(*shwp);
2516 2519 #endif
2517 2520 shwp++;
2518 2521 npte--;
2519 2522 }
2520 2523 }
2521 2524 }
2522 2525
2523 2526 static void
2524 2527 dvma_prefree(dev_info_t *rdip, immu_hdl_priv_t *ihp)
2525 2528 {
2526 2529 domain_t *domain;
2527 2530
2528 2531 domain = IMMU_DEVI(rdip)->imd_domain;
2529 2532
2530 2533 if (ihp->ihp_predvma != 0) {
2531 2534 dvma_unmap(domain, ihp->ihp_predvma, IMMU_NPREPTES, rdip);
2532 2535 vmem_free(domain->dom_dvma_arena,
2533 2536 (void *)(uintptr_t)ihp->ihp_predvma,
2534 2537 IMMU_NPREPTES * IMMU_PAGESIZE);
2535 2538 }
2536 2539 }
2537 2540
2538 2541 static void
2539 2542 dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages)
2540 2543 {
2541 2544 uint64_t size = npages * IMMU_PAGESIZE;
2542 2545
2543 2546 if (domain->dom_maptype != IMMU_MAPTYPE_XLATE)
2544 2547 return;
2545 2548
2546 2549 vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size);
2547 2550 }
2548 2551
2549 2552 static int
2550 2553 immu_map_dvmaseg(dev_info_t *rdip, ddi_dma_handle_t handle,
2551 2554 immu_hdl_priv_t *ihp, struct ddi_dma_req *dmareq,
2552 2555 ddi_dma_obj_t *dma_out)
2553 2556 {
2554 2557 domain_t *domain;
2555 2558 immu_t *immu;
2556 2559 immu_flags_t immu_flags;
2557 2560 ddi_dma_atyp_t buftype;
2558 2561 ddi_dma_obj_t *dmar_object;
2559 2562 ddi_dma_attr_t *attrp;
2560 2563 uint64_t offset, paddr, dvma, sdvma, rwmask;
2561 2564 size_t npages, npgalloc;
2562 2565 uint_t psize, size, pcnt, dmax;
2563 2566 page_t **pparray;
2564 2567 caddr_t vaddr;
2565 2568 page_t *page;
2566 2569 struct as *vas;
2567 2570 immu_dcookie_t *dcookies;
2568 2571 int pde_set;
2569 2572
2570 2573 domain = IMMU_DEVI(rdip)->imd_domain;
2571 2574 immu = domain->dom_immu;
2572 2575 immu_flags = dma_to_immu_flags(dmareq);
2573 2576
2574 2577 attrp = &((ddi_dma_impl_t *)handle)->dmai_attr;
2575 2578
2576 2579 dmar_object = &dmareq->dmar_object;
2577 2580 pparray = dmar_object->dmao_obj.virt_obj.v_priv;
2578 2581 vaddr = dmar_object->dmao_obj.virt_obj.v_addr;
2579 2582 buftype = dmar_object->dmao_type;
2580 2583 size = dmar_object->dmao_size;
2581 2584
2582 2585 IMMU_DPROBE3(immu__map__dvma, dev_info_t *, rdip, ddi_dma_atyp_t,
2583 2586 buftype, uint_t, size);
2584 2587
2585 2588 dcookies = &ihp->ihp_dcookies[0];
2586 2589
2587 2590 pcnt = dmax = 0;
2588 2591
2589 2592 /* retrieve paddr, psize, offset from dmareq */
2590 2593 if (buftype == DMA_OTYP_PAGES) {
2591 2594 page = dmar_object->dmao_obj.pp_obj.pp_pp;
2592 2595 offset = dmar_object->dmao_obj.pp_obj.pp_offset &
2593 2596 MMU_PAGEOFFSET;
2594 2597 paddr = pfn_to_pa(page->p_pagenum) + offset;
2595 2598 psize = MIN((MMU_PAGESIZE - offset), size);
2596 2599 page = page->p_next;
2597 2600 vas = dmar_object->dmao_obj.virt_obj.v_as;
2598 2601 } else {
2599 2602 if (vas == NULL) {
2600 2603 vas = &kas;
2601 2604 }
2602 2605 offset = (uintptr_t)vaddr & MMU_PAGEOFFSET;
2603 2606 if (pparray != NULL) {
2604 2607 paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset;
2605 2608 psize = MIN((MMU_PAGESIZE - offset), size);
2606 2609 pcnt++;
2607 2610 } else {
2608 2611 paddr = pfn_to_pa(hat_getpfnum(vas->a_hat,
2609 2612 vaddr)) + offset;
2610 2613 psize = MIN(size, (MMU_PAGESIZE - offset));
2611 2614 vaddr += psize;
2612 2615 }
2613 2616 }
2614 2617
2615 2618 npgalloc = IMMU_BTOPR(size + offset);
2616 2619
2617 2620 if (npgalloc <= IMMU_NPREPTES && ihp->ihp_predvma != 0) {
2618 2621 #ifdef BUGGY_DRIVERS
2619 2622 rwmask = PDTE_MASK_R | PDTE_MASK_W | immu->immu_ptemask;
2620 2623 #else
2621 2624 rwmask = immu->immu_ptemask;
2622 2625 if (immu_flags & IMMU_FLAGS_READ)
2623 2626 rwmask |= PDTE_MASK_R;
2624 2627 if (immu_flags & IMMU_FLAGS_WRITE)
2625 2628 rwmask |= PDTE_MASK_W;
2626 2629 #endif
2627 2630 #ifdef DEBUG
2628 2631 rwmask |= PDTE_MASK_P;
2629 2632 #endif
2630 2633 sdvma = ihp->ihp_predvma;
2631 2634 ihp->ihp_npremapped = npgalloc;
2632 2635 *ihp->ihp_preptes[0] =
2633 2636 PDTE_PADDR(paddr & ~MMU_PAGEOFFSET) | rwmask;
2634 2637 } else {
2635 2638 ihp->ihp_npremapped = 0;
2636 2639 sdvma = dvma_alloc(domain, attrp, npgalloc,
2637 2640 dmareq->dmar_fp == DDI_DMA_SLEEP ? VM_SLEEP : VM_NOSLEEP);
2638 2641 if (sdvma == 0)
2639 2642 return (DDI_DMA_NORESOURCES);
2640 2643
2641 2644 dcookies[0].dck_paddr = (paddr & ~MMU_PAGEOFFSET);
2642 2645 dcookies[0].dck_npages = 1;
2643 2646 }
2644 2647
2645 2648 IMMU_DPROBE3(immu__dvma__alloc, dev_info_t *, rdip, uint64_t, npgalloc,
2646 2649 uint64_t, sdvma);
2647 2650
2648 2651 dvma = sdvma;
2649 2652 pde_set = 0;
2650 2653 npages = 1;
2651 2654 size -= psize;
2652 2655 while (size > 0) {
2653 2656 /* get the size for this page (i.e. partial or full page) */
2654 2657 psize = MIN(size, MMU_PAGESIZE);
2655 2658 if (buftype == DMA_OTYP_PAGES) {
2656 2659 /* get the paddr from the page_t */
2657 2660 paddr = pfn_to_pa(page->p_pagenum);
2658 2661 page = page->p_next;
↓ open down ↓ |
521 lines elided |
↑ open up ↑ |
2659 2662 } else if (pparray != NULL) {
2660 2663 /* index into the array of page_t's to get the paddr */
2661 2664 paddr = pfn_to_pa(pparray[pcnt]->p_pagenum);
2662 2665 pcnt++;
2663 2666 } else {
2664 2667 /* call into the VM to get the paddr */
2665 2668 paddr = pfn_to_pa(hat_getpfnum(vas->a_hat, vaddr));
2666 2669 vaddr += psize;
2667 2670 }
2668 2671
2669 - npages++;
2670 -
2671 2672 if (ihp->ihp_npremapped > 0) {
2672 - *ihp->ihp_preptes[npages - 1] =
2673 + *ihp->ihp_preptes[npages] =
2673 2674 PDTE_PADDR(paddr) | rwmask;
2674 2675 } else if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) {
2675 2676 dcookies[dmax].dck_npages++;
2676 2677 } else {
2677 2678 /* No, we need a new dcookie */
2678 2679 if (dmax == (IMMU_NDCK - 1)) {
2679 2680 /*
2680 2681 * Ran out of dcookies. Map them now.
2681 2682 */
2682 2683 if (dvma_map(domain, dvma,
2683 2684 npages, dcookies, dmax + 1, rdip,
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
2684 2685 immu_flags))
2685 2686 pde_set++;
2686 2687
2687 2688 IMMU_DPROBE4(immu__dvmamap__early,
2688 2689 dev_info_t *, rdip, uint64_t, dvma,
2689 2690 uint_t, npages, uint_t, dmax+1);
2690 2691
2691 2692 dvma += (npages << IMMU_PAGESHIFT);
2692 2693 npages = 0;
2693 2694 dmax = 0;
2694 - } else
2695 + } else {
2695 2696 dmax++;
2697 + }
2696 2698 dcookies[dmax].dck_paddr = paddr;
2697 2699 dcookies[dmax].dck_npages = 1;
2698 2700 }
2699 2701 size -= psize;
2702 + if (npages != 0)
2703 + npages++;
2700 2704 }
2701 2705
2702 2706 /*
2703 2707 * Finish up, mapping all, or all of the remaining,
2704 2708 * physical memory ranges.
2705 2709 */
2706 2710 if (ihp->ihp_npremapped == 0 && npages > 0) {
2707 2711 IMMU_DPROBE4(immu__dvmamap__late, dev_info_t *, rdip, \
2708 2712 uint64_t, dvma, uint_t, npages, uint_t, dmax+1);
2709 2713
2710 2714 if (dvma_map(domain, dvma, npages, dcookies,
2711 2715 dmax + 1, rdip, immu_flags))
2712 2716 pde_set++;
2713 2717 }
2714 2718
2715 2719 /* Invalidate the IOTLB */
2716 2720 immu_flush_iotlb_psi(immu, domain->dom_did, sdvma, npgalloc,
2717 2721 pde_set > 0 ? TLB_IVA_WHOLE : TLB_IVA_LEAF,
2718 2722 &ihp->ihp_inv_wait);
2719 2723
2720 2724 ihp->ihp_ndvseg = 1;
2721 2725 ihp->ihp_dvseg[0].dvs_start = sdvma;
2722 2726 ihp->ihp_dvseg[0].dvs_len = dmar_object->dmao_size;
2723 2727
2724 2728 dma_out->dmao_size = dmar_object->dmao_size;
2725 2729 dma_out->dmao_obj.dvma_obj.dv_off = offset & IMMU_PAGEOFFSET;
2726 2730 dma_out->dmao_obj.dvma_obj.dv_nseg = 1;
2727 2731 dma_out->dmao_obj.dvma_obj.dv_seg = &ihp->ihp_dvseg[0];
2728 2732 dma_out->dmao_type = DMA_OTYP_DVADDR;
2729 2733
2730 2734 return (DDI_DMA_MAPPED);
2731 2735 }
2732 2736
2733 2737 static int
2734 2738 immu_unmap_dvmaseg(dev_info_t *rdip, ddi_dma_obj_t *dmao)
2735 2739 {
2736 2740 uint64_t dvma, npages;
2737 2741 domain_t *domain;
2738 2742 struct dvmaseg *dvs;
2739 2743
2740 2744 domain = IMMU_DEVI(rdip)->imd_domain;
2741 2745 dvs = dmao->dmao_obj.dvma_obj.dv_seg;
2742 2746
2743 2747 dvma = dvs[0].dvs_start;
2744 2748 npages = IMMU_BTOPR(dvs[0].dvs_len + dmao->dmao_obj.dvma_obj.dv_off);
2745 2749
2746 2750 #ifdef DEBUG
2747 2751 /* Unmap only in DEBUG mode */
2748 2752 dvma_unmap(domain, dvma, npages, rdip);
2749 2753 #endif
2750 2754 dvma_free(domain, dvma, npages);
2751 2755
2752 2756 IMMU_DPROBE3(immu__dvma__free, dev_info_t *, rdip, uint_t, npages,
2753 2757 uint64_t, dvma);
2754 2758
2755 2759 #ifdef DEBUG
2756 2760 /*
2757 2761 * In the DEBUG case, the unmap was actually done,
2758 2762 * but an IOTLB flush was not done. So, an explicit
2759 2763 * write back flush is needed.
2760 2764 */
2761 2765 immu_regs_wbf_flush(domain->dom_immu);
2762 2766 #endif
2763 2767
2764 2768 return (DDI_SUCCESS);
2765 2769 }
2766 2770
2767 2771 /* ############################# Functions exported ######################## */
2768 2772
2769 2773 /*
2770 2774 * setup the DVMA subsystem
2771 2775 * this code runs only for the first IOMMU unit
2772 2776 */
2773 2777 void
2774 2778 immu_dvma_setup(list_t *listp)
2775 2779 {
2776 2780 immu_t *immu;
2777 2781 uint_t kval;
2778 2782 size_t nchains;
2779 2783
2780 2784 /* locks */
2781 2785 mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL);
2782 2786
2783 2787 /* Create lists */
2784 2788 list_create(&immu_unity_domain_list, sizeof (domain_t),
2785 2789 offsetof(domain_t, dom_maptype_node));
2786 2790 list_create(&immu_xlate_domain_list, sizeof (domain_t),
2787 2791 offsetof(domain_t, dom_maptype_node));
2788 2792
2789 2793 /* Setup BDF domain hash */
2790 2794 nchains = 0xff;
2791 2795 kval = mod_hash_iddata_gen(nchains);
2792 2796
2793 2797 bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH",
2794 2798 nchains, mod_hash_null_keydtor, mod_hash_null_valdtor,
2795 2799 mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp,
2796 2800 KM_NOSLEEP);
2797 2801
2798 2802 immu = list_head(listp);
2799 2803 for (; immu; immu = list_next(listp, immu)) {
2800 2804 create_unity_domain(immu);
2801 2805 did_init(immu);
2802 2806 context_init(immu);
2803 2807 immu->immu_dvma_setup = B_TRUE;
2804 2808 }
2805 2809 }
2806 2810
2807 2811 /*
2808 2812 * Startup up one DVMA unit
2809 2813 */
2810 2814 void
2811 2815 immu_dvma_startup(immu_t *immu)
2812 2816 {
2813 2817 if (immu_gfxdvma_enable == B_FALSE &&
2814 2818 immu->immu_dvma_gfx_only == B_TRUE) {
2815 2819 return;
2816 2820 }
2817 2821
2818 2822 /*
2819 2823 * DVMA will start once IOMMU is "running"
2820 2824 */
2821 2825 immu->immu_dvma_running = B_TRUE;
2822 2826 }
2823 2827
2824 2828 /*
2825 2829 * immu_dvma_physmem_update()
2826 2830 * called when the installed memory on a
2827 2831 * system increases, to expand domain DVMA
2828 2832 * for domains with UNITY mapping
2829 2833 */
2830 2834 void
2831 2835 immu_dvma_physmem_update(uint64_t addr, uint64_t size)
2832 2836 {
2833 2837 uint64_t start;
2834 2838 uint64_t npages;
2835 2839 int dcount;
2836 2840 immu_dcookie_t dcookies[1] = {0};
2837 2841 domain_t *domain;
2838 2842
2839 2843 /*
2840 2844 * Just walk the system-wide list of domains with
2841 2845 * UNITY mapping. Both the list of *all* domains
2842 2846 * and *UNITY* domains is protected by the same
2843 2847 * single lock
2844 2848 */
2845 2849 mutex_enter(&immu_domain_lock);
2846 2850 domain = list_head(&immu_unity_domain_list);
2847 2851 for (; domain; domain = list_next(&immu_unity_domain_list, domain)) {
2848 2852 /*
2849 2853 * Nothing to do if the IOMMU supports passthrough.
2850 2854 */
2851 2855 if (IMMU_ECAP_GET_PT(domain->dom_immu->immu_regs_excap))
2852 2856 continue;
2853 2857
2854 2858 /* There is no vmem_arena for unity domains. Just map it */
2855 2859 ddi_err(DER_LOG, domain->dom_dip,
2856 2860 "iommu: unity-domain: Adding map "
2857 2861 "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size);
2858 2862
2859 2863 start = IMMU_ROUNDOWN(addr);
2860 2864 npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1;
2861 2865
2862 2866 dcookies[0].dck_paddr = start;
2863 2867 dcookies[0].dck_npages = npages;
2864 2868 dcount = 1;
2865 2869 (void) dvma_map(domain, start, npages,
2866 2870 dcookies, dcount, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2867 2871
2868 2872 }
2869 2873 mutex_exit(&immu_domain_lock);
2870 2874 }
2871 2875
2872 2876 int
2873 2877 immu_dvma_device_setup(dev_info_t *rdip, immu_flags_t immu_flags)
2874 2878 {
2875 2879 dev_info_t *ddip, *odip;
2876 2880 immu_t *immu;
2877 2881 domain_t *domain;
2878 2882
2879 2883 odip = rdip;
2880 2884
2881 2885 immu = immu_dvma_get_immu(rdip, immu_flags);
2882 2886 if (immu == NULL) {
2883 2887 /*
2884 2888 * possible that there is no IOMMU unit for this device
2885 2889 * - BIOS bugs are one example.
2886 2890 */
2887 2891 ddi_err(DER_WARN, rdip, "No iommu unit found for device");
2888 2892 return (DDI_DMA_NORESOURCES);
2889 2893 }
2890 2894
2891 2895 /*
2892 2896 * redirect isa devices attached under lpc to lpc dip
2893 2897 */
2894 2898 if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) {
2895 2899 rdip = get_lpc_devinfo(immu, rdip, immu_flags);
2896 2900 if (rdip == NULL) {
2897 2901 ddi_err(DER_PANIC, rdip, "iommu redirect failed");
2898 2902 /*NOTREACHED*/
2899 2903 }
2900 2904 }
2901 2905
2902 2906 /* Reset immu, as redirection can change IMMU */
2903 2907 immu = NULL;
2904 2908
2905 2909 /*
2906 2910 * for gart, redirect to the real graphic devinfo
2907 2911 */
2908 2912 if (strcmp(ddi_node_name(rdip), "agpgart") == 0) {
2909 2913 rdip = get_gfx_devinfo(rdip);
2910 2914 if (rdip == NULL) {
2911 2915 ddi_err(DER_PANIC, rdip, "iommu redirect failed");
2912 2916 /*NOTREACHED*/
2913 2917 }
2914 2918 }
2915 2919
2916 2920 /*
2917 2921 * Setup DVMA domain for the device. This does
2918 2922 * work only the first time we do DVMA for a
2919 2923 * device.
2920 2924 */
2921 2925 ddip = NULL;
2922 2926 domain = device_domain(rdip, &ddip, immu_flags);
2923 2927 if (domain == NULL) {
2924 2928 ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device");
2925 2929 return (DDI_DMA_NORESOURCES);
2926 2930 }
2927 2931
2928 2932 immu = domain->dom_immu;
2929 2933
2930 2934 /*
2931 2935 * If a domain is found, we must also have a domain dip
2932 2936 * which is the topmost ancestor dip of rdip that shares
2933 2937 * the same domain with rdip.
2934 2938 */
2935 2939 if (domain->dom_did == 0 || ddip == NULL) {
2936 2940 ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)",
2937 2941 domain->dom_did, ddip);
2938 2942 return (DDI_DMA_NORESOURCES);
2939 2943 }
2940 2944
2941 2945 if (odip != rdip)
2942 2946 set_domain(odip, ddip, domain);
2943 2947
2944 2948 /*
2945 2949 * Update the root and context entries
2946 2950 */
2947 2951 if (immu_context_update(immu, domain, ddip, rdip, immu_flags)
2948 2952 != DDI_SUCCESS) {
2949 2953 ddi_err(DER_MODE, rdip, "DVMA map: context update failed");
2950 2954 return (DDI_DMA_NORESOURCES);
2951 2955 }
2952 2956
2953 2957 return (DDI_SUCCESS);
2954 2958 }
2955 2959
2956 2960 int
2957 2961 immu_map_memrange(dev_info_t *rdip, memrng_t *mrng)
2958 2962 {
2959 2963 immu_dcookie_t dcookies[1] = {0};
2960 2964 boolean_t pde_set;
2961 2965 immu_t *immu;
2962 2966 domain_t *domain;
2963 2967 immu_inv_wait_t iw;
2964 2968
2965 2969 dcookies[0].dck_paddr = mrng->mrng_start;
2966 2970 dcookies[0].dck_npages = mrng->mrng_npages;
2967 2971
2968 2972 domain = IMMU_DEVI(rdip)->imd_domain;
2969 2973 immu = domain->dom_immu;
2970 2974
2971 2975 pde_set = dvma_map(domain, mrng->mrng_start,
2972 2976 mrng->mrng_npages, dcookies, 1, rdip,
2973 2977 IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
2974 2978
2975 2979 immu_init_inv_wait(&iw, "memrange", B_TRUE);
2976 2980
2977 2981 immu_flush_iotlb_psi(immu, domain->dom_did, mrng->mrng_start,
2978 2982 mrng->mrng_npages, pde_set == B_TRUE ?
2979 2983 TLB_IVA_WHOLE : TLB_IVA_LEAF, &iw);
2980 2984
2981 2985 return (DDI_SUCCESS);
2982 2986 }
2983 2987
2984 2988 immu_devi_t *
2985 2989 immu_devi_get(dev_info_t *rdip)
2986 2990 {
2987 2991 immu_devi_t *immu_devi;
2988 2992 volatile uintptr_t *vptr = (uintptr_t *)&(DEVI(rdip)->devi_iommu);
2989 2993
2990 2994 /* Just want atomic reads. No need for lock */
2991 2995 immu_devi = (immu_devi_t *)(uintptr_t)atomic_or_64_nv((uint64_t *)vptr,
2992 2996 0);
2993 2997 return (immu_devi);
2994 2998 }
2995 2999
2996 3000 /*ARGSUSED*/
2997 3001 int
2998 3002 immu_hdl_priv_ctor(void *buf, void *arg, int kmf)
2999 3003 {
3000 3004 immu_hdl_priv_t *ihp;
3001 3005
3002 3006 ihp = buf;
3003 3007 immu_init_inv_wait(&ihp->ihp_inv_wait, "dmahandle", B_FALSE);
3004 3008
3005 3009 return (0);
3006 3010 }
3007 3011
3008 3012 /*
3009 3013 * iommulib interface functions
3010 3014 */
3011 3015 static int
3012 3016 immu_probe(iommulib_handle_t handle, dev_info_t *dip)
3013 3017 {
3014 3018 immu_devi_t *immu_devi;
3015 3019 int ret;
3016 3020
3017 3021 if (!immu_enable)
3018 3022 return (DDI_FAILURE);
3019 3023
3020 3024 /*
3021 3025 * Make sure the device has all the IOMMU structures
3022 3026 * initialized. If this device goes through an IOMMU
3023 3027 * unit (e.g. this probe function returns success),
3024 3028 * this will be called at most N times, with N being
3025 3029 * the number of IOMMUs in the system.
3026 3030 *
3027 3031 * After that, when iommulib_nex_open succeeds,
3028 3032 * we can always assume that this device has all
3029 3033 * the structures initialized. IOMMU_USED(dip) will
3030 3034 * be true. There is no need to find the controlling
3031 3035 * IOMMU/domain again.
3032 3036 */
3033 3037 ret = immu_dvma_device_setup(dip, IMMU_FLAGS_NOSLEEP);
3034 3038 if (ret != DDI_SUCCESS)
3035 3039 return (ret);
3036 3040
3037 3041 immu_devi = IMMU_DEVI(dip);
3038 3042
3039 3043 /*
3040 3044 * For unity domains, there is no need to call in to
3041 3045 * the IOMMU code.
3042 3046 */
3043 3047 if (immu_devi->imd_domain->dom_did == IMMU_UNITY_DID)
3044 3048 return (DDI_FAILURE);
3045 3049
3046 3050 if (immu_devi->imd_immu->immu_dip == iommulib_iommu_getdip(handle))
3047 3051 return (DDI_SUCCESS);
3048 3052
3049 3053 return (DDI_FAILURE);
3050 3054 }
3051 3055
3052 3056 /*ARGSUSED*/
3053 3057 static int
3054 3058 immu_allochdl(iommulib_handle_t handle,
3055 3059 dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr,
3056 3060 int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *dma_handlep)
3057 3061 {
3058 3062 int ret;
3059 3063 immu_hdl_priv_t *ihp;
3060 3064 immu_t *immu;
3061 3065
3062 3066 ret = iommulib_iommu_dma_allochdl(dip, rdip, attr, waitfp,
3063 3067 arg, dma_handlep);
3064 3068 if (ret == DDI_SUCCESS) {
3065 3069 immu = IMMU_DEVI(rdip)->imd_immu;
3066 3070
3067 3071 ihp = kmem_cache_alloc(immu->immu_hdl_cache,
3068 3072 waitfp == DDI_DMA_SLEEP ? KM_SLEEP : KM_NOSLEEP);
3069 3073 if (ihp == NULL) {
3070 3074 (void) iommulib_iommu_dma_freehdl(dip, rdip,
3071 3075 *dma_handlep);
3072 3076 return (DDI_DMA_NORESOURCES);
3073 3077 }
3074 3078
3075 3079 if (IMMU_DEVI(rdip)->imd_use_premap)
3076 3080 dvma_prealloc(rdip, ihp, attr);
3077 3081 else {
3078 3082 ihp->ihp_npremapped = 0;
3079 3083 ihp->ihp_predvma = 0;
3080 3084 }
3081 3085 ret = iommulib_iommu_dmahdl_setprivate(dip, rdip, *dma_handlep,
3082 3086 ihp);
3083 3087 }
3084 3088 return (ret);
3085 3089 }
3086 3090
3087 3091 /*ARGSUSED*/
3088 3092 static int
3089 3093 immu_freehdl(iommulib_handle_t handle,
3090 3094 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle)
3091 3095 {
3092 3096 immu_hdl_priv_t *ihp;
3093 3097
3094 3098 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3095 3099 if (ihp != NULL) {
3096 3100 if (IMMU_DEVI(rdip)->imd_use_premap)
3097 3101 dvma_prefree(rdip, ihp);
3098 3102 kmem_cache_free(IMMU_DEVI(rdip)->imd_immu->immu_hdl_cache, ihp);
3099 3103 }
3100 3104
3101 3105 return (iommulib_iommu_dma_freehdl(dip, rdip, dma_handle));
3102 3106 }
3103 3107
3104 3108
3105 3109 /*ARGSUSED*/
3106 3110 static int
3107 3111 immu_bindhdl(iommulib_handle_t handle, dev_info_t *dip,
3108 3112 dev_info_t *rdip, ddi_dma_handle_t dma_handle,
3109 3113 struct ddi_dma_req *dma_req, ddi_dma_cookie_t *cookiep,
3110 3114 uint_t *ccountp)
3111 3115 {
3112 3116 int ret;
3113 3117 immu_hdl_priv_t *ihp;
3114 3118
3115 3119 ret = iommulib_iommu_dma_bindhdl(dip, rdip, dma_handle,
3116 3120 dma_req, cookiep, ccountp);
3117 3121
3118 3122 if (ret == DDI_DMA_MAPPED) {
3119 3123 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3120 3124 immu_flush_wait(IMMU_DEVI(rdip)->imd_immu, &ihp->ihp_inv_wait);
3121 3125 }
3122 3126
3123 3127 return (ret);
3124 3128 }
3125 3129
3126 3130 /*ARGSUSED*/
3127 3131 static int
3128 3132 immu_unbindhdl(iommulib_handle_t handle,
3129 3133 dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t dma_handle)
3130 3134 {
3131 3135 return (iommulib_iommu_dma_unbindhdl(dip, rdip, dma_handle));
3132 3136 }
3133 3137
3134 3138 /*ARGSUSED*/
3135 3139 static int
3136 3140 immu_sync(iommulib_handle_t handle, dev_info_t *dip,
3137 3141 dev_info_t *rdip, ddi_dma_handle_t dma_handle, off_t off,
3138 3142 size_t len, uint_t cachefl)
3139 3143 {
3140 3144 return (iommulib_iommu_dma_sync(dip, rdip, dma_handle, off, len,
3141 3145 cachefl));
3142 3146 }
3143 3147
3144 3148 /*ARGSUSED*/
3145 3149 static int
3146 3150 immu_win(iommulib_handle_t handle, dev_info_t *dip,
3147 3151 dev_info_t *rdip, ddi_dma_handle_t dma_handle, uint_t win,
3148 3152 off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep,
3149 3153 uint_t *ccountp)
3150 3154 {
3151 3155 return (iommulib_iommu_dma_win(dip, rdip, dma_handle, win, offp,
3152 3156 lenp, cookiep, ccountp));
3153 3157 }
3154 3158
3155 3159 /*ARGSUSED*/
3156 3160 static int
3157 3161 immu_mapobject(iommulib_handle_t handle, dev_info_t *dip,
3158 3162 dev_info_t *rdip, ddi_dma_handle_t dma_handle,
3159 3163 struct ddi_dma_req *dmareq, ddi_dma_obj_t *dmao)
3160 3164 {
3161 3165 immu_hdl_priv_t *ihp;
3162 3166
3163 3167 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3164 3168
3165 3169 return (immu_map_dvmaseg(rdip, dma_handle, ihp, dmareq, dmao));
3166 3170 }
3167 3171
3168 3172 /*ARGSUSED*/
3169 3173 static int
3170 3174 immu_unmapobject(iommulib_handle_t handle, dev_info_t *dip,
3171 3175 dev_info_t *rdip, ddi_dma_handle_t dma_handle, ddi_dma_obj_t *dmao)
3172 3176 {
3173 3177 immu_hdl_priv_t *ihp;
3174 3178
3175 3179 ihp = iommulib_iommu_dmahdl_getprivate(dip, rdip, dma_handle);
3176 3180 if (ihp->ihp_npremapped > 0)
3177 3181 return (DDI_SUCCESS);
3178 3182 return (immu_unmap_dvmaseg(rdip, dmao));
3179 3183 }
↓ open down ↓ |
470 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX