1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2017 Joyent, Inc.
  24  */
  25 /*
  26  * Copyright (c) 2010, Intel Corporation.
  27  * All rights reserved.
  28  */
  29 
  30 /*
  31  * PSMI 1.1 extensions are supported only in 2.6 and later versions.
  32  * PSMI 1.2 extensions are supported only in 2.7 and later versions.
  33  * PSMI 1.3 and 1.4 extensions are supported in Solaris 10.
  34  * PSMI 1.5 extensions are supported in Solaris Nevada.
  35  * PSMI 1.6 extensions are supported in Solaris Nevada.
  36  * PSMI 1.7 extensions are supported in Solaris Nevada.
  37  */
  38 #define PSMI_1_7
  39 
  40 #include <sys/processor.h>
  41 #include <sys/time.h>
  42 #include <sys/psm.h>
  43 #include <sys/smp_impldefs.h>
  44 #include <sys/inttypes.h>
  45 #include <sys/cram.h>
  46 #include <sys/acpi/acpi.h>
  47 #include <sys/acpica.h>
  48 #include <sys/psm_common.h>
  49 #include <sys/apic.h>
  50 #include <sys/apic_common.h>
  51 #include <sys/pit.h>
  52 #include <sys/ddi.h>
  53 #include <sys/sunddi.h>
  54 #include <sys/ddi_impldefs.h>
  55 #include <sys/pci.h>
  56 #include <sys/promif.h>
  57 #include <sys/x86_archext.h>
  58 #include <sys/cpc_impl.h>
  59 #include <sys/uadmin.h>
  60 #include <sys/panic.h>
  61 #include <sys/debug.h>
  62 #include <sys/archsystm.h>
  63 #include <sys/trap.h>
  64 #include <sys/machsystm.h>
  65 #include <sys/cpuvar.h>
  66 #include <sys/rm_platter.h>
  67 #include <sys/privregs.h>
  68 #include <sys/cyclic.h>
  69 #include <sys/note.h>
  70 #include <sys/pci_intr_lib.h>
  71 #include <sys/sunndi.h>
  72 #include <sys/hpet.h>
  73 #include <sys/clock.h>
  74 
  75 /*
  76  * Part of mp_platfrom_common.c that's used only by pcplusmp & xpv_psm
  77  * but not apix.
  78  * These functions may be moved to xpv_psm later when apix and pcplusmp
  79  * are merged together
  80  */
  81 
  82 /*
  83  *      Local Function Prototypes
  84  */
  85 static void apic_mark_vector(uchar_t oldvector, uchar_t newvector);
  86 static void apic_xlate_vector_free_timeout_handler(void *arg);
  87 static int apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
  88     int new_bind_cpu, int apicindex, int intin_no, int which_irq,
  89     struct ioapic_reprogram_data *drep);
  90 static int apic_setup_irq_table(dev_info_t *dip, int irqno,
  91     struct apic_io_intr *intrp, struct intrspec *ispec, iflag_t *intr_flagp,
  92     int type);
  93 static void apic_try_deferred_reprogram(int ipl, int vect);
  94 static void delete_defer_repro_ent(int which_irq);
  95 static void apic_ioapic_wait_pending_clear(int ioapicindex,
  96     int intin_no);
  97 
  98 extern int apic_acpi_translate_pci_irq(dev_info_t *dip, int busid, int devid,
  99     int ipin, int *pci_irqp, iflag_t *intr_flagp);
 100 extern int apic_handle_pci_pci_bridge(dev_info_t *idip, int child_devno,
 101     int child_ipin, struct apic_io_intr **intrp);
 102 extern uchar_t acpi_find_ioapic(int irq);
 103 extern struct apic_io_intr *apic_find_io_intr_w_busid(int irqno, int busid);
 104 extern int apic_find_bus_id(int bustype);
 105 extern int apic_find_intin(uchar_t ioapic, uchar_t intin);
 106 extern void apic_record_rdt_entry(apic_irq_t *irqptr, int irq);
 107 
 108 extern  int apic_sci_vect;
 109 extern  iflag_t apic_sci_flags;
 110 /* ACPI HPET interrupt configuration; -1 if HPET not used */
 111 extern  int apic_hpet_vect;
 112 extern  iflag_t apic_hpet_flags;
 113 extern  int     apic_intr_policy;
 114 extern  char *psm_name;
 115 
 116 /*
 117  * number of bits per byte, from <sys/param.h>
 118  */
 119 #define UCHAR_MAX       UINT8_MAX
 120 
 121 /* Max wait time (in repetitions) for flags to clear in an RDT entry. */
 122 extern int apic_max_reps_clear_pending;
 123 
 124 /* The irq # is implicit in the array index: */
 125 struct ioapic_reprogram_data apic_reprogram_info[APIC_MAX_VECTOR+1];
 126 /*
 127  * APIC_MAX_VECTOR + 1 is the maximum # of IRQs as well. ioapic_reprogram_info
 128  * is indexed by IRQ number, NOT by vector number.
 129  */
 130 
 131 extern  int     apic_int_busy_mark;
 132 extern  int     apic_int_free_mark;
 133 extern  int     apic_diff_for_redistribution;
 134 extern  int     apic_sample_factor_redistribution;
 135 extern  int     apic_redist_cpu_skip;
 136 extern  int     apic_num_imbalance;
 137 extern  int     apic_num_rebind;
 138 
 139 /* timeout for xlate_vector, mark_vector */
 140 int     apic_revector_timeout = 16 * 10000; /* 160 millisec */
 141 
 142 extern int      apic_defconf;
 143 extern int      apic_irq_translate;
 144 
 145 extern int      apic_use_acpi_madt_only;        /* 1=ONLY use MADT from ACPI */
 146 
 147 extern  uchar_t apic_io_vectbase[MAX_IO_APIC];
 148 
 149 extern  boolean_t ioapic_mask_workaround[MAX_IO_APIC];
 150 
 151 /*
 152  * First available slot to be used as IRQ index into the apic_irq_table
 153  * for those interrupts (like MSI/X) that don't have a physical IRQ.
 154  */
 155 extern int apic_first_avail_irq;
 156 
 157 /*
 158  * apic_defer_reprogram_lock ensures that only one processor is handling
 159  * deferred interrupt programming at *_intr_exit time.
 160  */
 161 static  lock_t  apic_defer_reprogram_lock;
 162 
 163 /*
 164  * The current number of deferred reprogrammings outstanding
 165  */
 166 uint_t  apic_reprogram_outstanding = 0;
 167 
 168 #ifdef DEBUG
 169 /*
 170  * Counters that keep track of deferred reprogramming stats
 171  */
 172 uint_t  apic_intr_deferrals = 0;
 173 uint_t  apic_intr_deliver_timeouts = 0;
 174 uint_t  apic_last_ditch_reprogram_failures = 0;
 175 uint_t  apic_deferred_setup_failures = 0;
 176 uint_t  apic_defer_repro_total_retries = 0;
 177 uint_t  apic_defer_repro_successes = 0;
 178 uint_t  apic_deferred_spurious_enters = 0;
 179 #endif
 180 
 181 extern  int     apic_io_max;
 182 extern  struct apic_io_intr *apic_io_intrp;
 183 
 184 uchar_t apic_vector_to_irq[APIC_MAX_VECTOR+1];
 185 
 186 extern  uint32_t        eisa_level_intr_mask;
 187         /* At least MSB will be set if EISA bus */
 188 
 189 extern  int     apic_pci_bus_total;
 190 extern  uchar_t apic_single_pci_busid;
 191 
 192 /*
 193  * Following declarations are for revectoring; used when ISRs at different
 194  * IPLs share an irq.
 195  */
 196 static  lock_t  apic_revector_lock;
 197 int     apic_revector_pending = 0;
 198 static  uchar_t *apic_oldvec_to_newvec;
 199 static  uchar_t *apic_newvec_to_oldvec;
 200 
 201 /* ACPI Interrupt Source Override Structure ptr */
 202 extern ACPI_MADT_INTERRUPT_OVERRIDE *acpi_isop;
 203 extern int acpi_iso_cnt;
 204 
 205 /*
 206  * Auto-configuration routines
 207  */
 208 
 209 /*
 210  * Initialise vector->ipl and ipl->pri arrays. level_intr and irqtable
 211  * are also set to NULL. vector->irq is set to a value which cannot map
 212  * to a real irq to show that it is free.
 213  */
 214 void
 215 apic_init_common(void)
 216 {
 217         int     i, j, indx;
 218         int     *iptr;
 219 
 220         /*
 221          * Initialize apic_ipls from apic_vectortoipl.  This array is
 222          * used in apic_intr_enter to determine the IPL to use for the
 223          * corresponding vector.  On some systems, due to hardware errata
 224          * and interrupt sharing, the IPL may not correspond to the IPL listed
 225          * in apic_vectortoipl (see apic_addspl and apic_delspl).
 226          */
 227         for (i = 0; i < (APIC_AVAIL_VECTOR / APIC_VECTOR_PER_IPL); i++) {
 228                 indx = i * APIC_VECTOR_PER_IPL;
 229 
 230                 for (j = 0; j < APIC_VECTOR_PER_IPL; j++, indx++)
 231                         apic_ipls[indx] = apic_vectortoipl[i];
 232         }
 233 
 234         /* cpu 0 is always up (for now) */
 235         apic_cpus[0].aci_status = APIC_CPU_ONLINE | APIC_CPU_INTR_ENABLE;
 236 
 237         iptr = (int *)&apic_irq_table[0];
 238         for (i = 0; i <= APIC_MAX_VECTOR; i++) {
 239                 apic_level_intr[i] = 0;
 240                 *iptr++ = NULL;
 241                 apic_vector_to_irq[i] = APIC_RESV_IRQ;
 242 
 243                 /* These *must* be initted to B_TRUE! */
 244                 apic_reprogram_info[i].done = B_TRUE;
 245                 apic_reprogram_info[i].irqp = NULL;
 246                 apic_reprogram_info[i].tries = 0;
 247                 apic_reprogram_info[i].bindcpu = 0;
 248         }
 249 
 250         /*
 251          * Allocate a dummy irq table entry for the reserved entry.
 252          * This takes care of the race between removing an irq and
 253          * clock detecting a CPU in that irq during interrupt load
 254          * sampling.
 255          */
 256         apic_irq_table[APIC_RESV_IRQ] =
 257             kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
 258 
 259         mutex_init(&airq_mutex, NULL, MUTEX_DEFAULT, NULL);
 260 }
 261 
 262 void
 263 ioapic_init_intr(int mask_apic)
 264 {
 265         int ioapic_ix;
 266         struct intrspec ispec;
 267         apic_irq_t *irqptr;
 268         int i, j;
 269         ulong_t iflag;
 270 
 271         LOCK_INIT_CLEAR(&apic_revector_lock);
 272         LOCK_INIT_CLEAR(&apic_defer_reprogram_lock);
 273 
 274         /* mask interrupt vectors */
 275         for (j = 0; j < apic_io_max && mask_apic; j++) {
 276                 int intin_max;
 277 
 278                 ioapic_ix = j;
 279                 /* Bits 23-16 define the maximum redirection entries */
 280                 intin_max = (ioapic_read(ioapic_ix, APIC_VERS_CMD) >> 16)
 281                     & 0xff;
 282                 for (i = 0; i <= intin_max; i++)
 283                         ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * i, AV_MASK);
 284         }
 285 
 286         /*
 287          * Hack alert: deal with ACPI SCI interrupt chicken/egg here
 288          */
 289         if (apic_sci_vect > 0) {
 290                 /*
 291                  * acpica has already done add_avintr(); we just
 292                  * to finish the job by mimicing translate_irq()
 293                  *
 294                  * Fake up an intrspec and setup the tables
 295                  */
 296                 ispec.intrspec_vec = apic_sci_vect;
 297                 ispec.intrspec_pri = SCI_IPL;
 298 
 299                 if (apic_setup_irq_table(NULL, apic_sci_vect, NULL,
 300                     &ispec, &apic_sci_flags, DDI_INTR_TYPE_FIXED) < 0) {
 301                         cmn_err(CE_WARN, "!apic: SCI setup failed");
 302                         return;
 303                 }
 304                 irqptr = apic_irq_table[apic_sci_vect];
 305 
 306                 iflag = intr_clear();
 307                 lock_set(&apic_ioapic_lock);
 308 
 309                 /* Program I/O APIC */
 310                 (void) apic_setup_io_intr(irqptr, apic_sci_vect, B_FALSE);
 311 
 312                 lock_clear(&apic_ioapic_lock);
 313                 intr_restore(iflag);
 314 
 315                 irqptr->airq_share++;
 316         }
 317 
 318         /*
 319          * Hack alert: deal with ACPI HPET interrupt chicken/egg here.
 320          */
 321         if (apic_hpet_vect > 0) {
 322                 /*
 323                  * hpet has already done add_avintr(); we just need
 324                  * to finish the job by mimicing translate_irq()
 325                  *
 326                  * Fake up an intrspec and setup the tables
 327                  */
 328                 ispec.intrspec_vec = apic_hpet_vect;
 329                 ispec.intrspec_pri = CBE_HIGH_PIL;
 330 
 331                 if (apic_setup_irq_table(NULL, apic_hpet_vect, NULL,
 332                     &ispec, &apic_hpet_flags, DDI_INTR_TYPE_FIXED) < 0) {
 333                         cmn_err(CE_WARN, "!apic: HPET setup failed");
 334                         return;
 335                 }
 336                 irqptr = apic_irq_table[apic_hpet_vect];
 337 
 338                 iflag = intr_clear();
 339                 lock_set(&apic_ioapic_lock);
 340 
 341                 /* Program I/O APIC */
 342                 (void) apic_setup_io_intr(irqptr, apic_hpet_vect, B_FALSE);
 343 
 344                 lock_clear(&apic_ioapic_lock);
 345                 intr_restore(iflag);
 346 
 347                 irqptr->airq_share++;
 348         }
 349 }
 350 
 351 /*
 352  * Add mask bits to disable interrupt vector from happening
 353  * at or above IPL. In addition, it should remove mask bits
 354  * to enable interrupt vectors below the given IPL.
 355  *
 356  * Both add and delspl are complicated by the fact that different interrupts
 357  * may share IRQs. This can happen in two ways.
 358  * 1. The same H/W line is shared by more than 1 device
 359  * 1a. with interrupts at different IPLs
 360  * 1b. with interrupts at same IPL
 361  * 2. We ran out of vectors at a given IPL and started sharing vectors.
 362  * 1b and 2 should be handled gracefully, except for the fact some ISRs
 363  * will get called often when no interrupt is pending for the device.
 364  * For 1a, we handle it at the higher IPL.
 365  */
 366 /*ARGSUSED*/
 367 int
 368 apic_addspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
 369 {
 370         uchar_t vector;
 371         ulong_t iflag;
 372         apic_irq_t *irqptr, *irqheadptr;
 373         int irqindex;
 374 
 375         ASSERT(max_ipl <= UCHAR_MAX);
 376         irqindex = IRQINDEX(irqno);
 377 
 378         if ((irqindex == -1) || (!apic_irq_table[irqindex]))
 379                 return (PSM_FAILURE);
 380 
 381         mutex_enter(&airq_mutex);
 382         irqptr = irqheadptr = apic_irq_table[irqindex];
 383 
 384         DDI_INTR_IMPLDBG((CE_CONT, "apic_addspl: dip=0x%p type=%d irqno=0x%x "
 385             "vector=0x%x\n", (void *)irqptr->airq_dip,
 386             irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
 387 
 388         while (irqptr) {
 389                 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
 390                         break;
 391                 irqptr = irqptr->airq_next;
 392         }
 393         irqptr->airq_share++;
 394 
 395         mutex_exit(&airq_mutex);
 396 
 397         /* return if it is not hardware interrupt */
 398         if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
 399                 return (PSM_SUCCESS);
 400 
 401         /* Or if there are more interupts at a higher IPL */
 402         if (ipl != max_ipl)
 403                 return (PSM_SUCCESS);
 404 
 405         /*
 406          * if apic_picinit() has not been called yet, just return.
 407          * At the end of apic_picinit(), we will call setup_io_intr().
 408          */
 409 
 410         if (!apic_picinit_called)
 411                 return (PSM_SUCCESS);
 412 
 413         /*
 414          * Upgrade vector if max_ipl is not earlier ipl. If we cannot allocate,
 415          * return failure.
 416          */
 417         if (irqptr->airq_ipl != max_ipl &&
 418             !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 419 
 420                 vector = apic_allocate_vector(max_ipl, irqindex, 1);
 421                 if (vector == 0) {
 422                         irqptr->airq_share--;
 423                         return (PSM_FAILURE);
 424                 }
 425                 irqptr = irqheadptr;
 426                 apic_mark_vector(irqptr->airq_vector, vector);
 427                 while (irqptr) {
 428                         irqptr->airq_vector = vector;
 429                         irqptr->airq_ipl = (uchar_t)max_ipl;
 430                         /*
 431                          * reprogram irq being added and every one else
 432                          * who is not in the UNINIT state
 433                          */
 434                         if ((VIRTIRQ(irqindex, irqptr->airq_share_id) ==
 435                             irqno) || (irqptr->airq_temp_cpu != IRQ_UNINIT)) {
 436                                 apic_record_rdt_entry(irqptr, irqindex);
 437 
 438                                 iflag = intr_clear();
 439                                 lock_set(&apic_ioapic_lock);
 440 
 441                                 (void) apic_setup_io_intr(irqptr, irqindex,
 442                                     B_FALSE);
 443 
 444                                 lock_clear(&apic_ioapic_lock);
 445                                 intr_restore(iflag);
 446                         }
 447                         irqptr = irqptr->airq_next;
 448                 }
 449                 return (PSM_SUCCESS);
 450 
 451         } else if (irqptr->airq_ipl != max_ipl &&
 452             ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 453                 /*
 454                  * We cannot upgrade the vector, but we can change
 455                  * the IPL that this vector induces.
 456                  *
 457                  * Note that we subtract APIC_BASE_VECT from the vector
 458                  * here because this array is used in apic_intr_enter
 459                  * (no need to add APIC_BASE_VECT in that hot code
 460                  * path since we can do it in the rarely-executed path
 461                  * here).
 462                  */
 463                 apic_ipls[irqptr->airq_vector - APIC_BASE_VECT] =
 464                     (uchar_t)max_ipl;
 465 
 466                 irqptr = irqheadptr;
 467                 while (irqptr) {
 468                         irqptr->airq_ipl = (uchar_t)max_ipl;
 469                         irqptr = irqptr->airq_next;
 470                 }
 471 
 472                 return (PSM_SUCCESS);
 473         }
 474 
 475         ASSERT(irqptr);
 476 
 477         iflag = intr_clear();
 478         lock_set(&apic_ioapic_lock);
 479 
 480         (void) apic_setup_io_intr(irqptr, irqindex, B_FALSE);
 481 
 482         lock_clear(&apic_ioapic_lock);
 483         intr_restore(iflag);
 484 
 485         return (PSM_SUCCESS);
 486 }
 487 
 488 /*
 489  * Recompute mask bits for the given interrupt vector.
 490  * If there is no interrupt servicing routine for this
 491  * vector, this function should disable interrupt vector
 492  * from happening at all IPLs. If there are still
 493  * handlers using the given vector, this function should
 494  * disable the given vector from happening below the lowest
 495  * IPL of the remaining hadlers.
 496  */
 497 /*ARGSUSED*/
 498 int
 499 apic_delspl_common(int irqno, int ipl, int min_ipl, int max_ipl)
 500 {
 501         uchar_t vector;
 502         uint32_t bind_cpu;
 503         int intin, irqindex;
 504         int ioapic_ix;
 505         apic_irq_t      *irqptr, *preirqptr, *irqheadptr, *irqp;
 506         ulong_t iflag;
 507 
 508         mutex_enter(&airq_mutex);
 509         irqindex = IRQINDEX(irqno);
 510         irqptr = preirqptr = irqheadptr = apic_irq_table[irqindex];
 511 
 512         DDI_INTR_IMPLDBG((CE_CONT, "apic_delspl: dip=0x%p type=%d irqno=0x%x "
 513             "vector=0x%x\n", (void *)irqptr->airq_dip,
 514             irqptr->airq_mps_intr_index, irqno, irqptr->airq_vector));
 515 
 516         while (irqptr) {
 517                 if (VIRTIRQ(irqindex, irqptr->airq_share_id) == irqno)
 518                         break;
 519                 preirqptr = irqptr;
 520                 irqptr = irqptr->airq_next;
 521         }
 522         ASSERT(irqptr);
 523 
 524         irqptr->airq_share--;
 525 
 526         mutex_exit(&airq_mutex);
 527 
 528         /*
 529          * If there are more interrupts at a higher IPL, we don't need
 530          * to disable anything.
 531          */
 532         if (ipl < max_ipl)
 533                 return (PSM_SUCCESS);
 534 
 535         /* return if it is not hardware interrupt */
 536         if (irqptr->airq_mps_intr_index == RESERVE_INDEX)
 537                 return (PSM_SUCCESS);
 538 
 539         if (!apic_picinit_called) {
 540                 /*
 541                  * Clear irq_struct. If two devices shared an intpt
 542                  * line & 1 unloaded before picinit, we are hosed. But, then
 543                  * we hope the machine survive.
 544                  */
 545                 irqptr->airq_mps_intr_index = FREE_INDEX;
 546                 irqptr->airq_temp_cpu = IRQ_UNINIT;
 547                 apic_free_vector(irqptr->airq_vector);
 548                 return (PSM_SUCCESS);
 549         }
 550         /*
 551          * Downgrade vector to new max_ipl if needed. If we cannot allocate,
 552          * use old IPL. Not very elegant, but it should work.
 553          */
 554         if ((irqptr->airq_ipl != max_ipl) && (max_ipl != PSM_INVALID_IPL) &&
 555             !ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 556                 apic_irq_t      *irqp;
 557                 if ((vector = apic_allocate_vector(max_ipl, irqno, 1))) {
 558                         apic_mark_vector(irqheadptr->airq_vector, vector);
 559                         irqp = irqheadptr;
 560                         while (irqp) {
 561                                 irqp->airq_vector = vector;
 562                                 irqp->airq_ipl = (uchar_t)max_ipl;
 563                                 if (irqp->airq_temp_cpu != IRQ_UNINIT) {
 564                                         apic_record_rdt_entry(irqp, irqindex);
 565 
 566                                         iflag = intr_clear();
 567                                         lock_set(&apic_ioapic_lock);
 568 
 569                                         (void) apic_setup_io_intr(irqp,
 570                                             irqindex, B_FALSE);
 571 
 572                                         lock_clear(&apic_ioapic_lock);
 573                                         intr_restore(iflag);
 574                                 }
 575                                 irqp = irqp->airq_next;
 576                         }
 577                 }
 578 
 579         } else if (irqptr->airq_ipl != max_ipl &&
 580             max_ipl != PSM_INVALID_IPL &&
 581             ioapic_mask_workaround[irqptr->airq_ioapicindex]) {
 582 
 583         /*
 584          * We cannot downgrade the IPL of the vector below the vector's
 585          * hardware priority. If we did, it would be possible for a
 586          * higher-priority hardware vector to interrupt a CPU running at an IPL
 587          * lower than the hardware priority of the interrupting vector (but
 588          * higher than the soft IPL of this IRQ). When this happens, we would
 589          * then try to drop the IPL BELOW what it was (effectively dropping
 590          * below base_spl) which would be potentially catastrophic.
 591          *
 592          * (e.g. Suppose the hardware vector associated with this IRQ is 0x40
 593          * (hardware IPL of 4).  Further assume that the old IPL of this IRQ
 594          * was 4, but the new IPL is 1.  If we forced vector 0x40 to result in
 595          * an IPL of 1, it would be possible for the processor to be executing
 596          * at IPL 3 and for an interrupt to come in on vector 0x40, interrupting
 597          * the currently-executing ISR.  When apic_intr_enter consults
 598          * apic_irqs[], it will return 1, bringing the IPL of the CPU down to 1
 599          * so even though the processor was running at IPL 4, an IPL 1
 600          * interrupt will have interrupted it, which must not happen)).
 601          *
 602          * Effectively, this means that the hardware priority corresponding to
 603          * the IRQ's IPL (in apic_ipls[]) cannot be lower than the vector's
 604          * hardware priority.
 605          *
 606          * (In the above example, then, after removal of the IPL 4 device's
 607          * interrupt handler, the new IPL will continue to be 4 because the
 608          * hardware priority that IPL 1 implies is lower than the hardware
 609          * priority of the vector used.)
 610          */
 611                 /* apic_ipls is indexed by vector, starting at APIC_BASE_VECT */
 612                 const int apic_ipls_index = irqptr->airq_vector -
 613                     APIC_BASE_VECT;
 614                 const int vect_inherent_hwpri = irqptr->airq_vector >>
 615                     APIC_IPL_SHIFT;
 616 
 617                 /*
 618                  * If there are still devices using this IRQ, determine the
 619                  * new ipl to use.
 620                  */
 621                 if (irqptr->airq_share) {
 622                         int vect_desired_hwpri, hwpri;
 623 
 624                         ASSERT(max_ipl < MAXIPL);
 625                         vect_desired_hwpri = apic_ipltopri[max_ipl] >>
 626                             APIC_IPL_SHIFT;
 627 
 628                         /*
 629                          * If the desired IPL's hardware priority is lower
 630                          * than that of the vector, use the hardware priority
 631                          * of the vector to determine the new IPL.
 632                          */
 633                         hwpri = (vect_desired_hwpri < vect_inherent_hwpri) ?
 634                             vect_inherent_hwpri : vect_desired_hwpri;
 635 
 636                         /*
 637                          * Now, to get the right index for apic_vectortoipl,
 638                          * we need to subtract APIC_BASE_VECT from the
 639                          * hardware-vector-equivalent (in hwpri).  Since hwpri
 640                          * is already shifted, we shift APIC_BASE_VECT before
 641                          * doing the subtraction.
 642                          */
 643                         hwpri -= (APIC_BASE_VECT >> APIC_IPL_SHIFT);
 644 
 645                         ASSERT(hwpri >= 0);
 646                         ASSERT(hwpri < MAXIPL);
 647                         max_ipl = apic_vectortoipl[hwpri];
 648                         apic_ipls[apic_ipls_index] = (uchar_t)max_ipl;
 649 
 650                         irqp = irqheadptr;
 651                         while (irqp) {
 652                                 irqp->airq_ipl = (uchar_t)max_ipl;
 653                                 irqp = irqp->airq_next;
 654                         }
 655                 } else {
 656                         /*
 657                          * No more devices on this IRQ, so reset this vector's
 658                          * element in apic_ipls to the original IPL for this
 659                          * vector
 660                          */
 661                         apic_ipls[apic_ipls_index] =
 662                             apic_vectortoipl[vect_inherent_hwpri];
 663                 }
 664         }
 665 
 666         /*
 667          * If there are still active interrupts, we are done.
 668          */
 669         if (irqptr->airq_share)
 670                 return (PSM_SUCCESS);
 671 
 672         iflag = intr_clear();
 673         lock_set(&apic_ioapic_lock);
 674 
 675         if (irqptr->airq_mps_intr_index == MSI_INDEX) {
 676                 /*
 677                  * Disable the MSI vector
 678                  * Make sure we only disable on the last
 679                  * of the multi-MSI support
 680                  */
 681                 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
 682                         apic_pci_msi_disable_mode(irqptr->airq_dip,
 683                             DDI_INTR_TYPE_MSI);
 684                 }
 685         } else if (irqptr->airq_mps_intr_index == MSIX_INDEX) {
 686                 /*
 687                  * Disable the MSI-X vector
 688                  * needs to clear its mask and addr/data for each MSI-X
 689                  */
 690                 apic_pci_msi_unconfigure(irqptr->airq_dip, DDI_INTR_TYPE_MSIX,
 691                     irqptr->airq_origirq);
 692                 /*
 693                  * Make sure we only disable on the last MSI-X
 694                  */
 695                 if (i_ddi_intr_get_current_nenables(irqptr->airq_dip) == 1) {
 696                         apic_pci_msi_disable_mode(irqptr->airq_dip,
 697                             DDI_INTR_TYPE_MSIX);
 698                 }
 699         } else {
 700                 /*
 701                  * The assumption here is that this is safe, even for
 702                  * systems with IOAPICs that suffer from the hardware
 703                  * erratum because all devices have been quiesced before
 704                  * they unregister their interrupt handlers.  If that
 705                  * assumption turns out to be false, this mask operation
 706                  * can induce the same erratum result we're trying to
 707                  * avoid.
 708                  */
 709                 ioapic_ix = irqptr->airq_ioapicindex;
 710                 intin = irqptr->airq_intin_no;
 711                 ioapic_write(ioapic_ix, APIC_RDT_CMD + 2 * intin, AV_MASK);
 712         }
 713 
 714         apic_vt_ops->apic_intrmap_free_entry(&irqptr->airq_intrmap_private);
 715 
 716         /*
 717          * This irq entry is the only one in the chain.
 718          */
 719         if (irqheadptr->airq_next == NULL) {
 720                 ASSERT(irqheadptr == irqptr);
 721                 bind_cpu = irqptr->airq_temp_cpu;
 722                 if (((uint32_t)bind_cpu != IRQ_UNBOUND) &&
 723                     ((uint32_t)bind_cpu != IRQ_UNINIT)) {
 724                         ASSERT(apic_cpu_in_range(bind_cpu));
 725                         if (bind_cpu & IRQ_USER_BOUND) {
 726                                 /* If hardbound, temp_cpu == cpu */
 727                                 bind_cpu &= ~IRQ_USER_BOUND;
 728                                 apic_cpus[bind_cpu].aci_bound--;
 729                         } else
 730                                 apic_cpus[bind_cpu].aci_temp_bound--;
 731                 }
 732                 irqptr->airq_temp_cpu = IRQ_UNINIT;
 733                 irqptr->airq_mps_intr_index = FREE_INDEX;
 734                 lock_clear(&apic_ioapic_lock);
 735                 intr_restore(iflag);
 736                 apic_free_vector(irqptr->airq_vector);
 737                 return (PSM_SUCCESS);
 738         }
 739 
 740         /*
 741          * If we get here, we are sharing the vector and there are more than
 742          * one active irq entries in the chain.
 743          */
 744         lock_clear(&apic_ioapic_lock);
 745         intr_restore(iflag);
 746 
 747         mutex_enter(&airq_mutex);
 748         /* Remove the irq entry from the chain */
 749         if (irqptr == irqheadptr) { /* The irq entry is at the head */
 750                 apic_irq_table[irqindex] = irqptr->airq_next;
 751         } else {
 752                 preirqptr->airq_next = irqptr->airq_next;
 753         }
 754         /* Free the irq entry */
 755         kmem_free(irqptr, sizeof (apic_irq_t));
 756         mutex_exit(&airq_mutex);
 757 
 758         return (PSM_SUCCESS);
 759 }
 760 
 761 /*
 762  * apic_introp_xlate() replaces apic_translate_irq() and is
 763  * called only from apic_intr_ops().  With the new ADII framework,
 764  * the priority can no longer be retrieved through i_ddi_get_intrspec().
 765  * It has to be passed in from the caller.
 766  *
 767  * Return value:
 768  *      Success: irqno for the given device
 769  *      Failure: -1
 770  */
 771 int
 772 apic_introp_xlate(dev_info_t *dip, struct intrspec *ispec, int type)
 773 {
 774         char dev_type[16];
 775         int dev_len, pci_irq, newirq, bustype, devid, busid, i;
 776         int irqno = ispec->intrspec_vec;
 777         ddi_acc_handle_t cfg_handle;
 778         uchar_t ipin;
 779         struct apic_io_intr *intrp;
 780         iflag_t intr_flag;
 781         ACPI_SUBTABLE_HEADER    *hp;
 782         ACPI_MADT_INTERRUPT_OVERRIDE *isop;
 783         apic_irq_t *airqp;
 784         int parent_is_pci_or_pciex = 0;
 785         int child_is_pciex = 0;
 786 
 787         DDI_INTR_IMPLDBG((CE_CONT, "apic_introp_xlate: dip=0x%p name=%s "
 788             "type=%d irqno=0x%x\n", (void *)dip, ddi_get_name(dip), type,
 789             irqno));
 790 
 791         dev_len = sizeof (dev_type);
 792         if (ddi_getlongprop_buf(DDI_DEV_T_ANY, ddi_get_parent(dip),
 793             DDI_PROP_DONTPASS, "device_type", (caddr_t)dev_type,
 794             &dev_len) == DDI_PROP_SUCCESS) {
 795                 if ((strcmp(dev_type, "pci") == 0) ||
 796                     (strcmp(dev_type, "pciex") == 0))
 797                         parent_is_pci_or_pciex = 1;
 798         }
 799 
 800         if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip,
 801             DDI_PROP_DONTPASS, "compatible", (caddr_t)dev_type,
 802             &dev_len) == DDI_PROP_SUCCESS) {
 803                 if (strstr(dev_type, "pciex"))
 804                         child_is_pciex = 1;
 805         }
 806 
 807         if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
 808                 if ((airqp = apic_find_irq(dip, ispec, type)) != NULL) {
 809                         airqp->airq_iflag.bustype =
 810                             child_is_pciex ? BUS_PCIE : BUS_PCI;
 811                         return (apic_vector_to_irq[airqp->airq_vector]);
 812                 }
 813                 return (apic_setup_irq_table(dip, irqno, NULL, ispec,
 814                     NULL, type));
 815         }
 816 
 817         bustype = 0;
 818 
 819         /* check if we have already translated this irq */
 820         mutex_enter(&airq_mutex);
 821         newirq = apic_min_device_irq;
 822         for (; newirq <= apic_max_device_irq; newirq++) {
 823                 airqp = apic_irq_table[newirq];
 824                 while (airqp) {
 825                         if ((airqp->airq_dip == dip) &&
 826                             (airqp->airq_origirq == irqno) &&
 827                             (airqp->airq_mps_intr_index != FREE_INDEX)) {
 828 
 829                                 mutex_exit(&airq_mutex);
 830                                 return (VIRTIRQ(newirq, airqp->airq_share_id));
 831                         }
 832                         airqp = airqp->airq_next;
 833                 }
 834         }
 835         mutex_exit(&airq_mutex);
 836 
 837         if (apic_defconf)
 838                 goto defconf;
 839 
 840         if ((dip == NULL) || (!apic_irq_translate && !apic_enable_acpi))
 841                 goto nonpci;
 842 
 843         if (parent_is_pci_or_pciex) {
 844                 /* pci device */
 845                 if (acpica_get_bdf(dip, &busid, &devid, NULL) != 0)
 846                         goto nonpci;
 847                 if (busid == 0 && apic_pci_bus_total == 1)
 848                         busid = (int)apic_single_pci_busid;
 849 
 850                 if (pci_config_setup(dip, &cfg_handle) != DDI_SUCCESS)
 851                         return (-1);
 852                 ipin = pci_config_get8(cfg_handle, PCI_CONF_IPIN) - PCI_INTA;
 853                 pci_config_teardown(&cfg_handle);
 854                 if (apic_enable_acpi && !apic_use_acpi_madt_only) {
 855                         if (apic_acpi_translate_pci_irq(dip, busid, devid,
 856                             ipin, &pci_irq, &intr_flag) != ACPI_PSM_SUCCESS)
 857                                 return (-1);
 858 
 859                         intr_flag.bustype = child_is_pciex ? BUS_PCIE : BUS_PCI;
 860                         return (apic_setup_irq_table(dip, pci_irq, NULL, ispec,
 861                             &intr_flag, type));
 862                 } else {
 863                         pci_irq = ((devid & 0x1f) << 2) | (ipin & 0x3);
 864                         if ((intrp = apic_find_io_intr_w_busid(pci_irq, busid))
 865                             == NULL) {
 866                                 if ((pci_irq = apic_handle_pci_pci_bridge(dip,
 867                                     devid, ipin, &intrp)) == -1)
 868                                         return (-1);
 869                         }
 870                         return (apic_setup_irq_table(dip, pci_irq, intrp, ispec,
 871                             NULL, type));
 872                 }
 873         } else if (strcmp(dev_type, "isa") == 0)
 874                 bustype = BUS_ISA;
 875         else if (strcmp(dev_type, "eisa") == 0)
 876                 bustype = BUS_EISA;
 877 
 878 nonpci:
 879         if (apic_enable_acpi && !apic_use_acpi_madt_only) {
 880                 /* search iso entries first */
 881                 if (acpi_iso_cnt != 0) {
 882                         hp = (ACPI_SUBTABLE_HEADER *)acpi_isop;
 883                         i = 0;
 884                         while (i < acpi_iso_cnt) {
 885                                 if (hp->Type ==
 886                                     ACPI_MADT_TYPE_INTERRUPT_OVERRIDE) {
 887                                         isop =
 888                                             (ACPI_MADT_INTERRUPT_OVERRIDE *) hp;
 889                                         if (isop->Bus == 0 &&
 890                                             isop->SourceIrq == irqno) {
 891                                                 newirq = isop->GlobalIrq;
 892                                                 intr_flag.intr_po =
 893                                                     isop->IntiFlags &
 894                                                     ACPI_MADT_POLARITY_MASK;
 895                                                 intr_flag.intr_el =
 896                                                     (isop->IntiFlags &
 897                                                     ACPI_MADT_TRIGGER_MASK)
 898                                                     >> 2;
 899                                                 intr_flag.bustype = BUS_ISA;
 900 
 901                                                 return (apic_setup_irq_table(
 902                                                     dip, newirq, NULL, ispec,
 903                                                     &intr_flag, type));
 904 
 905                                         }
 906                                         i++;
 907                                 }
 908                                 hp = (ACPI_SUBTABLE_HEADER *)(((char *)hp) +
 909                                     hp->Length);
 910                         }
 911                 }
 912                 intr_flag.intr_po = INTR_PO_ACTIVE_HIGH;
 913                 intr_flag.intr_el = INTR_EL_EDGE;
 914                 intr_flag.bustype = BUS_ISA;
 915                 return (apic_setup_irq_table(dip, irqno, NULL, ispec,
 916                     &intr_flag, type));
 917         } else {
 918                 if (bustype == 0)       /* not initialized */
 919                         bustype = eisa_level_intr_mask ? BUS_EISA : BUS_ISA;
 920                 for (i = 0; i < 2; i++) {
 921                         if (((busid = apic_find_bus_id(bustype)) != -1) &&
 922                             ((intrp = apic_find_io_intr_w_busid(irqno, busid))
 923                             != NULL)) {
 924                                 if ((newirq = apic_setup_irq_table(dip, irqno,
 925                                     intrp, ispec, NULL, type)) != -1) {
 926                                         return (newirq);
 927                                 }
 928                                 goto defconf;
 929                         }
 930                         bustype = (bustype == BUS_EISA) ? BUS_ISA : BUS_EISA;
 931                 }
 932         }
 933 
 934 /* MPS default configuration */
 935 defconf:
 936         newirq = apic_setup_irq_table(dip, irqno, NULL, ispec, NULL, type);
 937         if (newirq == -1)
 938                 return (-1);
 939         ASSERT(IRQINDEX(newirq) == irqno);
 940         ASSERT(apic_irq_table[irqno]);
 941         return (newirq);
 942 }
 943 
 944 /*
 945  * Attempt to share vector with someone else
 946  */
 947 static int
 948 apic_share_vector(int irqno, iflag_t *intr_flagp, short intr_index, int ipl,
 949     uchar_t ioapicindex, uchar_t ipin, apic_irq_t **irqptrp)
 950 {
 951 #ifdef DEBUG
 952         apic_irq_t *tmpirqp = NULL;
 953 #endif /* DEBUG */
 954         apic_irq_t *irqptr, dummyirq;
 955         int     newirq, chosen_irq = -1, share = 127;
 956         int     lowest, highest, i;
 957         uchar_t share_id;
 958 
 959         DDI_INTR_IMPLDBG((CE_CONT, "apic_share_vector: irqno=0x%x "
 960             "intr_index=0x%x ipl=0x%x\n", irqno, intr_index, ipl));
 961 
 962         highest = apic_ipltopri[ipl] + APIC_VECTOR_MASK;
 963         lowest = apic_ipltopri[ipl-1] + APIC_VECTOR_PER_IPL;
 964 
 965         if (highest < lowest) /* Both ipl and ipl-1 map to same pri */
 966                 lowest -= APIC_VECTOR_PER_IPL;
 967         dummyirq.airq_mps_intr_index = intr_index;
 968         dummyirq.airq_ioapicindex = ioapicindex;
 969         dummyirq.airq_intin_no = ipin;
 970         if (intr_flagp)
 971                 dummyirq.airq_iflag = *intr_flagp;
 972         apic_record_rdt_entry(&dummyirq, irqno);
 973         for (i = lowest; i <= highest; i++) {
 974                 newirq = apic_vector_to_irq[i];
 975                 if (newirq == APIC_RESV_IRQ)
 976                         continue;
 977                 irqptr = apic_irq_table[newirq];
 978 
 979                 if ((dummyirq.airq_rdt_entry & 0xFF00) !=
 980                     (irqptr->airq_rdt_entry & 0xFF00))
 981                         /* not compatible */
 982                         continue;
 983 
 984                 if (irqptr->airq_share < share) {
 985                         share = irqptr->airq_share;
 986                         chosen_irq = newirq;
 987                 }
 988         }
 989         if (chosen_irq != -1) {
 990                 /*
 991                  * Assign a share id which is free or which is larger
 992                  * than the largest one.
 993                  */
 994                 share_id = 1;
 995                 mutex_enter(&airq_mutex);
 996                 irqptr = apic_irq_table[chosen_irq];
 997                 while (irqptr) {
 998                         if (irqptr->airq_mps_intr_index == FREE_INDEX) {
 999                                 share_id = irqptr->airq_share_id;
1000                                 break;
1001                         }
1002                         if (share_id <= irqptr->airq_share_id)
1003                                 share_id = irqptr->airq_share_id + 1;
1004 #ifdef DEBUG
1005                         tmpirqp = irqptr;
1006 #endif /* DEBUG */
1007                         irqptr = irqptr->airq_next;
1008                 }
1009                 if (!irqptr) {
1010                         irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
1011                         irqptr->airq_temp_cpu = IRQ_UNINIT;
1012                         irqptr->airq_next =
1013                             apic_irq_table[chosen_irq]->airq_next;
1014                         apic_irq_table[chosen_irq]->airq_next = irqptr;
1015 #ifdef  DEBUG
1016                         tmpirqp = apic_irq_table[chosen_irq];
1017 #endif /* DEBUG */
1018                 }
1019                 irqptr->airq_mps_intr_index = intr_index;
1020                 irqptr->airq_ioapicindex = ioapicindex;
1021                 irqptr->airq_intin_no = ipin;
1022                 if (intr_flagp)
1023                         irqptr->airq_iflag = *intr_flagp;
1024                 irqptr->airq_vector = apic_irq_table[chosen_irq]->airq_vector;
1025                 irqptr->airq_share_id = share_id;
1026                 apic_record_rdt_entry(irqptr, irqno);
1027                 *irqptrp = irqptr;
1028 #ifdef  DEBUG
1029                 /* shuffle the pointers to test apic_delspl path */
1030                 if (tmpirqp) {
1031                         tmpirqp->airq_next = irqptr->airq_next;
1032                         irqptr->airq_next = apic_irq_table[chosen_irq];
1033                         apic_irq_table[chosen_irq] = irqptr;
1034                 }
1035 #endif /* DEBUG */
1036                 mutex_exit(&airq_mutex);
1037                 return (VIRTIRQ(chosen_irq, share_id));
1038         }
1039         return (-1);
1040 }
1041 
1042 /*
1043  * Allocate/Initialize the apic_irq_table[] entry for given irqno. If the entry
1044  * is used already, we will try to allocate a new irqno.
1045  *
1046  * Return value:
1047  *      Success: irqno
1048  *      Failure: -1
1049  */
1050 static int
1051 apic_setup_irq_table(dev_info_t *dip, int irqno, struct apic_io_intr *intrp,
1052     struct intrspec *ispec, iflag_t *intr_flagp, int type)
1053 {
1054         int origirq;
1055         uchar_t ipl;
1056         int     newirq, intr_index;
1057         uchar_t ipin, ioapic, ioapicindex, vector;
1058         apic_irq_t *irqptr;
1059         major_t major;
1060         dev_info_t      *sdip;
1061 
1062         ASSERT(ispec != NULL);
1063 
1064         origirq = ispec->intrspec_vec;
1065         ipl = ispec->intrspec_pri;
1066 
1067         DDI_INTR_IMPLDBG((CE_CONT, "apic_setup_irq_table: dip=0x%p type=%d "
1068             "irqno=0x%x origirq=0x%x\n", (void *)dip, type, irqno, origirq));
1069 
1070         major =  (dip != NULL) ? ddi_driver_major(dip) : 0;
1071 
1072         if (DDI_INTR_IS_MSI_OR_MSIX(type)) {
1073                 /* MSI/X doesn't need to setup ioapic stuffs */
1074                 ioapicindex = 0xff;
1075                 ioapic = 0xff;
1076                 ipin = (uchar_t)0xff;
1077                 intr_index = (type == DDI_INTR_TYPE_MSI) ? MSI_INDEX :
1078                     MSIX_INDEX;
1079                 mutex_enter(&airq_mutex);
1080                 if ((irqno = apic_allocate_irq(apic_first_avail_irq)) == -1) {
1081                         mutex_exit(&airq_mutex);
1082                         /* need an irq for MSI/X to index into autovect[] */
1083                         cmn_err(CE_WARN, "No interrupt irq: %s instance %d",
1084                             ddi_get_name(dip), ddi_get_instance(dip));
1085                         return (-1);
1086                 }
1087                 mutex_exit(&airq_mutex);
1088 
1089         } else if (intrp != NULL) {
1090                 intr_index = (int)(intrp - apic_io_intrp);
1091                 ioapic = intrp->intr_destid;
1092                 ipin = intrp->intr_destintin;
1093                 /* Find ioapicindex. If destid was ALL, we will exit with 0. */
1094                 for (ioapicindex = apic_io_max - 1; ioapicindex; ioapicindex--)
1095                         if (apic_io_id[ioapicindex] == ioapic)
1096                                 break;
1097                 ASSERT((ioapic == apic_io_id[ioapicindex]) ||
1098                     (ioapic == INTR_ALL_APIC));
1099 
1100                 /* check whether this intin# has been used by another irqno */
1101                 if ((newirq = apic_find_intin(ioapicindex, ipin)) != -1) {
1102                         return (newirq);
1103                 }
1104 
1105         } else if (intr_flagp != NULL) {
1106                 /* ACPI case */
1107                 intr_index = ACPI_INDEX;
1108                 ioapicindex = acpi_find_ioapic(irqno);
1109                 ASSERT(ioapicindex != 0xFF);
1110                 ioapic = apic_io_id[ioapicindex];
1111                 ipin = irqno - apic_io_vectbase[ioapicindex];
1112                 if (apic_irq_table[irqno] &&
1113                     apic_irq_table[irqno]->airq_mps_intr_index == ACPI_INDEX) {
1114                         ASSERT(apic_irq_table[irqno]->airq_intin_no == ipin &&
1115                             apic_irq_table[irqno]->airq_ioapicindex ==
1116                             ioapicindex);
1117                         return (irqno);
1118                 }
1119 
1120         } else {
1121                 /* default configuration */
1122                 ioapicindex = 0;
1123                 ioapic = apic_io_id[ioapicindex];
1124                 ipin = (uchar_t)irqno;
1125                 intr_index = DEFAULT_INDEX;
1126         }
1127 
1128         if ((vector = apic_allocate_vector(ipl, irqno, 0)) == 0) {
1129                 if ((newirq = apic_share_vector(irqno, intr_flagp, intr_index,
1130                     ipl, ioapicindex, ipin, &irqptr)) != -1) {
1131                         irqptr->airq_ipl = ipl;
1132                         irqptr->airq_origirq = (uchar_t)origirq;
1133                         irqptr->airq_dip = dip;
1134                         irqptr->airq_major = major;
1135                         sdip = apic_irq_table[IRQINDEX(newirq)]->airq_dip;
1136                         /* This is OK to do really */
1137                         if (sdip == NULL) {
1138                                 cmn_err(CE_WARN, "Sharing vectors: %s"
1139                                     " instance %d and SCI",
1140                                     ddi_get_name(dip), ddi_get_instance(dip));
1141                         } else {
1142                                 cmn_err(CE_WARN, "Sharing vectors: %s"
1143                                     " instance %d and %s instance %d",
1144                                     ddi_get_name(sdip), ddi_get_instance(sdip),
1145                                     ddi_get_name(dip), ddi_get_instance(dip));
1146                         }
1147                         return (newirq);
1148                 }
1149                 /* try high priority allocation now  that share has failed */
1150                 if ((vector = apic_allocate_vector(ipl, irqno, 1)) == 0) {
1151                         cmn_err(CE_WARN, "No interrupt vector: %s instance %d",
1152                             ddi_get_name(dip), ddi_get_instance(dip));
1153                         return (-1);
1154                 }
1155         }
1156 
1157         mutex_enter(&airq_mutex);
1158         if (apic_irq_table[irqno] == NULL) {
1159                 irqptr = kmem_zalloc(sizeof (apic_irq_t), KM_SLEEP);
1160                 irqptr->airq_temp_cpu = IRQ_UNINIT;
1161                 apic_irq_table[irqno] = irqptr;
1162         } else {
1163                 irqptr = apic_irq_table[irqno];
1164                 if (irqptr->airq_mps_intr_index != FREE_INDEX) {
1165                         /*
1166                          * The slot is used by another irqno, so allocate
1167                          * a free irqno for this interrupt
1168                          */
1169                         newirq = apic_allocate_irq(apic_first_avail_irq);
1170                         if (newirq == -1) {
1171                                 mutex_exit(&airq_mutex);
1172                                 return (-1);
1173                         }
1174                         irqno = newirq;
1175                         irqptr = apic_irq_table[irqno];
1176                         if (irqptr == NULL) {
1177                                 irqptr = kmem_zalloc(sizeof (apic_irq_t),
1178                                     KM_SLEEP);
1179                                 irqptr->airq_temp_cpu = IRQ_UNINIT;
1180                                 apic_irq_table[irqno] = irqptr;
1181                         }
1182                         vector = apic_modify_vector(vector, newirq);
1183                 }
1184         }
1185         apic_max_device_irq = max(irqno, apic_max_device_irq);
1186         apic_min_device_irq = min(irqno, apic_min_device_irq);
1187         mutex_exit(&airq_mutex);
1188         irqptr->airq_ioapicindex = ioapicindex;
1189         irqptr->airq_intin_no = ipin;
1190         irqptr->airq_ipl = ipl;
1191         irqptr->airq_vector = vector;
1192         irqptr->airq_origirq = (uchar_t)origirq;
1193         irqptr->airq_share_id = 0;
1194         irqptr->airq_mps_intr_index = (short)intr_index;
1195         irqptr->airq_dip = dip;
1196         irqptr->airq_major = major;
1197         irqptr->airq_cpu = apic_bind_intr(dip, irqno, ioapic, ipin);
1198         if (intr_flagp)
1199                 irqptr->airq_iflag = *intr_flagp;
1200 
1201         if (!DDI_INTR_IS_MSI_OR_MSIX(type)) {
1202                 /* setup I/O APIC entry for non-MSI/X interrupts */
1203                 apic_record_rdt_entry(irqptr, irqno);
1204         }
1205         return (irqno);
1206 }
1207 
1208 /*
1209  * return the cpu to which this intr should be bound.
1210  * Check properties or any other mechanism to see if user wants it
1211  * bound to a specific CPU. If so, return the cpu id with high bit set.
1212  * If not, use the policy to choose a cpu and return the id.
1213  */
1214 uint32_t
1215 apic_bind_intr(dev_info_t *dip, int irq, uchar_t ioapicid, uchar_t intin)
1216 {
1217         int     instance, instno, prop_len, bind_cpu, count;
1218         uint_t  i, rc;
1219         uint32_t cpu;
1220         major_t major;
1221         char    *name, *drv_name, *prop_val, *cptr;
1222         char    prop_name[32];
1223         ulong_t iflag;
1224 
1225 
1226         if (apic_intr_policy == INTR_LOWEST_PRIORITY)
1227                 return (IRQ_UNBOUND);
1228 
1229         if (apic_nproc == 1)
1230                 return (0);
1231 
1232         if (dip == NULL) {
1233                 iflag = intr_clear();
1234                 lock_set(&apic_ioapic_lock);
1235                 bind_cpu = apic_get_next_bind_cpu();
1236                 lock_clear(&apic_ioapic_lock);
1237                 intr_restore(iflag);
1238 
1239                 cmn_err(CE_CONT, "!%s: irq 0x%x "
1240                     "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1241                     psm_name, irq, apic_irq_table[irq]->airq_vector, ioapicid,
1242                     intin, bind_cpu & ~IRQ_USER_BOUND);
1243 
1244                 return ((uint32_t)bind_cpu);
1245         }
1246 
1247         name = ddi_get_name(dip);
1248         major = ddi_name_to_major(name);
1249         drv_name = ddi_major_to_name(major);
1250         instance = ddi_get_instance(dip);
1251         if (apic_intr_policy == INTR_ROUND_ROBIN_WITH_AFFINITY) {
1252                 i = apic_min_device_irq;
1253                 for (; i <= apic_max_device_irq; i++) {
1254                         if ((i == irq) || (apic_irq_table[i] == NULL) ||
1255                             (apic_irq_table[i]->airq_mps_intr_index
1256                             == FREE_INDEX))
1257                                 continue;
1258 
1259                         if ((apic_irq_table[i]->airq_major == major) &&
1260                             (!(apic_irq_table[i]->airq_cpu & IRQ_USER_BOUND))) {
1261                                 cpu = apic_irq_table[i]->airq_cpu;
1262 
1263                                 cmn_err(CE_CONT,
1264                                     "!%s: %s (%s) instance #%d "
1265                                     "irq 0x%x vector 0x%x ioapic 0x%x "
1266                                     "intin 0x%x is bound to cpu %d\n",
1267                                     psm_name,
1268                                     name, drv_name, instance, irq,
1269                                     apic_irq_table[irq]->airq_vector,
1270                                     ioapicid, intin, cpu);
1271                                 return (cpu);
1272                         }
1273                 }
1274         }
1275         /*
1276          * search for "drvname"_intpt_bind_cpus property first, the
1277          * syntax of the property should be "a[,b,c,...]" where
1278          * instance 0 binds to cpu a, instance 1 binds to cpu b,
1279          * instance 3 binds to cpu c...
1280          * ddi_getlongprop() will search /option first, then /
1281          * if "drvname"_intpt_bind_cpus doesn't exist, then find
1282          * intpt_bind_cpus property.  The syntax is the same, and
1283          * it applies to all the devices if its "drvname" specific
1284          * property doesn't exist
1285          */
1286         (void) strcpy(prop_name, drv_name);
1287         (void) strcat(prop_name, "_intpt_bind_cpus");
1288         rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0, prop_name,
1289             (caddr_t)&prop_val, &prop_len);
1290         if (rc != DDI_PROP_SUCCESS) {
1291                 rc = ddi_getlongprop(DDI_DEV_T_ANY, dip, 0,
1292                     "intpt_bind_cpus", (caddr_t)&prop_val, &prop_len);
1293         }
1294         if (rc == DDI_PROP_SUCCESS) {
1295                 for (i = count = 0; i < (prop_len - 1); i++)
1296                         if (prop_val[i] == ',')
1297                                 count++;
1298                 if (prop_val[i-1] != ',')
1299                         count++;
1300                 /*
1301                  * if somehow the binding instances defined in the
1302                  * property are not enough for this instno., then
1303                  * reuse the pattern for the next instance until
1304                  * it reaches the requested instno
1305                  */
1306                 instno = instance % count;
1307                 i = 0;
1308                 cptr = prop_val;
1309                 while (i < instno)
1310                         if (*cptr++ == ',')
1311                                 i++;
1312                 bind_cpu = stoi(&cptr);
1313                 kmem_free(prop_val, prop_len);
1314                 /* if specific CPU is bogus, then default to next cpu */
1315                 if (!apic_cpu_in_range(bind_cpu)) {
1316                         cmn_err(CE_WARN, "%s: %s=%s: CPU %d not present",
1317                             psm_name, prop_name, prop_val, bind_cpu);
1318                         rc = DDI_PROP_NOT_FOUND;
1319                 } else {
1320                         /* indicate that we are bound at user request */
1321                         bind_cpu |= IRQ_USER_BOUND;
1322                 }
1323                 /*
1324                  * no need to check apic_cpus[].aci_status, if specific CPU is
1325                  * not up, then post_cpu_start will handle it.
1326                  */
1327         }
1328 
1329         if (rc != DDI_PROP_SUCCESS) {
1330                 iflag = intr_clear();
1331                 lock_set(&apic_ioapic_lock);
1332                 bind_cpu = apic_get_next_bind_cpu();
1333                 lock_clear(&apic_ioapic_lock);
1334                 intr_restore(iflag);
1335         }
1336 
1337         cmn_err(CE_CONT, "!%s: %s (%s) instance %d irq 0x%x "
1338             "vector 0x%x ioapic 0x%x intin 0x%x is bound to cpu %d\n",
1339             psm_name, name, drv_name, instance, irq,
1340             apic_irq_table[irq]->airq_vector, ioapicid, intin,
1341             bind_cpu & ~IRQ_USER_BOUND);
1342 
1343         return ((uint32_t)bind_cpu);
1344 }
1345 
1346 /*
1347  * Mark vector as being in the process of being deleted. Interrupts
1348  * may still come in on some CPU. The moment an interrupt comes with
1349  * the new vector, we know we can free the old one. Called only from
1350  * addspl and delspl with interrupts disabled. Because an interrupt
1351  * can be shared, but no interrupt from either device may come in,
1352  * we also use a timeout mechanism, which we arbitrarily set to
1353  * apic_revector_timeout microseconds.
1354  */
1355 static void
1356 apic_mark_vector(uchar_t oldvector, uchar_t newvector)
1357 {
1358         ulong_t iflag;
1359 
1360         iflag = intr_clear();
1361         lock_set(&apic_revector_lock);
1362         if (!apic_oldvec_to_newvec) {
1363                 apic_oldvec_to_newvec =
1364                     kmem_zalloc(sizeof (newvector) * APIC_MAX_VECTOR * 2,
1365                     KM_NOSLEEP);
1366 
1367                 if (!apic_oldvec_to_newvec) {
1368                         /*
1369                          * This failure is not catastrophic.
1370                          * But, the oldvec will never be freed.
1371                          */
1372                         apic_error |= APIC_ERR_MARK_VECTOR_FAIL;
1373                         lock_clear(&apic_revector_lock);
1374                         intr_restore(iflag);
1375                         return;
1376                 }
1377                 apic_newvec_to_oldvec = &apic_oldvec_to_newvec[APIC_MAX_VECTOR];
1378         }
1379 
1380         /* See if we already did this for drivers which do double addintrs */
1381         if (apic_oldvec_to_newvec[oldvector] != newvector) {
1382                 apic_oldvec_to_newvec[oldvector] = newvector;
1383                 apic_newvec_to_oldvec[newvector] = oldvector;
1384                 apic_revector_pending++;
1385         }
1386         lock_clear(&apic_revector_lock);
1387         intr_restore(iflag);
1388         (void) timeout(apic_xlate_vector_free_timeout_handler,
1389             (void *)(uintptr_t)oldvector, drv_usectohz(apic_revector_timeout));
1390 }
1391 
1392 /*
1393  * xlate_vector is called from intr_enter if revector_pending is set.
1394  * It will xlate it if needed and mark the old vector as free.
1395  */
1396 uchar_t
1397 apic_xlate_vector(uchar_t vector)
1398 {
1399         uchar_t newvector, oldvector = 0;
1400 
1401         lock_set(&apic_revector_lock);
1402         /* Do we really need to do this ? */
1403         if (!apic_revector_pending) {
1404                 lock_clear(&apic_revector_lock);
1405                 return (vector);
1406         }
1407         if ((newvector = apic_oldvec_to_newvec[vector]) != 0)
1408                 oldvector = vector;
1409         else {
1410                 /*
1411                  * The incoming vector is new . See if a stale entry is
1412                  * remaining
1413                  */
1414                 if ((oldvector = apic_newvec_to_oldvec[vector]) != 0)
1415                         newvector = vector;
1416         }
1417 
1418         if (oldvector) {
1419                 apic_revector_pending--;
1420                 apic_oldvec_to_newvec[oldvector] = 0;
1421                 apic_newvec_to_oldvec[newvector] = 0;
1422                 apic_free_vector(oldvector);
1423                 lock_clear(&apic_revector_lock);
1424                 /* There could have been more than one reprogramming! */
1425                 return (apic_xlate_vector(newvector));
1426         }
1427         lock_clear(&apic_revector_lock);
1428         return (vector);
1429 }
1430 
1431 void
1432 apic_xlate_vector_free_timeout_handler(void *arg)
1433 {
1434         ulong_t iflag;
1435         uchar_t oldvector, newvector;
1436 
1437         oldvector = (uchar_t)(uintptr_t)arg;
1438         iflag = intr_clear();
1439         lock_set(&apic_revector_lock);
1440         if ((newvector = apic_oldvec_to_newvec[oldvector]) != 0) {
1441                 apic_free_vector(oldvector);
1442                 apic_oldvec_to_newvec[oldvector] = 0;
1443                 apic_newvec_to_oldvec[newvector] = 0;
1444                 apic_revector_pending--;
1445         }
1446 
1447         lock_clear(&apic_revector_lock);
1448         intr_restore(iflag);
1449 }
1450 
1451 /*
1452  * Bind interrupt corresponding to irq_ptr to bind_cpu.
1453  * Must be called with interrupts disabled and apic_ioapic_lock held
1454  */
1455 int
1456 apic_rebind(apic_irq_t *irq_ptr, int bind_cpu,
1457     struct ioapic_reprogram_data *drep)
1458 {
1459         int                     ioapicindex, intin_no;
1460         uint32_t                airq_temp_cpu;
1461         apic_cpus_info_t        *cpu_infop;
1462         uint32_t                rdt_entry;
1463         int                     which_irq;
1464         ioapic_rdt_t            irdt;
1465 
1466         which_irq = apic_vector_to_irq[irq_ptr->airq_vector];
1467 
1468         intin_no = irq_ptr->airq_intin_no;
1469         ioapicindex = irq_ptr->airq_ioapicindex;
1470         airq_temp_cpu = irq_ptr->airq_temp_cpu;
1471         if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu != IRQ_UNBOUND) {
1472                 if (airq_temp_cpu & IRQ_USER_BOUND)
1473                         /* Mask off high bit so it can be used as array index */
1474                         airq_temp_cpu &= ~IRQ_USER_BOUND;
1475 
1476                 ASSERT(apic_cpu_in_range(airq_temp_cpu));
1477         }
1478 
1479         /*
1480          * Can't bind to a CPU that's not accepting interrupts:
1481          */
1482         cpu_infop = &apic_cpus[bind_cpu & ~IRQ_USER_BOUND];
1483         if (!(cpu_infop->aci_status & APIC_CPU_INTR_ENABLE))
1484                 return (1);
1485 
1486         /*
1487          * If we are about to change the interrupt vector for this interrupt,
1488          * and this interrupt is level-triggered, attached to an IOAPIC,
1489          * has been delivered to a CPU and that CPU has not handled it
1490          * yet, we cannot reprogram the IOAPIC now.
1491          */
1492         if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1493 
1494                 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex,
1495                     intin_no);
1496 
1497                 if ((irq_ptr->airq_vector != RDT_VECTOR(rdt_entry)) &&
1498                     apic_check_stuck_interrupt(irq_ptr, airq_temp_cpu,
1499                     bind_cpu, ioapicindex, intin_no, which_irq, drep) != 0) {
1500 
1501                         return (0);
1502                 }
1503 
1504                 /*
1505                  * NOTE: We do not unmask the RDT here, as an interrupt MAY
1506                  * still come in before we have a chance to reprogram it below.
1507                  * The reprogramming below will simultaneously change and
1508                  * unmask the RDT entry.
1509                  */
1510 
1511                 if ((uint32_t)bind_cpu == IRQ_UNBOUND) {
1512                         irdt.ir_lo =  AV_LDEST | AV_LOPRI |
1513                             irq_ptr->airq_rdt_entry;
1514 
1515                         irdt.ir_hi = AV_TOALL >> APIC_ID_BIT_OFFSET;
1516 
1517                         apic_vt_ops->apic_intrmap_alloc_entry(
1518                             &irq_ptr->airq_intrmap_private, NULL,
1519                             DDI_INTR_TYPE_FIXED, 1, ioapicindex);
1520                         apic_vt_ops->apic_intrmap_map_entry(
1521                             irq_ptr->airq_intrmap_private, (void *)&irdt,
1522                             DDI_INTR_TYPE_FIXED, 1);
1523                         apic_vt_ops->apic_intrmap_record_rdt(
1524                             irq_ptr->airq_intrmap_private, &irdt);
1525 
1526                         /* Write the RDT entry -- no specific CPU binding */
1527                         WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1528                             irdt.ir_hi | AV_TOALL);
1529 
1530                         if (airq_temp_cpu != IRQ_UNINIT && airq_temp_cpu !=
1531                             IRQ_UNBOUND)
1532                                 apic_cpus[airq_temp_cpu].aci_temp_bound--;
1533 
1534                         /*
1535                          * Write the vector, trigger, and polarity portion of
1536                          * the RDT
1537                          */
1538                         WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1539                             irdt.ir_lo);
1540 
1541                         irq_ptr->airq_temp_cpu = IRQ_UNBOUND;
1542                         return (0);
1543                 }
1544         }
1545 
1546         if (bind_cpu & IRQ_USER_BOUND) {
1547                 cpu_infop->aci_bound++;
1548         } else {
1549                 cpu_infop->aci_temp_bound++;
1550         }
1551         ASSERT(apic_cpu_in_range(bind_cpu));
1552 
1553         if ((airq_temp_cpu != IRQ_UNBOUND) && (airq_temp_cpu != IRQ_UNINIT)) {
1554                 apic_cpus[airq_temp_cpu].aci_temp_bound--;
1555         }
1556         if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
1557 
1558                 irdt.ir_lo = AV_PDEST | AV_FIXED | irq_ptr->airq_rdt_entry;
1559                 irdt.ir_hi = cpu_infop->aci_local_id;
1560 
1561                 apic_vt_ops->apic_intrmap_alloc_entry(
1562                     &irq_ptr->airq_intrmap_private, NULL, DDI_INTR_TYPE_FIXED,
1563                     1, ioapicindex);
1564                 apic_vt_ops->apic_intrmap_map_entry(
1565                     irq_ptr->airq_intrmap_private,
1566                     (void *)&irdt, DDI_INTR_TYPE_FIXED, 1);
1567                 apic_vt_ops->apic_intrmap_record_rdt(
1568                     irq_ptr->airq_intrmap_private, &irdt);
1569 
1570                 /* Write the RDT entry -- bind to a specific CPU: */
1571                 WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
1572                     irdt.ir_hi);
1573 
1574                 /* Write the vector, trigger, and polarity portion of the RDT */
1575                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapicindex, intin_no,
1576                     irdt.ir_lo);
1577 
1578         } else {
1579                 int type = (irq_ptr->airq_mps_intr_index == MSI_INDEX) ?
1580                     DDI_INTR_TYPE_MSI : DDI_INTR_TYPE_MSIX;
1581                 if (type == DDI_INTR_TYPE_MSI) {
1582                         if (irq_ptr->airq_ioapicindex ==
1583                             irq_ptr->airq_origirq) {
1584                                 /* first one */
1585                                 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1586                                     "apic_pci_msi_enable_vector\n"));
1587                                 apic_pci_msi_enable_vector(irq_ptr,
1588                                     type, which_irq, irq_ptr->airq_vector,
1589                                     irq_ptr->airq_intin_no,
1590                                     cpu_infop->aci_local_id);
1591                         }
1592                         if ((irq_ptr->airq_ioapicindex +
1593                             irq_ptr->airq_intin_no - 1) ==
1594                             irq_ptr->airq_origirq) { /* last one */
1595                                 DDI_INTR_IMPLDBG((CE_CONT, "apic_rebind: call "
1596                                     "apic_pci_msi_enable_mode\n"));
1597                                 apic_pci_msi_enable_mode(irq_ptr->airq_dip,
1598                                     type, which_irq);
1599                         }
1600                 } else { /* MSI-X */
1601                         apic_pci_msi_enable_vector(irq_ptr, type,
1602                             irq_ptr->airq_origirq, irq_ptr->airq_vector, 1,
1603                             cpu_infop->aci_local_id);
1604                         apic_pci_msi_enable_mode(irq_ptr->airq_dip, type,
1605                             irq_ptr->airq_origirq);
1606                 }
1607         }
1608         irq_ptr->airq_temp_cpu = (uint32_t)bind_cpu;
1609         apic_redist_cpu_skip &= ~(1 << (bind_cpu & ~IRQ_USER_BOUND));
1610         return (0);
1611 }
1612 
1613 static void
1614 apic_last_ditch_clear_remote_irr(int ioapic_ix, int intin_no)
1615 {
1616         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no)
1617             & AV_REMOTE_IRR) != 0) {
1618                 /*
1619                  * Trying to clear the bit through normal
1620                  * channels has failed.  So as a last-ditch
1621                  * effort, try to set the trigger mode to
1622                  * edge, then to level.  This has been
1623                  * observed to work on many systems.
1624                  */
1625                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1626                     intin_no,
1627                     READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1628                     intin_no) & ~AV_LEVEL);
1629 
1630                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1631                     intin_no,
1632                     READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1633                     intin_no) | AV_LEVEL);
1634 
1635                 /*
1636                  * If the bit's STILL set, this interrupt may
1637                  * be hosed.
1638                  */
1639                 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1640                     intin_no) & AV_REMOTE_IRR) != 0) {
1641 
1642                         prom_printf("%s: Remote IRR still "
1643                             "not clear for IOAPIC %d intin %d.\n"
1644                             "\tInterrupts to this pin may cease "
1645                             "functioning.\n", psm_name, ioapic_ix,
1646                             intin_no);
1647 #ifdef DEBUG
1648                         apic_last_ditch_reprogram_failures++;
1649 #endif
1650                 }
1651         }
1652 }
1653 
1654 /*
1655  * This function is protected by apic_ioapic_lock coupled with the
1656  * fact that interrupts are disabled.
1657  */
1658 static void
1659 delete_defer_repro_ent(int which_irq)
1660 {
1661         ASSERT(which_irq >= 0);
1662         ASSERT(which_irq <= 255);
1663         ASSERT(LOCK_HELD(&apic_ioapic_lock));
1664 
1665         if (apic_reprogram_info[which_irq].done)
1666                 return;
1667 
1668         apic_reprogram_info[which_irq].done = B_TRUE;
1669 
1670 #ifdef DEBUG
1671         apic_defer_repro_total_retries +=
1672             apic_reprogram_info[which_irq].tries;
1673 
1674         apic_defer_repro_successes++;
1675 #endif
1676 
1677         if (--apic_reprogram_outstanding == 0) {
1678 
1679                 setlvlx = psm_intr_exit_fn();
1680         }
1681 }
1682 
1683 
1684 /*
1685  * Interrupts must be disabled during this function to prevent
1686  * self-deadlock.  Interrupts are disabled because this function
1687  * is called from apic_check_stuck_interrupt(), which is called
1688  * from apic_rebind(), which requires its caller to disable interrupts.
1689  */
1690 static void
1691 add_defer_repro_ent(apic_irq_t *irq_ptr, int which_irq, int new_bind_cpu)
1692 {
1693         ASSERT(which_irq >= 0);
1694         ASSERT(which_irq <= 255);
1695         ASSERT(!interrupts_enabled());
1696 
1697         /*
1698          * On the off-chance that there's already a deferred
1699          * reprogramming on this irq, check, and if so, just update the
1700          * CPU and irq pointer to which the interrupt is targeted, then return.
1701          */
1702         if (!apic_reprogram_info[which_irq].done) {
1703                 apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1704                 apic_reprogram_info[which_irq].irqp = irq_ptr;
1705                 return;
1706         }
1707 
1708         apic_reprogram_info[which_irq].irqp = irq_ptr;
1709         apic_reprogram_info[which_irq].bindcpu = new_bind_cpu;
1710         apic_reprogram_info[which_irq].tries = 0;
1711         /*
1712          * This must be the last thing set, since we're not
1713          * grabbing any locks, apic_try_deferred_reprogram() will
1714          * make its decision about using this entry iff done
1715          * is false.
1716          */
1717         apic_reprogram_info[which_irq].done = B_FALSE;
1718 
1719         /*
1720          * If there were previously no deferred reprogrammings, change
1721          * setlvlx to call apic_try_deferred_reprogram()
1722          */
1723         if (++apic_reprogram_outstanding == 1) {
1724 
1725                 setlvlx = apic_try_deferred_reprogram;
1726         }
1727 }
1728 
1729 static void
1730 apic_try_deferred_reprogram(int prev_ipl, int irq)
1731 {
1732         int reproirq;
1733         ulong_t iflag;
1734         struct ioapic_reprogram_data *drep;
1735 
1736         (*psm_intr_exit_fn())(prev_ipl, irq);
1737 
1738         if (!lock_try(&apic_defer_reprogram_lock)) {
1739                 return;
1740         }
1741 
1742         /*
1743          * Acquire the apic_ioapic_lock so that any other operations that
1744          * may affect the apic_reprogram_info state are serialized.
1745          * It's still possible for the last deferred reprogramming to clear
1746          * between the time we entered this function and the time we get to
1747          * the for loop below.  In that case, *setlvlx will have been set
1748          * back to *_intr_exit and drep will be NULL. (There's no way to
1749          * stop that from happening -- we would need to grab a lock before
1750          * calling *setlvlx, which is neither realistic nor prudent).
1751          */
1752         iflag = intr_clear();
1753         lock_set(&apic_ioapic_lock);
1754 
1755         /*
1756          * For each deferred RDT entry, try to reprogram it now.  Note that
1757          * there is no lock acquisition to read apic_reprogram_info because
1758          * '.done' is set only after the other fields in the structure are set.
1759          */
1760 
1761         drep = NULL;
1762         for (reproirq = 0; reproirq <= APIC_MAX_VECTOR; reproirq++) {
1763                 if (apic_reprogram_info[reproirq].done == B_FALSE) {
1764                         drep = &apic_reprogram_info[reproirq];
1765                         break;
1766                 }
1767         }
1768 
1769         /*
1770          * Either we found a deferred action to perform, or
1771          * we entered this function spuriously, after *setlvlx
1772          * was restored to point to *_intr_exit.  Any other
1773          * permutation is invalid.
1774          */
1775         ASSERT(drep != NULL || *setlvlx == psm_intr_exit_fn());
1776 
1777         /*
1778          * Though we can't really do anything about errors
1779          * at this point, keep track of them for reporting.
1780          * Note that it is very possible for apic_setup_io_intr
1781          * to re-register this very timeout if the Remote IRR bit
1782          * has not yet cleared.
1783          */
1784 
1785 #ifdef DEBUG
1786         if (drep != NULL) {
1787                 if (apic_setup_io_intr(drep, reproirq, B_TRUE) != 0) {
1788                         apic_deferred_setup_failures++;
1789                 }
1790         } else {
1791                 apic_deferred_spurious_enters++;
1792         }
1793 #else
1794         if (drep != NULL)
1795                 (void) apic_setup_io_intr(drep, reproirq, B_TRUE);
1796 #endif
1797 
1798         lock_clear(&apic_ioapic_lock);
1799         intr_restore(iflag);
1800 
1801         lock_clear(&apic_defer_reprogram_lock);
1802 }
1803 
1804 static void
1805 apic_ioapic_wait_pending_clear(int ioapic_ix, int intin_no)
1806 {
1807         int waited;
1808 
1809         /*
1810          * Wait for the delivery pending bit to clear.
1811          */
1812         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1813             (AV_LEVEL|AV_PENDING)) == (AV_LEVEL|AV_PENDING)) {
1814 
1815                 /*
1816                  * If we're still waiting on the delivery of this interrupt,
1817                  * continue to wait here until it is delivered (this should be
1818                  * a very small amount of time, but include a timeout just in
1819                  * case).
1820                  */
1821                 for (waited = 0; waited < apic_max_reps_clear_pending;
1822                     waited++) {
1823                         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1824                             intin_no) & AV_PENDING) == 0) {
1825                                 break;
1826                         }
1827                 }
1828         }
1829 }
1830 
1831 
1832 /*
1833  * Checks to see if the IOAPIC interrupt entry specified has its Remote IRR
1834  * bit set.  Calls functions that modify the function that setlvlx points to,
1835  * so that the reprogramming can be retried very shortly.
1836  *
1837  * This function will mask the RDT entry if the interrupt is level-triggered.
1838  * (The caller is responsible for unmasking the RDT entry.)
1839  *
1840  * Returns non-zero if the caller should defer IOAPIC reprogramming.
1841  */
1842 static int
1843 apic_check_stuck_interrupt(apic_irq_t *irq_ptr, int old_bind_cpu,
1844     int new_bind_cpu, int ioapic_ix, int intin_no, int which_irq,
1845     struct ioapic_reprogram_data *drep)
1846 {
1847         int32_t                 rdt_entry;
1848         int                     waited;
1849         int                     reps = 0;
1850 
1851         /*
1852          * Wait for the delivery pending bit to clear.
1853          */
1854         do {
1855                 ++reps;
1856 
1857                 apic_ioapic_wait_pending_clear(ioapic_ix, intin_no);
1858 
1859                 /*
1860                  * Mask the RDT entry, but only if it's a level-triggered
1861                  * interrupt
1862                  */
1863                 rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1864                     intin_no);
1865                 if ((rdt_entry & (AV_LEVEL|AV_MASK)) == AV_LEVEL) {
1866 
1867                         /* Mask it */
1868                         WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no,
1869                             AV_MASK | rdt_entry);
1870                 }
1871 
1872                 if ((rdt_entry & AV_LEVEL) == AV_LEVEL) {
1873                         /*
1874                          * If there was a race and an interrupt was injected
1875                          * just before we masked, check for that case here.
1876                          * Then, unmask the RDT entry and try again.  If we're
1877                          * on our last try, don't unmask (because we want the
1878                          * RDT entry to remain masked for the rest of the
1879                          * function).
1880                          */
1881                         rdt_entry = READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1882                             intin_no);
1883                         if ((rdt_entry & AV_PENDING) &&
1884                             (reps < apic_max_reps_clear_pending)) {
1885                                 /* Unmask it */
1886                                 WRITE_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1887                                     intin_no, rdt_entry & ~AV_MASK);
1888                         }
1889                 }
1890 
1891         } while ((rdt_entry & AV_PENDING) &&
1892             (reps < apic_max_reps_clear_pending));
1893 
1894 #ifdef DEBUG
1895                 if (rdt_entry & AV_PENDING)
1896                         apic_intr_deliver_timeouts++;
1897 #endif
1898 
1899         /*
1900          * If the remote IRR bit is set, then the interrupt has been sent
1901          * to a CPU for processing.  We have no choice but to wait for
1902          * that CPU to process the interrupt, at which point the remote IRR
1903          * bit will be cleared.
1904          */
1905         if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix, intin_no) &
1906             (AV_LEVEL|AV_REMOTE_IRR)) == (AV_LEVEL|AV_REMOTE_IRR)) {
1907 
1908                 /*
1909                  * If the CPU that this RDT is bound to is NOT the current
1910                  * CPU, wait until that CPU handles the interrupt and ACKs
1911                  * it.  If this interrupt is not bound to any CPU (that is,
1912                  * if it's bound to the logical destination of "anyone"), it
1913                  * may have been delivered to the current CPU so handle that
1914                  * case by deferring the reprogramming (below).
1915                  */
1916                 if ((old_bind_cpu != IRQ_UNBOUND) &&
1917                     (old_bind_cpu != IRQ_UNINIT) &&
1918                     (old_bind_cpu != psm_get_cpu_id())) {
1919                         for (waited = 0; waited < apic_max_reps_clear_pending;
1920                             waited++) {
1921                                 if ((READ_IOAPIC_RDT_ENTRY_LOW_DWORD(ioapic_ix,
1922                                     intin_no) & AV_REMOTE_IRR) == 0) {
1923 
1924                                         delete_defer_repro_ent(which_irq);
1925 
1926                                         /* Remote IRR has cleared! */
1927                                         return (0);
1928                                 }
1929                         }
1930                 }
1931 
1932                 /*
1933                  * If we waited and the Remote IRR bit is still not cleared,
1934                  * AND if we've invoked the timeout APIC_REPROGRAM_MAX_TIMEOUTS
1935                  * times for this interrupt, try the last-ditch workaround:
1936                  */
1937                 if (drep && drep->tries >= APIC_REPROGRAM_MAX_TRIES) {
1938 
1939                         apic_last_ditch_clear_remote_irr(ioapic_ix, intin_no);
1940 
1941                         /* Mark this one as reprogrammed: */
1942                         delete_defer_repro_ent(which_irq);
1943 
1944                         return (0);
1945                 } else {
1946 #ifdef DEBUG
1947                         apic_intr_deferrals++;
1948 #endif
1949 
1950                         /*
1951                          * If waiting for the Remote IRR bit (above) didn't
1952                          * allow it to clear, defer the reprogramming.
1953                          * Add a new deferred-programming entry if the
1954                          * caller passed a NULL one (and update the existing one
1955                          * in case anything changed).
1956                          */
1957                         add_defer_repro_ent(irq_ptr, which_irq, new_bind_cpu);
1958                         if (drep)
1959                                 drep->tries++;
1960 
1961                         /* Inform caller to defer IOAPIC programming: */
1962                         return (1);
1963                 }
1964 
1965         }
1966 
1967         /* Remote IRR is clear */
1968         delete_defer_repro_ent(which_irq);
1969 
1970         return (0);
1971 }
1972 
1973 /*
1974  * Called to migrate all interrupts at an irq to another cpu.
1975  * Must be called with interrupts disabled and apic_ioapic_lock held
1976  */
1977 int
1978 apic_rebind_all(apic_irq_t *irq_ptr, int bind_cpu)
1979 {
1980         apic_irq_t      *irqptr = irq_ptr;
1981         int             retval = 0;
1982 
1983         while (irqptr) {
1984                 if (irqptr->airq_temp_cpu != IRQ_UNINIT)
1985                         retval |= apic_rebind(irqptr, bind_cpu, NULL);
1986                 irqptr = irqptr->airq_next;
1987         }
1988 
1989         return (retval);
1990 }
1991 
1992 /*
1993  * apic_intr_redistribute does all the messy computations for identifying
1994  * which interrupt to move to which CPU. Currently we do just one interrupt
1995  * at a time. This reduces the time we spent doing all this within clock
1996  * interrupt. When it is done in idle, we could do more than 1.
1997  * First we find the most busy and the most free CPU (time in ISR only)
1998  * skipping those CPUs that has been identified as being ineligible (cpu_skip)
1999  * Then we look for IRQs which are closest to the difference between the
2000  * most busy CPU and the average ISR load. We try to find one whose load
2001  * is less than difference.If none exists, then we chose one larger than the
2002  * difference, provided it does not make the most idle CPU worse than the
2003  * most busy one. In the end, we clear all the busy fields for CPUs. For
2004  * IRQs, they are cleared as they are scanned.
2005  */
2006 void
2007 apic_intr_redistribute(void)
2008 {
2009         int busiest_cpu, most_free_cpu;
2010         int cpu_free, cpu_busy, max_busy, min_busy;
2011         int min_free, diff;
2012         int average_busy, cpus_online;
2013         int i, busy;
2014         ulong_t iflag;
2015         apic_cpus_info_t *cpu_infop;
2016         apic_irq_t *min_busy_irq = NULL;
2017         apic_irq_t *max_busy_irq = NULL;
2018 
2019         busiest_cpu = most_free_cpu = -1;
2020         cpu_free = cpu_busy = max_busy = average_busy = 0;
2021         min_free = apic_sample_factor_redistribution;
2022         cpus_online = 0;
2023         /*
2024          * Below we will check for CPU_INTR_ENABLE, bound, temp_bound, temp_cpu
2025          * without ioapic_lock. That is OK as we are just doing statistical
2026          * sampling anyway and any inaccuracy now will get corrected next time
2027          * The call to rebind which actually changes things will make sure
2028          * we are consistent.
2029          */
2030         for (i = 0; i < apic_nproc; i++) {
2031                 if (apic_cpu_in_range(i) &&
2032                     !(apic_redist_cpu_skip & (1 << i)) &&
2033                     (apic_cpus[i].aci_status & APIC_CPU_INTR_ENABLE)) {
2034 
2035                         cpu_infop = &apic_cpus[i];
2036                         /*
2037                          * If no unbound interrupts or only 1 total on this
2038                          * CPU, skip
2039                          */
2040                         if (!cpu_infop->aci_temp_bound ||
2041                             (cpu_infop->aci_bound + cpu_infop->aci_temp_bound)
2042                             == 1) {
2043                                 apic_redist_cpu_skip |= 1 << i;
2044                                 continue;
2045                         }
2046 
2047                         busy = cpu_infop->aci_busy;
2048                         average_busy += busy;
2049                         cpus_online++;
2050                         if (max_busy < busy) {
2051                                 max_busy = busy;
2052                                 busiest_cpu = i;
2053                         }
2054                         if (min_free > busy) {
2055                                 min_free = busy;
2056                                 most_free_cpu = i;
2057                         }
2058                         if (busy > apic_int_busy_mark) {
2059                                 cpu_busy |= 1 << i;
2060                         } else {
2061                                 if (busy < apic_int_free_mark)
2062                                         cpu_free |= 1 << i;
2063                         }
2064                 }
2065         }
2066         if ((cpu_busy && cpu_free) ||
2067             (max_busy >= (min_free + apic_diff_for_redistribution))) {
2068 
2069                 apic_num_imbalance++;
2070 #ifdef  DEBUG
2071                 if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2072                         prom_printf(
2073                             "redistribute busy=%x free=%x max=%x min=%x",
2074                             cpu_busy, cpu_free, max_busy, min_free);
2075                 }
2076 #endif /* DEBUG */
2077 
2078 
2079                 average_busy /= cpus_online;
2080 
2081                 diff = max_busy - average_busy;
2082                 min_busy = max_busy; /* start with the max possible value */
2083                 max_busy = 0;
2084                 min_busy_irq = max_busy_irq = NULL;
2085                 i = apic_min_device_irq;
2086                 for (; i <= apic_max_device_irq; i++) {
2087                         apic_irq_t *irq_ptr;
2088                         /* Change to linked list per CPU ? */
2089                         if ((irq_ptr = apic_irq_table[i]) == NULL)
2090                                 continue;
2091                         /* Check for irq_busy & decide which one to move */
2092                         /* Also zero them for next round */
2093                         if ((irq_ptr->airq_temp_cpu == busiest_cpu) &&
2094                             irq_ptr->airq_busy) {
2095                                 if (irq_ptr->airq_busy < diff) {
2096                                         /*
2097                                          * Check for least busy CPU,
2098                                          * best fit or what ?
2099                                          */
2100                                         if (max_busy < irq_ptr->airq_busy) {
2101                                                 /*
2102                                                  * Most busy within the
2103                                                  * required differential
2104                                                  */
2105                                                 max_busy = irq_ptr->airq_busy;
2106                                                 max_busy_irq = irq_ptr;
2107                                         }
2108                                 } else {
2109                                         if (min_busy > irq_ptr->airq_busy) {
2110                                                 /*
2111                                                  * least busy, but more than
2112                                                  * the reqd diff
2113                                                  */
2114                                                 if (min_busy <
2115                                                     (diff + average_busy -
2116                                                     min_free)) {
2117                                                         /*
2118                                                          * Making sure new cpu
2119                                                          * will not end up
2120                                                          * worse
2121                                                          */
2122                                                         min_busy =
2123                                                             irq_ptr->airq_busy;
2124 
2125                                                         min_busy_irq = irq_ptr;
2126                                                 }
2127                                         }
2128                                 }
2129                         }
2130                         irq_ptr->airq_busy = 0;
2131                 }
2132 
2133                 if (max_busy_irq != NULL) {
2134 #ifdef  DEBUG
2135                         if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2136                                 prom_printf("rebinding %x to %x",
2137                                     max_busy_irq->airq_vector, most_free_cpu);
2138                         }
2139 #endif /* DEBUG */
2140                         iflag = intr_clear();
2141                         if (lock_try(&apic_ioapic_lock)) {
2142                                 if (apic_rebind_all(max_busy_irq,
2143                                     most_free_cpu) == 0) {
2144                                         /* Make change permenant */
2145                                         max_busy_irq->airq_cpu =
2146                                             (uint32_t)most_free_cpu;
2147                                 }
2148                                 lock_clear(&apic_ioapic_lock);
2149                         }
2150                         intr_restore(iflag);
2151 
2152                 } else if (min_busy_irq != NULL) {
2153 #ifdef  DEBUG
2154                         if (apic_verbose & APIC_VERBOSE_IOAPIC_FLAG) {
2155                                 prom_printf("rebinding %x to %x",
2156                                     min_busy_irq->airq_vector, most_free_cpu);
2157                         }
2158 #endif /* DEBUG */
2159 
2160                         iflag = intr_clear();
2161                         if (lock_try(&apic_ioapic_lock)) {
2162                                 if (apic_rebind_all(min_busy_irq,
2163                                     most_free_cpu) == 0) {
2164                                         /* Make change permenant */
2165                                         min_busy_irq->airq_cpu =
2166                                             (uint32_t)most_free_cpu;
2167                                 }
2168                                 lock_clear(&apic_ioapic_lock);
2169                         }
2170                         intr_restore(iflag);
2171 
2172                 } else {
2173                         if (cpu_busy != (1 << busiest_cpu)) {
2174                                 apic_redist_cpu_skip |= 1 << busiest_cpu;
2175                                 /*
2176                                  * We leave cpu_skip set so that next time we
2177                                  * can choose another cpu
2178                                  */
2179                         }
2180                 }
2181                 apic_num_rebind++;
2182         } else {
2183                 /*
2184                  * found nothing. Could be that we skipped over valid CPUs
2185                  * or we have balanced everything. If we had a variable
2186                  * ticks_for_redistribution, it could be increased here.
2187                  * apic_int_busy, int_free etc would also need to be
2188                  * changed.
2189                  */
2190                 if (apic_redist_cpu_skip)
2191                         apic_redist_cpu_skip = 0;
2192         }
2193         for (i = 0; i < apic_nproc; i++) {
2194                 if (apic_cpu_in_range(i)) {
2195                         apic_cpus[i].aci_busy = 0;
2196                 }
2197         }
2198 }
2199 
2200 void
2201 apic_cleanup_busy(void)
2202 {
2203         int i;
2204         apic_irq_t *irq_ptr;
2205 
2206         for (i = 0; i < apic_nproc; i++) {
2207                 if (apic_cpu_in_range(i)) {
2208                         apic_cpus[i].aci_busy = 0;
2209                 }
2210         }
2211 
2212         for (i = apic_min_device_irq; i <= apic_max_device_irq; i++) {
2213                 if ((irq_ptr = apic_irq_table[i]) != NULL)
2214                         irq_ptr->airq_busy = 0;
2215         }
2216 }