1 
   2 /*
   3  * CDDL HEADER START
   4  *
   5  * The contents of this file are subject to the terms of the
   6  * Common Development and Distribution License (the "License").
   7  * You may not use this file except in compliance with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /*
  26  * Copyright (c) 2009-2010, Intel Corporation.
  27  * All rights reserved.
  28  */
  29 
  30 #define PSMI_1_7
  31 #include <sys/smp_impldefs.h>
  32 #include <sys/psm.h>
  33 #include <sys/psm_modctl.h>
  34 #include <sys/pit.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/strlog.h>
  37 #include <sys/clock.h>
  38 #include <sys/debug.h>
  39 #include <sys/rtc.h>
  40 #include <sys/x86_archext.h>
  41 #include <sys/cpupart.h>
  42 #include <sys/cpuvar.h>
  43 #include <sys/cpu_event.h>
  44 #include <sys/cmt.h>
  45 #include <sys/cpu.h>
  46 #include <sys/disp.h>
  47 #include <sys/archsystm.h>
  48 #include <sys/machsystm.h>
  49 #include <sys/sysmacros.h>
  50 #include <sys/memlist.h>
  51 #include <sys/param.h>
  52 #include <sys/promif.h>
  53 #include <sys/cpu_pm.h>
  54 #if defined(__xpv)
  55 #include <sys/hypervisor.h>
  56 #endif
  57 #include <sys/mach_intr.h>
  58 #include <vm/hat_i86.h>
  59 #include <sys/kdi_machimpl.h>
  60 #include <sys/sdt.h>
  61 #include <sys/hpet.h>
  62 #include <sys/sunddi.h>
  63 #include <sys/sunndi.h>
  64 #include <sys/cpc_pcbe.h>
  65 
  66 #define OFFSETOF(s, m)          (size_t)(&(((s *)0)->m))
  67 
  68 /*
  69  *      Local function prototypes
  70  */
  71 static int mp_disable_intr(processorid_t cpun);
  72 static void mp_enable_intr(processorid_t cpun);
  73 static void mach_init();
  74 static void mach_picinit();
  75 static int machhztomhz(uint64_t cpu_freq_hz);
  76 static uint64_t mach_getcpufreq(void);
  77 static void mach_fixcpufreq(void);
  78 static int mach_clkinit(int, int *);
  79 static void mach_smpinit(void);
  80 static int mach_softlvl_to_vect(int ipl);
  81 static void mach_get_platform(int owner);
  82 static void mach_construct_info();
  83 static int mach_translate_irq(dev_info_t *dip, int irqno);
  84 static int mach_intr_ops(dev_info_t *, ddi_intr_handle_impl_t *,
  85     psm_intr_op_t, int *);
  86 static void mach_notify_error(int level, char *errmsg);
  87 static hrtime_t dummy_hrtime(void);
  88 static void dummy_scalehrtime(hrtime_t *);
  89 static uint64_t dummy_unscalehrtime(hrtime_t);
  90 void cpu_idle(void);
  91 static void cpu_wakeup(cpu_t *, int);
  92 #ifndef __xpv
  93 void cpu_idle_mwait(void);
  94 static void cpu_wakeup_mwait(cpu_t *, int);
  95 #endif
  96 static int mach_cpu_create_devinfo(cpu_t *cp, dev_info_t **dipp);
  97 
  98 /*
  99  *      External reference functions
 100  */
 101 extern void return_instr();
 102 extern uint64_t freq_tsc(uint32_t *);
 103 #if defined(__i386)
 104 extern uint64_t freq_notsc(uint32_t *);
 105 #endif
 106 extern void pc_gethrestime(timestruc_t *);
 107 extern int cpuid_get_coreid(cpu_t *);
 108 extern int cpuid_get_chipid(cpu_t *);
 109 
 110 /*
 111  *      PSM functions initialization
 112  */
 113 void (*psm_shutdownf)(int, int) = (void (*)(int, int))return_instr;
 114 void (*psm_preshutdownf)(int, int) = (void (*)(int, int))return_instr;
 115 void (*psm_notifyf)(int)        = (void (*)(int))return_instr;
 116 void (*psm_set_idle_cpuf)(int)  = (void (*)(int))return_instr;
 117 void (*psm_unset_idle_cpuf)(int) = (void (*)(int))return_instr;
 118 void (*psminitf)()              = mach_init;
 119 void (*picinitf)()              = return_instr;
 120 int (*clkinitf)(int, int *)     = (int (*)(int, int *))return_instr;
 121 int (*ap_mlsetup)()             = (int (*)(void))return_instr;
 122 void (*send_dirintf)()          = return_instr;
 123 void (*setspl)(int)             = (void (*)(int))return_instr;
 124 int (*addspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr;
 125 int (*delspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr;
 126 int (*get_pending_spl)(void)    = (int (*)(void))return_instr;
 127 int (*addintr)(void *, int, avfunc, char *, int, caddr_t, caddr_t,
 128     uint64_t *, dev_info_t *) = NULL;
 129 void (*remintr)(void *, int, avfunc, int) = NULL;
 130 void (*kdisetsoftint)(int, struct av_softinfo *)=
 131         (void (*)(int, struct av_softinfo *))return_instr;
 132 void (*setsoftint)(int, struct av_softinfo *)=
 133         (void (*)(int, struct av_softinfo *))return_instr;
 134 int (*slvltovect)(int)          = (int (*)(int))return_instr;
 135 int (*setlvl)(int, int *)       = (int (*)(int, int *))return_instr;
 136 void (*setlvlx)(int, int)       = (void (*)(int, int))return_instr;
 137 int (*psm_disable_intr)(int)    = mp_disable_intr;
 138 void (*psm_enable_intr)(int)    = mp_enable_intr;
 139 hrtime_t (*gethrtimef)(void)    = dummy_hrtime;
 140 hrtime_t (*gethrtimeunscaledf)(void)    = dummy_hrtime;
 141 void (*scalehrtimef)(hrtime_t *)        = dummy_scalehrtime;
 142 uint64_t (*unscalehrtimef)(hrtime_t)    = dummy_unscalehrtime;
 143 int (*psm_translate_irq)(dev_info_t *, int) = mach_translate_irq;
 144 void (*gethrestimef)(timestruc_t *) = pc_gethrestime;
 145 void (*psm_notify_error)(int, char *) = (void (*)(int, char *))NULL;
 146 int (*psm_get_clockirq)(int) = NULL;
 147 int (*psm_get_ipivect)(int, int) = NULL;
 148 uchar_t (*psm_get_ioapicid)(uchar_t) = NULL;
 149 uint32_t (*psm_get_localapicid)(uint32_t) = NULL;
 150 uchar_t (*psm_xlate_vector_by_irq)(uchar_t) = NULL;
 151 
 152 int (*psm_clkinit)(int) = NULL;
 153 void (*psm_timer_reprogram)(hrtime_t) = NULL;
 154 void (*psm_timer_enable)(void) = NULL;
 155 void (*psm_timer_disable)(void) = NULL;
 156 void (*psm_post_cyclic_setup)(void *arg) = NULL;
 157 int (*psm_intr_ops)(dev_info_t *, ddi_intr_handle_impl_t *, psm_intr_op_t,
 158     int *) = mach_intr_ops;
 159 int (*psm_state)(psm_state_request_t *) = (int (*)(psm_state_request_t *))
 160     return_instr;
 161 
 162 void (*notify_error)(int, char *) = (void (*)(int, char *))return_instr;
 163 void (*hrtime_tick)(void)       = return_instr;
 164 
 165 int (*psm_cpu_create_devinfo)(cpu_t *, dev_info_t **) = mach_cpu_create_devinfo;
 166 int (*psm_cpu_get_devinfo)(cpu_t *, dev_info_t **) = NULL;
 167 
 168 /* global IRM pool for APIX (PSM) module */
 169 ddi_irm_pool_t *apix_irm_pool_p = NULL;
 170 
 171 /*
 172  * True if the generic TSC code is our source of hrtime, rather than whatever
 173  * the PSM can provide.
 174  */
 175 #ifdef __xpv
 176 int tsc_gethrtime_enable = 0;
 177 #else
 178 int tsc_gethrtime_enable = 1;
 179 #endif
 180 int tsc_gethrtime_initted = 0;
 181 
 182 /*
 183  * True if the hrtime implementation is "hires"; namely, better than microdata.
 184  */
 185 int gethrtime_hires = 0;
 186 
 187 /*
 188  * Local Static Data
 189  */
 190 static struct psm_ops mach_ops;
 191 static struct psm_ops *mach_set[4] = {&mach_ops, NULL, NULL, NULL};
 192 static ushort_t mach_ver[4] = {0, 0, 0, 0};
 193 
 194 /*
 195  * virtualization support for psm
 196  */
 197 void *psm_vt_ops = NULL;
 198 /*
 199  * If non-zero, idle cpus will become "halted" when there's
 200  * no work to do.
 201  */
 202 int     idle_cpu_use_hlt = 1;
 203 
 204 #ifndef __xpv
 205 /*
 206  * If non-zero, idle cpus will use mwait if available to halt instead of hlt.
 207  */
 208 int     idle_cpu_prefer_mwait = 1;
 209 /*
 210  * Set to 0 to avoid MONITOR+CLFLUSH assertion.
 211  */
 212 int     idle_cpu_assert_cflush_monitor = 1;
 213 
 214 /*
 215  * If non-zero, idle cpus will not use power saving Deep C-States idle loop.
 216  */
 217 int     idle_cpu_no_deep_c = 0;
 218 /*
 219  * Non-power saving idle loop and wakeup pointers.
 220  * Allows user to toggle Deep Idle power saving feature on/off.
 221  */
 222 void    (*non_deep_idle_cpu)() = cpu_idle;
 223 void    (*non_deep_idle_disp_enq_thread)(cpu_t *, int);
 224 
 225 /*
 226  * Object for the kernel to access the HPET.
 227  */
 228 hpet_t hpet;
 229 
 230 #endif  /* ifndef __xpv */
 231 
 232 uint_t cp_haltset_fanout = 0;
 233 
 234 /*ARGSUSED*/
 235 int
 236 pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw)
 237 {
 238         switch (hw) {
 239         case PGHW_IPIPE:
 240                 if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
 241                         /*
 242                          * Hyper-threading is SMT
 243                          */
 244                         return (1);
 245                 } else {
 246                         return (0);
 247                 }
 248         case PGHW_FPU:
 249                 if (cpuid_get_cores_per_compunit(cp) > 1)
 250                         return (1);
 251                 else
 252                         return (0);
 253         case PGHW_PROCNODE:
 254                 if (cpuid_get_procnodes_per_pkg(cp) > 1)
 255                         return (1);
 256                 else
 257                         return (0);
 258         case PGHW_CHIP:
 259                 if (is_x86_feature(x86_featureset, X86FSET_CMP) ||
 260                     is_x86_feature(x86_featureset, X86FSET_HTT))
 261                         return (1);
 262                 else
 263                         return (0);
 264         case PGHW_CACHE:
 265                 if (cpuid_get_ncpu_sharing_last_cache(cp) > 1)
 266                         return (1);
 267                 else
 268                         return (0);
 269         case PGHW_POW_ACTIVE:
 270                 if (cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE) != (id_t)-1)
 271                         return (1);
 272                 else
 273                         return (0);
 274         case PGHW_POW_IDLE:
 275                 if (cpupm_domain_id(cp, CPUPM_DTYPE_IDLE) != (id_t)-1)
 276                         return (1);
 277                 else
 278                         return (0);
 279         default:
 280                 return (0);
 281         }
 282 }
 283 
 284 /*
 285  * Compare two CPUs and see if they have a pghw_type_t sharing relationship
 286  * If pghw_type_t is an unsupported hardware type, then return -1
 287  */
 288 int
 289 pg_plat_cpus_share(cpu_t *cpu_a, cpu_t *cpu_b, pghw_type_t hw)
 290 {
 291         id_t pgp_a, pgp_b;
 292 
 293         pgp_a = pg_plat_hw_instance_id(cpu_a, hw);
 294         pgp_b = pg_plat_hw_instance_id(cpu_b, hw);
 295 
 296         if (pgp_a == -1 || pgp_b == -1)
 297                 return (-1);
 298 
 299         return (pgp_a == pgp_b);
 300 }
 301 
 302 /*
 303  * Return a physical instance identifier for known hardware sharing
 304  * relationships
 305  */
 306 id_t
 307 pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
 308 {
 309         switch (hw) {
 310         case PGHW_IPIPE:
 311                 return (cpuid_get_coreid(cpu));
 312         case PGHW_CACHE:
 313                 return (cpuid_get_last_lvl_cacheid(cpu));
 314         case PGHW_FPU:
 315                 return (cpuid_get_compunitid(cpu));
 316         case PGHW_PROCNODE:
 317                 return (cpuid_get_procnodeid(cpu));
 318         case PGHW_CHIP:
 319                 return (cpuid_get_chipid(cpu));
 320         case PGHW_POW_ACTIVE:
 321                 return (cpupm_domain_id(cpu, CPUPM_DTYPE_ACTIVE));
 322         case PGHW_POW_IDLE:
 323                 return (cpupm_domain_id(cpu, CPUPM_DTYPE_IDLE));
 324         default:
 325                 return (-1);
 326         }
 327 }
 328 
 329 /*
 330  * Express preference for optimizing for sharing relationship
 331  * hw1 vs hw2
 332  */
 333 pghw_type_t
 334 pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2)
 335 {
 336         int i, rank1, rank2;
 337 
 338         static pghw_type_t hw_hier[] = {
 339                 PGHW_IPIPE,
 340                 PGHW_CACHE,
 341                 PGHW_FPU,
 342                 PGHW_PROCNODE,
 343                 PGHW_CHIP,
 344                 PGHW_POW_IDLE,
 345                 PGHW_POW_ACTIVE,
 346                 PGHW_NUM_COMPONENTS
 347         };
 348 
 349         for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) {
 350                 if (hw_hier[i] == hw1)
 351                         rank1 = i;
 352                 if (hw_hier[i] == hw2)
 353                         rank2 = i;
 354         }
 355 
 356         if (rank1 > rank2)
 357                 return (hw1);
 358         else
 359                 return (hw2);
 360 }
 361 
 362 /*
 363  * Override the default CMT dispatcher policy for the specified
 364  * hardware sharing relationship
 365  */
 366 pg_cmt_policy_t
 367 pg_plat_cmt_policy(pghw_type_t hw)
 368 {
 369         /*
 370          * For shared caches, also load balance across them to
 371          * maximize aggregate cache capacity
 372          *
 373          * On AMD family 0x15 CPUs, cores come in pairs called
 374          * compute units, sharing the FPU and the L1I and L2
 375          * caches. Use balancing and cache affinity.
 376          */
 377         switch (hw) {
 378         case PGHW_FPU:
 379         case PGHW_CACHE:
 380                 return (CMT_BALANCE|CMT_AFFINITY);
 381         default:
 382                 return (CMT_NO_POLICY);
 383         }
 384 }
 385 
 386 id_t
 387 pg_plat_get_core_id(cpu_t *cpu)
 388 {
 389         return ((id_t)cpuid_get_coreid(cpu));
 390 }
 391 
 392 void
 393 cmp_set_nosteal_interval(void)
 394 {
 395         /* Set the nosteal interval (used by disp_getbest()) to 100us */
 396         nosteal_nsec = 100000UL;
 397 }
 398 
 399 /*
 400  * Routine to ensure initial callers to hrtime gets 0 as return
 401  */
 402 static hrtime_t
 403 dummy_hrtime(void)
 404 {
 405         return (0);
 406 }
 407 
 408 /* ARGSUSED */
 409 static void
 410 dummy_scalehrtime(hrtime_t *ticks)
 411 {}
 412 
 413 static uint64_t
 414 dummy_unscalehrtime(hrtime_t nsecs)
 415 {
 416         return ((uint64_t)nsecs);
 417 }
 418 
 419 /*
 420  * Supports Deep C-State power saving idle loop.
 421  */
 422 void
 423 cpu_idle_adaptive(void)
 424 {
 425         (*CPU->cpu_m.mcpu_idle_cpu)();
 426 }
 427 
 428 /*
 429  * Function called by CPU idle notification framework to check whether CPU
 430  * has been awakened. It will be called with interrupt disabled.
 431  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
 432  * notification framework.
 433  */
 434 /*ARGSUSED*/
 435 static void
 436 cpu_idle_check_wakeup(void *arg)
 437 {
 438         /*
 439          * Toggle interrupt flag to detect pending interrupts.
 440          * If interrupt happened, do_interrupt() will notify CPU idle
 441          * notification framework so no need to call cpu_idle_exit() here.
 442          */
 443         sti();
 444         SMT_PAUSE();
 445         cli();
 446 }
 447 
 448 /*
 449  * Idle the present CPU until wakened via an interrupt
 450  */
 451 void
 452 cpu_idle(void)
 453 {
 454         cpu_t           *cpup = CPU;
 455         processorid_t   cpu_sid = cpup->cpu_seqid;
 456         cpupart_t       *cp = cpup->cpu_part;
 457         int             hset_update = 1;
 458 
 459         /*
 460          * If this CPU is online, and there's multiple CPUs
 461          * in the system, then we should notate our halting
 462          * by adding ourselves to the partition's halted CPU
 463          * bitmap. This allows other CPUs to find/awaken us when
 464          * work becomes available.
 465          */
 466         if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
 467                 hset_update = 0;
 468 
 469         /*
 470          * Add ourselves to the partition's halted CPUs bitmap
 471          * and set our HALTED flag, if necessary.
 472          *
 473          * When a thread becomes runnable, it is placed on the queue
 474          * and then the halted CPU bitmap is checked to determine who
 475          * (if anyone) should be awakened. We therefore need to first
 476          * add ourselves to the bitmap, and and then check if there
 477          * is any work available. The order is important to prevent a race
 478          * that can lead to work languishing on a run queue somewhere while
 479          * this CPU remains halted.
 480          *
 481          * Either the producing CPU will see we're halted and will awaken us,
 482          * or this CPU will see the work available in disp_anywork().
 483          *
 484          * Note that memory barriers after updating the HALTED flag
 485          * are not necessary since an atomic operation (updating the bitset)
 486          * immediately follows. On x86 the atomic operation acts as a
 487          * memory barrier for the update of cpu_disp_flags.
 488          */
 489         if (hset_update) {
 490                 cpup->cpu_disp_flags |= CPU_DISP_HALTED;
 491                 bitset_atomic_add(&cp->cp_haltset, cpu_sid);
 492         }
 493 
 494         /*
 495          * Check to make sure there's really nothing to do.
 496          * Work destined for this CPU may become available after
 497          * this check. We'll be notified through the clearing of our
 498          * bit in the halted CPU bitmap, and a poke.
 499          */
 500         if (disp_anywork()) {
 501                 if (hset_update) {
 502                         cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 503                         bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 504                 }
 505                 return;
 506         }
 507 
 508         /*
 509          * We're on our way to being halted.
 510          *
 511          * Disable interrupts now, so that we'll awaken immediately
 512          * after halting if someone tries to poke us between now and
 513          * the time we actually halt.
 514          *
 515          * We check for the presence of our bit after disabling interrupts.
 516          * If it's cleared, we'll return. If the bit is cleared after
 517          * we check then the poke will pop us out of the halted state.
 518          *
 519          * This means that the ordering of the poke and the clearing
 520          * of the bit by cpu_wakeup is important.
 521          * cpu_wakeup() must clear, then poke.
 522          * cpu_idle() must disable interrupts, then check for the bit.
 523          */
 524         cli();
 525 
 526         if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
 527                 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 528                 sti();
 529                 return;
 530         }
 531 
 532         /*
 533          * The check for anything locally runnable is here for performance
 534          * and isn't needed for correctness. disp_nrunnable ought to be
 535          * in our cache still, so it's inexpensive to check, and if there
 536          * is anything runnable we won't have to wait for the poke.
 537          */
 538         if (cpup->cpu_disp->disp_nrunnable != 0) {
 539                 if (hset_update) {
 540                         cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 541                         bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 542                 }
 543                 sti();
 544                 return;
 545         }
 546 
 547         if (cpu_idle_enter(IDLE_STATE_C1, 0,
 548             cpu_idle_check_wakeup, NULL) == 0) {
 549                 mach_cpu_idle();
 550                 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 551         }
 552 
 553         /*
 554          * We're no longer halted
 555          */
 556         if (hset_update) {
 557                 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 558                 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 559         }
 560 }
 561 
 562 
 563 /*
 564  * If "cpu" is halted, then wake it up clearing its halted bit in advance.
 565  * Otherwise, see if other CPUs in the cpu partition are halted and need to
 566  * be woken up so that they can steal the thread we placed on this CPU.
 567  * This function is only used on MP systems.
 568  */
 569 static void
 570 cpu_wakeup(cpu_t *cpu, int bound)
 571 {
 572         uint_t          cpu_found;
 573         processorid_t   cpu_sid;
 574         cpupart_t       *cp;
 575 
 576         cp = cpu->cpu_part;
 577         cpu_sid = cpu->cpu_seqid;
 578         if (bitset_in_set(&cp->cp_haltset, cpu_sid)) {
 579                 /*
 580                  * Clear the halted bit for that CPU since it will be
 581                  * poked in a moment.
 582                  */
 583                 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 584                 /*
 585                  * We may find the current CPU present in the halted cpuset
 586                  * if we're in the context of an interrupt that occurred
 587                  * before we had a chance to clear our bit in cpu_idle().
 588                  * Poking ourself is obviously unnecessary, since if
 589                  * we're here, we're not halted.
 590                  */
 591                 if (cpu != CPU)
 592                         poke_cpu(cpu->cpu_id);
 593                 return;
 594         } else {
 595                 /*
 596                  * This cpu isn't halted, but it's idle or undergoing a
 597                  * context switch. No need to awaken anyone else.
 598                  */
 599                 if (cpu->cpu_thread == cpu->cpu_idle_thread ||
 600                     cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL)
 601                         return;
 602         }
 603 
 604         /*
 605          * No need to wake up other CPUs if this is for a bound thread.
 606          */
 607         if (bound)
 608                 return;
 609 
 610         /*
 611          * The CPU specified for wakeup isn't currently halted, so check
 612          * to see if there are any other halted CPUs in the partition,
 613          * and if there are then awaken one.
 614          */
 615         do {
 616                 cpu_found = bitset_find(&cp->cp_haltset);
 617                 if (cpu_found == (uint_t)-1)
 618                         return;
 619         } while (bitset_atomic_test_and_del(&cp->cp_haltset, cpu_found) < 0);
 620 
 621         if (cpu_found != CPU->cpu_seqid) {
 622                 poke_cpu(cpu_seq[cpu_found]->cpu_id);
 623         }
 624 }
 625 
 626 #ifndef __xpv
 627 /*
 628  * Function called by CPU idle notification framework to check whether CPU
 629  * has been awakened. It will be called with interrupt disabled.
 630  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
 631  * notification framework.
 632  */
 633 static void
 634 cpu_idle_mwait_check_wakeup(void *arg)
 635 {
 636         volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
 637 
 638         ASSERT(arg != NULL);
 639         if (*mcpu_mwait != MWAIT_HALTED) {
 640                 /*
 641                  * CPU has been awakened, notify CPU idle notification system.
 642                  */
 643                 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 644         } else {
 645                 /*
 646                  * Toggle interrupt flag to detect pending interrupts.
 647                  * If interrupt happened, do_interrupt() will notify CPU idle
 648                  * notification framework so no need to call cpu_idle_exit()
 649                  * here.
 650                  */
 651                 sti();
 652                 SMT_PAUSE();
 653                 cli();
 654         }
 655 }
 656 
 657 /*
 658  * Idle the present CPU until awakened via touching its monitored line
 659  */
 660 void
 661 cpu_idle_mwait(void)
 662 {
 663         volatile uint32_t       *mcpu_mwait = CPU->cpu_m.mcpu_mwait;
 664         cpu_t                   *cpup = CPU;
 665         processorid_t           cpu_sid = cpup->cpu_seqid;
 666         cpupart_t               *cp = cpup->cpu_part;
 667         int                     hset_update = 1;
 668 
 669         /*
 670          * Set our mcpu_mwait here, so we can tell if anyone tries to
 671          * wake us between now and when we call mwait.  No other cpu will
 672          * attempt to set our mcpu_mwait until we add ourself to the halted
 673          * CPU bitmap.
 674          */
 675         *mcpu_mwait = MWAIT_HALTED;
 676 
 677         /*
 678          * If this CPU is online, and there's multiple CPUs
 679          * in the system, then we should note our halting
 680          * by adding ourselves to the partition's halted CPU
 681          * bitmap. This allows other CPUs to find/awaken us when
 682          * work becomes available.
 683          */
 684         if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
 685                 hset_update = 0;
 686 
 687         /*
 688          * Add ourselves to the partition's halted CPUs bitmap
 689          * and set our HALTED flag, if necessary.
 690          *
 691          * When a thread becomes runnable, it is placed on the queue
 692          * and then the halted CPU bitmap is checked to determine who
 693          * (if anyone) should be awakened. We therefore need to first
 694          * add ourselves to the bitmap, and and then check if there
 695          * is any work available.
 696          *
 697          * Note that memory barriers after updating the HALTED flag
 698          * are not necessary since an atomic operation (updating the bitmap)
 699          * immediately follows. On x86 the atomic operation acts as a
 700          * memory barrier for the update of cpu_disp_flags.
 701          */
 702         if (hset_update) {
 703                 cpup->cpu_disp_flags |= CPU_DISP_HALTED;
 704                 bitset_atomic_add(&cp->cp_haltset, cpu_sid);
 705         }
 706 
 707         /*
 708          * Check to make sure there's really nothing to do.
 709          * Work destined for this CPU may become available after
 710          * this check. We'll be notified through the clearing of our
 711          * bit in the halted CPU bitmap, and a write to our mcpu_mwait.
 712          *
 713          * disp_anywork() checks disp_nrunnable, so we do not have to later.
 714          */
 715         if (disp_anywork()) {
 716                 if (hset_update) {
 717                         cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 718                         bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 719                 }
 720                 return;
 721         }
 722 
 723         /*
 724          * We're on our way to being halted.
 725          * To avoid a lost wakeup, arm the monitor before checking if another
 726          * cpu wrote to mcpu_mwait to wake us up.
 727          */
 728         i86_monitor(mcpu_mwait, 0, 0);
 729         if (*mcpu_mwait == MWAIT_HALTED) {
 730                 if (cpu_idle_enter(IDLE_STATE_C1, 0,
 731                     cpu_idle_mwait_check_wakeup, (void *)mcpu_mwait) == 0) {
 732                         if (*mcpu_mwait == MWAIT_HALTED) {
 733                                 i86_mwait(0, 0);
 734                         }
 735                         cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 736                 }
 737         }
 738 
 739         /*
 740          * We're no longer halted
 741          */
 742         if (hset_update) {
 743                 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 744                 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 745         }
 746 }
 747 
 748 /*
 749  * If "cpu" is halted in mwait, then wake it up clearing its halted bit in
 750  * advance.  Otherwise, see if other CPUs in the cpu partition are halted and
 751  * need to be woken up so that they can steal the thread we placed on this CPU.
 752  * This function is only used on MP systems.
 753  */
 754 static void
 755 cpu_wakeup_mwait(cpu_t *cp, int bound)
 756 {
 757         cpupart_t       *cpu_part;
 758         uint_t          cpu_found;
 759         processorid_t   cpu_sid;
 760 
 761         cpu_part = cp->cpu_part;
 762         cpu_sid = cp->cpu_seqid;
 763 
 764         /*
 765          * Clear the halted bit for that CPU since it will be woken up
 766          * in a moment.
 767          */
 768         if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
 769                 /*
 770                  * Clear the halted bit for that CPU since it will be
 771                  * poked in a moment.
 772                  */
 773                 bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
 774                 /*
 775                  * We may find the current CPU present in the halted cpuset
 776                  * if we're in the context of an interrupt that occurred
 777                  * before we had a chance to clear our bit in cpu_idle().
 778                  * Waking ourself is obviously unnecessary, since if
 779                  * we're here, we're not halted.
 780                  *
 781                  * monitor/mwait wakeup via writing to our cache line is
 782                  * harmless and less expensive than always checking if we
 783                  * are waking ourself which is an uncommon case.
 784                  */
 785                 MWAIT_WAKEUP(cp);       /* write to monitored line */
 786                 return;
 787         } else {
 788                 /*
 789                  * This cpu isn't halted, but it's idle or undergoing a
 790                  * context switch. No need to awaken anyone else.
 791                  */
 792                 if (cp->cpu_thread == cp->cpu_idle_thread ||
 793                     cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
 794                         return;
 795         }
 796 
 797         /*
 798          * No need to wake up other CPUs if the thread we just enqueued
 799          * is bound.
 800          */
 801         if (bound || ncpus == 1)
 802                 return;
 803 
 804         /*
 805          * See if there's any other halted CPUs. If there are, then
 806          * select one, and awaken it.
 807          * It's possible that after we find a CPU, somebody else
 808          * will awaken it before we get the chance.
 809          * In that case, look again.
 810          */
 811         do {
 812                 cpu_found = bitset_find(&cpu_part->cp_haltset);
 813                 if (cpu_found == (uint_t)-1)
 814                         return;
 815         } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
 816             cpu_found) < 0);
 817 
 818         /*
 819          * Do not check if cpu_found is ourself as monitor/mwait
 820          * wakeup is cheap.
 821          */
 822         MWAIT_WAKEUP(cpu_seq[cpu_found]); /* write to monitored line */
 823 }
 824 
 825 #endif
 826 
 827 void (*cpu_pause_handler)(volatile char *) = NULL;
 828 
 829 static int
 830 mp_disable_intr(int cpun)
 831 {
 832         /*
 833          * switch to the offline cpu
 834          */
 835         affinity_set(cpun);
 836         /*
 837          * raise ipl to just below cross call
 838          */
 839         splx(XC_SYS_PIL - 1);
 840         /*
 841          *      set base spl to prevent the next swtch to idle from
 842          *      lowering back to ipl 0
 843          */
 844         CPU->cpu_intr_actv |= (1 << (XC_SYS_PIL - 1));
 845         set_base_spl();
 846         affinity_clear();
 847         return (DDI_SUCCESS);
 848 }
 849 
 850 static void
 851 mp_enable_intr(int cpun)
 852 {
 853         /*
 854          * switch to the online cpu
 855          */
 856         affinity_set(cpun);
 857         /*
 858          * clear the interrupt active mask
 859          */
 860         CPU->cpu_intr_actv &= ~(1 << (XC_SYS_PIL - 1));
 861         set_base_spl();
 862         (void) spl0();
 863         affinity_clear();
 864 }
 865 
 866 static void
 867 mach_get_platform(int owner)
 868 {
 869         void            **srv_opsp;
 870         void            **clt_opsp;
 871         int             i;
 872         int             total_ops;
 873 
 874         /* fix up psm ops */
 875         srv_opsp = (void **)mach_set[0];
 876         clt_opsp = (void **)mach_set[owner];
 877         if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01)
 878                 total_ops = sizeof (struct psm_ops_ver01) /
 879                     sizeof (void (*)(void));
 880         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_1)
 881                 /* no psm_notify_func */
 882                 total_ops = OFFSETOF(struct psm_ops, psm_notify_func) /
 883                     sizeof (void (*)(void));
 884         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_2)
 885                 /* no psm_timer funcs */
 886                 total_ops = OFFSETOF(struct psm_ops, psm_timer_reprogram) /
 887                     sizeof (void (*)(void));
 888         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_3)
 889                 /* no psm_preshutdown function */
 890                 total_ops = OFFSETOF(struct psm_ops, psm_preshutdown) /
 891                     sizeof (void (*)(void));
 892         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_4)
 893                 /* no psm_intr_ops function */
 894                 total_ops = OFFSETOF(struct psm_ops, psm_intr_ops) /
 895                     sizeof (void (*)(void));
 896         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_5)
 897                 /* no psm_state function */
 898                 total_ops = OFFSETOF(struct psm_ops, psm_state) /
 899                     sizeof (void (*)(void));
 900         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_6)
 901                 /* no psm_cpu_ops function */
 902                 total_ops = OFFSETOF(struct psm_ops, psm_cpu_ops) /
 903                     sizeof (void (*)(void));
 904         else
 905                 total_ops = sizeof (struct psm_ops) / sizeof (void (*)(void));
 906 
 907         /*
 908          * Save the version of the PSM module, in case we need to
 909          * behave differently based on version.
 910          */
 911         mach_ver[0] = mach_ver[owner];
 912 
 913         for (i = 0; i < total_ops; i++)
 914                 if (clt_opsp[i] != NULL)
 915                         srv_opsp[i] = clt_opsp[i];
 916 }
 917 
 918 static void
 919 mach_construct_info()
 920 {
 921         struct psm_sw *swp;
 922         int     mach_cnt[PSM_OWN_OVERRIDE+1] = {0};
 923         int     conflict_owner = 0;
 924 
 925         if (psmsw->psw_forw == psmsw)
 926                 panic("No valid PSM modules found");
 927         mutex_enter(&psmsw_lock);
 928         for (swp = psmsw->psw_forw; swp != psmsw; swp = swp->psw_forw) {
 929                 if (!(swp->psw_flag & PSM_MOD_IDENTIFY))
 930                         continue;
 931                 mach_set[swp->psw_infop->p_owner] = swp->psw_infop->p_ops;
 932                 mach_ver[swp->psw_infop->p_owner] = swp->psw_infop->p_version;
 933                 mach_cnt[swp->psw_infop->p_owner]++;
 934         }
 935         mutex_exit(&psmsw_lock);
 936 
 937         mach_get_platform(PSM_OWN_SYS_DEFAULT);
 938 
 939         /* check to see are there any conflicts */
 940         if (mach_cnt[PSM_OWN_EXCLUSIVE] > 1)
 941                 conflict_owner = PSM_OWN_EXCLUSIVE;
 942         if (mach_cnt[PSM_OWN_OVERRIDE] > 1)
 943                 conflict_owner = PSM_OWN_OVERRIDE;
 944         if (conflict_owner) {
 945                 /* remove all psm modules except uppc */
 946                 cmn_err(CE_WARN,
 947                     "Conflicts detected on the following PSM modules:");
 948                 mutex_enter(&psmsw_lock);
 949                 for (swp = psmsw->psw_forw; swp != psmsw; swp = swp->psw_forw) {
 950                         if (swp->psw_infop->p_owner == conflict_owner)
 951                                 cmn_err(CE_WARN, "%s ",
 952                                     swp->psw_infop->p_mach_idstring);
 953                 }
 954                 mutex_exit(&psmsw_lock);
 955                 cmn_err(CE_WARN,
 956                     "Setting the system back to SINGLE processor mode!");
 957                 cmn_err(CE_WARN,
 958                     "Please edit /etc/mach to remove the invalid PSM module.");
 959                 return;
 960         }
 961 
 962         if (mach_set[PSM_OWN_EXCLUSIVE])
 963                 mach_get_platform(PSM_OWN_EXCLUSIVE);
 964 
 965         if (mach_set[PSM_OWN_OVERRIDE])
 966                 mach_get_platform(PSM_OWN_OVERRIDE);
 967 }
 968 
 969 static void
 970 mach_init()
 971 {
 972         struct psm_ops  *pops;
 973 
 974         mach_construct_info();
 975 
 976         pops = mach_set[0];
 977 
 978         /* register the interrupt and clock initialization rotuines */
 979         picinitf = mach_picinit;
 980         clkinitf = mach_clkinit;
 981         psm_get_clockirq = pops->psm_get_clockirq;
 982 
 983         /* register the interrupt setup code */
 984         slvltovect = mach_softlvl_to_vect;
 985         addspl  = pops->psm_addspl;
 986         delspl  = pops->psm_delspl;
 987 
 988         if (pops->psm_translate_irq)
 989                 psm_translate_irq = pops->psm_translate_irq;
 990         if (pops->psm_intr_ops)
 991                 psm_intr_ops = pops->psm_intr_ops;
 992 
 993 #if defined(PSMI_1_2) || defined(PSMI_1_3) || defined(PSMI_1_4)
 994         /*
 995          * Time-of-day functionality now handled in TOD modules.
 996          * (Warn about PSM modules that think that we're going to use
 997          * their ops vectors.)
 998          */
 999         if (pops->psm_tod_get)
1000                 cmn_err(CE_WARN, "obsolete psm_tod_get op %p",
1001                     (void *)pops->psm_tod_get);
1002 
1003         if (pops->psm_tod_set)
1004                 cmn_err(CE_WARN, "obsolete psm_tod_set op %p",
1005                     (void *)pops->psm_tod_set);
1006 #endif
1007 
1008         if (pops->psm_notify_error) {
1009                 psm_notify_error = mach_notify_error;
1010                 notify_error = pops->psm_notify_error;
1011         }
1012 
1013         (*pops->psm_softinit)();
1014 
1015         /*
1016          * Initialize the dispatcher's function hooks to enable CPU halting
1017          * when idle.  Set both the deep-idle and non-deep-idle hooks.
1018          *
1019          * Assume we can use power saving deep-idle loop cpu_idle_adaptive.
1020          * Platform deep-idle driver will reset our idle loop to
1021          * non_deep_idle_cpu if power saving deep-idle feature is not available.
1022          *
1023          * Do not use monitor/mwait if idle_cpu_use_hlt is not set(spin idle)
1024          * or idle_cpu_prefer_mwait is not set.
1025          * Allocate monitor/mwait buffer for cpu0.
1026          */
1027 #ifndef __xpv
1028         non_deep_idle_disp_enq_thread = disp_enq_thread;
1029 #endif
1030         if (idle_cpu_use_hlt) {
1031                 idle_cpu = cpu_idle_adaptive;
1032                 CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
1033 #ifndef __xpv
1034                 if (is_x86_feature(x86_featureset, X86FSET_MWAIT) &&
1035                     idle_cpu_prefer_mwait) {
1036                         CPU->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU);
1037                         /*
1038                          * Protect ourself from insane mwait size.
1039                          */
1040                         if (CPU->cpu_m.mcpu_mwait == NULL) {
1041 #ifdef DEBUG
1042                                 cmn_err(CE_NOTE, "Using hlt idle.  Cannot "
1043                                     "handle cpu 0 mwait size.");
1044 #endif
1045                                 idle_cpu_prefer_mwait = 0;
1046                                 CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
1047                         } else {
1048                                 CPU->cpu_m.mcpu_idle_cpu = cpu_idle_mwait;
1049                         }
1050                 } else {
1051                         CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
1052                 }
1053                 non_deep_idle_cpu = CPU->cpu_m.mcpu_idle_cpu;
1054 
1055                 /*
1056                  * Disable power saving deep idle loop?
1057                  */
1058                 if (idle_cpu_no_deep_c) {
1059                         idle_cpu = non_deep_idle_cpu;
1060                 }
1061 #endif
1062         }
1063 
1064         mach_smpinit();
1065 }
1066 
1067 static void
1068 mach_smpinit(void)
1069 {
1070         struct psm_ops  *pops;
1071         processorid_t cpu_id;
1072         int cnt;
1073         cpuset_t cpumask;
1074 
1075         pops = mach_set[0];
1076         CPUSET_ZERO(cpumask);
1077 
1078         cpu_id = -1;
1079         cpu_id = (*pops->psm_get_next_processorid)(cpu_id);
1080         /*
1081          * Only add boot_ncpus CPUs to mp_cpus. Other CPUs will be handled
1082          * by CPU DR driver at runtime.
1083          */
1084         for (cnt = 0; cpu_id != -1 && cnt < boot_ncpus; cnt++) {
1085                 CPUSET_ADD(cpumask, cpu_id);
1086                 cpu_id = (*pops->psm_get_next_processorid)(cpu_id);
1087         }
1088 
1089         mp_cpus = cpumask;
1090 
1091         /* MP related routines */
1092         ap_mlsetup = pops->psm_post_cpu_start;
1093         send_dirintf = pops->psm_send_ipi;
1094 
1095         /* optional MP related routines */
1096         if (pops->psm_shutdown)
1097                 psm_shutdownf = pops->psm_shutdown;
1098         if (pops->psm_preshutdown)
1099                 psm_preshutdownf = pops->psm_preshutdown;
1100         if (pops->psm_notify_func)
1101                 psm_notifyf = pops->psm_notify_func;
1102         if (pops->psm_set_idlecpu)
1103                 psm_set_idle_cpuf = pops->psm_set_idlecpu;
1104         if (pops->psm_unset_idlecpu)
1105                 psm_unset_idle_cpuf = pops->psm_unset_idlecpu;
1106 
1107         psm_clkinit = pops->psm_clkinit;
1108 
1109         if (pops->psm_timer_reprogram)
1110                 psm_timer_reprogram = pops->psm_timer_reprogram;
1111 
1112         if (pops->psm_timer_enable)
1113                 psm_timer_enable = pops->psm_timer_enable;
1114 
1115         if (pops->psm_timer_disable)
1116                 psm_timer_disable = pops->psm_timer_disable;
1117 
1118         if (pops->psm_post_cyclic_setup)
1119                 psm_post_cyclic_setup = pops->psm_post_cyclic_setup;
1120 
1121         if (pops->psm_state)
1122                 psm_state = pops->psm_state;
1123 
1124         /*
1125          * Set these vectors here so they can be used by Suspend/Resume
1126          * on UP machines.
1127          */
1128         if (pops->psm_disable_intr)
1129                 psm_disable_intr = pops->psm_disable_intr;
1130         if (pops->psm_enable_intr)
1131                 psm_enable_intr  = pops->psm_enable_intr;
1132 
1133         /* check for multiple CPUs */
1134         if (cnt < 2 && plat_dr_support_cpu() == B_FALSE)
1135                 return;
1136 
1137         /* check for MP platforms */
1138         if (pops->psm_cpu_start == NULL)
1139                 return;
1140 
1141         /*
1142          * Set the dispatcher hook to enable cpu "wake up"
1143          * when a thread becomes runnable.
1144          */
1145         if (idle_cpu_use_hlt) {
1146                 disp_enq_thread = cpu_wakeup;
1147 #ifndef __xpv
1148                 if (is_x86_feature(x86_featureset, X86FSET_MWAIT) &&
1149                     idle_cpu_prefer_mwait)
1150                         disp_enq_thread = cpu_wakeup_mwait;
1151                 non_deep_idle_disp_enq_thread = disp_enq_thread;
1152 #endif
1153         }
1154 
1155         psm_get_ipivect = pops->psm_get_ipivect;
1156 
1157         (void) add_avintr((void *)NULL, XC_HI_PIL, xc_serv, "xc_intr",
1158             (*pops->psm_get_ipivect)(XC_HI_PIL, PSM_INTR_IPI_HI),
1159             NULL, NULL, NULL, NULL);
1160 
1161         (void) (*pops->psm_get_ipivect)(XC_CPUPOKE_PIL, PSM_INTR_POKE);
1162 }
1163 
1164 static void
1165 mach_picinit()
1166 {
1167         struct psm_ops  *pops;
1168 
1169         pops = mach_set[0];
1170 
1171         /* register the interrupt handlers */
1172         setlvl = pops->psm_intr_enter;
1173         setlvlx = pops->psm_intr_exit;
1174 
1175         /* initialize the interrupt hardware */
1176         (*pops->psm_picinit)();
1177 
1178         /* set interrupt mask for current ipl */
1179         setspl = pops->psm_setspl;
1180         cli();
1181         setspl(CPU->cpu_pri);
1182 }
1183 
1184 uint_t  cpu_freq;       /* MHz */
1185 uint64_t cpu_freq_hz;   /* measured (in hertz) */
1186 
1187 #define MEGA_HZ         1000000
1188 
1189 #ifdef __xpv
1190 
1191 int xpv_cpufreq_workaround = 1;
1192 int xpv_cpufreq_verbose = 0;
1193 
1194 #else   /* __xpv */
1195 
1196 static uint64_t
1197 mach_calchz(uint32_t pit_counter, uint64_t *processor_clks)
1198 {
1199         uint64_t cpu_hz;
1200 
1201         if ((pit_counter == 0) || (*processor_clks == 0) ||
1202             (*processor_clks > (((uint64_t)-1) / PIT_HZ)))
1203                 return (0);
1204 
1205         cpu_hz = ((uint64_t)PIT_HZ * *processor_clks) / pit_counter;
1206 
1207         return (cpu_hz);
1208 }
1209 
1210 #endif  /* __xpv */
1211 
1212 static uint64_t
1213 mach_getcpufreq(void)
1214 {
1215 #if defined(__xpv)
1216         vcpu_time_info_t *vti = &CPU->cpu_m.mcpu_vcpu_info->time;
1217         uint64_t cpu_hz;
1218 
1219         /*
1220          * During dom0 bringup, it was noted that on at least one older
1221          * Intel HT machine, the hypervisor initially gives a tsc_to_system_mul
1222          * value that is quite wrong (the 3.06GHz clock was reported
1223          * as 4.77GHz)
1224          *
1225          * The curious thing is, that if you stop the kernel at entry,
1226          * breakpoint here and inspect the value with kmdb, the value
1227          * is correct - but if you don't stop and simply enable the
1228          * printf statement (below), you can see the bad value printed
1229          * here.  Almost as if something kmdb did caused the hypervisor to
1230          * figure it out correctly.  And, note that the hypervisor
1231          * eventually -does- figure it out correctly ... if you look at
1232          * the field later in the life of dom0, it is correct.
1233          *
1234          * For now, on dom0, we employ a slightly cheesy workaround of
1235          * using the DOM0_PHYSINFO hypercall.
1236          */
1237         if (DOMAIN_IS_INITDOMAIN(xen_info) && xpv_cpufreq_workaround) {
1238                 cpu_hz = 1000 * xpv_cpu_khz();
1239         } else {
1240                 cpu_hz = (UINT64_C(1000000000) << 32) / vti->tsc_to_system_mul;
1241 
1242                 if (vti->tsc_shift < 0)
1243                         cpu_hz <<= -vti->tsc_shift;
1244                 else
1245                         cpu_hz >>= vti->tsc_shift;
1246         }
1247 
1248         if (xpv_cpufreq_verbose)
1249                 printf("mach_getcpufreq: system_mul 0x%x, shift %d, "
1250                     "cpu_hz %" PRId64 "Hz\n",
1251                     vti->tsc_to_system_mul, vti->tsc_shift, cpu_hz);
1252 
1253         return (cpu_hz);
1254 #else   /* __xpv */
1255         uint32_t pit_counter;
1256         uint64_t processor_clks;
1257 
1258         if (is_x86_feature(x86_featureset, X86FSET_TSC)) {
1259                 /*
1260                  * We have a TSC. freq_tsc() knows how to measure the number
1261                  * of clock cycles sampled against the PIT.
1262                  */
1263                 ulong_t flags = clear_int_flag();
1264                 processor_clks = freq_tsc(&pit_counter);
1265                 restore_int_flag(flags);
1266                 return (mach_calchz(pit_counter, &processor_clks));
1267         } else if (x86_vendor == X86_VENDOR_Cyrix || x86_type == X86_TYPE_P5) {
1268 #if defined(__amd64)
1269                 panic("mach_getcpufreq: no TSC!");
1270 #elif defined(__i386)
1271                 /*
1272                  * We are a Cyrix based on a 6x86 core or an Intel Pentium
1273                  * for which freq_notsc() knows how to measure the number of
1274                  * elapsed clock cycles sampled against the PIT
1275                  */
1276                 ulong_t flags = clear_int_flag();
1277                 processor_clks = freq_notsc(&pit_counter);
1278                 restore_int_flag(flags);
1279                 return (mach_calchz(pit_counter, &processor_clks));
1280 #endif  /* __i386 */
1281         }
1282 
1283         /* We do not know how to calculate cpu frequency for this cpu. */
1284         return (0);
1285 #endif  /* __xpv */
1286 }
1287 
1288 /*
1289  * If the clock speed of a cpu is found to be reported incorrectly, do not add
1290  * to this array, instead improve the accuracy of the algorithm that determines
1291  * the clock speed of the processor or extend the implementation to support the
1292  * vendor as appropriate. This is here only to support adjusting the speed on
1293  * older slower processors that mach_fixcpufreq() would not be able to account
1294  * for otherwise.
1295  */
1296 static int x86_cpu_freq[] = { 60, 75, 80, 90, 120, 160, 166, 175, 180, 233 };
1297 
1298 /*
1299  * On fast processors the clock frequency that is measured may be off by
1300  * a few MHz from the value printed on the part. This is a combination of
1301  * the factors that for such fast parts being off by this much is within
1302  * the tolerances for manufacture and because of the difficulties in the
1303  * measurement that can lead to small error. This function uses some
1304  * heuristics in order to tweak the value that was measured to match what
1305  * is most likely printed on the part.
1306  *
1307  * Some examples:
1308  *      AMD Athlon 1000 mhz measured as 998 mhz
1309  *      Intel Pentium III Xeon 733 mhz measured as 731 mhz
1310  *      Intel Pentium IV 1500 mhz measured as 1495mhz
1311  *
1312  * If in the future this function is no longer sufficient to correct
1313  * for the error in the measurement, then the algorithm used to perform
1314  * the measurement will have to be improved in order to increase accuracy
1315  * rather than adding horrible and questionable kludges here.
1316  *
1317  * This is called after the cyclics subsystem because of the potential
1318  * that the heuristics within may give a worse estimate of the clock
1319  * frequency than the value that was measured.
1320  */
1321 static void
1322 mach_fixcpufreq(void)
1323 {
1324         uint32_t freq, mul, near66, delta66, near50, delta50, fixed, delta, i;
1325 
1326         freq = (uint32_t)cpu_freq;
1327 
1328         /*
1329          * Find the nearest integer multiple of 200/3 (about 66) MHz to the
1330          * measured speed taking into account that the 667 MHz parts were
1331          * the first to round-up.
1332          */
1333         mul = (uint32_t)((3 * (uint64_t)freq + 100) / 200);
1334         near66 = (uint32_t)((200 * (uint64_t)mul + ((mul >= 10) ? 1 : 0)) / 3);
1335         delta66 = (near66 > freq) ? (near66 - freq) : (freq - near66);
1336 
1337         /* Find the nearest integer multiple of 50 MHz to the measured speed */
1338         mul = (freq + 25) / 50;
1339         near50 = mul * 50;
1340         delta50 = (near50 > freq) ? (near50 - freq) : (freq - near50);
1341 
1342         /* Find the closer of the two */
1343         if (delta66 < delta50) {
1344                 fixed = near66;
1345                 delta = delta66;
1346         } else {
1347                 fixed = near50;
1348                 delta = delta50;
1349         }
1350 
1351         if (fixed > INT_MAX)
1352                 return;
1353 
1354         /*
1355          * Some older parts have a core clock frequency that is not an
1356          * integral multiple of 50 or 66 MHz. Check if one of the old
1357          * clock frequencies is closer to the measured value than any
1358          * of the integral multiples of 50 an 66, and if so set fixed
1359          * and delta appropriately to represent the closest value.
1360          */
1361         i = sizeof (x86_cpu_freq) / sizeof (int);
1362         while (i > 0) {
1363                 i--;
1364 
1365                 if (x86_cpu_freq[i] <= freq) {
1366                         mul = freq - x86_cpu_freq[i];
1367 
1368                         if (mul < delta) {
1369                                 fixed = x86_cpu_freq[i];
1370                                 delta = mul;
1371                         }
1372 
1373                         break;
1374                 }
1375 
1376                 mul = x86_cpu_freq[i] - freq;
1377 
1378                 if (mul < delta) {
1379                         fixed = x86_cpu_freq[i];
1380                         delta = mul;
1381                 }
1382         }
1383 
1384         /*
1385          * Set a reasonable maximum for how much to correct the measured
1386          * result by. This check is here to prevent the adjustment made
1387          * by this function from being more harm than good. It is entirely
1388          * possible that in the future parts will be made that are not
1389          * integral multiples of 66 or 50 in clock frequency or that
1390          * someone may overclock a part to some odd frequency. If the
1391          * measured value is farther from the corrected value than
1392          * allowed, then assume the corrected value is in error and use
1393          * the measured value.
1394          */
1395         if (6 < delta)
1396                 return;
1397 
1398         cpu_freq = (int)fixed;
1399 }
1400 
1401 
1402 static int
1403 machhztomhz(uint64_t cpu_freq_hz)
1404 {
1405         uint64_t cpu_mhz;
1406 
1407         /* Round to nearest MHZ */
1408         cpu_mhz = (cpu_freq_hz + (MEGA_HZ / 2)) / MEGA_HZ;
1409 
1410         if (cpu_mhz > INT_MAX)
1411                 return (0);
1412 
1413         return ((int)cpu_mhz);
1414 
1415 }
1416 
1417 
1418 static int
1419 mach_clkinit(int preferred_mode, int *set_mode)
1420 {
1421         struct psm_ops  *pops;
1422         int resolution;
1423 
1424         pops = mach_set[0];
1425 
1426         cpu_freq_hz = mach_getcpufreq();
1427 
1428         cpu_freq = machhztomhz(cpu_freq_hz);
1429 
1430         if (!is_x86_feature(x86_featureset, X86FSET_TSC) || (cpu_freq == 0))
1431                 tsc_gethrtime_enable = 0;
1432 
1433 #ifndef __xpv
1434         if (tsc_gethrtime_enable) {
1435                 tsc_hrtimeinit(cpu_freq_hz);
1436         } else
1437 #endif
1438         {
1439                 if (pops->psm_hrtimeinit)
1440                         (*pops->psm_hrtimeinit)();
1441                 gethrtimef = pops->psm_gethrtime;
1442                 gethrtimeunscaledf = gethrtimef;
1443                 /* scalehrtimef will remain dummy */
1444         }
1445 
1446         mach_fixcpufreq();
1447 
1448         if (mach_ver[0] >= PSM_INFO_VER01_3) {
1449                 if (preferred_mode == TIMER_ONESHOT) {
1450 
1451                         resolution = (*pops->psm_clkinit)(0);
1452                         if (resolution != 0)  {
1453                                 *set_mode = TIMER_ONESHOT;
1454                                 return (resolution);
1455                         }
1456                 }
1457 
1458                 /*
1459                  * either periodic mode was requested or could not set to
1460                  * one-shot mode
1461                  */
1462                 resolution = (*pops->psm_clkinit)(hz);
1463                 /*
1464                  * psm should be able to do periodic, so we do not check
1465                  * for return value of psm_clkinit here.
1466                  */
1467                 *set_mode = TIMER_PERIODIC;
1468                 return (resolution);
1469         } else {
1470                 /*
1471                  * PSMI interface prior to PSMI_3 does not define a return
1472                  * value for psm_clkinit, so the return value is ignored.
1473                  */
1474                 (void) (*pops->psm_clkinit)(hz);
1475                 *set_mode = TIMER_PERIODIC;
1476                 return (nsec_per_tick);
1477         }
1478 }
1479 
1480 
1481 /*ARGSUSED*/
1482 static int
1483 mach_softlvl_to_vect(int ipl)
1484 {
1485         setsoftint = av_set_softint_pending;
1486         kdisetsoftint = kdi_av_set_softint_pending;
1487 
1488         return (PSM_SV_SOFTWARE);
1489 }
1490 
1491 #ifdef DEBUG
1492 /*
1493  * This is here to allow us to simulate cpus that refuse to start.
1494  */
1495 cpuset_t cpufailset;
1496 #endif
1497 
1498 int
1499 mach_cpu_start(struct cpu *cp, void *ctx)
1500 {
1501         struct psm_ops *pops = mach_set[0];
1502         processorid_t id = cp->cpu_id;
1503 
1504 #ifdef DEBUG
1505         if (CPU_IN_SET(cpufailset, id))
1506                 return (0);
1507 #endif
1508         return ((*pops->psm_cpu_start)(id, ctx));
1509 }
1510 
1511 int
1512 mach_cpuid_start(processorid_t id, void *ctx)
1513 {
1514         struct psm_ops *pops = mach_set[0];
1515 
1516 #ifdef DEBUG
1517         if (CPU_IN_SET(cpufailset, id))
1518                 return (0);
1519 #endif
1520         return ((*pops->psm_cpu_start)(id, ctx));
1521 }
1522 
1523 int
1524 mach_cpu_stop(cpu_t *cp, void *ctx)
1525 {
1526         struct psm_ops *pops = mach_set[0];
1527         psm_cpu_request_t request;
1528 
1529         if (pops->psm_cpu_ops == NULL) {
1530                 return (ENOTSUP);
1531         }
1532 
1533         ASSERT(cp->cpu_id != -1);
1534         request.pcr_cmd = PSM_CPU_STOP;
1535         request.req.cpu_stop.cpuid = cp->cpu_id;
1536         request.req.cpu_stop.ctx = ctx;
1537 
1538         return ((*pops->psm_cpu_ops)(&request));
1539 }
1540 
1541 int
1542 mach_cpu_add(mach_cpu_add_arg_t *argp, processorid_t *cpuidp)
1543 {
1544         int rc;
1545         struct psm_ops *pops = mach_set[0];
1546         psm_cpu_request_t request;
1547 
1548         if (pops->psm_cpu_ops == NULL) {
1549                 return (ENOTSUP);
1550         }
1551 
1552         request.pcr_cmd = PSM_CPU_ADD;
1553         request.req.cpu_add.argp = argp;
1554         request.req.cpu_add.cpuid = -1;
1555         rc = (*pops->psm_cpu_ops)(&request);
1556         if (rc == 0) {
1557                 ASSERT(request.req.cpu_add.cpuid != -1);
1558                 *cpuidp = request.req.cpu_add.cpuid;
1559         }
1560 
1561         return (rc);
1562 }
1563 
1564 int
1565 mach_cpu_remove(processorid_t cpuid)
1566 {
1567         struct psm_ops *pops = mach_set[0];
1568         psm_cpu_request_t request;
1569 
1570         if (pops->psm_cpu_ops == NULL) {
1571                 return (ENOTSUP);
1572         }
1573 
1574         request.pcr_cmd = PSM_CPU_REMOVE;
1575         request.req.cpu_remove.cpuid = cpuid;
1576 
1577         return ((*pops->psm_cpu_ops)(&request));
1578 }
1579 
1580 /*
1581  * Default handler to create device node for CPU.
1582  * One reference count will be held on created device node.
1583  */
1584 static int
1585 mach_cpu_create_devinfo(cpu_t *cp, dev_info_t **dipp)
1586 {
1587         int rv, circ;
1588         dev_info_t *dip;
1589         static kmutex_t cpu_node_lock;
1590         static dev_info_t *cpu_nex_devi = NULL;
1591 
1592         ASSERT(cp != NULL);
1593         ASSERT(dipp != NULL);
1594         *dipp = NULL;
1595 
1596         if (cpu_nex_devi == NULL) {
1597                 mutex_enter(&cpu_node_lock);
1598                 /* First check whether cpus exists. */
1599                 cpu_nex_devi = ddi_find_devinfo("cpus", -1, 0);
1600                 /* Create cpus if it doesn't exist. */
1601                 if (cpu_nex_devi == NULL) {
1602                         ndi_devi_enter(ddi_root_node(), &circ);
1603                         rv = ndi_devi_alloc(ddi_root_node(), "cpus",
1604                             (pnode_t)DEVI_SID_NODEID, &dip);
1605                         if (rv != NDI_SUCCESS) {
1606                                 mutex_exit(&cpu_node_lock);
1607                                 cmn_err(CE_CONT,
1608                                     "?failed to create cpu nexus device.\n");
1609                                 return (PSM_FAILURE);
1610                         }
1611                         ASSERT(dip != NULL);
1612                         (void) ndi_devi_online(dip, 0);
1613                         ndi_devi_exit(ddi_root_node(), circ);
1614                         cpu_nex_devi = dip;
1615                 }
1616                 mutex_exit(&cpu_node_lock);
1617         }
1618 
1619         /*
1620          * create a child node for cpu identified as 'cpu_id'
1621          */
1622         ndi_devi_enter(cpu_nex_devi, &circ);
1623         dip = ddi_add_child(cpu_nex_devi, "cpu", DEVI_SID_NODEID, -1);
1624         if (dip == NULL) {
1625                 cmn_err(CE_CONT,
1626                     "?failed to create device node for cpu%d.\n", cp->cpu_id);
1627                 rv = PSM_FAILURE;
1628         } else {
1629                 *dipp = dip;
1630                 (void) ndi_hold_devi(dip);
1631                 rv = PSM_SUCCESS;
1632         }
1633         ndi_devi_exit(cpu_nex_devi, circ);
1634 
1635         return (rv);
1636 }
1637 
1638 /*
1639  * Create cpu device node in device tree and online it.
1640  * Return created dip with reference count held if requested.
1641  */
1642 int
1643 mach_cpu_create_device_node(struct cpu *cp, dev_info_t **dipp)
1644 {
1645         int rv;
1646         dev_info_t *dip = NULL;
1647 
1648         ASSERT(psm_cpu_create_devinfo != NULL);
1649         rv = psm_cpu_create_devinfo(cp, &dip);
1650         if (rv == PSM_SUCCESS) {
1651                 cpuid_set_cpu_properties(dip, cp->cpu_id, cp->cpu_m.mcpu_cpi);
1652                 /* Recursively attach driver for parent nexus device. */
1653                 if (i_ddi_attach_node_hierarchy(ddi_get_parent(dip)) ==
1654                     DDI_SUCCESS) {
1655                         /* Configure cpu itself and descendants. */
1656                         (void) ndi_devi_online(dip,
1657                             NDI_ONLINE_ATTACH | NDI_CONFIG);
1658                 }
1659                 if (dipp != NULL) {
1660                         *dipp = dip;
1661                 } else {
1662                         (void) ndi_rele_devi(dip);
1663                 }
1664         }
1665 
1666         return (rv);
1667 }
1668 
1669 /*
1670  * The dipp contains one of following values on return:
1671  * - NULL if no device node found
1672  * - pointer to device node if found
1673  */
1674 int
1675 mach_cpu_get_device_node(struct cpu *cp, dev_info_t **dipp)
1676 {
1677         *dipp = NULL;
1678         if (psm_cpu_get_devinfo != NULL) {
1679                 if (psm_cpu_get_devinfo(cp, dipp) == PSM_SUCCESS) {
1680                         return (PSM_SUCCESS);
1681                 }
1682         }
1683 
1684         return (PSM_FAILURE);
1685 }
1686 
1687 /*ARGSUSED*/
1688 static int
1689 mach_translate_irq(dev_info_t *dip, int irqno)
1690 {
1691         return (irqno); /* default to NO translation */
1692 }
1693 
1694 static void
1695 mach_notify_error(int level, char *errmsg)
1696 {
1697         /*
1698          * SL_FATAL is pass in once panicstr is set, deliver it
1699          * as CE_PANIC.  Also, translate SL_ codes back to CE_
1700          * codes for the psmi handler
1701          */
1702         if (level & SL_FATAL)
1703                 (*notify_error)(CE_PANIC, errmsg);
1704         else if (level & SL_WARN)
1705                 (*notify_error)(CE_WARN, errmsg);
1706         else if (level & SL_NOTE)
1707                 (*notify_error)(CE_NOTE, errmsg);
1708         else if (level & SL_CONSOLE)
1709                 (*notify_error)(CE_CONT, errmsg);
1710 }
1711 
1712 /*
1713  * It provides the default basic intr_ops interface for the new DDI
1714  * interrupt framework if the PSM doesn't have one.
1715  *
1716  * Input:
1717  * dip     - pointer to the dev_info structure of the requested device
1718  * hdlp    - pointer to the internal interrupt handle structure for the
1719  *           requested interrupt
1720  * intr_op - opcode for this call
1721  * result  - pointer to the integer that will hold the result to be
1722  *           passed back if return value is PSM_SUCCESS
1723  *
1724  * Output:
1725  * return value is either PSM_SUCCESS or PSM_FAILURE
1726  */
1727 static int
1728 mach_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
1729     psm_intr_op_t intr_op, int *result)
1730 {
1731         struct intrspec *ispec;
1732 
1733         switch (intr_op) {
1734         case PSM_INTR_OP_CHECK_MSI:
1735                 *result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
1736                     DDI_INTR_TYPE_MSIX);
1737                 break;
1738         case PSM_INTR_OP_ALLOC_VECTORS:
1739                 if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
1740                         *result = 1;
1741                 else
1742                         *result = 0;
1743                 break;
1744         case PSM_INTR_OP_FREE_VECTORS:
1745                 break;
1746         case PSM_INTR_OP_NAVAIL_VECTORS:
1747                 if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
1748                         *result = 1;
1749                 else
1750                         *result = 0;
1751                 break;
1752         case PSM_INTR_OP_XLATE_VECTOR:
1753                 ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
1754                 *result = psm_translate_irq(dip, ispec->intrspec_vec);
1755                 break;
1756         case PSM_INTR_OP_GET_CAP:
1757                 *result = 0;
1758                 break;
1759         case PSM_INTR_OP_GET_PENDING:
1760         case PSM_INTR_OP_CLEAR_MASK:
1761         case PSM_INTR_OP_SET_MASK:
1762         case PSM_INTR_OP_GET_SHARED:
1763         case PSM_INTR_OP_SET_PRI:
1764         case PSM_INTR_OP_SET_CAP:
1765         case PSM_INTR_OP_SET_CPU:
1766         case PSM_INTR_OP_GET_INTR:
1767         default:
1768                 return (PSM_FAILURE);
1769         }
1770         return (PSM_SUCCESS);
1771 }
1772 /*
1773  * Return 1 if CMT load balancing policies should be
1774  * implemented across instances of the specified hardware
1775  * sharing relationship.
1776  */
1777 int
1778 pg_cmt_load_bal_hw(pghw_type_t hw)
1779 {
1780         if (hw == PGHW_IPIPE ||
1781             hw == PGHW_FPU ||
1782             hw == PGHW_PROCNODE ||
1783             hw == PGHW_CHIP)
1784                 return (1);
1785         else
1786                 return (0);
1787 }
1788 /*
1789  * Return 1 if thread affinity polices should be implemented
1790  * for instances of the specifed hardware sharing relationship.
1791  */
1792 int
1793 pg_cmt_affinity_hw(pghw_type_t hw)
1794 {
1795         if (hw == PGHW_CACHE)
1796                 return (1);
1797         else
1798                 return (0);
1799 }
1800 
1801 /*
1802  * Return number of counter events requested to measure hardware capacity and
1803  * utilization and setup CPC requests for specified CPU as needed
1804  *
1805  * May return 0 when platform or processor specific code knows that no CPC
1806  * events should be programmed on this CPU or -1 when platform or processor
1807  * specific code doesn't know which counter events are best to use and common
1808  * code should decide for itself
1809  */
1810 int
1811 /* LINTED E_FUNC_ARG_UNUSED */
1812 cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
1813 {
1814         const char      *impl_name;
1815 
1816         /*
1817          * Return error if pcbe_ops not set
1818          */
1819         if (pcbe_ops == NULL)
1820                 return (-1);
1821 
1822         /*
1823          * Return that no CPC events should be programmed on hyperthreaded
1824          * Pentium 4 and return error for all other x86 processors to tell
1825          * common code to decide what counter events to program on those CPUs
1826          * for measuring hardware capacity and utilization
1827          */
1828         impl_name = pcbe_ops->pcbe_impl_name();
1829         if (impl_name != NULL && strcmp(impl_name, PCBE_IMPL_NAME_P4HT) == 0)
1830                 return (0);
1831         else
1832                 return (-1);
1833 }