1 
   2 /*
   3  * CDDL HEADER START
   4  *
   5  * The contents of this file are subject to the terms of the
   6  * Common Development and Distribution License (the "License").
   7  * You may not use this file except in compliance with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /*
  26  * Copyright (c) 2009-2010, Intel Corporation.
  27  * All rights reserved.
  28  */
  29 
  30 #define PSMI_1_7
  31 #include <sys/smp_impldefs.h>
  32 #include <sys/psm.h>
  33 #include <sys/psm_modctl.h>
  34 #include <sys/pit.h>
  35 #include <sys/cmn_err.h>
  36 #include <sys/strlog.h>
  37 #include <sys/clock.h>
  38 #include <sys/debug.h>
  39 #include <sys/rtc.h>
  40 #include <sys/x86_archext.h>
  41 #include <sys/cpupart.h>
  42 #include <sys/cpuvar.h>
  43 #include <sys/cpu_event.h>
  44 #include <sys/cmt.h>
  45 #include <sys/cpu.h>
  46 #include <sys/disp.h>
  47 #include <sys/archsystm.h>
  48 #include <sys/machsystm.h>
  49 #include <sys/sysmacros.h>
  50 #include <sys/memlist.h>
  51 #include <sys/param.h>
  52 #include <sys/promif.h>
  53 #include <sys/cpu_pm.h>
  54 #if defined(__xpv)
  55 #include <sys/hypervisor.h>
  56 #endif
  57 #include <sys/mach_intr.h>
  58 #include <vm/hat_i86.h>
  59 #include <sys/kdi_machimpl.h>
  60 #include <sys/sdt.h>
  61 #include <sys/hpet.h>
  62 #include <sys/sunddi.h>
  63 #include <sys/sunndi.h>
  64 #include <sys/cpc_pcbe.h>
  65 
  66 #define OFFSETOF(s, m)          (size_t)(&(((s *)0)->m))
  67 
  68 /*
  69  *      Local function prototypes
  70  */
  71 static int mp_disable_intr(processorid_t cpun);
  72 static void mp_enable_intr(processorid_t cpun);
  73 static void mach_init();
  74 static void mach_picinit();
  75 static int machhztomhz(uint64_t cpu_freq_hz);
  76 static uint64_t mach_getcpufreq(void);
  77 static void mach_fixcpufreq(void);
  78 static int mach_clkinit(int, int *);
  79 static void mach_smpinit(void);
  80 static int mach_softlvl_to_vect(int ipl);
  81 static void mach_get_platform(int owner);
  82 static void mach_construct_info();
  83 static int mach_translate_irq(dev_info_t *dip, int irqno);
  84 static int mach_intr_ops(dev_info_t *, ddi_intr_handle_impl_t *,
  85     psm_intr_op_t, int *);
  86 static void mach_notify_error(int level, char *errmsg);
  87 static hrtime_t dummy_hrtime(void);
  88 static void dummy_scalehrtime(hrtime_t *);
  89 static uint64_t dummy_unscalehrtime(hrtime_t);
  90 void cpu_idle(void);
  91 static void cpu_wakeup(cpu_t *, int);
  92 #ifndef __xpv
  93 void cpu_idle_mwait(void);
  94 static void cpu_wakeup_mwait(cpu_t *, int);
  95 #endif
  96 static int mach_cpu_create_devinfo(cpu_t *cp, dev_info_t **dipp);
  97 
  98 /*
  99  *      External reference functions
 100  */
 101 extern void return_instr();
 102 extern uint64_t freq_tsc(uint32_t *);
 103 #if defined(__i386)
 104 extern uint64_t freq_notsc(uint32_t *);
 105 #endif
 106 extern void pc_gethrestime(timestruc_t *);
 107 extern int cpuid_get_coreid(cpu_t *);
 108 extern int cpuid_get_chipid(cpu_t *);
 109 
 110 /*
 111  *      PSM functions initialization
 112  */
 113 void (*psm_shutdownf)(int, int) = (void (*)(int, int))return_instr;
 114 void (*psm_preshutdownf)(int, int) = (void (*)(int, int))return_instr;
 115 void (*psm_notifyf)(int)        = (void (*)(int))return_instr;
 116 void (*psm_set_idle_cpuf)(int)  = (void (*)(int))return_instr;
 117 void (*psm_unset_idle_cpuf)(int) = (void (*)(int))return_instr;
 118 void (*psminitf)()              = mach_init;
 119 void (*picinitf)()              = return_instr;
 120 int (*clkinitf)(int, int *)     = (int (*)(int, int *))return_instr;
 121 int (*ap_mlsetup)()             = (int (*)(void))return_instr;
 122 void (*send_dirintf)()          = return_instr;
 123 void (*setspl)(int)             = (void (*)(int))return_instr;
 124 int (*addspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr;
 125 int (*delspl)(int, int, int, int) = (int (*)(int, int, int, int))return_instr;
 126 int (*get_pending_spl)(void)    = (int (*)(void))return_instr;
 127 int (*addintr)(void *, int, avfunc, char *, int, caddr_t, caddr_t,
 128     uint64_t *, dev_info_t *) = NULL;
 129 void (*remintr)(void *, int, avfunc, int) = NULL;
 130 void (*kdisetsoftint)(int, struct av_softinfo *)=
 131         (void (*)(int, struct av_softinfo *))return_instr;
 132 void (*setsoftint)(int, struct av_softinfo *)=
 133         (void (*)(int, struct av_softinfo *))return_instr;
 134 int (*slvltovect)(int)          = (int (*)(int))return_instr;
 135 int (*setlvl)(int, int *)       = (int (*)(int, int *))return_instr;
 136 void (*setlvlx)(int, int)       = (void (*)(int, int))return_instr;
 137 int (*psm_disable_intr)(int)    = mp_disable_intr;
 138 void (*psm_enable_intr)(int)    = mp_enable_intr;
 139 hrtime_t (*gethrtimef)(void)    = dummy_hrtime;
 140 hrtime_t (*gethrtimeunscaledf)(void)    = dummy_hrtime;
 141 void (*scalehrtimef)(hrtime_t *)        = dummy_scalehrtime;
 142 uint64_t (*unscalehrtimef)(hrtime_t)    = dummy_unscalehrtime;
 143 int (*psm_translate_irq)(dev_info_t *, int) = mach_translate_irq;
 144 void (*gethrestimef)(timestruc_t *) = pc_gethrestime;
 145 void (*psm_notify_error)(int, char *) = (void (*)(int, char *))NULL;
 146 int (*psm_get_clockirq)(int) = NULL;
 147 int (*psm_get_ipivect)(int, int) = NULL;
 148 uchar_t (*psm_get_ioapicid)(uchar_t) = NULL;
 149 uint32_t (*psm_get_localapicid)(uint32_t) = NULL;
 150 uchar_t (*psm_xlate_vector_by_irq)(uchar_t) = NULL;
 151 
 152 int (*psm_clkinit)(int) = NULL;
 153 void (*psm_timer_reprogram)(hrtime_t) = NULL;
 154 void (*psm_timer_enable)(void) = NULL;
 155 void (*psm_timer_disable)(void) = NULL;
 156 void (*psm_post_cyclic_setup)(void *arg) = NULL;
 157 int (*psm_intr_ops)(dev_info_t *, ddi_intr_handle_impl_t *, psm_intr_op_t,
 158     int *) = mach_intr_ops;
 159 int (*psm_state)(psm_state_request_t *) = (int (*)(psm_state_request_t *))
 160     return_instr;
 161 
 162 void (*notify_error)(int, char *) = (void (*)(int, char *))return_instr;
 163 void (*hrtime_tick)(void)       = return_instr;
 164 
 165 int (*psm_cpu_create_devinfo)(cpu_t *, dev_info_t **) = mach_cpu_create_devinfo;
 166 int (*psm_cpu_get_devinfo)(cpu_t *, dev_info_t **) = NULL;
 167 
 168 /* global IRM pool for APIX (PSM) module */
 169 ddi_irm_pool_t *apix_irm_pool_p = NULL;
 170 
 171 /*
 172  * True if the generic TSC code is our source of hrtime, rather than whatever
 173  * the PSM can provide.
 174  */
 175 #ifdef __xpv
 176 int tsc_gethrtime_enable = 0;
 177 #else
 178 int tsc_gethrtime_enable = 1;
 179 #endif
 180 int tsc_gethrtime_initted = 0;
 181 
 182 /*
 183  * True if the hrtime implementation is "hires"; namely, better than microdata.
 184  */
 185 int gethrtime_hires = 0;
 186 
 187 /*
 188  * Local Static Data
 189  */
 190 static struct psm_ops mach_ops;
 191 static struct psm_ops *mach_set[4] = {&mach_ops, NULL, NULL, NULL};
 192 static ushort_t mach_ver[4] = {0, 0, 0, 0};
 193 
 194 /*
 195  * virtualization support for psm
 196  */
 197 void *psm_vt_ops = NULL;
 198 /*
 199  * If non-zero, idle cpus will become "halted" when there's
 200  * no work to do.
 201  */
 202 int     idle_cpu_use_hlt = 1;
 203 
 204 #ifndef __xpv
 205 /*
 206  * If non-zero, idle cpus will use mwait if available to halt instead of hlt.
 207  */
 208 int     idle_cpu_prefer_mwait = 1;
 209 /*
 210  * Set to 0 to avoid MONITOR+CLFLUSH assertion.
 211  */
 212 int     idle_cpu_assert_cflush_monitor = 1;
 213 
 214 /*
 215  * If non-zero, idle cpus will not use power saving Deep C-States idle loop.
 216  */
 217 int     idle_cpu_no_deep_c = 0;
 218 /*
 219  * Non-power saving idle loop and wakeup pointers.
 220  * Allows user to toggle Deep Idle power saving feature on/off.
 221  */
 222 void    (*non_deep_idle_cpu)() = cpu_idle;
 223 void    (*non_deep_idle_disp_enq_thread)(cpu_t *, int);
 224 
 225 /*
 226  * Object for the kernel to access the HPET.
 227  */
 228 hpet_t hpet;
 229 
 230 #endif  /* ifndef __xpv */
 231 
 232 uint_t cp_haltset_fanout = 0;
 233 
 234 /*ARGSUSED*/
 235 int
 236 pg_plat_hw_shared(cpu_t *cp, pghw_type_t hw)
 237 {
 238         switch (hw) {
 239         case PGHW_IPIPE:
 240                 if (is_x86_feature(x86_featureset, X86FSET_HTT)) {
 241                         /*
 242                          * Hyper-threading is SMT
 243                          */
 244                         return (1);
 245                 } else {
 246                         return (0);
 247                 }
 248         case PGHW_PROCNODE:
 249                 if (cpuid_get_procnodes_per_pkg(cp) > 1)
 250                         return (1);
 251                 else
 252                         return (0);
 253         case PGHW_CHIP:
 254                 if (is_x86_feature(x86_featureset, X86FSET_CMP) ||
 255                     is_x86_feature(x86_featureset, X86FSET_HTT))
 256                         return (1);
 257                 else
 258                         return (0);
 259         case PGHW_CACHE:
 260                 if (cpuid_get_ncpu_sharing_last_cache(cp) > 1)
 261                         return (1);
 262                 else
 263                         return (0);
 264         case PGHW_POW_ACTIVE:
 265                 if (cpupm_domain_id(cp, CPUPM_DTYPE_ACTIVE) != (id_t)-1)
 266                         return (1);
 267                 else
 268                         return (0);
 269         case PGHW_POW_IDLE:
 270                 if (cpupm_domain_id(cp, CPUPM_DTYPE_IDLE) != (id_t)-1)
 271                         return (1);
 272                 else
 273                         return (0);
 274         default:
 275                 return (0);
 276         }
 277 }
 278 
 279 /*
 280  * Compare two CPUs and see if they have a pghw_type_t sharing relationship
 281  * If pghw_type_t is an unsupported hardware type, then return -1
 282  */
 283 int
 284 pg_plat_cpus_share(cpu_t *cpu_a, cpu_t *cpu_b, pghw_type_t hw)
 285 {
 286         id_t pgp_a, pgp_b;
 287 
 288         pgp_a = pg_plat_hw_instance_id(cpu_a, hw);
 289         pgp_b = pg_plat_hw_instance_id(cpu_b, hw);
 290 
 291         if (pgp_a == -1 || pgp_b == -1)
 292                 return (-1);
 293 
 294         return (pgp_a == pgp_b);
 295 }
 296 
 297 /*
 298  * Return a physical instance identifier for known hardware sharing
 299  * relationships
 300  */
 301 id_t
 302 pg_plat_hw_instance_id(cpu_t *cpu, pghw_type_t hw)
 303 {
 304         switch (hw) {
 305         case PGHW_IPIPE:
 306                 return (cpuid_get_coreid(cpu));
 307         case PGHW_CACHE:
 308                 return (cpuid_get_last_lvl_cacheid(cpu));
 309         case PGHW_PROCNODE:
 310                 return (cpuid_get_procnodeid(cpu));
 311         case PGHW_CHIP:
 312                 return (cpuid_get_chipid(cpu));
 313         case PGHW_POW_ACTIVE:
 314                 return (cpupm_domain_id(cpu, CPUPM_DTYPE_ACTIVE));
 315         case PGHW_POW_IDLE:
 316                 return (cpupm_domain_id(cpu, CPUPM_DTYPE_IDLE));
 317         default:
 318                 return (-1);
 319         }
 320 }
 321 
 322 /*
 323  * Express preference for optimizing for sharing relationship
 324  * hw1 vs hw2
 325  */
 326 pghw_type_t
 327 pg_plat_hw_rank(pghw_type_t hw1, pghw_type_t hw2)
 328 {
 329         int i, rank1, rank2;
 330 
 331         static pghw_type_t hw_hier[] = {
 332                 PGHW_IPIPE,
 333                 PGHW_CACHE,
 334                 PGHW_PROCNODE,
 335                 PGHW_CHIP,
 336                 PGHW_POW_IDLE,
 337                 PGHW_POW_ACTIVE,
 338                 PGHW_NUM_COMPONENTS
 339         };
 340 
 341         for (i = 0; hw_hier[i] != PGHW_NUM_COMPONENTS; i++) {
 342                 if (hw_hier[i] == hw1)
 343                         rank1 = i;
 344                 if (hw_hier[i] == hw2)
 345                         rank2 = i;
 346         }
 347 
 348         if (rank1 > rank2)
 349                 return (hw1);
 350         else
 351                 return (hw2);
 352 }
 353 
 354 /*
 355  * Override the default CMT dispatcher policy for the specified
 356  * hardware sharing relationship
 357  */
 358 pg_cmt_policy_t
 359 pg_plat_cmt_policy(pghw_type_t hw)
 360 {
 361         /*
 362          * For shared caches, also load balance across them to
 363          * maximize aggregate cache capacity
 364          */
 365         switch (hw) {
 366         case PGHW_CACHE:
 367                 return (CMT_BALANCE|CMT_AFFINITY);
 368         default:
 369                 return (CMT_NO_POLICY);
 370         }
 371 }
 372 
 373 id_t
 374 pg_plat_get_core_id(cpu_t *cpu)
 375 {
 376         return ((id_t)cpuid_get_coreid(cpu));
 377 }
 378 
 379 void
 380 cmp_set_nosteal_interval(void)
 381 {
 382         /* Set the nosteal interval (used by disp_getbest()) to 100us */
 383         nosteal_nsec = 100000UL;
 384 }
 385 
 386 /*
 387  * Routine to ensure initial callers to hrtime gets 0 as return
 388  */
 389 static hrtime_t
 390 dummy_hrtime(void)
 391 {
 392         return (0);
 393 }
 394 
 395 /* ARGSUSED */
 396 static void
 397 dummy_scalehrtime(hrtime_t *ticks)
 398 {}
 399 
 400 static uint64_t
 401 dummy_unscalehrtime(hrtime_t nsecs)
 402 {
 403         return ((uint64_t)nsecs);
 404 }
 405 
 406 /*
 407  * Supports Deep C-State power saving idle loop.
 408  */
 409 void
 410 cpu_idle_adaptive(void)
 411 {
 412         (*CPU->cpu_m.mcpu_idle_cpu)();
 413 }
 414 
 415 /*
 416  * Function called by CPU idle notification framework to check whether CPU
 417  * has been awakened. It will be called with interrupt disabled.
 418  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
 419  * notification framework.
 420  */
 421 /*ARGSUSED*/
 422 static void
 423 cpu_idle_check_wakeup(void *arg)
 424 {
 425         /*
 426          * Toggle interrupt flag to detect pending interrupts.
 427          * If interrupt happened, do_interrupt() will notify CPU idle
 428          * notification framework so no need to call cpu_idle_exit() here.
 429          */
 430         sti();
 431         SMT_PAUSE();
 432         cli();
 433 }
 434 
 435 /*
 436  * Idle the present CPU until wakened via an interrupt
 437  */
 438 void
 439 cpu_idle(void)
 440 {
 441         cpu_t           *cpup = CPU;
 442         processorid_t   cpu_sid = cpup->cpu_seqid;
 443         cpupart_t       *cp = cpup->cpu_part;
 444         int             hset_update = 1;
 445 
 446         /*
 447          * If this CPU is online, and there's multiple CPUs
 448          * in the system, then we should notate our halting
 449          * by adding ourselves to the partition's halted CPU
 450          * bitmap. This allows other CPUs to find/awaken us when
 451          * work becomes available.
 452          */
 453         if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
 454                 hset_update = 0;
 455 
 456         /*
 457          * Add ourselves to the partition's halted CPUs bitmap
 458          * and set our HALTED flag, if necessary.
 459          *
 460          * When a thread becomes runnable, it is placed on the queue
 461          * and then the halted CPU bitmap is checked to determine who
 462          * (if anyone) should be awakened. We therefore need to first
 463          * add ourselves to the bitmap, and and then check if there
 464          * is any work available. The order is important to prevent a race
 465          * that can lead to work languishing on a run queue somewhere while
 466          * this CPU remains halted.
 467          *
 468          * Either the producing CPU will see we're halted and will awaken us,
 469          * or this CPU will see the work available in disp_anywork().
 470          *
 471          * Note that memory barriers after updating the HALTED flag
 472          * are not necessary since an atomic operation (updating the bitset)
 473          * immediately follows. On x86 the atomic operation acts as a
 474          * memory barrier for the update of cpu_disp_flags.
 475          */
 476         if (hset_update) {
 477                 cpup->cpu_disp_flags |= CPU_DISP_HALTED;
 478                 bitset_atomic_add(&cp->cp_haltset, cpu_sid);
 479         }
 480 
 481         /*
 482          * Check to make sure there's really nothing to do.
 483          * Work destined for this CPU may become available after
 484          * this check. We'll be notified through the clearing of our
 485          * bit in the halted CPU bitmap, and a poke.
 486          */
 487         if (disp_anywork()) {
 488                 if (hset_update) {
 489                         cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 490                         bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 491                 }
 492                 return;
 493         }
 494 
 495         /*
 496          * We're on our way to being halted.
 497          *
 498          * Disable interrupts now, so that we'll awaken immediately
 499          * after halting if someone tries to poke us between now and
 500          * the time we actually halt.
 501          *
 502          * We check for the presence of our bit after disabling interrupts.
 503          * If it's cleared, we'll return. If the bit is cleared after
 504          * we check then the poke will pop us out of the halted state.
 505          *
 506          * This means that the ordering of the poke and the clearing
 507          * of the bit by cpu_wakeup is important.
 508          * cpu_wakeup() must clear, then poke.
 509          * cpu_idle() must disable interrupts, then check for the bit.
 510          */
 511         cli();
 512 
 513         if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
 514                 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 515                 sti();
 516                 return;
 517         }
 518 
 519         /*
 520          * The check for anything locally runnable is here for performance
 521          * and isn't needed for correctness. disp_nrunnable ought to be
 522          * in our cache still, so it's inexpensive to check, and if there
 523          * is anything runnable we won't have to wait for the poke.
 524          */
 525         if (cpup->cpu_disp->disp_nrunnable != 0) {
 526                 if (hset_update) {
 527                         cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 528                         bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 529                 }
 530                 sti();
 531                 return;
 532         }
 533 
 534         if (cpu_idle_enter(IDLE_STATE_C1, 0,
 535             cpu_idle_check_wakeup, NULL) == 0) {
 536                 mach_cpu_idle();
 537                 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 538         }
 539 
 540         /*
 541          * We're no longer halted
 542          */
 543         if (hset_update) {
 544                 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 545                 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 546         }
 547 }
 548 
 549 
 550 /*
 551  * If "cpu" is halted, then wake it up clearing its halted bit in advance.
 552  * Otherwise, see if other CPUs in the cpu partition are halted and need to
 553  * be woken up so that they can steal the thread we placed on this CPU.
 554  * This function is only used on MP systems.
 555  */
 556 static void
 557 cpu_wakeup(cpu_t *cpu, int bound)
 558 {
 559         uint_t          cpu_found;
 560         processorid_t   cpu_sid;
 561         cpupart_t       *cp;
 562 
 563         cp = cpu->cpu_part;
 564         cpu_sid = cpu->cpu_seqid;
 565         if (bitset_in_set(&cp->cp_haltset, cpu_sid)) {
 566                 /*
 567                  * Clear the halted bit for that CPU since it will be
 568                  * poked in a moment.
 569                  */
 570                 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 571                 /*
 572                  * We may find the current CPU present in the halted cpuset
 573                  * if we're in the context of an interrupt that occurred
 574                  * before we had a chance to clear our bit in cpu_idle().
 575                  * Poking ourself is obviously unnecessary, since if
 576                  * we're here, we're not halted.
 577                  */
 578                 if (cpu != CPU)
 579                         poke_cpu(cpu->cpu_id);
 580                 return;
 581         } else {
 582                 /*
 583                  * This cpu isn't halted, but it's idle or undergoing a
 584                  * context switch. No need to awaken anyone else.
 585                  */
 586                 if (cpu->cpu_thread == cpu->cpu_idle_thread ||
 587                     cpu->cpu_disp_flags & CPU_DISP_DONTSTEAL)
 588                         return;
 589         }
 590 
 591         /*
 592          * No need to wake up other CPUs if this is for a bound thread.
 593          */
 594         if (bound)
 595                 return;
 596 
 597         /*
 598          * The CPU specified for wakeup isn't currently halted, so check
 599          * to see if there are any other halted CPUs in the partition,
 600          * and if there are then awaken one.
 601          */
 602         do {
 603                 cpu_found = bitset_find(&cp->cp_haltset);
 604                 if (cpu_found == (uint_t)-1)
 605                         return;
 606         } while (bitset_atomic_test_and_del(&cp->cp_haltset, cpu_found) < 0);
 607 
 608         if (cpu_found != CPU->cpu_seqid) {
 609                 poke_cpu(cpu_seq[cpu_found]->cpu_id);
 610         }
 611 }
 612 
 613 #ifndef __xpv
 614 /*
 615  * Function called by CPU idle notification framework to check whether CPU
 616  * has been awakened. It will be called with interrupt disabled.
 617  * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
 618  * notification framework.
 619  */
 620 static void
 621 cpu_idle_mwait_check_wakeup(void *arg)
 622 {
 623         volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;
 624 
 625         ASSERT(arg != NULL);
 626         if (*mcpu_mwait != MWAIT_HALTED) {
 627                 /*
 628                  * CPU has been awakened, notify CPU idle notification system.
 629                  */
 630                 cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 631         } else {
 632                 /*
 633                  * Toggle interrupt flag to detect pending interrupts.
 634                  * If interrupt happened, do_interrupt() will notify CPU idle
 635                  * notification framework so no need to call cpu_idle_exit()
 636                  * here.
 637                  */
 638                 sti();
 639                 SMT_PAUSE();
 640                 cli();
 641         }
 642 }
 643 
 644 /*
 645  * Idle the present CPU until awakened via touching its monitored line
 646  */
 647 void
 648 cpu_idle_mwait(void)
 649 {
 650         volatile uint32_t       *mcpu_mwait = CPU->cpu_m.mcpu_mwait;
 651         cpu_t                   *cpup = CPU;
 652         processorid_t           cpu_sid = cpup->cpu_seqid;
 653         cpupart_t               *cp = cpup->cpu_part;
 654         int                     hset_update = 1;
 655 
 656         /*
 657          * Set our mcpu_mwait here, so we can tell if anyone tries to
 658          * wake us between now and when we call mwait.  No other cpu will
 659          * attempt to set our mcpu_mwait until we add ourself to the halted
 660          * CPU bitmap.
 661          */
 662         *mcpu_mwait = MWAIT_HALTED;
 663 
 664         /*
 665          * If this CPU is online, and there's multiple CPUs
 666          * in the system, then we should note our halting
 667          * by adding ourselves to the partition's halted CPU
 668          * bitmap. This allows other CPUs to find/awaken us when
 669          * work becomes available.
 670          */
 671         if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
 672                 hset_update = 0;
 673 
 674         /*
 675          * Add ourselves to the partition's halted CPUs bitmap
 676          * and set our HALTED flag, if necessary.
 677          *
 678          * When a thread becomes runnable, it is placed on the queue
 679          * and then the halted CPU bitmap is checked to determine who
 680          * (if anyone) should be awakened. We therefore need to first
 681          * add ourselves to the bitmap, and and then check if there
 682          * is any work available.
 683          *
 684          * Note that memory barriers after updating the HALTED flag
 685          * are not necessary since an atomic operation (updating the bitmap)
 686          * immediately follows. On x86 the atomic operation acts as a
 687          * memory barrier for the update of cpu_disp_flags.
 688          */
 689         if (hset_update) {
 690                 cpup->cpu_disp_flags |= CPU_DISP_HALTED;
 691                 bitset_atomic_add(&cp->cp_haltset, cpu_sid);
 692         }
 693 
 694         /*
 695          * Check to make sure there's really nothing to do.
 696          * Work destined for this CPU may become available after
 697          * this check. We'll be notified through the clearing of our
 698          * bit in the halted CPU bitmap, and a write to our mcpu_mwait.
 699          *
 700          * disp_anywork() checks disp_nrunnable, so we do not have to later.
 701          */
 702         if (disp_anywork()) {
 703                 if (hset_update) {
 704                         cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 705                         bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 706                 }
 707                 return;
 708         }
 709 
 710         /*
 711          * We're on our way to being halted.
 712          * To avoid a lost wakeup, arm the monitor before checking if another
 713          * cpu wrote to mcpu_mwait to wake us up.
 714          */
 715         i86_monitor(mcpu_mwait, 0, 0);
 716         if (*mcpu_mwait == MWAIT_HALTED) {
 717                 if (cpu_idle_enter(IDLE_STATE_C1, 0,
 718                     cpu_idle_mwait_check_wakeup, (void *)mcpu_mwait) == 0) {
 719                         if (*mcpu_mwait == MWAIT_HALTED) {
 720                                 i86_mwait(0, 0);
 721                         }
 722                         cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
 723                 }
 724         }
 725 
 726         /*
 727          * We're no longer halted
 728          */
 729         if (hset_update) {
 730                 cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
 731                 bitset_atomic_del(&cp->cp_haltset, cpu_sid);
 732         }
 733 }
 734 
 735 /*
 736  * If "cpu" is halted in mwait, then wake it up clearing its halted bit in
 737  * advance.  Otherwise, see if other CPUs in the cpu partition are halted and
 738  * need to be woken up so that they can steal the thread we placed on this CPU.
 739  * This function is only used on MP systems.
 740  */
 741 static void
 742 cpu_wakeup_mwait(cpu_t *cp, int bound)
 743 {
 744         cpupart_t       *cpu_part;
 745         uint_t          cpu_found;
 746         processorid_t   cpu_sid;
 747 
 748         cpu_part = cp->cpu_part;
 749         cpu_sid = cp->cpu_seqid;
 750 
 751         /*
 752          * Clear the halted bit for that CPU since it will be woken up
 753          * in a moment.
 754          */
 755         if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
 756                 /*
 757                  * Clear the halted bit for that CPU since it will be
 758                  * poked in a moment.
 759                  */
 760                 bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);
 761                 /*
 762                  * We may find the current CPU present in the halted cpuset
 763                  * if we're in the context of an interrupt that occurred
 764                  * before we had a chance to clear our bit in cpu_idle().
 765                  * Waking ourself is obviously unnecessary, since if
 766                  * we're here, we're not halted.
 767                  *
 768                  * monitor/mwait wakeup via writing to our cache line is
 769                  * harmless and less expensive than always checking if we
 770                  * are waking ourself which is an uncommon case.
 771                  */
 772                 MWAIT_WAKEUP(cp);       /* write to monitored line */
 773                 return;
 774         } else {
 775                 /*
 776                  * This cpu isn't halted, but it's idle or undergoing a
 777                  * context switch. No need to awaken anyone else.
 778                  */
 779                 if (cp->cpu_thread == cp->cpu_idle_thread ||
 780                     cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
 781                         return;
 782         }
 783 
 784         /*
 785          * No need to wake up other CPUs if the thread we just enqueued
 786          * is bound.
 787          */
 788         if (bound || ncpus == 1)
 789                 return;
 790 
 791         /*
 792          * See if there's any other halted CPUs. If there are, then
 793          * select one, and awaken it.
 794          * It's possible that after we find a CPU, somebody else
 795          * will awaken it before we get the chance.
 796          * In that case, look again.
 797          */
 798         do {
 799                 cpu_found = bitset_find(&cpu_part->cp_haltset);
 800                 if (cpu_found == (uint_t)-1)
 801                         return;
 802         } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
 803             cpu_found) < 0);
 804 
 805         /*
 806          * Do not check if cpu_found is ourself as monitor/mwait
 807          * wakeup is cheap.
 808          */
 809         MWAIT_WAKEUP(cpu_seq[cpu_found]); /* write to monitored line */
 810 }
 811 
 812 #endif
 813 
 814 void (*cpu_pause_handler)(volatile char *) = NULL;
 815 
 816 static int
 817 mp_disable_intr(int cpun)
 818 {
 819         /*
 820          * switch to the offline cpu
 821          */
 822         affinity_set(cpun);
 823         /*
 824          * raise ipl to just below cross call
 825          */
 826         splx(XC_SYS_PIL - 1);
 827         /*
 828          *      set base spl to prevent the next swtch to idle from
 829          *      lowering back to ipl 0
 830          */
 831         CPU->cpu_intr_actv |= (1 << (XC_SYS_PIL - 1));
 832         set_base_spl();
 833         affinity_clear();
 834         return (DDI_SUCCESS);
 835 }
 836 
 837 static void
 838 mp_enable_intr(int cpun)
 839 {
 840         /*
 841          * switch to the online cpu
 842          */
 843         affinity_set(cpun);
 844         /*
 845          * clear the interrupt active mask
 846          */
 847         CPU->cpu_intr_actv &= ~(1 << (XC_SYS_PIL - 1));
 848         set_base_spl();
 849         (void) spl0();
 850         affinity_clear();
 851 }
 852 
 853 static void
 854 mach_get_platform(int owner)
 855 {
 856         void            **srv_opsp;
 857         void            **clt_opsp;
 858         int             i;
 859         int             total_ops;
 860 
 861         /* fix up psm ops */
 862         srv_opsp = (void **)mach_set[0];
 863         clt_opsp = (void **)mach_set[owner];
 864         if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01)
 865                 total_ops = sizeof (struct psm_ops_ver01) /
 866                     sizeof (void (*)(void));
 867         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_1)
 868                 /* no psm_notify_func */
 869                 total_ops = OFFSETOF(struct psm_ops, psm_notify_func) /
 870                     sizeof (void (*)(void));
 871         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_2)
 872                 /* no psm_timer funcs */
 873                 total_ops = OFFSETOF(struct psm_ops, psm_timer_reprogram) /
 874                     sizeof (void (*)(void));
 875         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_3)
 876                 /* no psm_preshutdown function */
 877                 total_ops = OFFSETOF(struct psm_ops, psm_preshutdown) /
 878                     sizeof (void (*)(void));
 879         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_4)
 880                 /* no psm_intr_ops function */
 881                 total_ops = OFFSETOF(struct psm_ops, psm_intr_ops) /
 882                     sizeof (void (*)(void));
 883         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_5)
 884                 /* no psm_state function */
 885                 total_ops = OFFSETOF(struct psm_ops, psm_state) /
 886                     sizeof (void (*)(void));
 887         else if (mach_ver[owner] == (ushort_t)PSM_INFO_VER01_6)
 888                 /* no psm_cpu_ops function */
 889                 total_ops = OFFSETOF(struct psm_ops, psm_cpu_ops) /
 890                     sizeof (void (*)(void));
 891         else
 892                 total_ops = sizeof (struct psm_ops) / sizeof (void (*)(void));
 893 
 894         /*
 895          * Save the version of the PSM module, in case we need to
 896          * behave differently based on version.
 897          */
 898         mach_ver[0] = mach_ver[owner];
 899 
 900         for (i = 0; i < total_ops; i++)
 901                 if (clt_opsp[i] != NULL)
 902                         srv_opsp[i] = clt_opsp[i];
 903 }
 904 
 905 static void
 906 mach_construct_info()
 907 {
 908         struct psm_sw *swp;
 909         int     mach_cnt[PSM_OWN_OVERRIDE+1] = {0};
 910         int     conflict_owner = 0;
 911 
 912         if (psmsw->psw_forw == psmsw)
 913                 panic("No valid PSM modules found");
 914         mutex_enter(&psmsw_lock);
 915         for (swp = psmsw->psw_forw; swp != psmsw; swp = swp->psw_forw) {
 916                 if (!(swp->psw_flag & PSM_MOD_IDENTIFY))
 917                         continue;
 918                 mach_set[swp->psw_infop->p_owner] = swp->psw_infop->p_ops;
 919                 mach_ver[swp->psw_infop->p_owner] = swp->psw_infop->p_version;
 920                 mach_cnt[swp->psw_infop->p_owner]++;
 921         }
 922         mutex_exit(&psmsw_lock);
 923 
 924         mach_get_platform(PSM_OWN_SYS_DEFAULT);
 925 
 926         /* check to see are there any conflicts */
 927         if (mach_cnt[PSM_OWN_EXCLUSIVE] > 1)
 928                 conflict_owner = PSM_OWN_EXCLUSIVE;
 929         if (mach_cnt[PSM_OWN_OVERRIDE] > 1)
 930                 conflict_owner = PSM_OWN_OVERRIDE;
 931         if (conflict_owner) {
 932                 /* remove all psm modules except uppc */
 933                 cmn_err(CE_WARN,
 934                     "Conflicts detected on the following PSM modules:");
 935                 mutex_enter(&psmsw_lock);
 936                 for (swp = psmsw->psw_forw; swp != psmsw; swp = swp->psw_forw) {
 937                         if (swp->psw_infop->p_owner == conflict_owner)
 938                                 cmn_err(CE_WARN, "%s ",
 939                                     swp->psw_infop->p_mach_idstring);
 940                 }
 941                 mutex_exit(&psmsw_lock);
 942                 cmn_err(CE_WARN,
 943                     "Setting the system back to SINGLE processor mode!");
 944                 cmn_err(CE_WARN,
 945                     "Please edit /etc/mach to remove the invalid PSM module.");
 946                 return;
 947         }
 948 
 949         if (mach_set[PSM_OWN_EXCLUSIVE])
 950                 mach_get_platform(PSM_OWN_EXCLUSIVE);
 951 
 952         if (mach_set[PSM_OWN_OVERRIDE])
 953                 mach_get_platform(PSM_OWN_OVERRIDE);
 954 }
 955 
 956 static void
 957 mach_init()
 958 {
 959         struct psm_ops  *pops;
 960 
 961         mach_construct_info();
 962 
 963         pops = mach_set[0];
 964 
 965         /* register the interrupt and clock initialization rotuines */
 966         picinitf = mach_picinit;
 967         clkinitf = mach_clkinit;
 968         psm_get_clockirq = pops->psm_get_clockirq;
 969 
 970         /* register the interrupt setup code */
 971         slvltovect = mach_softlvl_to_vect;
 972         addspl  = pops->psm_addspl;
 973         delspl  = pops->psm_delspl;
 974 
 975         if (pops->psm_translate_irq)
 976                 psm_translate_irq = pops->psm_translate_irq;
 977         if (pops->psm_intr_ops)
 978                 psm_intr_ops = pops->psm_intr_ops;
 979 
 980 #if defined(PSMI_1_2) || defined(PSMI_1_3) || defined(PSMI_1_4)
 981         /*
 982          * Time-of-day functionality now handled in TOD modules.
 983          * (Warn about PSM modules that think that we're going to use
 984          * their ops vectors.)
 985          */
 986         if (pops->psm_tod_get)
 987                 cmn_err(CE_WARN, "obsolete psm_tod_get op %p",
 988                     (void *)pops->psm_tod_get);
 989 
 990         if (pops->psm_tod_set)
 991                 cmn_err(CE_WARN, "obsolete psm_tod_set op %p",
 992                     (void *)pops->psm_tod_set);
 993 #endif
 994 
 995         if (pops->psm_notify_error) {
 996                 psm_notify_error = mach_notify_error;
 997                 notify_error = pops->psm_notify_error;
 998         }
 999 
1000         (*pops->psm_softinit)();
1001 
1002         /*
1003          * Initialize the dispatcher's function hooks to enable CPU halting
1004          * when idle.  Set both the deep-idle and non-deep-idle hooks.
1005          *
1006          * Assume we can use power saving deep-idle loop cpu_idle_adaptive.
1007          * Platform deep-idle driver will reset our idle loop to
1008          * non_deep_idle_cpu if power saving deep-idle feature is not available.
1009          *
1010          * Do not use monitor/mwait if idle_cpu_use_hlt is not set(spin idle)
1011          * or idle_cpu_prefer_mwait is not set.
1012          * Allocate monitor/mwait buffer for cpu0.
1013          */
1014 #ifndef __xpv
1015         non_deep_idle_disp_enq_thread = disp_enq_thread;
1016 #endif
1017         if (idle_cpu_use_hlt) {
1018                 idle_cpu = cpu_idle_adaptive;
1019                 CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
1020 #ifndef __xpv
1021                 if (is_x86_feature(x86_featureset, X86FSET_MWAIT) &&
1022                     idle_cpu_prefer_mwait) {
1023                         CPU->cpu_m.mcpu_mwait = cpuid_mwait_alloc(CPU);
1024                         /*
1025                          * Protect ourself from insane mwait size.
1026                          */
1027                         if (CPU->cpu_m.mcpu_mwait == NULL) {
1028 #ifdef DEBUG
1029                                 cmn_err(CE_NOTE, "Using hlt idle.  Cannot "
1030                                     "handle cpu 0 mwait size.");
1031 #endif
1032                                 idle_cpu_prefer_mwait = 0;
1033                                 CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
1034                         } else {
1035                                 CPU->cpu_m.mcpu_idle_cpu = cpu_idle_mwait;
1036                         }
1037                 } else {
1038                         CPU->cpu_m.mcpu_idle_cpu = cpu_idle;
1039                 }
1040                 non_deep_idle_cpu = CPU->cpu_m.mcpu_idle_cpu;
1041 
1042                 /*
1043                  * Disable power saving deep idle loop?
1044                  */
1045                 if (idle_cpu_no_deep_c) {
1046                         idle_cpu = non_deep_idle_cpu;
1047                 }
1048 #endif
1049         }
1050 
1051         mach_smpinit();
1052 }
1053 
1054 static void
1055 mach_smpinit(void)
1056 {
1057         struct psm_ops  *pops;
1058         processorid_t cpu_id;
1059         int cnt;
1060         cpuset_t cpumask;
1061 
1062         pops = mach_set[0];
1063         CPUSET_ZERO(cpumask);
1064 
1065         cpu_id = -1;
1066         cpu_id = (*pops->psm_get_next_processorid)(cpu_id);
1067         /*
1068          * Only add boot_ncpus CPUs to mp_cpus. Other CPUs will be handled
1069          * by CPU DR driver at runtime.
1070          */
1071         for (cnt = 0; cpu_id != -1 && cnt < boot_ncpus; cnt++) {
1072                 CPUSET_ADD(cpumask, cpu_id);
1073                 cpu_id = (*pops->psm_get_next_processorid)(cpu_id);
1074         }
1075 
1076         mp_cpus = cpumask;
1077 
1078         /* MP related routines */
1079         ap_mlsetup = pops->psm_post_cpu_start;
1080         send_dirintf = pops->psm_send_ipi;
1081 
1082         /* optional MP related routines */
1083         if (pops->psm_shutdown)
1084                 psm_shutdownf = pops->psm_shutdown;
1085         if (pops->psm_preshutdown)
1086                 psm_preshutdownf = pops->psm_preshutdown;
1087         if (pops->psm_notify_func)
1088                 psm_notifyf = pops->psm_notify_func;
1089         if (pops->psm_set_idlecpu)
1090                 psm_set_idle_cpuf = pops->psm_set_idlecpu;
1091         if (pops->psm_unset_idlecpu)
1092                 psm_unset_idle_cpuf = pops->psm_unset_idlecpu;
1093 
1094         psm_clkinit = pops->psm_clkinit;
1095 
1096         if (pops->psm_timer_reprogram)
1097                 psm_timer_reprogram = pops->psm_timer_reprogram;
1098 
1099         if (pops->psm_timer_enable)
1100                 psm_timer_enable = pops->psm_timer_enable;
1101 
1102         if (pops->psm_timer_disable)
1103                 psm_timer_disable = pops->psm_timer_disable;
1104 
1105         if (pops->psm_post_cyclic_setup)
1106                 psm_post_cyclic_setup = pops->psm_post_cyclic_setup;
1107 
1108         if (pops->psm_state)
1109                 psm_state = pops->psm_state;
1110 
1111         /*
1112          * Set these vectors here so they can be used by Suspend/Resume
1113          * on UP machines.
1114          */
1115         if (pops->psm_disable_intr)
1116                 psm_disable_intr = pops->psm_disable_intr;
1117         if (pops->psm_enable_intr)
1118                 psm_enable_intr  = pops->psm_enable_intr;
1119 
1120         /* check for multiple CPUs */
1121         if (cnt < 2 && plat_dr_support_cpu() == B_FALSE)
1122                 return;
1123 
1124         /* check for MP platforms */
1125         if (pops->psm_cpu_start == NULL)
1126                 return;
1127 
1128         /*
1129          * Set the dispatcher hook to enable cpu "wake up"
1130          * when a thread becomes runnable.
1131          */
1132         if (idle_cpu_use_hlt) {
1133                 disp_enq_thread = cpu_wakeup;
1134 #ifndef __xpv
1135                 if (is_x86_feature(x86_featureset, X86FSET_MWAIT) &&
1136                     idle_cpu_prefer_mwait)
1137                         disp_enq_thread = cpu_wakeup_mwait;
1138                 non_deep_idle_disp_enq_thread = disp_enq_thread;
1139 #endif
1140         }
1141 
1142         psm_get_ipivect = pops->psm_get_ipivect;
1143 
1144         (void) add_avintr((void *)NULL, XC_HI_PIL, xc_serv, "xc_intr",
1145             (*pops->psm_get_ipivect)(XC_HI_PIL, PSM_INTR_IPI_HI),
1146             NULL, NULL, NULL, NULL);
1147 
1148         (void) (*pops->psm_get_ipivect)(XC_CPUPOKE_PIL, PSM_INTR_POKE);
1149 }
1150 
1151 static void
1152 mach_picinit()
1153 {
1154         struct psm_ops  *pops;
1155 
1156         pops = mach_set[0];
1157 
1158         /* register the interrupt handlers */
1159         setlvl = pops->psm_intr_enter;
1160         setlvlx = pops->psm_intr_exit;
1161 
1162         /* initialize the interrupt hardware */
1163         (*pops->psm_picinit)();
1164 
1165         /* set interrupt mask for current ipl */
1166         setspl = pops->psm_setspl;
1167         cli();
1168         setspl(CPU->cpu_pri);
1169 }
1170 
1171 uint_t  cpu_freq;       /* MHz */
1172 uint64_t cpu_freq_hz;   /* measured (in hertz) */
1173 
1174 #define MEGA_HZ         1000000
1175 
1176 #ifdef __xpv
1177 
1178 int xpv_cpufreq_workaround = 1;
1179 int xpv_cpufreq_verbose = 0;
1180 
1181 #else   /* __xpv */
1182 
1183 static uint64_t
1184 mach_calchz(uint32_t pit_counter, uint64_t *processor_clks)
1185 {
1186         uint64_t cpu_hz;
1187 
1188         if ((pit_counter == 0) || (*processor_clks == 0) ||
1189             (*processor_clks > (((uint64_t)-1) / PIT_HZ)))
1190                 return (0);
1191 
1192         cpu_hz = ((uint64_t)PIT_HZ * *processor_clks) / pit_counter;
1193 
1194         return (cpu_hz);
1195 }
1196 
1197 #endif  /* __xpv */
1198 
1199 static uint64_t
1200 mach_getcpufreq(void)
1201 {
1202 #if defined(__xpv)
1203         vcpu_time_info_t *vti = &CPU->cpu_m.mcpu_vcpu_info->time;
1204         uint64_t cpu_hz;
1205 
1206         /*
1207          * During dom0 bringup, it was noted that on at least one older
1208          * Intel HT machine, the hypervisor initially gives a tsc_to_system_mul
1209          * value that is quite wrong (the 3.06GHz clock was reported
1210          * as 4.77GHz)
1211          *
1212          * The curious thing is, that if you stop the kernel at entry,
1213          * breakpoint here and inspect the value with kmdb, the value
1214          * is correct - but if you don't stop and simply enable the
1215          * printf statement (below), you can see the bad value printed
1216          * here.  Almost as if something kmdb did caused the hypervisor to
1217          * figure it out correctly.  And, note that the hypervisor
1218          * eventually -does- figure it out correctly ... if you look at
1219          * the field later in the life of dom0, it is correct.
1220          *
1221          * For now, on dom0, we employ a slightly cheesy workaround of
1222          * using the DOM0_PHYSINFO hypercall.
1223          */
1224         if (DOMAIN_IS_INITDOMAIN(xen_info) && xpv_cpufreq_workaround) {
1225                 cpu_hz = 1000 * xpv_cpu_khz();
1226         } else {
1227                 cpu_hz = (UINT64_C(1000000000) << 32) / vti->tsc_to_system_mul;
1228 
1229                 if (vti->tsc_shift < 0)
1230                         cpu_hz <<= -vti->tsc_shift;
1231                 else
1232                         cpu_hz >>= vti->tsc_shift;
1233         }
1234 
1235         if (xpv_cpufreq_verbose)
1236                 printf("mach_getcpufreq: system_mul 0x%x, shift %d, "
1237                     "cpu_hz %" PRId64 "Hz\n",
1238                     vti->tsc_to_system_mul, vti->tsc_shift, cpu_hz);
1239 
1240         return (cpu_hz);
1241 #else   /* __xpv */
1242         uint32_t pit_counter;
1243         uint64_t processor_clks;
1244 
1245         if (is_x86_feature(x86_featureset, X86FSET_TSC)) {
1246                 /*
1247                  * We have a TSC. freq_tsc() knows how to measure the number
1248                  * of clock cycles sampled against the PIT.
1249                  */
1250                 ulong_t flags = clear_int_flag();
1251                 processor_clks = freq_tsc(&pit_counter);
1252                 restore_int_flag(flags);
1253                 return (mach_calchz(pit_counter, &processor_clks));
1254         } else if (x86_vendor == X86_VENDOR_Cyrix || x86_type == X86_TYPE_P5) {
1255 #if defined(__amd64)
1256                 panic("mach_getcpufreq: no TSC!");
1257 #elif defined(__i386)
1258                 /*
1259                  * We are a Cyrix based on a 6x86 core or an Intel Pentium
1260                  * for which freq_notsc() knows how to measure the number of
1261                  * elapsed clock cycles sampled against the PIT
1262                  */
1263                 ulong_t flags = clear_int_flag();
1264                 processor_clks = freq_notsc(&pit_counter);
1265                 restore_int_flag(flags);
1266                 return (mach_calchz(pit_counter, &processor_clks));
1267 #endif  /* __i386 */
1268         }
1269 
1270         /* We do not know how to calculate cpu frequency for this cpu. */
1271         return (0);
1272 #endif  /* __xpv */
1273 }
1274 
1275 /*
1276  * If the clock speed of a cpu is found to be reported incorrectly, do not add
1277  * to this array, instead improve the accuracy of the algorithm that determines
1278  * the clock speed of the processor or extend the implementation to support the
1279  * vendor as appropriate. This is here only to support adjusting the speed on
1280  * older slower processors that mach_fixcpufreq() would not be able to account
1281  * for otherwise.
1282  */
1283 static int x86_cpu_freq[] = { 60, 75, 80, 90, 120, 160, 166, 175, 180, 233 };
1284 
1285 /*
1286  * On fast processors the clock frequency that is measured may be off by
1287  * a few MHz from the value printed on the part. This is a combination of
1288  * the factors that for such fast parts being off by this much is within
1289  * the tolerances for manufacture and because of the difficulties in the
1290  * measurement that can lead to small error. This function uses some
1291  * heuristics in order to tweak the value that was measured to match what
1292  * is most likely printed on the part.
1293  *
1294  * Some examples:
1295  *      AMD Athlon 1000 mhz measured as 998 mhz
1296  *      Intel Pentium III Xeon 733 mhz measured as 731 mhz
1297  *      Intel Pentium IV 1500 mhz measured as 1495mhz
1298  *
1299  * If in the future this function is no longer sufficient to correct
1300  * for the error in the measurement, then the algorithm used to perform
1301  * the measurement will have to be improved in order to increase accuracy
1302  * rather than adding horrible and questionable kludges here.
1303  *
1304  * This is called after the cyclics subsystem because of the potential
1305  * that the heuristics within may give a worse estimate of the clock
1306  * frequency than the value that was measured.
1307  */
1308 static void
1309 mach_fixcpufreq(void)
1310 {
1311         uint32_t freq, mul, near66, delta66, near50, delta50, fixed, delta, i;
1312 
1313         freq = (uint32_t)cpu_freq;
1314 
1315         /*
1316          * Find the nearest integer multiple of 200/3 (about 66) MHz to the
1317          * measured speed taking into account that the 667 MHz parts were
1318          * the first to round-up.
1319          */
1320         mul = (uint32_t)((3 * (uint64_t)freq + 100) / 200);
1321         near66 = (uint32_t)((200 * (uint64_t)mul + ((mul >= 10) ? 1 : 0)) / 3);
1322         delta66 = (near66 > freq) ? (near66 - freq) : (freq - near66);
1323 
1324         /* Find the nearest integer multiple of 50 MHz to the measured speed */
1325         mul = (freq + 25) / 50;
1326         near50 = mul * 50;
1327         delta50 = (near50 > freq) ? (near50 - freq) : (freq - near50);
1328 
1329         /* Find the closer of the two */
1330         if (delta66 < delta50) {
1331                 fixed = near66;
1332                 delta = delta66;
1333         } else {
1334                 fixed = near50;
1335                 delta = delta50;
1336         }
1337 
1338         if (fixed > INT_MAX)
1339                 return;
1340 
1341         /*
1342          * Some older parts have a core clock frequency that is not an
1343          * integral multiple of 50 or 66 MHz. Check if one of the old
1344          * clock frequencies is closer to the measured value than any
1345          * of the integral multiples of 50 an 66, and if so set fixed
1346          * and delta appropriately to represent the closest value.
1347          */
1348         i = sizeof (x86_cpu_freq) / sizeof (int);
1349         while (i > 0) {
1350                 i--;
1351 
1352                 if (x86_cpu_freq[i] <= freq) {
1353                         mul = freq - x86_cpu_freq[i];
1354 
1355                         if (mul < delta) {
1356                                 fixed = x86_cpu_freq[i];
1357                                 delta = mul;
1358                         }
1359 
1360                         break;
1361                 }
1362 
1363                 mul = x86_cpu_freq[i] - freq;
1364 
1365                 if (mul < delta) {
1366                         fixed = x86_cpu_freq[i];
1367                         delta = mul;
1368                 }
1369         }
1370 
1371         /*
1372          * Set a reasonable maximum for how much to correct the measured
1373          * result by. This check is here to prevent the adjustment made
1374          * by this function from being more harm than good. It is entirely
1375          * possible that in the future parts will be made that are not
1376          * integral multiples of 66 or 50 in clock frequency or that
1377          * someone may overclock a part to some odd frequency. If the
1378          * measured value is farther from the corrected value than
1379          * allowed, then assume the corrected value is in error and use
1380          * the measured value.
1381          */
1382         if (6 < delta)
1383                 return;
1384 
1385         cpu_freq = (int)fixed;
1386 }
1387 
1388 
1389 static int
1390 machhztomhz(uint64_t cpu_freq_hz)
1391 {
1392         uint64_t cpu_mhz;
1393 
1394         /* Round to nearest MHZ */
1395         cpu_mhz = (cpu_freq_hz + (MEGA_HZ / 2)) / MEGA_HZ;
1396 
1397         if (cpu_mhz > INT_MAX)
1398                 return (0);
1399 
1400         return ((int)cpu_mhz);
1401 
1402 }
1403 
1404 
1405 static int
1406 mach_clkinit(int preferred_mode, int *set_mode)
1407 {
1408         struct psm_ops  *pops;
1409         int resolution;
1410 
1411         pops = mach_set[0];
1412 
1413         cpu_freq_hz = mach_getcpufreq();
1414 
1415         cpu_freq = machhztomhz(cpu_freq_hz);
1416 
1417         if (!is_x86_feature(x86_featureset, X86FSET_TSC) || (cpu_freq == 0))
1418                 tsc_gethrtime_enable = 0;
1419 
1420 #ifndef __xpv
1421         if (tsc_gethrtime_enable) {
1422                 tsc_hrtimeinit(cpu_freq_hz);
1423         } else
1424 #endif
1425         {
1426                 if (pops->psm_hrtimeinit)
1427                         (*pops->psm_hrtimeinit)();
1428                 gethrtimef = pops->psm_gethrtime;
1429                 gethrtimeunscaledf = gethrtimef;
1430                 /* scalehrtimef will remain dummy */
1431         }
1432 
1433         mach_fixcpufreq();
1434 
1435         if (mach_ver[0] >= PSM_INFO_VER01_3) {
1436                 if (preferred_mode == TIMER_ONESHOT) {
1437 
1438                         resolution = (*pops->psm_clkinit)(0);
1439                         if (resolution != 0)  {
1440                                 *set_mode = TIMER_ONESHOT;
1441                                 return (resolution);
1442                         }
1443                 }
1444 
1445                 /*
1446                  * either periodic mode was requested or could not set to
1447                  * one-shot mode
1448                  */
1449                 resolution = (*pops->psm_clkinit)(hz);
1450                 /*
1451                  * psm should be able to do periodic, so we do not check
1452                  * for return value of psm_clkinit here.
1453                  */
1454                 *set_mode = TIMER_PERIODIC;
1455                 return (resolution);
1456         } else {
1457                 /*
1458                  * PSMI interface prior to PSMI_3 does not define a return
1459                  * value for psm_clkinit, so the return value is ignored.
1460                  */
1461                 (void) (*pops->psm_clkinit)(hz);
1462                 *set_mode = TIMER_PERIODIC;
1463                 return (nsec_per_tick);
1464         }
1465 }
1466 
1467 
1468 /*ARGSUSED*/
1469 static int
1470 mach_softlvl_to_vect(int ipl)
1471 {
1472         setsoftint = av_set_softint_pending;
1473         kdisetsoftint = kdi_av_set_softint_pending;
1474 
1475         return (PSM_SV_SOFTWARE);
1476 }
1477 
1478 #ifdef DEBUG
1479 /*
1480  * This is here to allow us to simulate cpus that refuse to start.
1481  */
1482 cpuset_t cpufailset;
1483 #endif
1484 
1485 int
1486 mach_cpu_start(struct cpu *cp, void *ctx)
1487 {
1488         struct psm_ops *pops = mach_set[0];
1489         processorid_t id = cp->cpu_id;
1490 
1491 #ifdef DEBUG
1492         if (CPU_IN_SET(cpufailset, id))
1493                 return (0);
1494 #endif
1495         return ((*pops->psm_cpu_start)(id, ctx));
1496 }
1497 
1498 int
1499 mach_cpuid_start(processorid_t id, void *ctx)
1500 {
1501         struct psm_ops *pops = mach_set[0];
1502 
1503 #ifdef DEBUG
1504         if (CPU_IN_SET(cpufailset, id))
1505                 return (0);
1506 #endif
1507         return ((*pops->psm_cpu_start)(id, ctx));
1508 }
1509 
1510 int
1511 mach_cpu_stop(cpu_t *cp, void *ctx)
1512 {
1513         struct psm_ops *pops = mach_set[0];
1514         psm_cpu_request_t request;
1515 
1516         if (pops->psm_cpu_ops == NULL) {
1517                 return (ENOTSUP);
1518         }
1519 
1520         ASSERT(cp->cpu_id != -1);
1521         request.pcr_cmd = PSM_CPU_STOP;
1522         request.req.cpu_stop.cpuid = cp->cpu_id;
1523         request.req.cpu_stop.ctx = ctx;
1524 
1525         return ((*pops->psm_cpu_ops)(&request));
1526 }
1527 
1528 int
1529 mach_cpu_add(mach_cpu_add_arg_t *argp, processorid_t *cpuidp)
1530 {
1531         int rc;
1532         struct psm_ops *pops = mach_set[0];
1533         psm_cpu_request_t request;
1534 
1535         if (pops->psm_cpu_ops == NULL) {
1536                 return (ENOTSUP);
1537         }
1538 
1539         request.pcr_cmd = PSM_CPU_ADD;
1540         request.req.cpu_add.argp = argp;
1541         request.req.cpu_add.cpuid = -1;
1542         rc = (*pops->psm_cpu_ops)(&request);
1543         if (rc == 0) {
1544                 ASSERT(request.req.cpu_add.cpuid != -1);
1545                 *cpuidp = request.req.cpu_add.cpuid;
1546         }
1547 
1548         return (rc);
1549 }
1550 
1551 int
1552 mach_cpu_remove(processorid_t cpuid)
1553 {
1554         struct psm_ops *pops = mach_set[0];
1555         psm_cpu_request_t request;
1556 
1557         if (pops->psm_cpu_ops == NULL) {
1558                 return (ENOTSUP);
1559         }
1560 
1561         request.pcr_cmd = PSM_CPU_REMOVE;
1562         request.req.cpu_remove.cpuid = cpuid;
1563 
1564         return ((*pops->psm_cpu_ops)(&request));
1565 }
1566 
1567 /*
1568  * Default handler to create device node for CPU.
1569  * One reference count will be held on created device node.
1570  */
1571 static int
1572 mach_cpu_create_devinfo(cpu_t *cp, dev_info_t **dipp)
1573 {
1574         int rv, circ;
1575         dev_info_t *dip;
1576         static kmutex_t cpu_node_lock;
1577         static dev_info_t *cpu_nex_devi = NULL;
1578 
1579         ASSERT(cp != NULL);
1580         ASSERT(dipp != NULL);
1581         *dipp = NULL;
1582 
1583         if (cpu_nex_devi == NULL) {
1584                 mutex_enter(&cpu_node_lock);
1585                 /* First check whether cpus exists. */
1586                 cpu_nex_devi = ddi_find_devinfo("cpus", -1, 0);
1587                 /* Create cpus if it doesn't exist. */
1588                 if (cpu_nex_devi == NULL) {
1589                         ndi_devi_enter(ddi_root_node(), &circ);
1590                         rv = ndi_devi_alloc(ddi_root_node(), "cpus",
1591                             (pnode_t)DEVI_SID_NODEID, &dip);
1592                         if (rv != NDI_SUCCESS) {
1593                                 mutex_exit(&cpu_node_lock);
1594                                 cmn_err(CE_CONT,
1595                                     "?failed to create cpu nexus device.\n");
1596                                 return (PSM_FAILURE);
1597                         }
1598                         ASSERT(dip != NULL);
1599                         (void) ndi_devi_online(dip, 0);
1600                         ndi_devi_exit(ddi_root_node(), circ);
1601                         cpu_nex_devi = dip;
1602                 }
1603                 mutex_exit(&cpu_node_lock);
1604         }
1605 
1606         /*
1607          * create a child node for cpu identified as 'cpu_id'
1608          */
1609         ndi_devi_enter(cpu_nex_devi, &circ);
1610         dip = ddi_add_child(cpu_nex_devi, "cpu", DEVI_SID_NODEID, -1);
1611         if (dip == NULL) {
1612                 cmn_err(CE_CONT,
1613                     "?failed to create device node for cpu%d.\n", cp->cpu_id);
1614                 rv = PSM_FAILURE;
1615         } else {
1616                 *dipp = dip;
1617                 (void) ndi_hold_devi(dip);
1618                 rv = PSM_SUCCESS;
1619         }
1620         ndi_devi_exit(cpu_nex_devi, circ);
1621 
1622         return (rv);
1623 }
1624 
1625 /*
1626  * Create cpu device node in device tree and online it.
1627  * Return created dip with reference count held if requested.
1628  */
1629 int
1630 mach_cpu_create_device_node(struct cpu *cp, dev_info_t **dipp)
1631 {
1632         int rv;
1633         dev_info_t *dip = NULL;
1634 
1635         ASSERT(psm_cpu_create_devinfo != NULL);
1636         rv = psm_cpu_create_devinfo(cp, &dip);
1637         if (rv == PSM_SUCCESS) {
1638                 cpuid_set_cpu_properties(dip, cp->cpu_id, cp->cpu_m.mcpu_cpi);
1639                 /* Recursively attach driver for parent nexus device. */
1640                 if (i_ddi_attach_node_hierarchy(ddi_get_parent(dip)) ==
1641                     DDI_SUCCESS) {
1642                         /* Configure cpu itself and descendants. */
1643                         (void) ndi_devi_online(dip,
1644                             NDI_ONLINE_ATTACH | NDI_CONFIG);
1645                 }
1646                 if (dipp != NULL) {
1647                         *dipp = dip;
1648                 } else {
1649                         (void) ndi_rele_devi(dip);
1650                 }
1651         }
1652 
1653         return (rv);
1654 }
1655 
1656 /*
1657  * The dipp contains one of following values on return:
1658  * - NULL if no device node found
1659  * - pointer to device node if found
1660  */
1661 int
1662 mach_cpu_get_device_node(struct cpu *cp, dev_info_t **dipp)
1663 {
1664         *dipp = NULL;
1665         if (psm_cpu_get_devinfo != NULL) {
1666                 if (psm_cpu_get_devinfo(cp, dipp) == PSM_SUCCESS) {
1667                         return (PSM_SUCCESS);
1668                 }
1669         }
1670 
1671         return (PSM_FAILURE);
1672 }
1673 
1674 /*ARGSUSED*/
1675 static int
1676 mach_translate_irq(dev_info_t *dip, int irqno)
1677 {
1678         return (irqno); /* default to NO translation */
1679 }
1680 
1681 static void
1682 mach_notify_error(int level, char *errmsg)
1683 {
1684         /*
1685          * SL_FATAL is pass in once panicstr is set, deliver it
1686          * as CE_PANIC.  Also, translate SL_ codes back to CE_
1687          * codes for the psmi handler
1688          */
1689         if (level & SL_FATAL)
1690                 (*notify_error)(CE_PANIC, errmsg);
1691         else if (level & SL_WARN)
1692                 (*notify_error)(CE_WARN, errmsg);
1693         else if (level & SL_NOTE)
1694                 (*notify_error)(CE_NOTE, errmsg);
1695         else if (level & SL_CONSOLE)
1696                 (*notify_error)(CE_CONT, errmsg);
1697 }
1698 
1699 /*
1700  * It provides the default basic intr_ops interface for the new DDI
1701  * interrupt framework if the PSM doesn't have one.
1702  *
1703  * Input:
1704  * dip     - pointer to the dev_info structure of the requested device
1705  * hdlp    - pointer to the internal interrupt handle structure for the
1706  *           requested interrupt
1707  * intr_op - opcode for this call
1708  * result  - pointer to the integer that will hold the result to be
1709  *           passed back if return value is PSM_SUCCESS
1710  *
1711  * Output:
1712  * return value is either PSM_SUCCESS or PSM_FAILURE
1713  */
1714 static int
1715 mach_intr_ops(dev_info_t *dip, ddi_intr_handle_impl_t *hdlp,
1716     psm_intr_op_t intr_op, int *result)
1717 {
1718         struct intrspec *ispec;
1719 
1720         switch (intr_op) {
1721         case PSM_INTR_OP_CHECK_MSI:
1722                 *result = hdlp->ih_type & ~(DDI_INTR_TYPE_MSI |
1723                     DDI_INTR_TYPE_MSIX);
1724                 break;
1725         case PSM_INTR_OP_ALLOC_VECTORS:
1726                 if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
1727                         *result = 1;
1728                 else
1729                         *result = 0;
1730                 break;
1731         case PSM_INTR_OP_FREE_VECTORS:
1732                 break;
1733         case PSM_INTR_OP_NAVAIL_VECTORS:
1734                 if (hdlp->ih_type == DDI_INTR_TYPE_FIXED)
1735                         *result = 1;
1736                 else
1737                         *result = 0;
1738                 break;
1739         case PSM_INTR_OP_XLATE_VECTOR:
1740                 ispec = ((ihdl_plat_t *)hdlp->ih_private)->ip_ispecp;
1741                 *result = psm_translate_irq(dip, ispec->intrspec_vec);
1742                 break;
1743         case PSM_INTR_OP_GET_CAP:
1744                 *result = 0;
1745                 break;
1746         case PSM_INTR_OP_GET_PENDING:
1747         case PSM_INTR_OP_CLEAR_MASK:
1748         case PSM_INTR_OP_SET_MASK:
1749         case PSM_INTR_OP_GET_SHARED:
1750         case PSM_INTR_OP_SET_PRI:
1751         case PSM_INTR_OP_SET_CAP:
1752         case PSM_INTR_OP_SET_CPU:
1753         case PSM_INTR_OP_GET_INTR:
1754         default:
1755                 return (PSM_FAILURE);
1756         }
1757         return (PSM_SUCCESS);
1758 }
1759 /*
1760  * Return 1 if CMT load balancing policies should be
1761  * implemented across instances of the specified hardware
1762  * sharing relationship.
1763  */
1764 int
1765 pg_cmt_load_bal_hw(pghw_type_t hw)
1766 {
1767         if (hw == PGHW_IPIPE ||
1768             hw == PGHW_FPU ||
1769             hw == PGHW_PROCNODE ||
1770             hw == PGHW_CHIP)
1771                 return (1);
1772         else
1773                 return (0);
1774 }
1775 /*
1776  * Return 1 if thread affinity polices should be implemented
1777  * for instances of the specifed hardware sharing relationship.
1778  */
1779 int
1780 pg_cmt_affinity_hw(pghw_type_t hw)
1781 {
1782         if (hw == PGHW_CACHE)
1783                 return (1);
1784         else
1785                 return (0);
1786 }
1787 
1788 /*
1789  * Return number of counter events requested to measure hardware capacity and
1790  * utilization and setup CPC requests for specified CPU as needed
1791  *
1792  * May return 0 when platform or processor specific code knows that no CPC
1793  * events should be programmed on this CPU or -1 when platform or processor
1794  * specific code doesn't know which counter events are best to use and common
1795  * code should decide for itself
1796  */
1797 int
1798 /* LINTED E_FUNC_ARG_UNUSED */
1799 cu_plat_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
1800 {
1801         const char      *impl_name;
1802 
1803         /*
1804          * Return error if pcbe_ops not set
1805          */
1806         if (pcbe_ops == NULL)
1807                 return (-1);
1808 
1809         /*
1810          * Return that no CPC events should be programmed on hyperthreaded
1811          * Pentium 4 and return error for all other x86 processors to tell
1812          * common code to decide what counter events to program on those CPUs
1813          * for measuring hardware capacity and utilization
1814          */
1815         impl_name = pcbe_ops->pcbe_impl_name();
1816         if (impl_name != NULL && strcmp(impl_name, PCBE_IMPL_NAME_P4HT) == 0)
1817                 return (0);
1818         else
1819                 return (-1);
1820 }