]> git.proxmox.com Git - rustc.git/blobdiff - vendor/measureme/src/counters.rs
Merge tag 'debian/1.52.1+dfsg1-1_exp2' into proxmox/buster
[rustc.git] / vendor / measureme / src / counters.rs
diff --git a/vendor/measureme/src/counters.rs b/vendor/measureme/src/counters.rs
new file mode 100644 (file)
index 0000000..7d155ab
--- /dev/null
@@ -0,0 +1,1012 @@
+//! Profiling counters and their implementation.\r
+//!\r
+//! # Available counters\r
+//!\r
+//! Name (for [`Counter::by_name()`]) | Counter                      | OSes  | CPUs\r
+//! --------------------------------- | -------                      | ----  | ----\r
+//! `wall-time`                       | [`WallTime`]                 | any   | any\r
+//! `instructions:u`                  | [`Instructions`]             | Linux | `x86_64`\r
+//! `instructions-minus-irqs:u`       | [`InstructionsMinusIrqs`]    | Linux | `x86_64`<br>- AMD (since K8)<br>- Intel (since Sandy Bridge)\r
+//! `instructions-minus-r0420:u`      | [`InstructionsMinusRaw0420`] | Linux | `x86_64`<br>- AMD (Zen)\r
+//!\r
+//! *Note: `:u` suffixes for hardware performance counters come from the Linux `perf`\r
+//! tool, and indicate that the counter is only active while userspace code executes\r
+//! (i.e. it's paused while the kernel handles syscalls, interrupts, etc.).*\r
+//!\r
+//! # Limitations and caveats\r
+//!\r
+//! *Note: for more information, also see the GitHub PR which first implemented hardware\r
+//! performance counter support ([#143](https://github.com/rust-lang/measureme/pull/143)).*\r
+//!\r
+//! The hardware performance counters (i.e. all counters other than `wall-time`) are limited to:\r
+//! * nightly Rust (gated on `features = ["nightly"]`), for `asm!`\r
+//! * Linux, for out-of-the-box performance counter reads from userspace\r
+//!   * other OSes could work through custom kernel extensions/drivers, in the future\r
+//! * `x86_64` CPUs, mostly due to lack of other available test hardware\r
+//!   * new architectures would be easier to support (on Linux) than new OSes\r
+//!   * easiest to add would be 32-bit `x86` (aka `i686`), which would reuse\r
+//!     most of the `x86_64` CPU model detection logic\r
+//! * specific (newer) CPU models, for certain non-standard counters\r
+//!   * e.g. `instructions-minus-irqs:u` requires a "hardware interrupts" (aka "IRQs")\r
+//!     counter, which is implemented differently between vendors / models (if at all)\r
+//! * single-threaded programs (counters only work on the thread they were created on)\r
+//!   * for profiling `rustc`, this means only "check mode" (`--emit=metadata`),\r
+//!     is supported currently (`-Z no-llvm-threads` could also work)\r
+//!   * unclear what the best approach for handling multiple threads would be\r
+//!   * changing the API (e.g. to require per-thread profiler handles) could result\r
+//!     in a more efficient implementation, but would also be less ergonomic\r
+//!   * profiling data from multithreaded programs would be harder to use due to\r
+//!     noise from synchronization mechanisms, non-deterministic work-stealing, etc.\r
+//!\r
+//! For ergonomic reasons, the public API doesn't vary based on `features` or target.\r
+//! Instead, attempting to create any unsupported counter will return `Err`, just\r
+//! like it does for any issue detected at runtime (e.g. incompatible CPU model).\r
+//!\r
+//! When counting instructions specifically, these factors will impact the profiling quality:\r
+//! * high-level non-determinism (e.g. user interactions, networking)\r
+//!   * the ideal use-case is a mostly-deterministic program, e.g. a compiler like `rustc`\r
+//!   * if I/O can be isolated to separate profiling events, and doesn't impact\r
+//!     execution in a more subtle way (see below), the deterministic parts of\r
+//!     the program can still be profiled with high accuracy\r
+//!   * intentional uses of randomness may change execution paths, though for\r
+//!     cryptographic operations specifically, "constant time" implementations\r
+//!     are preferred / necessary (in order to limit an external observer's\r
+//!     ability to infer secrets), so they're not as much of a problem\r
+//!   * even otherwise-deterministic machine-local communication (to e.g. system\r
+//!     services or drivers) can behave unpredictably (especially under load)\r
+//!     * while we haven't observed this in the wild yet, it's possible for\r
+//!       file reads/writes to be split up into multiple smaller chunks\r
+//!       (and therefore take more userspace instructions to fully read/write)\r
+//! * low-level non-determinism (e.g. ASLR, randomized `HashMap`s, timers)\r
+//!   * ASLR ("Address Space Layout Randomization"), may be provided by the OS for\r
+//!     security reasons, or accidentally caused through allocations that depend on\r
+//!     random data (even as low-entropy as e.g. the base 10 length of a process ID)\r
+//!   * on Linux ASLR can be disabled by running the process under `setarch -R`\r
+//!   * this impacts `rustc` and LLVM, which rely on keying `HashMap`s by addresses\r
+//!     (typically of interned data) as an optimization, and while non-determinstic\r
+//!     outputs are considered bugs, the instructions executed can still vary a lot,\r
+//!     even when the externally observable behavior is perfectly repeatable\r
+//!   * `HashMap`s are involved in one more than one way:\r
+//!     * both the executed instructions, and the shape of the allocations depend\r
+//!       on both the hasher state and choice of keys (as the buckets are in\r
+//!       a flat array indexed by some of the lower bits of the key hashes)\r
+//!     * so every `HashMap` with keys being/containing addresses will amplify\r
+//!       ASLR and ASLR-like effects, making the entire program more sensitive\r
+//!     * the default hasher is randomized, and while `rustc` doesn't use it,\r
+//!       proc macros can (and will), and it's harder to disable than Linux ASLR\r
+//!   * most ways of measuring time will inherently never perfectly align with\r
+//!     exact points in the program's execution, making time behave like another\r
+//!     low-entropy source of randomness - this also means timers will elapse at\r
+//!     unpredictable points (which can further impact the rest of the execution)\r
+//!     * this includes the common thread scheduler technique of preempting the\r
+//!       currently executing thread with a periodic timer interrupt, so the exact\r
+//!       interleaving of multiple threads will likely not be reproducible without\r
+//!       special OS configuration, or tools that emulate a deterministic scheduler\r
+//!     * `jemalloc` (the allocator used by `rustc`, at least in official releases)\r
+//!       has a 10 second "purge timer", which can introduce an ASLR-like effect,\r
+//!       unless disabled with `MALLOC_CONF=dirty_decay_ms:0,muzzy_decay_ms:0`\r
+//! * hardware flaws (whether in the design or implementation)\r
+//!   * hardware interrupts ("IRQs") and exceptions (like page faults) cause\r
+//!     overcounting (1 instruction per interrupt, possibly the `iret` from the\r
+//!     kernel handler back to the interrupted userspace program)\r
+//!     * this is the reason why `instructions-minus-irqs:u` should be preferred\r
+//!       to `instructions:u`, where the former is available\r
+//!     * there are system-wide options (e.g. `CONFIG_NO_HZ_FULL`) for removing\r
+//!       some interrupts from the cores used for profiling, but they're not as\r
+//!       complete of a solution, nor easy to set up in the first place\r
+//!   * AMD Zen CPUs have a speculative execution feature (dubbed `SpecLockMap`),\r
+//!     which can cause non-deterministic overcounting for instructions following\r
+//!     an atomic instruction (such as found in heap allocators, or `measureme`)\r
+//!     * this is automatically detected, with a `log` message pointing the user\r
+//!       to <https://github.com/mozilla/rr/wiki/Zen> for guidance on how to\r
+//!       disable `SpecLockMap` on their system (sadly requires root access)\r
+//!\r
+//! Even if some of the above caveats apply for some profiling setup, as long as\r
+//! the counters function, they can still be used, and compared with `wall-time`.\r
+//! Chances are, they will still have less variance, as everything that impacts\r
+//! instruction counts will also impact any time measurements.\r
+//!\r
+//! Also keep in mind that instruction counts do not properly reflect all kinds\r
+//! of workloads, e.g. SIMD throughput and cache locality are unaccounted for.\r
+\r
+use std::error::Error;\r
+use std::time::Instant;\r
+\r
+// HACK(eddyb) this is semantically `warn!` but uses `error!` because\r
+// that's the only log level enabled by default - see also\r
+// https://github.com/rust-lang/rust/issues/76824\r
+macro_rules! really_warn {\r
+    ($msg:literal $($rest:tt)*) => {\r
+        error!(concat!("[WARNING] ", $msg) $($rest)*)\r
+    }\r
+}\r
+\r
+pub enum Counter {\r
+    WallTime(WallTime),\r
+    Instructions(Instructions),\r
+    InstructionsMinusIrqs(InstructionsMinusIrqs),\r
+    InstructionsMinusRaw0420(InstructionsMinusRaw0420),\r
+}\r
+\r
+impl Counter {\r
+    pub fn by_name(name: &str) -> Result<Self, Box<dyn Error + Send + Sync>> {\r
+        Ok(match name {\r
+            WallTime::NAME => Counter::WallTime(WallTime::new()),\r
+            Instructions::NAME => Counter::Instructions(Instructions::new()?),\r
+            InstructionsMinusIrqs::NAME => {\r
+                Counter::InstructionsMinusIrqs(InstructionsMinusIrqs::new()?)\r
+            }\r
+            InstructionsMinusRaw0420::NAME => {\r
+                Counter::InstructionsMinusRaw0420(InstructionsMinusRaw0420::new()?)\r
+            }\r
+            _ => return Err(format!("{:?} is not a valid counter name", name).into()),\r
+        })\r
+    }\r
+\r
+    pub(super) fn describe_as_json(&self) -> String {\r
+        let (name, units) = match self {\r
+            Counter::WallTime(_) => (\r
+                WallTime::NAME,\r
+                r#"[["ns", 1], ["μs", 1000], ["ms", 1000000], ["s", 1000000000]]"#,\r
+            ),\r
+            Counter::Instructions(_) => (Instructions::NAME, r#"[["instructions", 1]]"#),\r
+            Counter::InstructionsMinusIrqs(_) => {\r
+                (InstructionsMinusIrqs::NAME, r#"[["instructions", 1]]"#)\r
+            }\r
+            Counter::InstructionsMinusRaw0420(_) => {\r
+                (InstructionsMinusRaw0420::NAME, r#"[["instructions", 1]]"#)\r
+            }\r
+        };\r
+        format!(r#"{{ "name": "{}", "units": {} }}"#, name, units)\r
+    }\r
+\r
+    #[inline]\r
+    pub(super) fn since_start(&self) -> u64 {\r
+        match self {\r
+            Counter::WallTime(counter) => counter.since_start(),\r
+            Counter::Instructions(counter) => counter.since_start(),\r
+            Counter::InstructionsMinusIrqs(counter) => counter.since_start(),\r
+            Counter::InstructionsMinusRaw0420(counter) => counter.since_start(),\r
+        }\r
+    }\r
+}\r
+\r
+/// "Monotonic clock" with nanosecond precision (using [`std::time::Instant`]).\r
+///\r
+/// Can be obtained with `Counter::by_name("wall-time")`.\r
+pub struct WallTime {\r
+    start: Instant,\r
+}\r
+\r
+impl WallTime {\r
+    const NAME: &'static str = "wall-time";\r
+\r
+    pub fn new() -> Self {\r
+        WallTime {\r
+            start: Instant::now(),\r
+        }\r
+    }\r
+\r
+    #[inline]\r
+    fn since_start(&self) -> u64 {\r
+        self.start.elapsed().as_nanos() as u64\r
+    }\r
+}\r
+\r
+/// "Instructions retired" hardware performance counter (userspace-only).\r
+///\r
+/// Can be obtained with `Counter::by_name("instructions:u")`.\r
+pub struct Instructions {\r
+    instructions: hw::Counter,\r
+    start: u64,\r
+}\r
+\r
+impl Instructions {\r
+    const NAME: &'static str = "instructions:u";\r
+\r
+    pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> {\r
+        let model = hw::CpuModel::detect()?;\r
+        let instructions = hw::Counter::new(&model, HwCounterType::Instructions)?;\r
+        let start = instructions.read();\r
+        Ok(Instructions {\r
+            instructions,\r
+            start,\r
+        })\r
+    }\r
+\r
+    #[inline]\r
+    fn since_start(&self) -> u64 {\r
+        self.instructions.read().wrapping_sub(self.start)\r
+    }\r
+}\r
+\r
+/// More accurate [`Instructions`] (subtracting hardware interrupt counts).\r
+///\r
+/// Can be obtained with `Counter::by_name("instructions-minus-irqs:u")`.\r
+pub struct InstructionsMinusIrqs {\r
+    instructions: hw::Counter,\r
+    irqs: hw::Counter,\r
+    start: u64,\r
+}\r
+\r
+impl InstructionsMinusIrqs {\r
+    const NAME: &'static str = "instructions-minus-irqs:u";\r
+\r
+    pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> {\r
+        let model = hw::CpuModel::detect()?;\r
+        let instructions = hw::Counter::new(&model, HwCounterType::Instructions)?;\r
+        let irqs = hw::Counter::new(&model, HwCounterType::Irqs)?;\r
+        let (start_instructions, start_irqs) = (&instructions, &irqs).read();\r
+        let start = start_instructions.wrapping_sub(start_irqs);\r
+        Ok(InstructionsMinusIrqs {\r
+            instructions,\r
+            irqs,\r
+            start,\r
+        })\r
+    }\r
+\r
+    #[inline]\r
+    fn since_start(&self) -> u64 {\r
+        let (instructions, irqs) = (&self.instructions, &self.irqs).read();\r
+        instructions.wrapping_sub(irqs).wrapping_sub(self.start)\r
+    }\r
+}\r
+\r
+/// (Experimental) Like [`InstructionsMinusIrqs`] (but using an undocumented `r0420:u` counter).\r
+///\r
+/// Can be obtained with `Counter::by_name("instructions-minus-r0420:u")`.\r
+//\r
+// HACK(eddyb) this is a variant of `instructions-minus-irqs:u`, where `r0420`\r
+// is subtracted, instead of the usual "hardware interrupts" (aka IRQs).\r
+// `r0420` is an undocumented counter on AMD Zen CPUs which appears to count\r
+// both hardware interrupts and exceptions (such as page faults), though\r
+// it's unclear yet what exactly it's counting (could even be `iret`s).\r
+pub struct InstructionsMinusRaw0420(InstructionsMinusIrqs);\r
+\r
+impl InstructionsMinusRaw0420 {\r
+    const NAME: &'static str = "instructions-minus-r0420:u";\r
+\r
+    pub fn new() -> Result<Self, Box<dyn Error + Send + Sync>> {\r
+        let model = hw::CpuModel::detect()?;\r
+        let instructions = hw::Counter::new(&model, HwCounterType::Instructions)?;\r
+        let irqs = hw::Counter::new(&model, HwCounterType::Raw0420)?;\r
+        let (start_instructions, start_irqs) = (&instructions, &irqs).read();\r
+        let start = start_instructions.wrapping_sub(start_irqs);\r
+        Ok(InstructionsMinusRaw0420(InstructionsMinusIrqs {\r
+            instructions,\r
+            irqs,\r
+            start,\r
+        }))\r
+    }\r
+\r
+    #[inline]\r
+    fn since_start(&self) -> u64 {\r
+        self.0.since_start()\r
+    }\r
+}\r
+\r
+trait HwCounterRead {\r
+    type Output;\r
+    fn read(&self) -> Self::Output;\r
+}\r
+\r
+enum HwCounterType {\r
+    Instructions,\r
+    Irqs,\r
+    Raw0420,\r
+}\r
+\r
+const BUG_REPORT_MSG: &str =\r
+    "please report this to https://github.com/rust-lang/measureme/issues/new";\r
+\r
+/// Linux x86_64 implementation based on `perf_event_open` and `rdpmc`.\r
+#[cfg(all(feature = "nightly", target_arch = "x86_64", target_os = "linux"))]\r
+mod hw {\r
+    use memmap::{Mmap, MmapOptions};\r
+    use perf_event_open_sys::{bindings::*, perf_event_open};\r
+    use std::convert::TryInto;\r
+    use std::error::Error;\r
+    use std::fs;\r
+    use std::mem;\r
+    use std::os::unix::io::FromRawFd;\r
+\r
+    pub(super) struct Counter {\r
+        mmap: Mmap,\r
+        reg_idx: u32,\r
+    }\r
+\r
+    impl Counter {\r
+        pub(super) fn new(\r
+            model: &CpuModel,\r
+            counter_type: super::HwCounterType,\r
+        ) -> Result<Self, Box<dyn Error + Send + Sync>> {\r
+            let (type_, hw_id) = match counter_type {\r
+                super::HwCounterType::Instructions => (\r
+                    perf_type_id_PERF_TYPE_HARDWARE,\r
+                    perf_hw_id_PERF_COUNT_HW_INSTRUCTIONS,\r
+                ),\r
+                super::HwCounterType::Irqs => {\r
+                    (perf_type_id_PERF_TYPE_RAW, model.irqs_counter_config()?)\r
+                }\r
+                super::HwCounterType::Raw0420 => {\r
+                    match model {\r
+                        CpuModel::Amd(AmdGen::Zen) => {}\r
+\r
+                        _ => really_warn!(\r
+                            "Counter::new: the undocumented `r0420` performance \\r
+                             counter has only been observed on AMD Zen CPUs"\r
+                        ),\r
+                    }\r
+\r
+                    (perf_type_id_PERF_TYPE_RAW, 0x04_20)\r
+                }\r
+            };\r
+            Self::with_type_and_hw_id(type_, hw_id)\r
+        }\r
+\r
+        fn with_type_and_hw_id(\r
+            type_: perf_type_id,\r
+            hw_id: u32,\r
+        ) -> Result<Self, Box<dyn Error + Send + Sync>> {\r
+            let mut attrs = perf_event_attr {\r
+                size: mem::size_of::<perf_event_attr>().try_into().unwrap(),\r
+                type_,\r
+                config: hw_id.into(),\r
+                ..perf_event_attr::default()\r
+            };\r
+\r
+            // Only record same-thread, any CPUs, and only userspace (no kernel/hypervisor).\r
+            // NOTE(eddyb) `pid = 0`, despite talking about "process id", means\r
+            // "calling process/thread", *not* "any thread in the calling process"\r
+            // (i.e. "process" is interchangeable with "main thread of the process")\r
+            // FIXME(eddyb) introduce per-thread counters and/or use `inherit`\r
+            // (and `inherit_stat`? though they might not be appropriate here)\r
+            // to be able to read the counter on more than just the initial thread.\r
+            let pid = 0;\r
+            let cpu = -1;\r
+            let group_fd = -1;\r
+            attrs.set_exclude_kernel(1);\r
+            attrs.set_exclude_hv(1);\r
+\r
+            let file = unsafe {\r
+                let fd =\r
+                    perf_event_open(&mut attrs, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC.into());\r
+                if fd < 0 {\r
+                    Err(std::io::Error::from_raw_os_error(-fd))\r
+                } else {\r
+                    Ok(fs::File::from_raw_fd(fd))\r
+                }\r
+            };\r
+            let file = file.map_err(|e| format!("perf_event_open failed: {:?}", e))?;\r
+\r
+            let mmap = unsafe {\r
+                MmapOptions::new()\r
+                    .len(mem::size_of::<perf_event_mmap_page>())\r
+                    .map(&file)\r
+            };\r
+            let mmap = mmap.map_err(|e| format!("perf_event_mmap_page: mmap failed: {:?}", e))?;\r
+\r
+            let mut counter = Counter { mmap, reg_idx: 0 };\r
+\r
+            let (version, compat_version, caps, index, pmc_width) = counter\r
+                .access_mmap_page_with_seqlock(|mp| {\r
+                    (\r
+                        mp.version,\r
+                        mp.compat_version,\r
+                        unsafe { mp.__bindgen_anon_1.__bindgen_anon_1 },\r
+                        mp.index,\r
+                        mp.pmc_width,\r
+                    )\r
+                });\r
+\r
+            info!(\r
+                "Counter::new: version={} compat_version={} index={:#x}",\r
+                version, compat_version, index,\r
+            );\r
+\r
+            if caps.cap_user_rdpmc() == 0 {\r
+                return Err(format!(\r
+                    "perf_event_mmap_page: missing cap_user_rdpmc{}",\r
+                    if caps.cap_bit0_is_deprecated() == 0 && caps.cap_bit0() == 1 {\r
+                        " (ignoring legacy/broken rdpmc support)"\r
+                    } else {\r
+                        ""\r
+                    }\r
+                )\r
+                .into());\r
+            }\r
+\r
+            if index == 0 {\r
+                return Err(format!(\r
+                    "perf_event_mmap_page: no allocated hardware register (ran out?)"\r
+                )\r
+                .into());\r
+            }\r
+            counter.reg_idx = index - 1;\r
+\r
+            if (cfg!(not(accurate_seqlock_rdpmc)) || true) && pmc_width != 48 {\r
+                return Err(format!(\r
+                    "perf_event_mmap_page: {}-bit hardware counter found, only 48-bit supported",\r
+                    pmc_width\r
+                )\r
+                .into());\r
+            }\r
+\r
+            Ok(counter)\r
+        }\r
+\r
+        /// Try to access the mmap page, retrying the `attempt` closure as long\r
+        /// as the "seqlock" sequence number changes (which indicates the kernel\r
+        /// has updated one or more fields within the mmap page).\r
+        #[inline]\r
+        fn access_mmap_page_with_seqlock<T>(\r
+            &self,\r
+            attempt: impl Fn(&perf_event_mmap_page) -> T,\r
+        ) -> T {\r
+            // FIXME(eddyb) it's probably UB to use regular reads, especially\r
+            // from behind `&T`, with the only synchronization being barriers.\r
+            // Probably needs atomic reads, and stronger ones at that, for the\r
+            // `lock` field, than the fields (which would be `Relaxed`?).\r
+            let mmap_page = unsafe { &*(self.mmap.as_ptr() as *const perf_event_mmap_page) };\r
+            let barrier = || std::sync::atomic::fence(std::sync::atomic::Ordering::Acquire);\r
+\r
+            loop {\r
+                // Grab the "seqlock" - the kernel will update this value when it\r
+                // updates any of the other fields that may be read in `attempt`.\r
+                let seq_lock = mmap_page.lock;\r
+                barrier();\r
+\r
+                let result = attempt(mmap_page);\r
+\r
+                // If nothing has changed, we're done. Otherwise, keep retrying.\r
+                barrier();\r
+                if mmap_page.lock == seq_lock {\r
+                    return result;\r
+                }\r
+            }\r
+        }\r
+    }\r
+\r
+    impl super::HwCounterRead for Counter {\r
+        type Output = u64;\r
+\r
+        #[inline]\r
+        fn read(&self) -> u64 {\r
+            // HACK(eddyb) keep the accurate code around while not using it,\r
+            // to minimize overhead without losing the more complex implementation.\r
+            let (counter, offset, pmc_width) = if cfg!(accurate_seqlock_rdpmc) && false {\r
+                self.access_mmap_page_with_seqlock(|mp| {\r
+                    let caps = unsafe { mp.__bindgen_anon_1.__bindgen_anon_1 };\r
+                    assert_ne!(caps.cap_user_rdpmc(), 0);\r
+\r
+                    (\r
+                        rdpmc(mp.index.checked_sub(1).unwrap()),\r
+                        mp.offset,\r
+                        mp.pmc_width,\r
+                    )\r
+                })\r
+            } else {\r
+                (rdpmc(self.reg_idx), 0, 48)\r
+            };\r
+\r
+            let counter = offset + (counter as i64);\r
+\r
+            // Sign-extend the `pmc_width`-bit value to `i64`.\r
+            (counter << (64 - pmc_width) >> (64 - pmc_width)) as u64\r
+        }\r
+    }\r
+\r
+    impl super::HwCounterRead for (&Counter, &Counter) {\r
+        type Output = (u64, u64);\r
+\r
+        #[inline]\r
+        fn read(&self) -> (u64, u64) {\r
+            // HACK(eddyb) keep the accurate code around while not using it,\r
+            // to minimize overhead without losing the more complex implementation.\r
+            if (cfg!(accurate_seqlock_rdpmc) || cfg!(unserialized_rdpmc)) && false {\r
+                return (self.0.read(), self.1.read());\r
+            }\r
+\r
+            let pmc_width = 48;\r
+\r
+            let (a_counter, b_counter) = rdpmc_pair(self.0.reg_idx, self.1.reg_idx);\r
+\r
+            // Sign-extend the `pmc_width`-bit values to `i64`.\r
+            (\r
+                ((a_counter as i64) << (64 - pmc_width) >> (64 - pmc_width)) as u64,\r
+                ((b_counter as i64) << (64 - pmc_width) >> (64 - pmc_width)) as u64,\r
+            )\r
+        }\r
+    }\r
+\r
+    /// Read the hardware performance counter indicated by `reg_idx`.\r
+    ///\r
+    /// If the counter is signed, sign extension should be performed based on\r
+    /// the width of the register (32 to 64 bits, e.g. 48-bit seems common).\r
+    #[inline(always)]\r
+    fn rdpmc(reg_idx: u32) -> u64 {\r
+        let (lo, hi): (u32, u32);\r
+        unsafe {\r
+            // NOTE(eddyb) below comment is outdated (the other branch uses `cpuid`).\r
+            if cfg!(unserialized_rdpmc) && false {\r
+                // FIXME(eddyb) the Intel and AMD manuals warn about the need for\r
+                // "serializing instructions" before/after `rdpmc`, if avoiding any\r
+                // reordering is desired, but do not agree on the full set of usable\r
+                // "serializing instructions" (e.g. `mfence` isn't listed in both).\r
+                //\r
+                // The only usable, and guaranteed to work, "serializing instruction"\r
+                // appears to be `cpuid`, but it doesn't seem easy to use, especially\r
+                // due to the overlap in registers with `rdpmc` itself, and it might\r
+                // have too high of a cost, compared to serialization benefits (if any).\r
+                asm!("rdpmc", in("ecx") reg_idx, out("eax") lo, out("edx") hi, options(nostack));\r
+            } else {\r
+                asm!(\r
+                    // Dummy `cpuid(0)` to serialize instruction execution.\r
+                    "xor %eax, %eax", // Intel syntax: "xor eax, eax"\r
+                    "cpuid",\r
+\r
+                    "mov {rdpmc_ecx:e}, %ecx", // Intel syntax: "mov ecx, {rdpmc_ecx:e}"\r
+                    "rdpmc",\r
+                    rdpmc_ecx = in(reg) reg_idx,\r
+                    out("eax") lo,\r
+                    out("edx") hi,\r
+\r
+                    // `cpuid` clobbers (not overwritten by `rdpmc`).\r
+                    out("ebx") _,\r
+                    out("ecx") _,\r
+\r
+                    options(nostack),\r
+\r
+                    // HACK(eddyb) LLVM 9 and older do not support modifiers\r
+                    // in Intel syntax inline asm; whenever Rust minimum LLVM\r
+                    // version becomes LLVM 10, remove and replace above\r
+                    // instructions with Intel syntax version (from comments).\r
+                    options(att_syntax),\r
+                );\r
+            }\r
+        }\r
+        lo as u64 | (hi as u64) << 32\r
+    }\r
+\r
+    /// Read two hardware performance counters at once (see `rdpmc`).\r
+    ///\r
+    /// Should be more efficient/accurate than two `rdpmc` calls, as it\r
+    /// only requires one "serializing instruction", rather than two.\r
+    #[inline(always)]\r
+    fn rdpmc_pair(a_reg_idx: u32, b_reg_idx: u32) -> (u64, u64) {\r
+        let (a_lo, a_hi): (u32, u32);\r
+        let (b_lo, b_hi): (u32, u32);\r
+        unsafe {\r
+            asm!(\r
+                // Dummy `cpuid(0)` to serialize instruction execution.\r
+                "xor %eax, %eax", // Intel syntax: "xor eax, eax"\r
+                "cpuid",\r
+\r
+                "mov {a_rdpmc_ecx:e}, %ecx", // Intel syntax: "mov ecx, {a_rdpmc_ecx:e}"\r
+                "rdpmc",\r
+                "mov %eax, {a_rdpmc_eax:e}", // Intel syntax: "mov {a_rdpmc_eax:e}, eax"\r
+                "mov %edx, {a_rdpmc_edx:e}", // Intel syntax: "mov {a_rdpmc_edx:e}, edx"\r
+                "mov {b_rdpmc_ecx:e}, %ecx", // Intel syntax: "mov ecx, {b_rdpmc_ecx:e}"\r
+                "rdpmc",\r
+                a_rdpmc_ecx = in(reg) a_reg_idx,\r
+                a_rdpmc_eax = out(reg) a_lo,\r
+                a_rdpmc_edx = out(reg) a_hi,\r
+                b_rdpmc_ecx = in(reg) b_reg_idx,\r
+                out("eax") b_lo,\r
+                out("edx") b_hi,\r
+\r
+                // `cpuid` clobbers (not overwritten by `rdpmc`).\r
+                out("ebx") _,\r
+                out("ecx") _,\r
+\r
+                options(nostack),\r
+\r
+                // HACK(eddyb) LLVM 9 and older do not support modifiers\r
+                // in Intel syntax inline asm; whenever Rust minimum LLVM\r
+                // version becomes LLVM 10, remove and replace above\r
+                // instructions with Intel syntax version (from comments).\r
+                options(att_syntax),\r
+            );\r
+        }\r
+        (\r
+            a_lo as u64 | (a_hi as u64) << 32,\r
+            b_lo as u64 | (b_hi as u64) << 32,\r
+        )\r
+    }\r
+\r
+    /// Categorization of `x86_64` CPUs, primarily based on how they\r
+    /// support for counting "hardware interrupts" (documented or not).\r
+    pub(super) enum CpuModel {\r
+        Amd(AmdGen),\r
+        Intel(IntelGen),\r
+    }\r
+\r
+    pub(super) enum AmdGen {\r
+        /// K8 (Hammer) to Jaguar / Puma.\r
+        PreZen,\r
+\r
+        /// Zen / Zen+ / Zen 2.\r
+        Zen,\r
+\r
+        /// Unknown AMD CPU, contemporary to/succeeding Zen/Zen+/Zen 2,\r
+        /// but likely similar to them.\r
+        UnknownMaybeZenLike,\r
+    }\r
+\r
+    pub(super) enum IntelGen {\r
+        /// Intel CPU predating Sandy Bridge. These are the only CPUs we\r
+        /// can't support (more) accurate instruction counting on, as they\r
+        /// don't (appear to) have any way to count "hardware interrupts".\r
+        PreBridge,\r
+\r
+        /// Sandy Bridge / Ivy Bridge:\r
+        /// * client: Sandy Bridge (M/H) / Ivy Bridge (M/H/Gladden)\r
+        /// * server: Sandy Bridge (E/EN/EP) / Ivy Bridge (E/EN/EP/EX)\r
+        ///\r
+        /// Intel doesn't document support for counting "hardware interrupts"\r
+        /// prior to Skylake, but testing found that `HW_INTERRUPTS.RECEIVED`\r
+        /// from Skylake has existed, with the same config, as far back as\r
+        /// "Sandy Bridge" (but before that it mapped to a different event).\r
+        ///\r
+        /// These are the (pre-Skylake) *Bridge CPU models confirmed so far:\r
+        /// * Sandy Bridge (client) Family 6 Model 42\r
+        ///     Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz (@alyssais)\r
+        /// * Ivy Bridge (client) Family 6 Model 58\r
+        ///     Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz (@eddyb)\r
+        ///\r
+        /// We later found this paper, which on page 5 lists 12 counters,\r
+        /// for each of Nehalem/Westmere, Sandy Bridge and Ivy Bridge:\r
+        /// http://web.eece.maine.edu/~vweaver/projects/deterministic/deterministic_counters.pdf\r
+        /// It appears that both Sandy Bridge and Ivy Bridge used to have\r
+        /// `HW_INTERRUPTS.RECEIVED` documented, before Intel removed every\r
+        /// mention of the counter from newer versions of their manuals.\r
+        Bridge,\r
+\r
+        /// Haswell / Broadwell:\r
+        /// * client: Haswell (S/ULT/GT3e) / Broadwell (U/Y/S/H/C/W)\r
+        /// * server: Haswell (E/EP/EX) / Broadwell (E/EP/EX/DE/Hewitt Lake)\r
+        ///\r
+        /// Equally as undocumented as "Sandy Bridge / Ivy Bridge" (see above).\r
+        ///\r
+        /// These are the (pre-Skylake) *Well CPU models confirmed so far:\r
+        /// * Haswell (client) Family 6 Model 60\r
+        ///     Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz (@m-ou-se)\r
+        /// * Haswell (server) Family 6 Model 63\r
+        ///     Intel(R) Xeon(R) CPU E5-2697 v3 @ 2.60GHz (@cuviper)\r
+        /// * Haswell (client + GT3e) Family 6 Model 70\r
+        ///     Intel(R) Core(TM) i7-4750HQ CPU @ 2.00GHz (@nagisa)\r
+        ///     Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz (@m-ou-se)\r
+        Well,\r
+\r
+        /// Skylake / Skylake-derived:\r
+        /// * client: Skylake (Y/U/DT/H/S) / Kaby Lake (Y/U/DT/H/S/X) / Coffee Lake (U/S/H/E)\r
+        /// * server: Skylake (SP/X/DE/W) / Cascade Lake (SP/X/W)\r
+        ///\r
+        /// Both "client" and "server" product lines have documented support\r
+        /// for counting "hardware interrupts" (`HW_INTERRUPTS.RECEIVED`).\r
+        ///\r
+        /// Intel does not make it clear that future product lines, such as\r
+        /// "Ice Lake", will continue to support this (or with what config),\r
+        /// and even "Comet Lake" (aka "10th gen") isn't explicitly listed.\r
+        Lake,\r
+\r
+        /// Unknown Intel CPU, contemporary to/succeeding *Bridge/*Well/*Lake,\r
+        /// but likely similar to them.\r
+        UnknownMaybeLakeLike,\r
+    }\r
+\r
+    impl CpuModel {\r
+        /// Detect the model of the current CPU using `cpuid`.\r
+        pub(super) fn detect() -> Result<Self, Box<dyn Error + Send + Sync>> {\r
+            let cpuid0 = unsafe { std::arch::x86_64::__cpuid(0) };\r
+            let cpuid1 = unsafe { std::arch::x86_64::__cpuid(1) };\r
+            let mut vendor = [0; 12];\r
+            vendor[0..4].copy_from_slice(&cpuid0.ebx.to_le_bytes());\r
+            vendor[4..8].copy_from_slice(&cpuid0.edx.to_le_bytes());\r
+            vendor[8..12].copy_from_slice(&cpuid0.ecx.to_le_bytes());\r
+\r
+            let vendor = std::str::from_utf8(&vendor).map_err(|_| {\r
+                format!(\r
+                    "cpuid returned non-UTF-8 vendor name: cpuid(0)={:?} cpuid(1)={:?}",\r
+                    cpuid0, cpuid1\r
+                )\r
+            })?;\r
+\r
+            let version = cpuid1.eax;\r
+\r
+            let mut family = (version >> 8) & 0xf;\r
+            if family == 15 {\r
+                // Extended family.\r
+                family += (version >> 20) & 0xff;\r
+            }\r
+\r
+            let mut model = (version >> 4) & 0xf;\r
+            if family >= 15 || vendor == "GenuineIntel" && family == 6 {\r
+                // Extended model.\r
+                model += ((version >> 16) & 0xf) << 4;\r
+            }\r
+\r
+            info!(\r
+                "CpuModel::detect: vendor={:?} family={} model={}",\r
+                vendor, family, model\r
+            );\r
+\r
+            match vendor {\r
+                "AuthenticAMD" => {\r
+                    use self::AmdGen::*;\r
+\r
+                    let (gen, name) = match (family, model) {\r
+                        (0..=14, _) | (19, _) => {\r
+                            return Err(format!(\r
+                                "impossible AMD64 CPU detected (Family {} Model {}); {}",\r
+                                family,\r
+                                model,\r
+                                super::BUG_REPORT_MSG\r
+                            )\r
+                            .into());\r
+                        }\r
+\r
+                        (15, _) => (PreZen, "K8 (Hammer)"),\r
+                        (16, _) => (PreZen, "K10 (Barcelona/Shanghai/Istanbul)"),\r
+                        (17, _) => (PreZen, "K8+K10 hybrid (Turion X2 Ultra)"),\r
+                        (18, _) => (PreZen, "Fusion"),\r
+                        (20, _) => (PreZen, "Bobcat"),\r
+                        (21, _) => (PreZen, "Bulldozer / Piledriver / Steamroller / Excavator"),\r
+                        (22, _) => (PreZen, "Jaguar / Puma"),\r
+\r
+                        (23, 1) => (Zen, "Zen (Naples/Whitehaven/Summit Ridge/Snowy Owl)"),\r
+                        (23, 17) => (Zen, "Zen (Raven Ridge)"),\r
+                        (23, 24) => (Zen, "Zen (Banded Kestrel/Dali) / Zen+ (Picasso)"),\r
+                        (23, 8) => (Zen, "Zen+ (Pinnacle Ridge)"),\r
+                        (23, 49) => (Zen, "Zen 2 (Rome/Castle Peak)"),\r
+                        (23, 113) => (Zen, "Zen 2 (Matisse)"),\r
+\r
+                        (23..=0xffff_ffff, _) => {\r
+                            really_warn!(\r
+                                "CpuModel::detect: unknown AMD CPU (Family {} Model {}), \\r
+                                 assuming Zen-like; {}",\r
+                                family,\r
+                                model,\r
+                                super::BUG_REPORT_MSG\r
+                            );\r
+\r
+                            (UnknownMaybeZenLike, "")\r
+                        }\r
+                    };\r
+\r
+                    if !name.is_empty() {\r
+                        info!("CpuModel::detect: known AMD CPU: {}", name);\r
+                    }\r
+\r
+                    // The `SpecLockMap` (speculative atomic aka `lock` instruction\r
+                    // execution, unclear what "Map" refers to) feature in AMD Zen CPUs\r
+                    // causes non-deterministic overcounting of atomic instructions,\r
+                    // presumably whenever it has to roll back the speculation\r
+                    // (as in, the performance counters aren't rolled back).\r
+                    // Even this this may be rare when uncontended, it adds up.\r
+                    //\r
+                    // There is an MSR bit (`MSRC001_1020[54]`) that's not officially\r
+                    // documented, but which several motherboards and profiling tools\r
+                    // set whenever IBS (Instruction-Based Sampling) is in use, and\r
+                    // it is sometimes referred to as "disabling `SpecLockMap`"\r
+                    // (hence having a name for the feature that speculates `lock`s).\r
+                    //\r
+                    // One way we could detect that the bit has been set would be to\r
+                    // parse `uname().release` (aka `uname -r`) and look for versions\r
+                    // which are known to include the patch suggested in this thread:\r
+                    // https://github.com/mozilla/rr/issues/2034#issuecomment-693761247\r
+                    //\r
+                    // However, one may set the bit using e.g. `wrmsr`, even on older\r
+                    // kernels, so a more reliable approach is to execute some atomics\r
+                    // and look at the `SpecLockMapCommit` (`r0825:u`) Zen counter,\r
+                    // which only reliably remains `0` when `SpecLockMap` is disabled.\r
+                    if matches!(gen, Zen | UnknownMaybeZenLike) {\r
+                        if let Ok(spec_lock_map_commit) =\r
+                            Counter::with_type_and_hw_id(perf_type_id_PERF_TYPE_RAW, 0x08_25)\r
+                        {\r
+                            use super::HwCounterRead;\r
+\r
+                            let start_spec_lock_map_commit = spec_lock_map_commit.read();\r
+\r
+                            // Execute an atomic (`lock`) instruction, which should\r
+                            // start speculative execution for following instructions\r
+                            // (as long as `SpecLockMap` isn't disabled).\r
+                            let mut atomic: u64 = 0;\r
+                            let mut _tmp: u64 = 0;\r
+                            unsafe {\r
+                                asm!(\r
+                                    // Intel syntax: "lock xadd [{atomic}], {tmp}"\r
+                                    "lock xadd {tmp}, ({atomic})",\r
+\r
+                                    atomic = in(reg) &mut atomic,\r
+                                    tmp = inout(reg) _tmp,\r
+\r
+                                    // HACK(eddyb) LLVM 9 and older do not support modifiers\r
+                                    // in Intel syntax inline asm; whenever Rust minimum LLVM\r
+                                    // version becomes LLVM 10, remove and replace above\r
+                                    // instructions with Intel syntax version (from comments).\r
+                                    options(att_syntax),\r
+                                );\r
+                            }\r
+\r
+                            if spec_lock_map_commit.read() != start_spec_lock_map_commit {\r
+                                really_warn!(\r
+                                    "CpuModel::detect: SpecLockMap detected, in AMD {} CPU; \\r
+                                     this may add some non-deterministic noise - \\r
+                                     for information on disabling SpecLockMap, see \\r
+                                     https://github.com/mozilla/rr/wiki/Zen",\r
+                                    name\r
+                                );\r
+                            }\r
+                        }\r
+                    }\r
+\r
+                    Ok(CpuModel::Amd(gen))\r
+                }\r
+\r
+                "GenuineIntel" => {\r
+                    use self::IntelGen::*;\r
+\r
+                    let (gen, name) = match (family, model) {\r
+                        // No need to name these, they're unsupported anyway.\r
+                        (0..=5, _) => (PreBridge, ""),\r
+                        (15, _) => (PreBridge, "Netburst"),\r
+                        (6, 0..=41) => (PreBridge, ""),\r
+\r
+                        // Older Xeon Phi CPUs, misplaced in Family 6.\r
+                        (6, 87) => (PreBridge, "Knights Landing"),\r
+                        (6, 133) => (PreBridge, "Knights Mill"),\r
+\r
+                        // Older Atom CPUs, interleaved with other CPUs.\r
+                        // FIXME(eddyb) figure out if these are like *Bridge/*Well.\r
+                        (6, 53) | (6, 54) => (PreBridge, "Saltwell"),\r
+                        (6, 55) | (6, 74) | (6, 77) | (6, 90) | (6, 93) => {\r
+                            (PreBridge, "Silvermont")\r
+                        }\r
+                        (6, 76) => (PreBridge, "Airmont (Cherry Trail/Braswell)"),\r
+\r
+                        // Older server CPUs, numbered out of order.\r
+                        (6, 44) => (PreBridge, "Westmere (Gulftown/EP)"),\r
+                        (6, 46) => (PreBridge, "Nehalem (EX)"),\r
+                        (6, 47) => (PreBridge, "Westmere (EX)"),\r
+\r
+                        (6, 42) => (Bridge, "Sandy Bridge (M/H)"),\r
+                        (6, 45) => (Bridge, "Sandy Bridge (E/EN/EP)"),\r
+                        (6, 58) => (Bridge, "Ivy Bridge (M/H/Gladden)"),\r
+                        (6, 62) => (Bridge, "Ivy Bridge (E/EN/EP/EX)"),\r
+\r
+                        (6, 60) => (Well, "Haswell (S)"),\r
+                        (6, 61) => (Well, "Broadwell (U/Y/S)"),\r
+                        (6, 63) => (Well, "Haswell (E/EP/EX)"),\r
+                        (6, 69) => (Well, "Haswell (ULT)"),\r
+                        (6, 70) => (Well, "Haswell (GT3e)"),\r
+                        (6, 71) => (Well, "Broadwell (H/C/W)"),\r
+                        (6, 79) => (Well, "Broadwell (E/EP/EX)"),\r
+                        (6, 86) => (Well, "Broadwell (DE/Hewitt Lake)"),\r
+\r
+                        (6, 78) => (Lake, "Skylake (Y/U)"),\r
+                        (6, 85) => (Lake, "Skylake (SP/X/DE/W) / Cascade Lake (SP/X/W)"),\r
+                        (6, 94) => (Lake, "Skylake (DT/H/S)"),\r
+                        (6, 142) => (Lake, "Kaby Lake (Y/U) / Coffee Lake (U)"),\r
+                        (6, 158) => (Lake, "Kaby Lake (DT/H/S/X) / Coffee Lake (S/H/E)"),\r
+\r
+                        (6..=14, _) | (16..=0xffff_ffff, _) => {\r
+                            really_warn!(\r
+                                "CpuModel::detect: unknown Intel CPU (Family {} Model {}), \\r
+                                 assuming Skylake-like; {}",\r
+                                family,\r
+                                model,\r
+                                super::BUG_REPORT_MSG\r
+                            );\r
+\r
+                            (UnknownMaybeLakeLike, "")\r
+                        }\r
+                    };\r
+\r
+                    if !name.is_empty() {\r
+                        info!("CpuModel::detect: known Intel CPU: {}", name);\r
+                    }\r
+\r
+                    Ok(CpuModel::Intel(gen))\r
+                }\r
+\r
+                _ => Err(format!(\r
+                    "cpuid returned unknown CPU vendor {:?}; version={:#x}",\r
+                    vendor, version\r
+                )\r
+                .into()),\r
+            }\r
+        }\r
+\r
+        /// Return the hardware performance counter configuration for\r
+        /// counting "hardware interrupts" (documented or not).\r
+        fn irqs_counter_config(&self) -> Result<u32, Box<dyn Error + Send + Sync>> {\r
+            match self {\r
+                CpuModel::Amd(model) => match model {\r
+                    AmdGen::PreZen => Ok(0x00_cf),\r
+                    AmdGen::Zen | AmdGen::UnknownMaybeZenLike => Ok(0x00_2c),\r
+                },\r
+                CpuModel::Intel(model) => match model {\r
+                    IntelGen::PreBridge => Err(format!(\r
+                        "counting IRQs not yet supported on Intel CPUs \\r
+                         predating Sandy Bridge; {}",\r
+                        super::BUG_REPORT_MSG\r
+                    )\r
+                    .into()),\r
+                    IntelGen::Bridge\r
+                    | IntelGen::Well\r
+                    | IntelGen::Lake\r
+                    | IntelGen::UnknownMaybeLakeLike => Ok(0x01_cb),\r
+                },\r
+            }\r
+        }\r
+    }\r
+}\r
+\r
+#[cfg(not(all(feature = "nightly", target_arch = "x86_64", target_os = "linux")))]\r
+mod hw {\r
+    use std::error::Error;\r
+\r
+    pub(super) enum Counter {}\r
+\r
+    impl Counter {\r
+        pub(super) fn new(\r
+            model: &CpuModel,\r
+            _: super::HwCounterType,\r
+        ) -> Result<Self, Box<dyn Error + Send + Sync>> {\r
+            match *model {}\r
+        }\r
+    }\r
+\r
+    impl super::HwCounterRead for Counter {\r
+        type Output = u64;\r
+\r
+        #[inline]\r
+        fn read(&self) -> u64 {\r
+            match *self {}\r
+        }\r
+    }\r
+\r
+    impl super::HwCounterRead for (&Counter, &Counter) {\r
+        type Output = (u64, u64);\r
+\r
+        #[inline]\r
+        fn read(&self) -> (u64, u64) {\r
+            match *self.0 {}\r
+        }\r
+    }\r
+\r
+    pub(super) enum CpuModel {}\r
+\r
+    impl CpuModel {\r
+        pub(super) fn detect() -> Result<Self, Box<dyn Error + Send + Sync>> {\r
+            // HACK(eddyb) mark `really_warn!` (and transitively `log` macros)\r
+            // and `BUG_REPORT_MSG` as "used" to silence warnings.\r
+            if false {\r
+                really_warn!("unsupported; {}", super::BUG_REPORT_MSG);\r
+            }\r
+\r
+            let mut msg = String::new();\r
+            let mut add_error = |s| {\r
+                if !msg.is_empty() {\r
+                    msg += "; ";\r
+                }\r
+                msg += s;\r
+            };\r
+\r
+            if cfg!(not(feature = "nightly")) {\r
+                add_error("only supported with measureme's \"nightly\" feature");\r
+            }\r
+\r
+            if cfg!(not(target_arch = "x86_64")) {\r
+                add_error("only supported architecture is x86_64");\r
+            }\r
+\r
+            if cfg!(not(target_os = "linux")) {\r
+                add_error("only supported OS is Linux");\r
+            }\r
+\r
+            Err(msg.into())\r
+        }\r
+    }\r
+}\r