[mirror_ubuntu-hirsute-kernel.git] / kernel / watchdog_hld.c

/*
 * Detect hard lockups on a system
 *
 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
 *
 * Note: Most of this code is borrowed heavily from the original softlockup
 * detector, so thanks to Ingo for the initial implementation.
 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
 * to those contributors as well.
 */

#define pr_fmt(fmt) "NMI watchdog: " fmt

#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/sched/debug.h>

#include <asm/irq_regs.h>
#include <linux/perf_event.h>

static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);

static unsigned long hardlockup_allcpu_dumped;
static bool hardlockup_detector_disabled;

void arch_touch_nmi_watchdog(void)
{
	/*
	 * Using __raw here because some code paths have
	 * preemption enabled.  If preemption is enabled
	 * then interrupts should be enabled too, in which
	 * case we shouldn't have to worry about the watchdog
	 * going off.
	 */
	raw_cpu_write(watchdog_nmi_touch, true);
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);

#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
static DEFINE_PER_CPU(ktime_t, last_timestamp);
static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;

void watchdog_update_hrtimer_threshold(u64 period)
{
	/*
	 * The hrtimer runs with a period of (watchdog_threshold * 2) / 5
	 *
	 * So it runs effectively with 2.5 times the rate of the NMI
	 * watchdog. That means the hrtimer should fire 2-3 times before
	 * the NMI watchdog expires. The NMI watchdog on x86 is based on
	 * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
	 * might run way faster than expected and the NMI fires in a
	 * smaller period than the one deduced from the nominal CPU
	 * frequency. Depending on the Turbo-Mode factor this might be fast
	 * enough to get the NMI period smaller than the hrtimer watchdog
	 * period and trigger false positives.
	 *
	 * The sample threshold is used to check in the NMI handler whether
	 * the minimum time between two NMI samples has elapsed. That
	 * prevents false positives.
	 *
	 * Set this to 4/5 of the actual watchdog threshold period so the
	 * hrtimer is guaranteed to fire at least once within the real
	 * watchdog threshold.
	 */
	watchdog_hrtimer_sample_threshold = period * 2;
}

static bool watchdog_check_timestamp(void)
{
	ktime_t delta, now = ktime_get_mono_fast_ns();

	delta = now - __this_cpu_read(last_timestamp);
	if (delta < watchdog_hrtimer_sample_threshold) {
		/*
		 * If ktime is jiffies based, a stalled timer would prevent
		 * jiffies from being incremented and the filter would look
		 * at a stale timestamp and never trigger.
		 */
		if (__this_cpu_inc_return(nmi_rearmed) < 10)
			return false;
	}
	__this_cpu_write(nmi_rearmed, 0);
	__this_cpu_write(last_timestamp, now);
	return true;
}
#else
static inline bool watchdog_check_timestamp(void)
{
	return true;
}
#endif

static struct perf_event_attr wd_hw_attr = {
	.type		= PERF_TYPE_HARDWARE,
	.config		= PERF_COUNT_HW_CPU_CYCLES,
	.size		= sizeof(struct perf_event_attr),
	.pinned		= 1,
	.disabled	= 1,
};

/* Callback function for perf event subsystem */
static void watchdog_overflow_callback(struct perf_event *event,
		 struct perf_sample_data *data,
		 struct pt_regs *regs)
{
	/* Ensure the watchdog never gets throttled */
	event->hw.interrupts = 0;

	if (atomic_read(&watchdog_park_in_progress) != 0)
		return;

	if (__this_cpu_read(watchdog_nmi_touch) == true) {
		__this_cpu_write(watchdog_nmi_touch, false);
		return;
	}

	if (!watchdog_check_timestamp())
		return;

	/* check for a hardlockup
	 * This is done by making sure our timer interrupt
	 * is incrementing.  The timer interrupt should have
	 * fired multiple times before we overflow'd.  If it hasn't
	 * then this is a good indication the cpu is stuck
	 */
	if (is_hardlockup()) {
		int this_cpu = smp_processor_id();

		/* only print hardlockups once */
		if (__this_cpu_read(hard_watchdog_warn) == true)
			return;

		pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
		print_modules();
		print_irqtrace_events(current);
		if (regs)
			show_regs(regs);
		else
			dump_stack();

		/*
		 * Perform all-CPU dump only once to avoid multiple hardlockups
		 * generating interleaving traces
		 */
		if (sysctl_hardlockup_all_cpu_backtrace &&
				!test_and_set_bit(0, &hardlockup_allcpu_dumped))
			trigger_allbutself_cpu_backtrace();

		if (hardlockup_panic)
			nmi_panic(regs, "Hard LOCKUP");

		__this_cpu_write(hard_watchdog_warn, true);
		return;
	}

	__this_cpu_write(hard_watchdog_warn, false);
	return;
}

/*
 * People like the simple clean cpu node info on boot.
 * Reduce the watchdog noise by only printing messages
 * that are different from what cpu0 displayed.
 */
static unsigned long firstcpu_err;
static atomic_t watchdog_cpus;

int watchdog_nmi_enable(unsigned int cpu)
{
	struct perf_event_attr *wd_attr;
	struct perf_event *event = per_cpu(watchdog_ev, cpu);
	int firstcpu = 0;

	/* nothing to do if the hard lockup detector is disabled */
	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
		goto out;

	/* A failure disabled the hardlockup detector permanently */
	if (hardlockup_detector_disabled)
		return -ENODEV;

	/* is it already setup and enabled? */
	if (event && event->state > PERF_EVENT_STATE_OFF)
		goto out;

	/* it is setup but not enabled */
	if (event != NULL)
		goto out_enable;

	if (atomic_inc_return(&watchdog_cpus) == 1)
		firstcpu = 1;

	wd_attr = &wd_hw_attr;
	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);

	/* Try to register using hardware perf events */
	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);

	/* save the first cpu's error for future comparision */
	if (firstcpu && IS_ERR(event))
		firstcpu_err = PTR_ERR(event);

	if (!IS_ERR(event)) {
		/* only print for the first cpu initialized */
		if (firstcpu || firstcpu_err)
			pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
		goto out_save;
	}

	/* skip displaying the same error again */
	if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
		return PTR_ERR(event);

	/* vary the KERN level based on the returned errno */
	if (PTR_ERR(event) == -EOPNOTSUPP)
		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
	else if (PTR_ERR(event) == -ENOENT)
		pr_warn("disabled (cpu%i): hardware events not enabled\n",
			 cpu);
	else
		pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
			cpu, PTR_ERR(event));

	pr_info("Disabling hard lockup detector permanently\n");
	hardlockup_detector_disabled = true;

	return PTR_ERR(event);

	/* success path */
out_save:
	per_cpu(watchdog_ev, cpu) = event;
out_enable:
	perf_event_enable(per_cpu(watchdog_ev, cpu));
out:
	return 0;
}

void watchdog_nmi_disable(unsigned int cpu)
{
	struct perf_event *event = per_cpu(watchdog_ev, cpu);

	if (event) {
		perf_event_disable(event);
		per_cpu(watchdog_ev, cpu) = NULL;

		/* should be in cleanup, but blocks oprofile */
		perf_event_release_kernel(event);

		/* watchdog_nmi_enable() expects this to be zero initially. */
		if (atomic_dec_and_test(&watchdog_cpus))
			firstcpu_err = 0;
	}
}

/**
 * hardlockup_detector_perf_stop - Globally stop watchdog events
 *
 * Special interface for x86 to handle the perf HT bug.
 */
void __init hardlockup_detector_perf_stop(void)
{
	int cpu;

	lockdep_assert_cpus_held();

	for_each_online_cpu(cpu) {
		struct perf_event *event = per_cpu(watchdog_ev, cpu);

		if (event)
			perf_event_disable(event);
	}
}

/**
 * hardlockup_detector_perf_restart - Globally restart watchdog events
 *
 * Special interface for x86 to handle the perf HT bug.
 */
void __init hardlockup_detector_perf_restart(void)
{
	int cpu;

	lockdep_assert_cpus_held();

	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
		return;

	for_each_online_cpu(cpu) {
		struct perf_event *event = per_cpu(watchdog_ev, cpu);

		if (event)
			perf_event_enable(event);
	}
}
Commit	Line	Data
73ce0511 BM	1	/*
	2	* Detect hard lockups on a system
	3	*
	4	* started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
	5	*
	6	* Note: Most of this code is borrowed heavily from the original softlockup
	7	* detector, so thanks to Ingo for the initial implementation.
	8	* Some chunks also taken from the old x86-specific nmi watchdog code, thanks
	9	* to those contributors as well.
	10	*/
	11
	12	#define pr_fmt(fmt) "NMI watchdog: " fmt
	13
	14	#include <linux/nmi.h>
	15	#include <linux/module.h>
b17b0153 IM	16	#include <linux/sched/debug.h>
b17b0153 IM	17
73ce0511 BM	18	#include <asm/irq_regs.h>
	19	#include <linux/perf_event.h>
	20
	21	static DEFINE_PER_CPU(bool, hard_watchdog_warn);
	22	static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
	23	static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
	24
73ce0511	25	static unsigned long hardlockup_allcpu_dumped;
20d853fd	26	static bool hardlockup_detector_disabled;
73ce0511	27
f2e0cff8	28	void arch_touch_nmi_watchdog(void)
73ce0511 BM	29	{
	30	/*
	31	* Using __raw here because some code paths have
	32	* preemption enabled. If preemption is enabled
	33	* then interrupts should be enabled too, in which
	34	* case we shouldn't have to worry about the watchdog
	35	* going off.
	36	*/
	37	raw_cpu_write(watchdog_nmi_touch, true);
73ce0511	38	}
f2e0cff8	39	EXPORT_SYMBOL(arch_touch_nmi_watchdog);
73ce0511	40
7edaeb68 TG	41	#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
	42	static DEFINE_PER_CPU(ktime_t, last_timestamp);
	43	static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
	44	static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
	45
	46	void watchdog_update_hrtimer_threshold(u64 period)
	47	{
	48	/*
	49	* The hrtimer runs with a period of (watchdog_threshold * 2) / 5
	50	*
	51	* So it runs effectively with 2.5 times the rate of the NMI
	52	* watchdog. That means the hrtimer should fire 2-3 times before
	53	* the NMI watchdog expires. The NMI watchdog on x86 is based on
	54	* unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
	55	* might run way faster than expected and the NMI fires in a
	56	* smaller period than the one deduced from the nominal CPU
	57	* frequency. Depending on the Turbo-Mode factor this might be fast
	58	* enough to get the NMI period smaller than the hrtimer watchdog
	59	* period and trigger false positives.
	60	*
	61	* The sample threshold is used to check in the NMI handler whether
	62	* the minimum time between two NMI samples has elapsed. That
	63	* prevents false positives.
	64	*
	65	* Set this to 4/5 of the actual watchdog threshold period so the
	66	* hrtimer is guaranteed to fire at least once within the real
	67	* watchdog threshold.
	68	*/
	69	watchdog_hrtimer_sample_threshold = period * 2;
	70	}
	71
	72	static bool watchdog_check_timestamp(void)
	73	{
	74	ktime_t delta, now = ktime_get_mono_fast_ns();
	75
	76	delta = now - __this_cpu_read(last_timestamp);
	77	if (delta < watchdog_hrtimer_sample_threshold) {
	78	/*
	79	* If ktime is jiffies based, a stalled timer would prevent
	80	* jiffies from being incremented and the filter would look
	81	* at a stale timestamp and never trigger.
	82	*/
	83	if (__this_cpu_inc_return(nmi_rearmed) < 10)
	84	return false;
	85	}
	86	__this_cpu_write(nmi_rearmed, 0);
	87	__this_cpu_write(last_timestamp, now);
	88	return true;
	89	}
	90	#else
	91	static inline bool watchdog_check_timestamp(void)
	92	{
	93	return true;
	94	}
	95	#endif
	96
73ce0511 BM	97	static struct perf_event_attr wd_hw_attr = {
	98	.type = PERF_TYPE_HARDWARE,
	99	.config = PERF_COUNT_HW_CPU_CYCLES,
	100	.size = sizeof(struct perf_event_attr),
	101	.pinned = 1,
	102	.disabled = 1,
	103	};
	104
	105	/* Callback function for perf event subsystem */
	106	static void watchdog_overflow_callback(struct perf_event *event,
	107	struct perf_sample_data *data,
	108	struct pt_regs *regs)
	109	{
	110	/* Ensure the watchdog never gets throttled */
	111	event->hw.interrupts = 0;
	112
b94f5118 DZ	113	if (atomic_read(&watchdog_park_in_progress) != 0)
	114	return;
	115
73ce0511 BM	116	if (__this_cpu_read(watchdog_nmi_touch) == true) {
	117	__this_cpu_write(watchdog_nmi_touch, false);
	118	return;
	119	}
	120
7edaeb68 TG	121	if (!watchdog_check_timestamp())
	122	return;
	123
73ce0511 BM	124	/* check for a hardlockup
	125	* This is done by making sure our timer interrupt
	126	* is incrementing. The timer interrupt should have
	127	* fired multiple times before we overflow'd. If it hasn't
	128	* then this is a good indication the cpu is stuck
	129	*/
	130	if (is_hardlockup()) {
	131	int this_cpu = smp_processor_id();
	132
	133	/* only print hardlockups once */
	134	if (__this_cpu_read(hard_watchdog_warn) == true)
	135	return;
	136
	137	pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
	138	print_modules();
	139	print_irqtrace_events(current);
	140	if (regs)
	141	show_regs(regs);
	142	else
	143	dump_stack();
	144
	145	/*
	146	* Perform all-CPU dump only once to avoid multiple hardlockups
	147	* generating interleaving traces
	148	*/
	149	if (sysctl_hardlockup_all_cpu_backtrace &&
	150	!test_and_set_bit(0, &hardlockup_allcpu_dumped))
	151	trigger_allbutself_cpu_backtrace();
	152
	153	if (hardlockup_panic)
	154	nmi_panic(regs, "Hard LOCKUP");
	155
	156	__this_cpu_write(hard_watchdog_warn, true);
	157	return;
	158	}
	159
	160	__this_cpu_write(hard_watchdog_warn, false);
	161	return;
	162	}
	163
	164	/*
	165	* People like the simple clean cpu node info on boot.
	166	* Reduce the watchdog noise by only printing messages
	167	* that are different from what cpu0 displayed.
	168	*/
8dcde9de PB	169	static unsigned long firstcpu_err;
8dcde9de PB	170	static atomic_t watchdog_cpus;
73ce0511 BM	171
	172	int watchdog_nmi_enable(unsigned int cpu)
	173	{
	174	struct perf_event_attr *wd_attr;
	175	struct perf_event *event = per_cpu(watchdog_ev, cpu);
8dcde9de	176	int firstcpu = 0;
73ce0511 BM	177
	178	/* nothing to do if the hard lockup detector is disabled */
	179	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
	180	goto out;
	181
20d853fd TG	182	/* A failure disabled the hardlockup detector permanently */
	183	if (hardlockup_detector_disabled)
	184	return -ENODEV;
	185
73ce0511 BM	186	/* is it already setup and enabled? */
	187	if (event && event->state > PERF_EVENT_STATE_OFF)
	188	goto out;
	189
	190	/* it is setup but not enabled */
	191	if (event != NULL)
	192	goto out_enable;
	193
8dcde9de PB	194	if (atomic_inc_return(&watchdog_cpus) == 1)
	195	firstcpu = 1;
	196
73ce0511 BM	197	wd_attr = &wd_hw_attr;
	198	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
	199
	200	/* Try to register using hardware perf events */
	201	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
	202
8dcde9de PB	203	/* save the first cpu's error for future comparision */
	204	if (firstcpu && IS_ERR(event))
	205	firstcpu_err = PTR_ERR(event);
73ce0511 BM	206
73ce0511 BM	207	if (!IS_ERR(event)) {
8dcde9de PB	208	/* only print for the first cpu initialized */
8dcde9de PB	209	if (firstcpu \|\| firstcpu_err)
73ce0511 BM	210	pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
	211	goto out_save;
	212	}
	213
73ce0511	214	/* skip displaying the same error again */
8dcde9de	215	if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
73ce0511 BM	216	return PTR_ERR(event);
	217
	218	/* vary the KERN level based on the returned errno */
	219	if (PTR_ERR(event) == -EOPNOTSUPP)
	220	pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
	221	else if (PTR_ERR(event) == -ENOENT)
	222	pr_warn("disabled (cpu%i): hardware events not enabled\n",
	223	cpu);
	224	else
	225	pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
	226	cpu, PTR_ERR(event));
	227
20d853fd TG	228	pr_info("Disabling hard lockup detector permanently\n");
20d853fd TG	229	hardlockup_detector_disabled = true;
73ce0511 BM	230
	231	return PTR_ERR(event);
	232
	233	/* success path */
	234	out_save:
	235	per_cpu(watchdog_ev, cpu) = event;
	236	out_enable:
	237	perf_event_enable(per_cpu(watchdog_ev, cpu));
	238	out:
	239	return 0;
	240	}
	241
	242	void watchdog_nmi_disable(unsigned int cpu)
	243	{
	244	struct perf_event *event = per_cpu(watchdog_ev, cpu);
	245
	246	if (event) {
	247	perf_event_disable(event);
	248	per_cpu(watchdog_ev, cpu) = NULL;
	249
	250	/* should be in cleanup, but blocks oprofile */
	251	perf_event_release_kernel(event);
8dcde9de	252
73ce0511	253	/* watchdog_nmi_enable() expects this to be zero initially. */
8dcde9de PB	254	if (atomic_dec_and_test(&watchdog_cpus))
8dcde9de PB	255	firstcpu_err = 0;
73ce0511 BM	256	}
73ce0511 BM	257	}
d0b6e0a8 PZ	258
	259	/**
	260	* hardlockup_detector_perf_stop - Globally stop watchdog events
	261	*
	262	* Special interface for x86 to handle the perf HT bug.
	263	*/
	264	void __init hardlockup_detector_perf_stop(void)
	265	{
	266	int cpu;
	267
	268	lockdep_assert_cpus_held();
	269
	270	for_each_online_cpu(cpu) {
	271	struct perf_event *event = per_cpu(watchdog_ev, cpu);
	272
	273	if (event)
	274	perf_event_disable(event);
	275	}
	276	}
	277
	278	/**
	279	* hardlockup_detector_perf_restart - Globally restart watchdog events
	280	*
	281	* Special interface for x86 to handle the perf HT bug.
	282	*/
	283	void __init hardlockup_detector_perf_restart(void)
	284	{
	285	int cpu;
	286
	287	lockdep_assert_cpus_held();
	288
	289	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
	290	return;
	291
	292	for_each_online_cpu(cpu) {
	293	struct perf_event *event = per_cpu(watchdog_ev, cpu);
	294
	295	if (event)
	296	perf_event_enable(event);
	297	}
	298	}