Skip to content

Commit 2341366

Browse files
Finn Thaingregkh
authored andcommitted
powerpc/tau: Disable TAU between measurements
[ Upstream commit e63d6fb ] Enabling CONFIG_TAU_INT causes random crashes: Unrecoverable exception 1700 at c0009414 (msr=1000) Oops: Unrecoverable exception, sig: 6 [#1] BE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2 PowerMac Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.7.0-pmac-00043-gd5f545e1a8593 #5 NIP: c0009414 LR: c0009414 CTR: c00116fc REGS: c0799eb8 TRAP: 1700 Not tainted (5.7.0-pmac-00043-gd5f545e1a8593) MSR: 00001000 <ME> CR: 22000228 XER: 00000100 GPR00: 00000000 c0799f70 c076e300 00800000 0291c0ac 00e00000 c076e300 00049032 GPR08: 00000001 c00116fc 00000000 dfbd3200 ffffffff 007f80a8 00000000 00000000 GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 c075ce04 GPR24: c075ce04 dfff8880 c07b0000 c075ce04 00080000 00000001 c079ef98 c079ef5c NIP [c0009414] arch_cpu_idle+0x24/0x6c LR [c0009414] arch_cpu_idle+0x24/0x6c Call Trace: [c0799f70] [00000001] 0x1 (unreliable) [c0799f80] [c0060990] do_idle+0xd8/0x17c [c0799fa0] [c0060ba4] cpu_startup_entry+0x20/0x28 [c0799fb0] [c072d220] start_kernel+0x434/0x44c [c0799ff0] [00003860] 0x3860 Instruction dump: XXXXXXXX XXXXXXXX XXXXXXXX 3d20c07b XXXXXXXX XXXXXXXX XXXXXXXX 7c0802a6 XXXXXXXX XXXXXXXX XXXXXXXX 4e800421 XXXXXXXX XXXXXXXX XXXXXXXX 7d2000a6 ---[ end trace 3a0c9b5cb216db6b ]--- Resolve this problem by disabling each THRMn comparator when handling the associated THRMn interrupt and by disabling the TAU entirely when updating THRMn thresholds. Fixes: 1da177e ("Linux-2.6.12-rc2") Signed-off-by: Finn Thain <fthain@telegraphics.com.au> Tested-by: Stan Johnson <userm57@yahoo.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/5a0ba3dc5612c7aac596727331284a3676c08472.1599260540.git.fthain@telegraphics.com.au Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 9f7cb67 commit 2341366

2 files changed

Lines changed: 26 additions & 48 deletions

File tree

arch/powerpc/kernel/tau_6xx.c

Lines changed: 24 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,6 @@ static struct tau_temp
3737

3838
struct timer_list tau_timer;
3939

40-
#undef DEBUG
41-
4240
/* TODO: put these in a /proc interface, with some sanity checks, and maybe
4341
* dynamic adjustment to minimize # of interrupts */
4442
/* configurable values for step size and how much to expand the window when
@@ -71,42 +69,33 @@ void set_thresholds(unsigned long cpu)
7169

7270
void TAUupdate(int cpu)
7371
{
74-
unsigned thrm;
75-
76-
#ifdef DEBUG
77-
printk("TAUupdate ");
78-
#endif
72+
u32 thrm;
73+
u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V;
7974

8075
/* if both thresholds are crossed, the step_sizes cancel out
8176
* and the window winds up getting expanded twice. */
82-
if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */
83-
if(thrm & THRM1_TIN){ /* crossed low threshold */
84-
if (tau[cpu].low >= step_size){
85-
tau[cpu].low -= step_size;
86-
tau[cpu].high -= (step_size - window_expand);
87-
}
88-
tau[cpu].grew = 1;
89-
#ifdef DEBUG
90-
printk("low threshold crossed ");
91-
#endif
77+
thrm = mfspr(SPRN_THRM1);
78+
if ((thrm & bits) == bits) {
79+
mtspr(SPRN_THRM1, 0);
80+
81+
if (tau[cpu].low >= step_size) {
82+
tau[cpu].low -= step_size;
83+
tau[cpu].high -= (step_size - window_expand);
9284
}
85+
tau[cpu].grew = 1;
86+
pr_debug("%s: low threshold crossed\n", __func__);
9387
}
94-
if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */
95-
if(thrm & THRM1_TIN){ /* crossed high threshold */
96-
if (tau[cpu].high <= 127-step_size){
97-
tau[cpu].low += (step_size - window_expand);
98-
tau[cpu].high += step_size;
99-
}
100-
tau[cpu].grew = 1;
101-
#ifdef DEBUG
102-
printk("high threshold crossed ");
103-
#endif
88+
thrm = mfspr(SPRN_THRM2);
89+
if ((thrm & bits) == bits) {
90+
mtspr(SPRN_THRM2, 0);
91+
92+
if (tau[cpu].high <= 127 - step_size) {
93+
tau[cpu].low += (step_size - window_expand);
94+
tau[cpu].high += step_size;
10495
}
96+
tau[cpu].grew = 1;
97+
pr_debug("%s: high threshold crossed\n", __func__);
10598
}
106-
107-
#ifdef DEBUG
108-
printk("grew = %d\n", tau[cpu].grew);
109-
#endif
11099
}
111100

112101
#ifdef CONFIG_TAU_INT
@@ -131,18 +120,18 @@ void TAUException(struct pt_regs * regs)
131120
static void tau_timeout(void * info)
132121
{
133122
int cpu;
134-
unsigned long flags;
135123
int size;
136124
int shrink;
137125

138-
/* disabling interrupts *should* be okay */
139-
local_irq_save(flags);
140126
cpu = smp_processor_id();
141127

142128
#ifndef CONFIG_TAU_INT
143129
TAUupdate(cpu);
144130
#endif
145131

132+
/* Stop thermal sensor comparisons and interrupts */
133+
mtspr(SPRN_THRM3, 0);
134+
146135
size = tau[cpu].high - tau[cpu].low;
147136
if (size > min_window && ! tau[cpu].grew) {
148137
/* do an exponential shrink of half the amount currently over size */
@@ -164,18 +153,12 @@ static void tau_timeout(void * info)
164153

165154
set_thresholds(cpu);
166155

167-
/*
168-
* Do the enable every time, since otherwise a bunch of (relatively)
169-
* complex sleep code needs to be added. One mtspr every time
170-
* tau_timeout is called is probably not a big deal.
171-
*
156+
/* Restart thermal sensor comparisons and interrupts.
172157
* The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet"
173158
* recommends that "the maximum value be set in THRM3 under all
174159
* conditions."
175160
*/
176161
mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E);
177-
178-
local_irq_restore(flags);
179162
}
180163

181164
static void tau_timeout_smp(unsigned long unused)

arch/powerpc/platforms/Kconfig

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -242,20 +242,15 @@ config TAU
242242
temp is actually what /proc/cpuinfo says it is.
243243

244244
config TAU_INT
245-
bool "Interrupt driven TAU driver (DANGEROUS)"
245+
bool "Interrupt driven TAU driver (EXPERIMENTAL)"
246246
depends on TAU
247247
---help---
248248
The TAU supports an interrupt driven mode which causes an interrupt
249249
whenever the temperature goes out of range. This is the fastest way
250250
to get notified the temp has exceeded a range. With this option off,
251251
a timer is used to re-check the temperature periodically.
252252

253-
However, on some cpus it appears that the TAU interrupt hardware
254-
is buggy and can cause a situation which would lead unexplained hard
255-
lockups.
256-
257-
Unless you are extending the TAU driver, or enjoy kernel/hardware
258-
debugging, leave this option off.
253+
If in doubt, say N here.
259254

260255
config TAU_AVERAGE
261256
bool "Average high and low temp"

0 commit comments

Comments
 (0)