]>
Commit | Line | Data |
---|---|---|
ba2f1a67 FG |
1 | commit 8974189222159154c55f24ddad33e3613960521a |
2 | Author: Peter Zijlstra <peterz@infradead.org> | |
3 | Date: Thu Jun 16 10:50:40 2016 +0200 | |
4 | ||
5 | sched/fair: Fix cfs_rq avg tracking underflow | |
6 | ||
7 | As per commit: | |
8 | ||
9 | b7fa30c9cc48 ("sched/fair: Fix post_init_entity_util_avg() serialization") | |
10 | ||
11 | > the code generated from update_cfs_rq_load_avg(): | |
12 | > | |
13 | > if (atomic_long_read(&cfs_rq->removed_load_avg)) { | |
14 | > s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); | |
15 | > sa->load_avg = max_t(long, sa->load_avg - r, 0); | |
16 | > sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); | |
17 | > removed_load = 1; | |
18 | > } | |
19 | > | |
20 | > turns into: | |
21 | > | |
22 | > ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax | |
23 | > ffffffff8108706b: 48 85 c0 test %rax,%rax | |
24 | > ffffffff8108706e: 74 40 je ffffffff810870b0 <update_blocked_averages+0xc0> | |
25 | > ffffffff81087070: 4c 89 f8 mov %r15,%rax | |
26 | > ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) | |
27 | > ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) | |
28 | > ffffffff8108707e: 4c 89 f9 mov %r15,%rcx | |
29 | > ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx | |
30 | > ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) | |
31 | > ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx | |
32 | > | |
33 | > Which you'll note ends up with sa->load_avg -= r in memory at | |
34 | > ffffffff8108707a. | |
35 | ||
36 | So I _should_ have looked at other unserialized users of ->load_avg, | |
37 | but alas. Luckily nikbor reported a similar /0 from task_h_load() which | |
38 | instantly triggered recollection of this here problem. | |
39 | ||
40 | Aside from the intermediate value hitting memory and causing problems, | |
41 | there's another problem: the underflow detection relies on the signed | |
42 | bit. This reduces the effective width of the variables, IOW its | |
43 | effectively the same as having these variables be of signed type. | |
44 | ||
45 | This patch changes to a different means of unsigned underflow | |
46 | detection to not rely on the signed bit. This allows the variables to | |
47 | use the 'full' unsigned range. And it does so with explicit LOAD - | |
48 | STORE to ensure any intermediate value will never be visible in | |
49 | memory, allowing these unserialized loads. | |
50 | ||
51 | Note: GCC generates crap code for this, might warrant a look later. | |
52 | ||
53 | Note2: I say 'full' above, if we end up at U*_MAX we'll still explode; | |
54 | maybe we should do clamping on add too. | |
55 | ||
56 | Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> | |
57 | Cc: Andrey Ryabinin <aryabinin@virtuozzo.com> | |
58 | Cc: Chris Wilson <chris@chris-wilson.co.uk> | |
59 | Cc: Linus Torvalds <torvalds@linux-foundation.org> | |
60 | Cc: Mike Galbraith <efault@gmx.de> | |
61 | Cc: Peter Zijlstra <peterz@infradead.org> | |
62 | Cc: Thomas Gleixner <tglx@linutronix.de> | |
63 | Cc: Yuyang Du <yuyang.du@intel.com> | |
64 | Cc: bsegall@google.com | |
65 | Cc: kernel@kyup.com | |
66 | Cc: morten.rasmussen@arm.com | |
67 | Cc: pjt@google.com | |
68 | Cc: steve.muckle@linaro.org | |
69 | Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking") | |
70 | Link: http://lkml.kernel.org/r/20160617091948.GJ30927@twins.programming.kicks-ass.net | |
71 | Signed-off-by: Ingo Molnar <mingo@kernel.org> | |
72 | ||
73 | --- | |
74 | kernel/sched/fair.c | 33 +++++++++++++++++++++++++-------- | |
75 | 1 file changed, 25 insertions(+), 8 deletions(-) | |
76 | ||
77 | --- a/kernel/sched/fair.c | |
78 | +++ b/kernel/sched/fair.c | |
79 | @@ -2682,6 +2682,23 @@ static inline void update_tg_load_avg(st | |
80 | ||
81 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | |
82 | ||
83 | +/* | |
84 | + * Unsigned subtract and clamp on underflow. | |
85 | + * | |
86 | + * Explicitly do a load-store to ensure the intermediate value never hits | |
87 | + * memory. This allows lockless observations without ever seeing the negative | |
88 | + * values. | |
89 | + */ | |
90 | +#define sub_positive(_ptr, _val) do { \ | |
91 | + typeof(_ptr) ptr = (_ptr); \ | |
92 | + typeof(*ptr) val = (_val); \ | |
93 | + typeof(*ptr) res, var = READ_ONCE(*ptr); \ | |
94 | + res = var - val; \ | |
95 | + if (res > var) \ | |
96 | + res = 0; \ | |
97 | + WRITE_ONCE(*ptr, res); \ | |
98 | +} while (0) | |
99 | + | |
100 | /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ | |
101 | static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) | |
102 | { | |
103 | @@ -2690,15 +2707,15 @@ static inline int update_cfs_rq_load_avg | |
104 | ||
105 | if (atomic_long_read(&cfs_rq->removed_load_avg)) { | |
106 | s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); | |
107 | - sa->load_avg = max_t(long, sa->load_avg - r, 0); | |
108 | - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); | |
109 | + sub_positive(&sa->load_avg, r); | |
110 | + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); | |
111 | removed = 1; | |
112 | } | |
113 | ||
114 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { | |
115 | long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); | |
116 | - sa->util_avg = max_t(long, sa->util_avg - r, 0); | |
117 | - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); | |
118 | + sub_positive(&sa->util_avg, r); | |
119 | + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); | |
120 | } | |
121 | ||
122 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | |
123 | @@ -2764,10 +2781,10 @@ static void detach_entity_load_avg(struc | |
124 | &se->avg, se->on_rq * scale_load_down(se->load.weight), | |
125 | cfs_rq->curr == se, NULL); | |
126 | ||
127 | - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); | |
128 | - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); | |
129 | - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); | |
130 | - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); | |
131 | + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); | |
132 | + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); | |
133 | + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); | |
134 | + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); | |
135 | } | |
136 | ||
137 | /* Add the load generated by se into cfs_rq's load average */ |