]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/kernel/alternative.c
x86/stackframe/32: Allow int3_emulate_push()
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kernel / alternative.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
c767a54b
JP
2#define pr_fmt(fmt) "SMP alternatives: " fmt
3
9a0b5817 4#include <linux/module.h>
f6a57033 5#include <linux/sched.h>
2f1dafe5 6#include <linux/mutex.h>
9a0b5817 7#include <linux/list.h>
8b5a10fc 8#include <linux/stringify.h>
19d36ccd
AK
9#include <linux/mm.h>
10#include <linux/vmalloc.h>
3945dab4 11#include <linux/memory.h>
3d55cc8a 12#include <linux/stop_machine.h>
5a0e3ad6 13#include <linux/slab.h>
fd4363ff 14#include <linux/kdebug.h>
c13324a5 15#include <linux/kprobes.h>
b3fd8e83 16#include <linux/mmu_context.h>
35de5b06 17#include <asm/text-patching.h>
9a0b5817
GH
18#include <asm/alternative.h>
19#include <asm/sections.h>
19d36ccd 20#include <asm/pgtable.h>
8f4e956b
AK
21#include <asm/mce.h>
22#include <asm/nmi.h>
e587cadd 23#include <asm/cacheflush.h>
78ff7fae 24#include <asm/tlbflush.h>
e587cadd 25#include <asm/io.h>
78ff7fae 26#include <asm/fixmap.h>
9a0b5817 27
5e907bb0
IM
28int __read_mostly alternatives_patched;
29
30EXPORT_SYMBOL_GPL(alternatives_patched);
31
ab144f5e
AK
32#define MAX_PATCH_LEN (255-1)
33
8b5a10fc 34static int __initdata_or_module debug_alternative;
b7fb4af0 35
d167a518
GH
36static int __init debug_alt(char *str)
37{
38 debug_alternative = 1;
39 return 1;
40}
d167a518
GH
41__setup("debug-alternative", debug_alt);
42
09488165
JB
43static int noreplace_smp;
44
b7fb4af0
JF
45static int __init setup_noreplace_smp(char *str)
46{
47 noreplace_smp = 1;
48 return 1;
49}
50__setup("noreplace-smp", setup_noreplace_smp);
51
db477a33
BP
52#define DPRINTK(fmt, args...) \
53do { \
54 if (debug_alternative) \
55 printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
c767a54b 56} while (0)
d167a518 57
48c7a250
BP
58#define DUMP_BYTES(buf, len, fmt, args...) \
59do { \
60 if (unlikely(debug_alternative)) { \
61 int j; \
62 \
63 if (!(len)) \
64 break; \
65 \
66 printk(KERN_DEBUG fmt, ##args); \
67 for (j = 0; j < (len) - 1; j++) \
68 printk(KERN_CONT "%02hhx ", buf[j]); \
69 printk(KERN_CONT "%02hhx\n", buf[j]); \
70 } \
71} while (0)
72
dc326fca
PA
73/*
74 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
75 * that correspond to that nop. Getting from one nop to the next, we
76 * add to the array the offset that is equal to the sum of all sizes of
77 * nops preceding the one we are after.
78 *
79 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
80 * nice symmetry of sizes of the previous nops.
81 */
8b5a10fc 82#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
dc326fca
PA
83static const unsigned char intelnops[] =
84{
85 GENERIC_NOP1,
86 GENERIC_NOP2,
87 GENERIC_NOP3,
88 GENERIC_NOP4,
89 GENERIC_NOP5,
90 GENERIC_NOP6,
91 GENERIC_NOP7,
92 GENERIC_NOP8,
93 GENERIC_NOP5_ATOMIC
94};
95static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
96{
9a0b5817
GH
97 NULL,
98 intelnops,
99 intelnops + 1,
100 intelnops + 1 + 2,
101 intelnops + 1 + 2 + 3,
102 intelnops + 1 + 2 + 3 + 4,
103 intelnops + 1 + 2 + 3 + 4 + 5,
104 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
105 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 106 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
9a0b5817 107};
d167a518
GH
108#endif
109
110#ifdef K8_NOP1
dc326fca
PA
111static const unsigned char k8nops[] =
112{
113 K8_NOP1,
114 K8_NOP2,
115 K8_NOP3,
116 K8_NOP4,
117 K8_NOP5,
118 K8_NOP6,
119 K8_NOP7,
120 K8_NOP8,
121 K8_NOP5_ATOMIC
122};
123static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
124{
9a0b5817
GH
125 NULL,
126 k8nops,
127 k8nops + 1,
128 k8nops + 1 + 2,
129 k8nops + 1 + 2 + 3,
130 k8nops + 1 + 2 + 3 + 4,
131 k8nops + 1 + 2 + 3 + 4 + 5,
132 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
133 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 134 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
9a0b5817 135};
d167a518
GH
136#endif
137
8b5a10fc 138#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
dc326fca
PA
139static const unsigned char k7nops[] =
140{
141 K7_NOP1,
142 K7_NOP2,
143 K7_NOP3,
144 K7_NOP4,
145 K7_NOP5,
146 K7_NOP6,
147 K7_NOP7,
148 K7_NOP8,
149 K7_NOP5_ATOMIC
150};
151static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
152{
9a0b5817
GH
153 NULL,
154 k7nops,
155 k7nops + 1,
156 k7nops + 1 + 2,
157 k7nops + 1 + 2 + 3,
158 k7nops + 1 + 2 + 3 + 4,
159 k7nops + 1 + 2 + 3 + 4 + 5,
160 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
161 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 162 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
9a0b5817 163};
d167a518
GH
164#endif
165
32c464f5 166#ifdef P6_NOP1
cb09cad4 167static const unsigned char p6nops[] =
dc326fca
PA
168{
169 P6_NOP1,
170 P6_NOP2,
171 P6_NOP3,
172 P6_NOP4,
173 P6_NOP5,
174 P6_NOP6,
175 P6_NOP7,
176 P6_NOP8,
177 P6_NOP5_ATOMIC
178};
179static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
180{
32c464f5
JB
181 NULL,
182 p6nops,
183 p6nops + 1,
184 p6nops + 1 + 2,
185 p6nops + 1 + 2 + 3,
186 p6nops + 1 + 2 + 3 + 4,
187 p6nops + 1 + 2 + 3 + 4 + 5,
188 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
189 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 190 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
32c464f5
JB
191};
192#endif
193
dc326fca 194/* Initialize these to a safe default */
d167a518 195#ifdef CONFIG_X86_64
dc326fca
PA
196const unsigned char * const *ideal_nops = p6_nops;
197#else
198const unsigned char * const *ideal_nops = intel_nops;
199#endif
d167a518 200
dc326fca 201void __init arch_init_ideal_nops(void)
d167a518 202{
dc326fca
PA
203 switch (boot_cpu_data.x86_vendor) {
204 case X86_VENDOR_INTEL:
d8d9766c
PA
205 /*
206 * Due to a decoder implementation quirk, some
207 * specific Intel CPUs actually perform better with
208 * the "k8_nops" than with the SDM-recommended NOPs.
209 */
210 if (boot_cpu_data.x86 == 6 &&
211 boot_cpu_data.x86_model >= 0x0f &&
212 boot_cpu_data.x86_model != 0x1c &&
213 boot_cpu_data.x86_model != 0x26 &&
214 boot_cpu_data.x86_model != 0x27 &&
215 boot_cpu_data.x86_model < 0x30) {
216 ideal_nops = k8_nops;
217 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
dc326fca
PA
218 ideal_nops = p6_nops;
219 } else {
220#ifdef CONFIG_X86_64
221 ideal_nops = k8_nops;
222#else
223 ideal_nops = intel_nops;
224#endif
225 }
d6250a3f 226 break;
f21262b8 227
c3fecca4
PW
228 case X86_VENDOR_HYGON:
229 ideal_nops = p6_nops;
230 return;
231
f21262b8
BP
232 case X86_VENDOR_AMD:
233 if (boot_cpu_data.x86 > 0xf) {
234 ideal_nops = p6_nops;
235 return;
236 }
237
238 /* fall through */
239
dc326fca
PA
240 default:
241#ifdef CONFIG_X86_64
242 ideal_nops = k8_nops;
243#else
244 if (boot_cpu_has(X86_FEATURE_K8))
245 ideal_nops = k8_nops;
246 else if (boot_cpu_has(X86_FEATURE_K7))
247 ideal_nops = k7_nops;
248 else
249 ideal_nops = intel_nops;
250#endif
251 }
9a0b5817
GH
252}
253
ab144f5e 254/* Use this to add nops to a buffer, then text_poke the whole buffer. */
8b5a10fc 255static void __init_or_module add_nops(void *insns, unsigned int len)
139ec7c4 256{
139ec7c4
RR
257 while (len > 0) {
258 unsigned int noplen = len;
259 if (noplen > ASM_NOP_MAX)
260 noplen = ASM_NOP_MAX;
dc326fca 261 memcpy(insns, ideal_nops[noplen], noplen);
139ec7c4
RR
262 insns += noplen;
263 len -= noplen;
264 }
265}
266
d167a518 267extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
5967ed87 268extern s32 __smp_locks[], __smp_locks_end[];
0a203df5 269void text_poke_early(void *addr, const void *opcode, size_t len);
d167a518 270
48c7a250
BP
271/*
272 * Are we looking at a near JMP with a 1 or 4-byte displacement.
273 */
274static inline bool is_jmp(const u8 opcode)
275{
276 return opcode == 0xeb || opcode == 0xe9;
277}
278
279static void __init_or_module
280recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
281{
282 u8 *next_rip, *tgt_rip;
283 s32 n_dspl, o_dspl;
284 int repl_len;
285
286 if (a->replacementlen != 5)
287 return;
288
289 o_dspl = *(s32 *)(insnbuf + 1);
290
291 /* next_rip of the replacement JMP */
292 next_rip = repl_insn + a->replacementlen;
293 /* target rip of the replacement JMP */
294 tgt_rip = next_rip + o_dspl;
295 n_dspl = tgt_rip - orig_insn;
296
0e6c16c6 297 DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
48c7a250
BP
298
299 if (tgt_rip - orig_insn >= 0) {
300 if (n_dspl - 2 <= 127)
301 goto two_byte_jmp;
302 else
303 goto five_byte_jmp;
304 /* negative offset */
305 } else {
306 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
307 goto two_byte_jmp;
308 else
309 goto five_byte_jmp;
310 }
311
312two_byte_jmp:
313 n_dspl -= 2;
314
315 insnbuf[0] = 0xeb;
316 insnbuf[1] = (s8)n_dspl;
317 add_nops(insnbuf + 2, 3);
318
319 repl_len = 2;
320 goto done;
321
322five_byte_jmp:
323 n_dspl -= 5;
324
325 insnbuf[0] = 0xe9;
326 *(s32 *)&insnbuf[1] = n_dspl;
327
328 repl_len = 5;
329
330done:
331
332 DPRINTK("final displ: 0x%08x, JMP 0x%lx",
333 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
334}
335
34bfab0e
BP
336/*
337 * "noinline" to cause control flow change and thus invalidate I$ and
338 * cause refetch after modification.
339 */
340static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
4fd4b6e5 341{
66c117d7 342 unsigned long flags;
612e8e93 343 int i;
66c117d7 344
612e8e93
BP
345 for (i = 0; i < a->padlen; i++) {
346 if (instr[i] != 0x90)
347 return;
348 }
69df353f 349
66c117d7 350 local_irq_save(flags);
4fd4b6e5 351 add_nops(instr + (a->instrlen - a->padlen), a->padlen);
66c117d7 352 local_irq_restore(flags);
4fd4b6e5 353
0e6c16c6 354 DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
4fd4b6e5
BP
355 instr, a->instrlen - a->padlen, a->padlen);
356}
357
db477a33
BP
358/*
359 * Replace instructions with better alternatives for this CPU type. This runs
360 * before SMP is initialized to avoid SMP problems with self modifying code.
361 * This implies that asymmetric systems where APs have less capabilities than
362 * the boot processor are not handled. Tough. Make sure you disable such
363 * features by hand.
34bfab0e
BP
364 *
365 * Marked "noinline" to cause control flow change and thus insn cache
366 * to refetch changed I$ lines.
db477a33 367 */
34bfab0e
BP
368void __init_or_module noinline apply_alternatives(struct alt_instr *start,
369 struct alt_instr *end)
9a0b5817 370{
9a0b5817 371 struct alt_instr *a;
59e97e4d 372 u8 *instr, *replacement;
1b1d9258 373 u8 insnbuf[MAX_PATCH_LEN];
9a0b5817 374
0e6c16c6 375 DPRINTK("alt table %px, -> %px", start, end);
50973133
FY
376 /*
377 * The scan order should be from start to end. A later scanned
db477a33 378 * alternative code can overwrite previously scanned alternative code.
50973133
FY
379 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
380 * patch code.
381 *
382 * So be careful if you want to change the scan order to any other
383 * order.
384 */
9a0b5817 385 for (a = start; a < end; a++) {
48c7a250
BP
386 int insnbuf_sz = 0;
387
59e97e4d
AL
388 instr = (u8 *)&a->instr_offset + a->instr_offset;
389 replacement = (u8 *)&a->repl_offset + a->repl_offset;
ab144f5e 390 BUG_ON(a->instrlen > sizeof(insnbuf));
65fc985b 391 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
4fd4b6e5
BP
392 if (!boot_cpu_has(a->cpuid)) {
393 if (a->padlen > 1)
394 optimize_nops(a, instr);
395
9a0b5817 396 continue;
4fd4b6e5 397 }
59e97e4d 398
c1d4e419 399 DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
db477a33
BP
400 a->cpuid >> 5,
401 a->cpuid & 0x1f,
c1d4e419 402 instr, instr, a->instrlen,
dbe4058a 403 replacement, a->replacementlen, a->padlen);
db477a33 404
0e6c16c6
BP
405 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
406 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
48c7a250 407
59e97e4d 408 memcpy(insnbuf, replacement, a->replacementlen);
48c7a250 409 insnbuf_sz = a->replacementlen;
59e97e4d 410
fc152d22
MJ
411 /*
412 * 0xe8 is a relative jump; fix the offset.
413 *
414 * Instruction length is checked before the opcode to avoid
415 * accessing uninitialized bytes for zero-length replacements.
416 */
417 if (a->replacementlen == 5 && *insnbuf == 0xe8) {
db477a33 418 *(s32 *)(insnbuf + 1) += replacement - instr;
48c7a250
BP
419 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
420 *(s32 *)(insnbuf + 1),
421 (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
db477a33 422 }
59e97e4d 423
48c7a250
BP
424 if (a->replacementlen && is_jmp(replacement[0]))
425 recompute_jump(a, instr, replacement, insnbuf);
426
427 if (a->instrlen > a->replacementlen) {
4332195c
BP
428 add_nops(insnbuf + a->replacementlen,
429 a->instrlen - a->replacementlen);
48c7a250
BP
430 insnbuf_sz += a->instrlen - a->replacementlen;
431 }
0e6c16c6 432 DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
59e97e4d 433
48c7a250 434 text_poke_early(instr, insnbuf, insnbuf_sz);
9a0b5817
GH
435 }
436}
437
8ec4d41f 438#ifdef CONFIG_SMP
5967ed87
JB
439static void alternatives_smp_lock(const s32 *start, const s32 *end,
440 u8 *text, u8 *text_end)
9a0b5817 441{
5967ed87 442 const s32 *poff;
9a0b5817 443
5967ed87
JB
444 for (poff = start; poff < end; poff++) {
445 u8 *ptr = (u8 *)poff + *poff;
446
447 if (!*poff || ptr < text || ptr >= text_end)
9a0b5817 448 continue;
f88f07e0 449 /* turn DS segment override prefix into lock prefix */
d9c5841e
PA
450 if (*ptr == 0x3e)
451 text_poke(ptr, ((unsigned char []){0xf0}), 1);
4b8073e4 452 }
9a0b5817
GH
453}
454
5967ed87
JB
455static void alternatives_smp_unlock(const s32 *start, const s32 *end,
456 u8 *text, u8 *text_end)
9a0b5817 457{
5967ed87 458 const s32 *poff;
9a0b5817 459
5967ed87
JB
460 for (poff = start; poff < end; poff++) {
461 u8 *ptr = (u8 *)poff + *poff;
462
463 if (!*poff || ptr < text || ptr >= text_end)
9a0b5817 464 continue;
f88f07e0 465 /* turn lock prefix into DS segment override prefix */
d9c5841e
PA
466 if (*ptr == 0xf0)
467 text_poke(ptr, ((unsigned char []){0x3E}), 1);
4b8073e4 468 }
9a0b5817
GH
469}
470
471struct smp_alt_module {
472 /* what is this ??? */
473 struct module *mod;
474 char *name;
475
476 /* ptrs to lock prefixes */
5967ed87
JB
477 const s32 *locks;
478 const s32 *locks_end;
9a0b5817
GH
479
480 /* .text segment, needed to avoid patching init code ;) */
481 u8 *text;
482 u8 *text_end;
483
484 struct list_head next;
485};
486static LIST_HEAD(smp_alt_modules);
e846d139 487static bool uniproc_patched = false; /* protected by text_mutex */
9a0b5817 488
8b5a10fc
JB
489void __init_or_module alternatives_smp_module_add(struct module *mod,
490 char *name,
491 void *locks, void *locks_end,
492 void *text, void *text_end)
9a0b5817
GH
493{
494 struct smp_alt_module *smp;
9a0b5817 495
e846d139 496 mutex_lock(&text_mutex);
816afe4f
RR
497 if (!uniproc_patched)
498 goto unlock;
b7fb4af0 499
816afe4f
RR
500 if (num_possible_cpus() == 1)
501 /* Don't bother remembering, we'll never have to undo it. */
502 goto smp_unlock;
9a0b5817
GH
503
504 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
505 if (NULL == smp)
816afe4f
RR
506 /* we'll run the (safe but slow) SMP code then ... */
507 goto unlock;
9a0b5817
GH
508
509 smp->mod = mod;
510 smp->name = name;
511 smp->locks = locks;
512 smp->locks_end = locks_end;
513 smp->text = text;
514 smp->text_end = text_end;
db477a33
BP
515 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
516 smp->locks, smp->locks_end,
9a0b5817
GH
517 smp->text, smp->text_end, smp->name);
518
9a0b5817 519 list_add_tail(&smp->next, &smp_alt_modules);
816afe4f
RR
520smp_unlock:
521 alternatives_smp_unlock(locks, locks_end, text, text_end);
522unlock:
e846d139 523 mutex_unlock(&text_mutex);
9a0b5817
GH
524}
525
8b5a10fc 526void __init_or_module alternatives_smp_module_del(struct module *mod)
9a0b5817
GH
527{
528 struct smp_alt_module *item;
9a0b5817 529
e846d139 530 mutex_lock(&text_mutex);
9a0b5817
GH
531 list_for_each_entry(item, &smp_alt_modules, next) {
532 if (mod != item->mod)
533 continue;
534 list_del(&item->next);
9a0b5817 535 kfree(item);
816afe4f 536 break;
9a0b5817 537 }
e846d139 538 mutex_unlock(&text_mutex);
9a0b5817
GH
539}
540
816afe4f 541void alternatives_enable_smp(void)
9a0b5817
GH
542{
543 struct smp_alt_module *mod;
9a0b5817 544
816afe4f
RR
545 /* Why bother if there are no other CPUs? */
546 BUG_ON(num_possible_cpus() == 1);
9a0b5817 547
e846d139 548 mutex_lock(&text_mutex);
ca74a6f8 549
816afe4f 550 if (uniproc_patched) {
c767a54b 551 pr_info("switching to SMP code\n");
816afe4f 552 BUG_ON(num_online_cpus() != 1);
53756d37
JF
553 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
554 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
9a0b5817
GH
555 list_for_each_entry(mod, &smp_alt_modules, next)
556 alternatives_smp_lock(mod->locks, mod->locks_end,
557 mod->text, mod->text_end);
816afe4f 558 uniproc_patched = false;
9a0b5817 559 }
e846d139 560 mutex_unlock(&text_mutex);
9a0b5817
GH
561}
562
e846d139
ZC
563/*
564 * Return 1 if the address range is reserved for SMP-alternatives.
565 * Must hold text_mutex.
566 */
2cfa1978
MH
567int alternatives_text_reserved(void *start, void *end)
568{
569 struct smp_alt_module *mod;
5967ed87 570 const s32 *poff;
076dc4a6
MH
571 u8 *text_start = start;
572 u8 *text_end = end;
2cfa1978 573
e846d139
ZC
574 lockdep_assert_held(&text_mutex);
575
2cfa1978 576 list_for_each_entry(mod, &smp_alt_modules, next) {
076dc4a6 577 if (mod->text > text_end || mod->text_end < text_start)
2cfa1978 578 continue;
5967ed87
JB
579 for (poff = mod->locks; poff < mod->locks_end; poff++) {
580 const u8 *ptr = (const u8 *)poff + *poff;
581
582 if (text_start <= ptr && text_end > ptr)
2cfa1978 583 return 1;
5967ed87 584 }
2cfa1978
MH
585 }
586
587 return 0;
588}
48c7a250 589#endif /* CONFIG_SMP */
8ec4d41f 590
139ec7c4 591#ifdef CONFIG_PARAVIRT
8b5a10fc
JB
592void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
593 struct paravirt_patch_site *end)
139ec7c4 594{
98de032b 595 struct paravirt_patch_site *p;
ab144f5e 596 char insnbuf[MAX_PATCH_LEN];
139ec7c4
RR
597
598 for (p = start; p < end; p++) {
599 unsigned int used;
600
ab144f5e 601 BUG_ON(p->len > MAX_PATCH_LEN);
d34fda4a
CW
602 /* prep the buffer with the original instructions */
603 memcpy(insnbuf, p->instr, p->len);
5c83511b 604 used = pv_ops.init.patch(p->instrtype, insnbuf,
93b1eab3 605 (unsigned long)p->instr, p->len);
7f63c41c 606
63f70270
JF
607 BUG_ON(used > p->len);
608
139ec7c4 609 /* Pad the rest with nops */
ab144f5e 610 add_nops(insnbuf + used, p->len - used);
e587cadd 611 text_poke_early(p->instr, insnbuf, p->len);
139ec7c4 612 }
139ec7c4 613}
98de032b 614extern struct paravirt_patch_site __start_parainstructions[],
139ec7c4
RR
615 __stop_parainstructions[];
616#endif /* CONFIG_PARAVIRT */
617
9a0b5817
GH
618void __init alternative_instructions(void)
619{
8f4e956b
AK
620 /* The patching is not fully atomic, so try to avoid local interruptions
621 that might execute the to be patched code.
622 Other CPUs are not running. */
623 stop_nmi();
123aa76e
AK
624
625 /*
626 * Don't stop machine check exceptions while patching.
627 * MCEs only happen when something got corrupted and in this
628 * case we must do something about the corruption.
629 * Ignoring it is worse than a unlikely patching race.
630 * Also machine checks tend to be broadcast and if one CPU
631 * goes into machine check the others follow quickly, so we don't
632 * expect a machine check to cause undue problems during to code
633 * patching.
634 */
8f4e956b 635
9a0b5817
GH
636 apply_alternatives(__alt_instructions, __alt_instructions_end);
637
8ec4d41f 638#ifdef CONFIG_SMP
816afe4f
RR
639 /* Patch to UP if other cpus not imminent. */
640 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
641 uniproc_patched = true;
9a0b5817
GH
642 alternatives_smp_module_add(NULL, "core kernel",
643 __smp_locks, __smp_locks_end,
644 _text, _etext);
9a0b5817 645 }
8f4e956b 646
816afe4f 647 if (!uniproc_patched || num_possible_cpus() == 1)
f68fd5f4
FW
648 free_init_pages("SMP alternatives",
649 (unsigned long)__smp_locks,
650 (unsigned long)__smp_locks_end);
816afe4f
RR
651#endif
652
653 apply_paravirt(__parainstructions, __parainstructions_end);
f68fd5f4 654
8f4e956b 655 restart_nmi();
5e907bb0 656 alternatives_patched = 1;
9a0b5817 657}
19d36ccd 658
e587cadd
MD
659/**
660 * text_poke_early - Update instructions on a live kernel at boot time
661 * @addr: address to modify
662 * @opcode: source of the copy
663 * @len: length to copy
664 *
19d36ccd
AK
665 * When you use this code to patch more than one byte of an instruction
666 * you need to make sure that other CPUs cannot execute this code in parallel.
e587cadd
MD
667 * Also no thread must be currently preempted in the middle of these
668 * instructions. And on the local CPU you need to be protected again NMI or MCE
669 * handlers seeing an inconsistent instruction while you patch.
19d36ccd 670 */
0a203df5
NA
671void __init_or_module text_poke_early(void *addr, const void *opcode,
672 size_t len)
19d36ccd 673{
e587cadd 674 unsigned long flags;
f2c65fb3
NA
675
676 if (boot_cpu_has(X86_FEATURE_NX) &&
677 is_module_text_address((unsigned long)addr)) {
678 /*
679 * Modules text is marked initially as non-executable, so the
680 * code cannot be running and speculative code-fetches are
681 * prevented. Just change the code.
682 */
683 memcpy(addr, opcode, len);
684 } else {
685 local_irq_save(flags);
686 memcpy(addr, opcode, len);
687 local_irq_restore(flags);
688 sync_core();
689
690 /*
691 * Could also do a CLFLUSH here to speed up CPU recovery; but
692 * that causes hangs on some VIA CPUs.
693 */
694 }
e587cadd
MD
695}
696
4fc19708
NA
697__ro_after_init struct mm_struct *poking_mm;
698__ro_after_init unsigned long poking_addr;
699
e836673c 700static void *__text_poke(void *addr, const void *opcode, size_t len)
e587cadd 701{
b3fd8e83
NA
702 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
703 struct page *pages[2] = {NULL};
704 temp_mm_state_t prev;
78ff7fae 705 unsigned long flags;
b3fd8e83
NA
706 pte_t pte, *ptep;
707 spinlock_t *ptl;
708 pgprot_t pgprot;
e587cadd 709
6fffacb3 710 /*
b3fd8e83
NA
711 * While boot memory allocator is running we cannot use struct pages as
712 * they are not yet initialized. There is no way to recover.
6fffacb3
PT
713 */
714 BUG_ON(!after_bootmem);
715
b7b66baa
MD
716 if (!core_kernel_text((unsigned long)addr)) {
717 pages[0] = vmalloc_to_page(addr);
b3fd8e83
NA
718 if (cross_page_boundary)
719 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
15a601eb 720 } else {
b7b66baa 721 pages[0] = virt_to_page(addr);
00c6b2d5 722 WARN_ON(!PageReserved(pages[0]));
b3fd8e83
NA
723 if (cross_page_boundary)
724 pages[1] = virt_to_page(addr + PAGE_SIZE);
e587cadd 725 }
b3fd8e83
NA
726 /*
727 * If something went wrong, crash and burn since recovery paths are not
728 * implemented.
729 */
730 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
731
7cf49427 732 local_irq_save(flags);
b3fd8e83
NA
733
734 /*
735 * Map the page without the global bit, as TLB flushing is done with
736 * flush_tlb_mm_range(), which is intended for non-global PTEs.
737 */
738 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
739
740 /*
741 * The lock is not really needed, but this allows to avoid open-coding.
742 */
743 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
744
745 /*
746 * This must not fail; preallocated in poking_init().
747 */
748 VM_BUG_ON(!ptep);
749
750 pte = mk_pte(pages[0], pgprot);
751 set_pte_at(poking_mm, poking_addr, ptep, pte);
752
753 if (cross_page_boundary) {
754 pte = mk_pte(pages[1], pgprot);
755 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
756 }
757
758 /*
759 * Loading the temporary mm behaves as a compiler barrier, which
760 * guarantees that the PTE will be set at the time memcpy() is done.
761 */
762 prev = use_temporary_mm(poking_mm);
763
764 kasan_disable_current();
765 memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
766 kasan_enable_current();
767
768 /*
769 * Ensure that the PTE is only cleared after the instructions of memcpy
770 * were issued by using a compiler barrier.
771 */
772 barrier();
773
774 pte_clear(poking_mm, poking_addr, ptep);
775 if (cross_page_boundary)
776 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
777
778 /*
779 * Loading the previous page-table hierarchy requires a serializing
780 * instruction that already allows the core to see the updated version.
781 * Xen-PV is assumed to serialize execution in a similar manner.
782 */
783 unuse_temporary_mm(prev);
784
785 /*
786 * Flushing the TLB might involve IPIs, which would require enabled
787 * IRQs, but not if the mm is not used, as it is in this point.
788 */
789 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
790 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
791 PAGE_SHIFT, false);
792
793 /*
794 * If the text does not match what we just wrote then something is
795 * fundamentally screwy; there's nothing we can really do about that.
796 */
797 BUG_ON(memcmp(addr, opcode, len));
798
799 pte_unmap_unlock(ptep, ptl);
7cf49427 800 local_irq_restore(flags);
e587cadd 801 return addr;
19d36ccd 802}
3d55cc8a 803
e836673c
NA
804/**
805 * text_poke - Update instructions on a live kernel
806 * @addr: address to modify
807 * @opcode: source of the copy
808 * @len: length to copy
809 *
810 * Only atomic text poke/set should be allowed when not doing early patching.
811 * It means the size must be writable atomically and the address must be aligned
812 * in a way that permits an atomic write. It also makes sure we fit on a single
813 * page.
3950746d
NA
814 *
815 * Note that the caller must ensure that if the modified code is part of a
816 * module, the module would not be removed during poking. This can be achieved
817 * by registering a module notifier, and ordering module removal and patching
818 * trough a mutex.
e836673c
NA
819 */
820void *text_poke(void *addr, const void *opcode, size_t len)
821{
822 lockdep_assert_held(&text_mutex);
823
824 return __text_poke(addr, opcode, len);
825}
826
827/**
828 * text_poke_kgdb - Update instructions on a live kernel by kgdb
829 * @addr: address to modify
830 * @opcode: source of the copy
831 * @len: length to copy
832 *
833 * Only atomic text poke/set should be allowed when not doing early patching.
834 * It means the size must be writable atomically and the address must be aligned
835 * in a way that permits an atomic write. It also makes sure we fit on a single
836 * page.
837 *
838 * Context: should only be used by kgdb, which ensures no other core is running,
839 * despite the fact it does not hold the text_mutex.
840 */
841void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
842{
843 return __text_poke(addr, opcode, len);
844}
845
fd4363ff
JK
846static void do_sync_core(void *info)
847{
848 sync_core();
849}
850
851static bool bp_patching_in_progress;
852static void *bp_int3_handler, *bp_int3_addr;
853
17f41571 854int poke_int3_handler(struct pt_regs *regs)
fd4363ff 855{
01651324
PZ
856 /*
857 * Having observed our INT3 instruction, we now must observe
858 * bp_patching_in_progress.
859 *
860 * in_progress = TRUE INT3
861 * WMB RMB
862 * write INT3 if (in_progress)
863 *
864 * Idem for bp_int3_handler.
865 */
fd4363ff
JK
866 smp_rmb();
867
868 if (likely(!bp_patching_in_progress))
17f41571 869 return 0;
fd4363ff 870
f39b6f0e 871 if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
17f41571 872 return 0;
fd4363ff
JK
873
874 /* set up the specified breakpoint handler */
17f41571
JK
875 regs->ip = (unsigned long) bp_int3_handler;
876
877 return 1;
fd4363ff 878}
c13324a5 879NOKPROBE_SYMBOL(poke_int3_handler);
17f41571 880
fd4363ff
JK
881/**
882 * text_poke_bp() -- update instructions on live kernel on SMP
883 * @addr: address to patch
884 * @opcode: opcode of new instruction
885 * @len: length to copy
886 * @handler: address to jump to when the temporary breakpoint is hit
887 *
888 * Modify multi-byte instruction by using int3 breakpoint on SMP.
ea8596bb
MH
889 * We completely avoid stop_machine() here, and achieve the
890 * synchronization using int3 breakpoint.
fd4363ff
JK
891 *
892 * The way it is done:
893 * - add a int3 trap to the address that will be patched
894 * - sync cores
895 * - update all but the first byte of the patched range
896 * - sync cores
897 * - replace the first byte (int3) by the first byte of
898 * replacing opcode
899 * - sync cores
fd4363ff 900 */
0a203df5 901void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
fd4363ff
JK
902{
903 unsigned char int3 = 0xcc;
904
905 bp_int3_handler = handler;
906 bp_int3_addr = (u8 *)addr + sizeof(int3);
907 bp_patching_in_progress = true;
9222f606
JK
908
909 lockdep_assert_held(&text_mutex);
910
fd4363ff 911 /*
01651324
PZ
912 * Corresponding read barrier in int3 notifier for making sure the
913 * in_progress and handler are correctly ordered wrt. patching.
fd4363ff
JK
914 */
915 smp_wmb();
916
917 text_poke(addr, &int3, sizeof(int3));
918
919 on_each_cpu(do_sync_core, NULL, 1);
920
921 if (len - sizeof(int3) > 0) {
922 /* patch all but the first byte */
923 text_poke((char *)addr + sizeof(int3),
924 (const char *) opcode + sizeof(int3),
925 len - sizeof(int3));
926 /*
927 * According to Intel, this core syncing is very likely
928 * not necessary and we'd be safe even without it. But
929 * better safe than sorry (plus there's not only Intel).
930 */
931 on_each_cpu(do_sync_core, NULL, 1);
932 }
933
934 /* patch the first byte */
935 text_poke(addr, opcode, sizeof(int3));
936
937 on_each_cpu(do_sync_core, NULL, 1);
01651324
PZ
938 /*
939 * sync_core() implies an smp_mb() and orders this store against
940 * the writing of the new instruction.
941 */
fd4363ff 942 bp_patching_in_progress = false;
fd4363ff
JK
943}
944