]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blame - arch/x86/kernel/alternative.c
x86/alternatives: Add int3_emulate_call() selftest
[mirror_ubuntu-focal-kernel.git] / arch / x86 / kernel / alternative.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
c767a54b
JP
2#define pr_fmt(fmt) "SMP alternatives: " fmt
3
9a0b5817 4#include <linux/module.h>
f6a57033 5#include <linux/sched.h>
2f1dafe5 6#include <linux/mutex.h>
9a0b5817 7#include <linux/list.h>
8b5a10fc 8#include <linux/stringify.h>
19d36ccd
AK
9#include <linux/mm.h>
10#include <linux/vmalloc.h>
3945dab4 11#include <linux/memory.h>
3d55cc8a 12#include <linux/stop_machine.h>
5a0e3ad6 13#include <linux/slab.h>
fd4363ff 14#include <linux/kdebug.h>
c13324a5 15#include <linux/kprobes.h>
b3fd8e83 16#include <linux/mmu_context.h>
35de5b06 17#include <asm/text-patching.h>
9a0b5817
GH
18#include <asm/alternative.h>
19#include <asm/sections.h>
19d36ccd 20#include <asm/pgtable.h>
8f4e956b
AK
21#include <asm/mce.h>
22#include <asm/nmi.h>
e587cadd 23#include <asm/cacheflush.h>
78ff7fae 24#include <asm/tlbflush.h>
e587cadd 25#include <asm/io.h>
78ff7fae 26#include <asm/fixmap.h>
9a0b5817 27
5e907bb0
IM
28int __read_mostly alternatives_patched;
29
30EXPORT_SYMBOL_GPL(alternatives_patched);
31
ab144f5e
AK
32#define MAX_PATCH_LEN (255-1)
33
8b5a10fc 34static int __initdata_or_module debug_alternative;
b7fb4af0 35
d167a518
GH
36static int __init debug_alt(char *str)
37{
38 debug_alternative = 1;
39 return 1;
40}
d167a518
GH
41__setup("debug-alternative", debug_alt);
42
09488165
JB
43static int noreplace_smp;
44
b7fb4af0
JF
45static int __init setup_noreplace_smp(char *str)
46{
47 noreplace_smp = 1;
48 return 1;
49}
50__setup("noreplace-smp", setup_noreplace_smp);
51
db477a33
BP
52#define DPRINTK(fmt, args...) \
53do { \
54 if (debug_alternative) \
55 printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
c767a54b 56} while (0)
d167a518 57
48c7a250
BP
58#define DUMP_BYTES(buf, len, fmt, args...) \
59do { \
60 if (unlikely(debug_alternative)) { \
61 int j; \
62 \
63 if (!(len)) \
64 break; \
65 \
66 printk(KERN_DEBUG fmt, ##args); \
67 for (j = 0; j < (len) - 1; j++) \
68 printk(KERN_CONT "%02hhx ", buf[j]); \
69 printk(KERN_CONT "%02hhx\n", buf[j]); \
70 } \
71} while (0)
72
dc326fca
PA
73/*
74 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
75 * that correspond to that nop. Getting from one nop to the next, we
76 * add to the array the offset that is equal to the sum of all sizes of
77 * nops preceding the one we are after.
78 *
79 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
80 * nice symmetry of sizes of the previous nops.
81 */
8b5a10fc 82#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
dc326fca
PA
83static const unsigned char intelnops[] =
84{
85 GENERIC_NOP1,
86 GENERIC_NOP2,
87 GENERIC_NOP3,
88 GENERIC_NOP4,
89 GENERIC_NOP5,
90 GENERIC_NOP6,
91 GENERIC_NOP7,
92 GENERIC_NOP8,
93 GENERIC_NOP5_ATOMIC
94};
95static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
96{
9a0b5817
GH
97 NULL,
98 intelnops,
99 intelnops + 1,
100 intelnops + 1 + 2,
101 intelnops + 1 + 2 + 3,
102 intelnops + 1 + 2 + 3 + 4,
103 intelnops + 1 + 2 + 3 + 4 + 5,
104 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
105 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 106 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
9a0b5817 107};
d167a518
GH
108#endif
109
110#ifdef K8_NOP1
dc326fca
PA
111static const unsigned char k8nops[] =
112{
113 K8_NOP1,
114 K8_NOP2,
115 K8_NOP3,
116 K8_NOP4,
117 K8_NOP5,
118 K8_NOP6,
119 K8_NOP7,
120 K8_NOP8,
121 K8_NOP5_ATOMIC
122};
123static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
124{
9a0b5817
GH
125 NULL,
126 k8nops,
127 k8nops + 1,
128 k8nops + 1 + 2,
129 k8nops + 1 + 2 + 3,
130 k8nops + 1 + 2 + 3 + 4,
131 k8nops + 1 + 2 + 3 + 4 + 5,
132 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
133 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 134 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
9a0b5817 135};
d167a518
GH
136#endif
137
8b5a10fc 138#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
dc326fca
PA
139static const unsigned char k7nops[] =
140{
141 K7_NOP1,
142 K7_NOP2,
143 K7_NOP3,
144 K7_NOP4,
145 K7_NOP5,
146 K7_NOP6,
147 K7_NOP7,
148 K7_NOP8,
149 K7_NOP5_ATOMIC
150};
151static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
152{
9a0b5817
GH
153 NULL,
154 k7nops,
155 k7nops + 1,
156 k7nops + 1 + 2,
157 k7nops + 1 + 2 + 3,
158 k7nops + 1 + 2 + 3 + 4,
159 k7nops + 1 + 2 + 3 + 4 + 5,
160 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
161 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 162 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
9a0b5817 163};
d167a518
GH
164#endif
165
32c464f5 166#ifdef P6_NOP1
cb09cad4 167static const unsigned char p6nops[] =
dc326fca
PA
168{
169 P6_NOP1,
170 P6_NOP2,
171 P6_NOP3,
172 P6_NOP4,
173 P6_NOP5,
174 P6_NOP6,
175 P6_NOP7,
176 P6_NOP8,
177 P6_NOP5_ATOMIC
178};
179static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
180{
32c464f5
JB
181 NULL,
182 p6nops,
183 p6nops + 1,
184 p6nops + 1 + 2,
185 p6nops + 1 + 2 + 3,
186 p6nops + 1 + 2 + 3 + 4,
187 p6nops + 1 + 2 + 3 + 4 + 5,
188 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
189 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 190 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
32c464f5
JB
191};
192#endif
193
dc326fca 194/* Initialize these to a safe default */
d167a518 195#ifdef CONFIG_X86_64
dc326fca
PA
196const unsigned char * const *ideal_nops = p6_nops;
197#else
198const unsigned char * const *ideal_nops = intel_nops;
199#endif
d167a518 200
dc326fca 201void __init arch_init_ideal_nops(void)
d167a518 202{
dc326fca
PA
203 switch (boot_cpu_data.x86_vendor) {
204 case X86_VENDOR_INTEL:
d8d9766c
PA
205 /*
206 * Due to a decoder implementation quirk, some
207 * specific Intel CPUs actually perform better with
208 * the "k8_nops" than with the SDM-recommended NOPs.
209 */
210 if (boot_cpu_data.x86 == 6 &&
211 boot_cpu_data.x86_model >= 0x0f &&
212 boot_cpu_data.x86_model != 0x1c &&
213 boot_cpu_data.x86_model != 0x26 &&
214 boot_cpu_data.x86_model != 0x27 &&
215 boot_cpu_data.x86_model < 0x30) {
216 ideal_nops = k8_nops;
217 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
dc326fca
PA
218 ideal_nops = p6_nops;
219 } else {
220#ifdef CONFIG_X86_64
221 ideal_nops = k8_nops;
222#else
223 ideal_nops = intel_nops;
224#endif
225 }
d6250a3f 226 break;
f21262b8 227
c3fecca4
PW
228 case X86_VENDOR_HYGON:
229 ideal_nops = p6_nops;
230 return;
231
f21262b8
BP
232 case X86_VENDOR_AMD:
233 if (boot_cpu_data.x86 > 0xf) {
234 ideal_nops = p6_nops;
235 return;
236 }
237
238 /* fall through */
239
dc326fca
PA
240 default:
241#ifdef CONFIG_X86_64
242 ideal_nops = k8_nops;
243#else
244 if (boot_cpu_has(X86_FEATURE_K8))
245 ideal_nops = k8_nops;
246 else if (boot_cpu_has(X86_FEATURE_K7))
247 ideal_nops = k7_nops;
248 else
249 ideal_nops = intel_nops;
250#endif
251 }
9a0b5817
GH
252}
253
ab144f5e 254/* Use this to add nops to a buffer, then text_poke the whole buffer. */
8b5a10fc 255static void __init_or_module add_nops(void *insns, unsigned int len)
139ec7c4 256{
139ec7c4
RR
257 while (len > 0) {
258 unsigned int noplen = len;
259 if (noplen > ASM_NOP_MAX)
260 noplen = ASM_NOP_MAX;
dc326fca 261 memcpy(insns, ideal_nops[noplen], noplen);
139ec7c4
RR
262 insns += noplen;
263 len -= noplen;
264 }
265}
266
d167a518 267extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
5967ed87 268extern s32 __smp_locks[], __smp_locks_end[];
0a203df5 269void text_poke_early(void *addr, const void *opcode, size_t len);
d167a518 270
48c7a250
BP
271/*
272 * Are we looking at a near JMP with a 1 or 4-byte displacement.
273 */
274static inline bool is_jmp(const u8 opcode)
275{
276 return opcode == 0xeb || opcode == 0xe9;
277}
278
279static void __init_or_module
280recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
281{
282 u8 *next_rip, *tgt_rip;
283 s32 n_dspl, o_dspl;
284 int repl_len;
285
286 if (a->replacementlen != 5)
287 return;
288
289 o_dspl = *(s32 *)(insnbuf + 1);
290
291 /* next_rip of the replacement JMP */
292 next_rip = repl_insn + a->replacementlen;
293 /* target rip of the replacement JMP */
294 tgt_rip = next_rip + o_dspl;
295 n_dspl = tgt_rip - orig_insn;
296
0e6c16c6 297 DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
48c7a250
BP
298
299 if (tgt_rip - orig_insn >= 0) {
300 if (n_dspl - 2 <= 127)
301 goto two_byte_jmp;
302 else
303 goto five_byte_jmp;
304 /* negative offset */
305 } else {
306 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
307 goto two_byte_jmp;
308 else
309 goto five_byte_jmp;
310 }
311
312two_byte_jmp:
313 n_dspl -= 2;
314
315 insnbuf[0] = 0xeb;
316 insnbuf[1] = (s8)n_dspl;
317 add_nops(insnbuf + 2, 3);
318
319 repl_len = 2;
320 goto done;
321
322five_byte_jmp:
323 n_dspl -= 5;
324
325 insnbuf[0] = 0xe9;
326 *(s32 *)&insnbuf[1] = n_dspl;
327
328 repl_len = 5;
329
330done:
331
332 DPRINTK("final displ: 0x%08x, JMP 0x%lx",
333 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
334}
335
34bfab0e
BP
336/*
337 * "noinline" to cause control flow change and thus invalidate I$ and
338 * cause refetch after modification.
339 */
340static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
4fd4b6e5 341{
66c117d7 342 unsigned long flags;
612e8e93 343 int i;
66c117d7 344
612e8e93
BP
345 for (i = 0; i < a->padlen; i++) {
346 if (instr[i] != 0x90)
347 return;
348 }
69df353f 349
66c117d7 350 local_irq_save(flags);
4fd4b6e5 351 add_nops(instr + (a->instrlen - a->padlen), a->padlen);
66c117d7 352 local_irq_restore(flags);
4fd4b6e5 353
0e6c16c6 354 DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
4fd4b6e5
BP
355 instr, a->instrlen - a->padlen, a->padlen);
356}
357
db477a33
BP
358/*
359 * Replace instructions with better alternatives for this CPU type. This runs
360 * before SMP is initialized to avoid SMP problems with self modifying code.
361 * This implies that asymmetric systems where APs have less capabilities than
362 * the boot processor are not handled. Tough. Make sure you disable such
363 * features by hand.
34bfab0e
BP
364 *
365 * Marked "noinline" to cause control flow change and thus insn cache
366 * to refetch changed I$ lines.
db477a33 367 */
34bfab0e
BP
368void __init_or_module noinline apply_alternatives(struct alt_instr *start,
369 struct alt_instr *end)
9a0b5817 370{
9a0b5817 371 struct alt_instr *a;
59e97e4d 372 u8 *instr, *replacement;
1b1d9258 373 u8 insnbuf[MAX_PATCH_LEN];
9a0b5817 374
0e6c16c6 375 DPRINTK("alt table %px, -> %px", start, end);
50973133
FY
376 /*
377 * The scan order should be from start to end. A later scanned
db477a33 378 * alternative code can overwrite previously scanned alternative code.
50973133
FY
379 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
380 * patch code.
381 *
382 * So be careful if you want to change the scan order to any other
383 * order.
384 */
9a0b5817 385 for (a = start; a < end; a++) {
48c7a250
BP
386 int insnbuf_sz = 0;
387
59e97e4d
AL
388 instr = (u8 *)&a->instr_offset + a->instr_offset;
389 replacement = (u8 *)&a->repl_offset + a->repl_offset;
ab144f5e 390 BUG_ON(a->instrlen > sizeof(insnbuf));
65fc985b 391 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
4fd4b6e5
BP
392 if (!boot_cpu_has(a->cpuid)) {
393 if (a->padlen > 1)
394 optimize_nops(a, instr);
395
9a0b5817 396 continue;
4fd4b6e5 397 }
59e97e4d 398
c1d4e419 399 DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
db477a33
BP
400 a->cpuid >> 5,
401 a->cpuid & 0x1f,
c1d4e419 402 instr, instr, a->instrlen,
dbe4058a 403 replacement, a->replacementlen, a->padlen);
db477a33 404
0e6c16c6
BP
405 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
406 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
48c7a250 407
59e97e4d 408 memcpy(insnbuf, replacement, a->replacementlen);
48c7a250 409 insnbuf_sz = a->replacementlen;
59e97e4d 410
fc152d22
MJ
411 /*
412 * 0xe8 is a relative jump; fix the offset.
413 *
414 * Instruction length is checked before the opcode to avoid
415 * accessing uninitialized bytes for zero-length replacements.
416 */
417 if (a->replacementlen == 5 && *insnbuf == 0xe8) {
db477a33 418 *(s32 *)(insnbuf + 1) += replacement - instr;
48c7a250
BP
419 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
420 *(s32 *)(insnbuf + 1),
421 (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
db477a33 422 }
59e97e4d 423
48c7a250
BP
424 if (a->replacementlen && is_jmp(replacement[0]))
425 recompute_jump(a, instr, replacement, insnbuf);
426
427 if (a->instrlen > a->replacementlen) {
4332195c
BP
428 add_nops(insnbuf + a->replacementlen,
429 a->instrlen - a->replacementlen);
48c7a250
BP
430 insnbuf_sz += a->instrlen - a->replacementlen;
431 }
0e6c16c6 432 DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
59e97e4d 433
48c7a250 434 text_poke_early(instr, insnbuf, insnbuf_sz);
9a0b5817
GH
435 }
436}
437
8ec4d41f 438#ifdef CONFIG_SMP
5967ed87
JB
439static void alternatives_smp_lock(const s32 *start, const s32 *end,
440 u8 *text, u8 *text_end)
9a0b5817 441{
5967ed87 442 const s32 *poff;
9a0b5817 443
5967ed87
JB
444 for (poff = start; poff < end; poff++) {
445 u8 *ptr = (u8 *)poff + *poff;
446
447 if (!*poff || ptr < text || ptr >= text_end)
9a0b5817 448 continue;
f88f07e0 449 /* turn DS segment override prefix into lock prefix */
d9c5841e
PA
450 if (*ptr == 0x3e)
451 text_poke(ptr, ((unsigned char []){0xf0}), 1);
4b8073e4 452 }
9a0b5817
GH
453}
454
5967ed87
JB
455static void alternatives_smp_unlock(const s32 *start, const s32 *end,
456 u8 *text, u8 *text_end)
9a0b5817 457{
5967ed87 458 const s32 *poff;
9a0b5817 459
5967ed87
JB
460 for (poff = start; poff < end; poff++) {
461 u8 *ptr = (u8 *)poff + *poff;
462
463 if (!*poff || ptr < text || ptr >= text_end)
9a0b5817 464 continue;
f88f07e0 465 /* turn lock prefix into DS segment override prefix */
d9c5841e
PA
466 if (*ptr == 0xf0)
467 text_poke(ptr, ((unsigned char []){0x3E}), 1);
4b8073e4 468 }
9a0b5817
GH
469}
470
471struct smp_alt_module {
472 /* what is this ??? */
473 struct module *mod;
474 char *name;
475
476 /* ptrs to lock prefixes */
5967ed87
JB
477 const s32 *locks;
478 const s32 *locks_end;
9a0b5817
GH
479
480 /* .text segment, needed to avoid patching init code ;) */
481 u8 *text;
482 u8 *text_end;
483
484 struct list_head next;
485};
486static LIST_HEAD(smp_alt_modules);
e846d139 487static bool uniproc_patched = false; /* protected by text_mutex */
9a0b5817 488
8b5a10fc
JB
489void __init_or_module alternatives_smp_module_add(struct module *mod,
490 char *name,
491 void *locks, void *locks_end,
492 void *text, void *text_end)
9a0b5817
GH
493{
494 struct smp_alt_module *smp;
9a0b5817 495
e846d139 496 mutex_lock(&text_mutex);
816afe4f
RR
497 if (!uniproc_patched)
498 goto unlock;
b7fb4af0 499
816afe4f
RR
500 if (num_possible_cpus() == 1)
501 /* Don't bother remembering, we'll never have to undo it. */
502 goto smp_unlock;
9a0b5817
GH
503
504 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
505 if (NULL == smp)
816afe4f
RR
506 /* we'll run the (safe but slow) SMP code then ... */
507 goto unlock;
9a0b5817
GH
508
509 smp->mod = mod;
510 smp->name = name;
511 smp->locks = locks;
512 smp->locks_end = locks_end;
513 smp->text = text;
514 smp->text_end = text_end;
db477a33
BP
515 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
516 smp->locks, smp->locks_end,
9a0b5817
GH
517 smp->text, smp->text_end, smp->name);
518
9a0b5817 519 list_add_tail(&smp->next, &smp_alt_modules);
816afe4f
RR
520smp_unlock:
521 alternatives_smp_unlock(locks, locks_end, text, text_end);
522unlock:
e846d139 523 mutex_unlock(&text_mutex);
9a0b5817
GH
524}
525
8b5a10fc 526void __init_or_module alternatives_smp_module_del(struct module *mod)
9a0b5817
GH
527{
528 struct smp_alt_module *item;
9a0b5817 529
e846d139 530 mutex_lock(&text_mutex);
9a0b5817
GH
531 list_for_each_entry(item, &smp_alt_modules, next) {
532 if (mod != item->mod)
533 continue;
534 list_del(&item->next);
9a0b5817 535 kfree(item);
816afe4f 536 break;
9a0b5817 537 }
e846d139 538 mutex_unlock(&text_mutex);
9a0b5817
GH
539}
540
816afe4f 541void alternatives_enable_smp(void)
9a0b5817
GH
542{
543 struct smp_alt_module *mod;
9a0b5817 544
816afe4f
RR
545 /* Why bother if there are no other CPUs? */
546 BUG_ON(num_possible_cpus() == 1);
9a0b5817 547
e846d139 548 mutex_lock(&text_mutex);
ca74a6f8 549
816afe4f 550 if (uniproc_patched) {
c767a54b 551 pr_info("switching to SMP code\n");
816afe4f 552 BUG_ON(num_online_cpus() != 1);
53756d37
JF
553 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
554 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
9a0b5817
GH
555 list_for_each_entry(mod, &smp_alt_modules, next)
556 alternatives_smp_lock(mod->locks, mod->locks_end,
557 mod->text, mod->text_end);
816afe4f 558 uniproc_patched = false;
9a0b5817 559 }
e846d139 560 mutex_unlock(&text_mutex);
9a0b5817
GH
561}
562
e846d139
ZC
563/*
564 * Return 1 if the address range is reserved for SMP-alternatives.
565 * Must hold text_mutex.
566 */
2cfa1978
MH
567int alternatives_text_reserved(void *start, void *end)
568{
569 struct smp_alt_module *mod;
5967ed87 570 const s32 *poff;
076dc4a6
MH
571 u8 *text_start = start;
572 u8 *text_end = end;
2cfa1978 573
e846d139
ZC
574 lockdep_assert_held(&text_mutex);
575
2cfa1978 576 list_for_each_entry(mod, &smp_alt_modules, next) {
076dc4a6 577 if (mod->text > text_end || mod->text_end < text_start)
2cfa1978 578 continue;
5967ed87
JB
579 for (poff = mod->locks; poff < mod->locks_end; poff++) {
580 const u8 *ptr = (const u8 *)poff + *poff;
581
582 if (text_start <= ptr && text_end > ptr)
2cfa1978 583 return 1;
5967ed87 584 }
2cfa1978
MH
585 }
586
587 return 0;
588}
48c7a250 589#endif /* CONFIG_SMP */
8ec4d41f 590
139ec7c4 591#ifdef CONFIG_PARAVIRT
8b5a10fc
JB
592void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
593 struct paravirt_patch_site *end)
139ec7c4 594{
98de032b 595 struct paravirt_patch_site *p;
ab144f5e 596 char insnbuf[MAX_PATCH_LEN];
139ec7c4
RR
597
598 for (p = start; p < end; p++) {
599 unsigned int used;
600
ab144f5e 601 BUG_ON(p->len > MAX_PATCH_LEN);
d34fda4a
CW
602 /* prep the buffer with the original instructions */
603 memcpy(insnbuf, p->instr, p->len);
5c83511b 604 used = pv_ops.init.patch(p->instrtype, insnbuf,
93b1eab3 605 (unsigned long)p->instr, p->len);
7f63c41c 606
63f70270
JF
607 BUG_ON(used > p->len);
608
139ec7c4 609 /* Pad the rest with nops */
ab144f5e 610 add_nops(insnbuf + used, p->len - used);
e587cadd 611 text_poke_early(p->instr, insnbuf, p->len);
139ec7c4 612 }
139ec7c4 613}
98de032b 614extern struct paravirt_patch_site __start_parainstructions[],
139ec7c4
RR
615 __stop_parainstructions[];
616#endif /* CONFIG_PARAVIRT */
617
7457c0da
PZ
618/*
619 * Self-test for the INT3 based CALL emulation code.
620 *
621 * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
622 * properly and that there is a stack gap between the INT3 frame and the
623 * previous context. Without this gap doing a virtual PUSH on the interrupted
624 * stack would corrupt the INT3 IRET frame.
625 *
626 * See entry_{32,64}.S for more details.
627 */
628static void __init int3_magic(unsigned int *ptr)
629{
630 *ptr = 1;
631}
632
633extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
634
635static int __init
636int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
637{
638 struct die_args *args = data;
639 struct pt_regs *regs = args->regs;
640
641 if (!regs || user_mode(regs))
642 return NOTIFY_DONE;
643
644 if (val != DIE_INT3)
645 return NOTIFY_DONE;
646
647 if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip)
648 return NOTIFY_DONE;
649
650 int3_emulate_call(regs, (unsigned long)&int3_magic);
651 return NOTIFY_STOP;
652}
653
654static void __init int3_selftest(void)
655{
656 static __initdata struct notifier_block int3_exception_nb = {
657 .notifier_call = int3_exception_notify,
658 .priority = INT_MAX-1, /* last */
659 };
660 unsigned int val = 0;
661
662 BUG_ON(register_die_notifier(&int3_exception_nb));
663
664 /*
665 * Basically: int3_magic(&val); but really complicated :-)
666 *
667 * Stick the address of the INT3 instruction into int3_selftest_ip,
668 * then trigger the INT3, padded with NOPs to match a CALL instruction
669 * length.
670 */
671 asm volatile ("1: int3; nop; nop; nop; nop\n\t"
672 ".pushsection .init.data,\"aw\"\n\t"
673 ".align " __ASM_SEL(4, 8) "\n\t"
674 ".type int3_selftest_ip, @object\n\t"
675 ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t"
676 "int3_selftest_ip:\n\t"
677 __ASM_SEL(.long, .quad) " 1b\n\t"
678 ".popsection\n\t"
679 : : __ASM_SEL_RAW(a, D) (&val) : "memory");
680
681 BUG_ON(val != 1);
682
683 unregister_die_notifier(&int3_exception_nb);
684}
685
9a0b5817
GH
686void __init alternative_instructions(void)
687{
7457c0da
PZ
688 int3_selftest();
689
690 /*
691 * The patching is not fully atomic, so try to avoid local
692 * interruptions that might execute the to be patched code.
693 * Other CPUs are not running.
694 */
8f4e956b 695 stop_nmi();
123aa76e
AK
696
697 /*
698 * Don't stop machine check exceptions while patching.
699 * MCEs only happen when something got corrupted and in this
700 * case we must do something about the corruption.
701 * Ignoring it is worse than a unlikely patching race.
702 * Also machine checks tend to be broadcast and if one CPU
703 * goes into machine check the others follow quickly, so we don't
704 * expect a machine check to cause undue problems during to code
705 * patching.
706 */
8f4e956b 707
9a0b5817
GH
708 apply_alternatives(__alt_instructions, __alt_instructions_end);
709
8ec4d41f 710#ifdef CONFIG_SMP
816afe4f
RR
711 /* Patch to UP if other cpus not imminent. */
712 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
713 uniproc_patched = true;
9a0b5817
GH
714 alternatives_smp_module_add(NULL, "core kernel",
715 __smp_locks, __smp_locks_end,
716 _text, _etext);
9a0b5817 717 }
8f4e956b 718
7457c0da 719 if (!uniproc_patched || num_possible_cpus() == 1) {
f68fd5f4
FW
720 free_init_pages("SMP alternatives",
721 (unsigned long)__smp_locks,
722 (unsigned long)__smp_locks_end);
7457c0da 723 }
816afe4f
RR
724#endif
725
726 apply_paravirt(__parainstructions, __parainstructions_end);
f68fd5f4 727
8f4e956b 728 restart_nmi();
5e907bb0 729 alternatives_patched = 1;
9a0b5817 730}
19d36ccd 731
e587cadd
MD
732/**
733 * text_poke_early - Update instructions on a live kernel at boot time
734 * @addr: address to modify
735 * @opcode: source of the copy
736 * @len: length to copy
737 *
19d36ccd
AK
738 * When you use this code to patch more than one byte of an instruction
739 * you need to make sure that other CPUs cannot execute this code in parallel.
e587cadd
MD
740 * Also no thread must be currently preempted in the middle of these
741 * instructions. And on the local CPU you need to be protected again NMI or MCE
742 * handlers seeing an inconsistent instruction while you patch.
19d36ccd 743 */
0a203df5
NA
744void __init_or_module text_poke_early(void *addr, const void *opcode,
745 size_t len)
19d36ccd 746{
e587cadd 747 unsigned long flags;
f2c65fb3
NA
748
749 if (boot_cpu_has(X86_FEATURE_NX) &&
750 is_module_text_address((unsigned long)addr)) {
751 /*
752 * Modules text is marked initially as non-executable, so the
753 * code cannot be running and speculative code-fetches are
754 * prevented. Just change the code.
755 */
756 memcpy(addr, opcode, len);
757 } else {
758 local_irq_save(flags);
759 memcpy(addr, opcode, len);
760 local_irq_restore(flags);
761 sync_core();
762
763 /*
764 * Could also do a CLFLUSH here to speed up CPU recovery; but
765 * that causes hangs on some VIA CPUs.
766 */
767 }
e587cadd
MD
768}
769
4fc19708
NA
770__ro_after_init struct mm_struct *poking_mm;
771__ro_after_init unsigned long poking_addr;
772
e836673c 773static void *__text_poke(void *addr, const void *opcode, size_t len)
e587cadd 774{
b3fd8e83
NA
775 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
776 struct page *pages[2] = {NULL};
777 temp_mm_state_t prev;
78ff7fae 778 unsigned long flags;
b3fd8e83
NA
779 pte_t pte, *ptep;
780 spinlock_t *ptl;
781 pgprot_t pgprot;
e587cadd 782
6fffacb3 783 /*
b3fd8e83
NA
784 * While boot memory allocator is running we cannot use struct pages as
785 * they are not yet initialized. There is no way to recover.
6fffacb3
PT
786 */
787 BUG_ON(!after_bootmem);
788
b7b66baa
MD
789 if (!core_kernel_text((unsigned long)addr)) {
790 pages[0] = vmalloc_to_page(addr);
b3fd8e83
NA
791 if (cross_page_boundary)
792 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
15a601eb 793 } else {
b7b66baa 794 pages[0] = virt_to_page(addr);
00c6b2d5 795 WARN_ON(!PageReserved(pages[0]));
b3fd8e83
NA
796 if (cross_page_boundary)
797 pages[1] = virt_to_page(addr + PAGE_SIZE);
e587cadd 798 }
b3fd8e83
NA
799 /*
800 * If something went wrong, crash and burn since recovery paths are not
801 * implemented.
802 */
803 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
804
7cf49427 805 local_irq_save(flags);
b3fd8e83
NA
806
807 /*
808 * Map the page without the global bit, as TLB flushing is done with
809 * flush_tlb_mm_range(), which is intended for non-global PTEs.
810 */
811 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
812
813 /*
814 * The lock is not really needed, but this allows to avoid open-coding.
815 */
816 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
817
818 /*
819 * This must not fail; preallocated in poking_init().
820 */
821 VM_BUG_ON(!ptep);
822
823 pte = mk_pte(pages[0], pgprot);
824 set_pte_at(poking_mm, poking_addr, ptep, pte);
825
826 if (cross_page_boundary) {
827 pte = mk_pte(pages[1], pgprot);
828 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
829 }
830
831 /*
832 * Loading the temporary mm behaves as a compiler barrier, which
833 * guarantees that the PTE will be set at the time memcpy() is done.
834 */
835 prev = use_temporary_mm(poking_mm);
836
837 kasan_disable_current();
838 memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
839 kasan_enable_current();
840
841 /*
842 * Ensure that the PTE is only cleared after the instructions of memcpy
843 * were issued by using a compiler barrier.
844 */
845 barrier();
846
847 pte_clear(poking_mm, poking_addr, ptep);
848 if (cross_page_boundary)
849 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
850
851 /*
852 * Loading the previous page-table hierarchy requires a serializing
853 * instruction that already allows the core to see the updated version.
854 * Xen-PV is assumed to serialize execution in a similar manner.
855 */
856 unuse_temporary_mm(prev);
857
858 /*
859 * Flushing the TLB might involve IPIs, which would require enabled
860 * IRQs, but not if the mm is not used, as it is in this point.
861 */
862 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
863 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
864 PAGE_SHIFT, false);
865
866 /*
867 * If the text does not match what we just wrote then something is
868 * fundamentally screwy; there's nothing we can really do about that.
869 */
870 BUG_ON(memcmp(addr, opcode, len));
871
872 pte_unmap_unlock(ptep, ptl);
7cf49427 873 local_irq_restore(flags);
e587cadd 874 return addr;
19d36ccd 875}
3d55cc8a 876
e836673c
NA
877/**
878 * text_poke - Update instructions on a live kernel
879 * @addr: address to modify
880 * @opcode: source of the copy
881 * @len: length to copy
882 *
883 * Only atomic text poke/set should be allowed when not doing early patching.
884 * It means the size must be writable atomically and the address must be aligned
885 * in a way that permits an atomic write. It also makes sure we fit on a single
886 * page.
3950746d
NA
887 *
888 * Note that the caller must ensure that if the modified code is part of a
889 * module, the module would not be removed during poking. This can be achieved
890 * by registering a module notifier, and ordering module removal and patching
891 * trough a mutex.
e836673c
NA
892 */
893void *text_poke(void *addr, const void *opcode, size_t len)
894{
895 lockdep_assert_held(&text_mutex);
896
897 return __text_poke(addr, opcode, len);
898}
899
900/**
901 * text_poke_kgdb - Update instructions on a live kernel by kgdb
902 * @addr: address to modify
903 * @opcode: source of the copy
904 * @len: length to copy
905 *
906 * Only atomic text poke/set should be allowed when not doing early patching.
907 * It means the size must be writable atomically and the address must be aligned
908 * in a way that permits an atomic write. It also makes sure we fit on a single
909 * page.
910 *
911 * Context: should only be used by kgdb, which ensures no other core is running,
912 * despite the fact it does not hold the text_mutex.
913 */
914void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
915{
916 return __text_poke(addr, opcode, len);
917}
918
fd4363ff
JK
919static void do_sync_core(void *info)
920{
921 sync_core();
922}
923
924static bool bp_patching_in_progress;
925static void *bp_int3_handler, *bp_int3_addr;
926
17f41571 927int poke_int3_handler(struct pt_regs *regs)
fd4363ff 928{
01651324
PZ
929 /*
930 * Having observed our INT3 instruction, we now must observe
931 * bp_patching_in_progress.
932 *
933 * in_progress = TRUE INT3
934 * WMB RMB
935 * write INT3 if (in_progress)
936 *
937 * Idem for bp_int3_handler.
938 */
fd4363ff
JK
939 smp_rmb();
940
941 if (likely(!bp_patching_in_progress))
17f41571 942 return 0;
fd4363ff 943
f39b6f0e 944 if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
17f41571 945 return 0;
fd4363ff
JK
946
947 /* set up the specified breakpoint handler */
17f41571
JK
948 regs->ip = (unsigned long) bp_int3_handler;
949
950 return 1;
fd4363ff 951}
c13324a5 952NOKPROBE_SYMBOL(poke_int3_handler);
17f41571 953
fd4363ff
JK
954/**
955 * text_poke_bp() -- update instructions on live kernel on SMP
956 * @addr: address to patch
957 * @opcode: opcode of new instruction
958 * @len: length to copy
959 * @handler: address to jump to when the temporary breakpoint is hit
960 *
961 * Modify multi-byte instruction by using int3 breakpoint on SMP.
ea8596bb
MH
962 * We completely avoid stop_machine() here, and achieve the
963 * synchronization using int3 breakpoint.
fd4363ff
JK
964 *
965 * The way it is done:
966 * - add a int3 trap to the address that will be patched
967 * - sync cores
968 * - update all but the first byte of the patched range
969 * - sync cores
970 * - replace the first byte (int3) by the first byte of
971 * replacing opcode
972 * - sync cores
fd4363ff 973 */
0a203df5 974void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
fd4363ff
JK
975{
976 unsigned char int3 = 0xcc;
977
978 bp_int3_handler = handler;
979 bp_int3_addr = (u8 *)addr + sizeof(int3);
980 bp_patching_in_progress = true;
9222f606
JK
981
982 lockdep_assert_held(&text_mutex);
983
fd4363ff 984 /*
01651324
PZ
985 * Corresponding read barrier in int3 notifier for making sure the
986 * in_progress and handler are correctly ordered wrt. patching.
fd4363ff
JK
987 */
988 smp_wmb();
989
990 text_poke(addr, &int3, sizeof(int3));
991
992 on_each_cpu(do_sync_core, NULL, 1);
993
994 if (len - sizeof(int3) > 0) {
995 /* patch all but the first byte */
996 text_poke((char *)addr + sizeof(int3),
997 (const char *) opcode + sizeof(int3),
998 len - sizeof(int3));
999 /*
1000 * According to Intel, this core syncing is very likely
1001 * not necessary and we'd be safe even without it. But
1002 * better safe than sorry (plus there's not only Intel).
1003 */
1004 on_each_cpu(do_sync_core, NULL, 1);
1005 }
1006
1007 /* patch the first byte */
1008 text_poke(addr, opcode, sizeof(int3));
1009
1010 on_each_cpu(do_sync_core, NULL, 1);
01651324
PZ
1011 /*
1012 * sync_core() implies an smp_mb() and orders this store against
1013 * the writing of the new instruction.
1014 */
fd4363ff 1015 bp_patching_in_progress = false;
fd4363ff
JK
1016}
1017