]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - arch/x86/kernel/alternative.c
x86/kprobes: Set instruction page as executable
[mirror_ubuntu-jammy-kernel.git] / arch / x86 / kernel / alternative.c
CommitLineData
c767a54b
JP
1#define pr_fmt(fmt) "SMP alternatives: " fmt
2
9a0b5817 3#include <linux/module.h>
f6a57033 4#include <linux/sched.h>
2f1dafe5 5#include <linux/mutex.h>
9a0b5817 6#include <linux/list.h>
8b5a10fc 7#include <linux/stringify.h>
19d36ccd
AK
8#include <linux/mm.h>
9#include <linux/vmalloc.h>
3945dab4 10#include <linux/memory.h>
3d55cc8a 11#include <linux/stop_machine.h>
5a0e3ad6 12#include <linux/slab.h>
fd4363ff 13#include <linux/kdebug.h>
c13324a5 14#include <linux/kprobes.h>
b3fd8e83 15#include <linux/mmu_context.h>
35de5b06 16#include <asm/text-patching.h>
9a0b5817
GH
17#include <asm/alternative.h>
18#include <asm/sections.h>
19d36ccd 19#include <asm/pgtable.h>
8f4e956b
AK
20#include <asm/mce.h>
21#include <asm/nmi.h>
e587cadd 22#include <asm/cacheflush.h>
78ff7fae 23#include <asm/tlbflush.h>
e587cadd 24#include <asm/io.h>
78ff7fae 25#include <asm/fixmap.h>
9a0b5817 26
5e907bb0
IM
27int __read_mostly alternatives_patched;
28
29EXPORT_SYMBOL_GPL(alternatives_patched);
30
ab144f5e
AK
31#define MAX_PATCH_LEN (255-1)
32
8b5a10fc 33static int __initdata_or_module debug_alternative;
b7fb4af0 34
d167a518
GH
35static int __init debug_alt(char *str)
36{
37 debug_alternative = 1;
38 return 1;
39}
d167a518
GH
40__setup("debug-alternative", debug_alt);
41
09488165
JB
42static int noreplace_smp;
43
b7fb4af0
JF
44static int __init setup_noreplace_smp(char *str)
45{
46 noreplace_smp = 1;
47 return 1;
48}
49__setup("noreplace-smp", setup_noreplace_smp);
50
db477a33
BP
51#define DPRINTK(fmt, args...) \
52do { \
53 if (debug_alternative) \
54 printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
c767a54b 55} while (0)
d167a518 56
48c7a250
BP
57#define DUMP_BYTES(buf, len, fmt, args...) \
58do { \
59 if (unlikely(debug_alternative)) { \
60 int j; \
61 \
62 if (!(len)) \
63 break; \
64 \
65 printk(KERN_DEBUG fmt, ##args); \
66 for (j = 0; j < (len) - 1; j++) \
67 printk(KERN_CONT "%02hhx ", buf[j]); \
68 printk(KERN_CONT "%02hhx\n", buf[j]); \
69 } \
70} while (0)
71
dc326fca
PA
72/*
73 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
74 * that correspond to that nop. Getting from one nop to the next, we
75 * add to the array the offset that is equal to the sum of all sizes of
76 * nops preceding the one we are after.
77 *
78 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
79 * nice symmetry of sizes of the previous nops.
80 */
8b5a10fc 81#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
dc326fca
PA
82static const unsigned char intelnops[] =
83{
84 GENERIC_NOP1,
85 GENERIC_NOP2,
86 GENERIC_NOP3,
87 GENERIC_NOP4,
88 GENERIC_NOP5,
89 GENERIC_NOP6,
90 GENERIC_NOP7,
91 GENERIC_NOP8,
92 GENERIC_NOP5_ATOMIC
93};
94static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
95{
9a0b5817
GH
96 NULL,
97 intelnops,
98 intelnops + 1,
99 intelnops + 1 + 2,
100 intelnops + 1 + 2 + 3,
101 intelnops + 1 + 2 + 3 + 4,
102 intelnops + 1 + 2 + 3 + 4 + 5,
103 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
104 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 105 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
9a0b5817 106};
d167a518
GH
107#endif
108
109#ifdef K8_NOP1
dc326fca
PA
110static const unsigned char k8nops[] =
111{
112 K8_NOP1,
113 K8_NOP2,
114 K8_NOP3,
115 K8_NOP4,
116 K8_NOP5,
117 K8_NOP6,
118 K8_NOP7,
119 K8_NOP8,
120 K8_NOP5_ATOMIC
121};
122static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
123{
9a0b5817
GH
124 NULL,
125 k8nops,
126 k8nops + 1,
127 k8nops + 1 + 2,
128 k8nops + 1 + 2 + 3,
129 k8nops + 1 + 2 + 3 + 4,
130 k8nops + 1 + 2 + 3 + 4 + 5,
131 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
132 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 133 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
9a0b5817 134};
d167a518
GH
135#endif
136
8b5a10fc 137#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
dc326fca
PA
138static const unsigned char k7nops[] =
139{
140 K7_NOP1,
141 K7_NOP2,
142 K7_NOP3,
143 K7_NOP4,
144 K7_NOP5,
145 K7_NOP6,
146 K7_NOP7,
147 K7_NOP8,
148 K7_NOP5_ATOMIC
149};
150static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
151{
9a0b5817
GH
152 NULL,
153 k7nops,
154 k7nops + 1,
155 k7nops + 1 + 2,
156 k7nops + 1 + 2 + 3,
157 k7nops + 1 + 2 + 3 + 4,
158 k7nops + 1 + 2 + 3 + 4 + 5,
159 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
160 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 161 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
9a0b5817 162};
d167a518
GH
163#endif
164
32c464f5 165#ifdef P6_NOP1
cb09cad4 166static const unsigned char p6nops[] =
dc326fca
PA
167{
168 P6_NOP1,
169 P6_NOP2,
170 P6_NOP3,
171 P6_NOP4,
172 P6_NOP5,
173 P6_NOP6,
174 P6_NOP7,
175 P6_NOP8,
176 P6_NOP5_ATOMIC
177};
178static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
179{
32c464f5
JB
180 NULL,
181 p6nops,
182 p6nops + 1,
183 p6nops + 1 + 2,
184 p6nops + 1 + 2 + 3,
185 p6nops + 1 + 2 + 3 + 4,
186 p6nops + 1 + 2 + 3 + 4 + 5,
187 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
188 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
dc326fca 189 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
32c464f5
JB
190};
191#endif
192
dc326fca 193/* Initialize these to a safe default */
d167a518 194#ifdef CONFIG_X86_64
dc326fca
PA
195const unsigned char * const *ideal_nops = p6_nops;
196#else
197const unsigned char * const *ideal_nops = intel_nops;
198#endif
d167a518 199
dc326fca 200void __init arch_init_ideal_nops(void)
d167a518 201{
dc326fca
PA
202 switch (boot_cpu_data.x86_vendor) {
203 case X86_VENDOR_INTEL:
d8d9766c
PA
204 /*
205 * Due to a decoder implementation quirk, some
206 * specific Intel CPUs actually perform better with
207 * the "k8_nops" than with the SDM-recommended NOPs.
208 */
209 if (boot_cpu_data.x86 == 6 &&
210 boot_cpu_data.x86_model >= 0x0f &&
211 boot_cpu_data.x86_model != 0x1c &&
212 boot_cpu_data.x86_model != 0x26 &&
213 boot_cpu_data.x86_model != 0x27 &&
214 boot_cpu_data.x86_model < 0x30) {
215 ideal_nops = k8_nops;
216 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
dc326fca
PA
217 ideal_nops = p6_nops;
218 } else {
219#ifdef CONFIG_X86_64
220 ideal_nops = k8_nops;
221#else
222 ideal_nops = intel_nops;
223#endif
224 }
d6250a3f 225 break;
f21262b8 226
c3fecca4
PW
227 case X86_VENDOR_HYGON:
228 ideal_nops = p6_nops;
229 return;
230
f21262b8
BP
231 case X86_VENDOR_AMD:
232 if (boot_cpu_data.x86 > 0xf) {
233 ideal_nops = p6_nops;
234 return;
235 }
236
237 /* fall through */
238
dc326fca
PA
239 default:
240#ifdef CONFIG_X86_64
241 ideal_nops = k8_nops;
242#else
243 if (boot_cpu_has(X86_FEATURE_K8))
244 ideal_nops = k8_nops;
245 else if (boot_cpu_has(X86_FEATURE_K7))
246 ideal_nops = k7_nops;
247 else
248 ideal_nops = intel_nops;
249#endif
250 }
9a0b5817
GH
251}
252
ab144f5e 253/* Use this to add nops to a buffer, then text_poke the whole buffer. */
8b5a10fc 254static void __init_or_module add_nops(void *insns, unsigned int len)
139ec7c4 255{
139ec7c4
RR
256 while (len > 0) {
257 unsigned int noplen = len;
258 if (noplen > ASM_NOP_MAX)
259 noplen = ASM_NOP_MAX;
dc326fca 260 memcpy(insns, ideal_nops[noplen], noplen);
139ec7c4
RR
261 insns += noplen;
262 len -= noplen;
263 }
264}
265
d167a518 266extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
5967ed87 267extern s32 __smp_locks[], __smp_locks_end[];
fa6f2cc7 268void *text_poke_early(void *addr, const void *opcode, size_t len);
d167a518 269
48c7a250
BP
270/*
271 * Are we looking at a near JMP with a 1 or 4-byte displacement.
272 */
273static inline bool is_jmp(const u8 opcode)
274{
275 return opcode == 0xeb || opcode == 0xe9;
276}
277
278static void __init_or_module
279recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
280{
281 u8 *next_rip, *tgt_rip;
282 s32 n_dspl, o_dspl;
283 int repl_len;
284
285 if (a->replacementlen != 5)
286 return;
287
288 o_dspl = *(s32 *)(insnbuf + 1);
289
290 /* next_rip of the replacement JMP */
291 next_rip = repl_insn + a->replacementlen;
292 /* target rip of the replacement JMP */
293 tgt_rip = next_rip + o_dspl;
294 n_dspl = tgt_rip - orig_insn;
295
0e6c16c6 296 DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
48c7a250
BP
297
298 if (tgt_rip - orig_insn >= 0) {
299 if (n_dspl - 2 <= 127)
300 goto two_byte_jmp;
301 else
302 goto five_byte_jmp;
303 /* negative offset */
304 } else {
305 if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
306 goto two_byte_jmp;
307 else
308 goto five_byte_jmp;
309 }
310
311two_byte_jmp:
312 n_dspl -= 2;
313
314 insnbuf[0] = 0xeb;
315 insnbuf[1] = (s8)n_dspl;
316 add_nops(insnbuf + 2, 3);
317
318 repl_len = 2;
319 goto done;
320
321five_byte_jmp:
322 n_dspl -= 5;
323
324 insnbuf[0] = 0xe9;
325 *(s32 *)&insnbuf[1] = n_dspl;
326
327 repl_len = 5;
328
329done:
330
331 DPRINTK("final displ: 0x%08x, JMP 0x%lx",
332 n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
333}
334
34bfab0e
BP
335/*
336 * "noinline" to cause control flow change and thus invalidate I$ and
337 * cause refetch after modification.
338 */
339static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
4fd4b6e5 340{
66c117d7 341 unsigned long flags;
612e8e93 342 int i;
66c117d7 343
612e8e93
BP
344 for (i = 0; i < a->padlen; i++) {
345 if (instr[i] != 0x90)
346 return;
347 }
69df353f 348
66c117d7 349 local_irq_save(flags);
4fd4b6e5 350 add_nops(instr + (a->instrlen - a->padlen), a->padlen);
66c117d7 351 local_irq_restore(flags);
4fd4b6e5 352
0e6c16c6 353 DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
4fd4b6e5
BP
354 instr, a->instrlen - a->padlen, a->padlen);
355}
356
db477a33
BP
357/*
358 * Replace instructions with better alternatives for this CPU type. This runs
359 * before SMP is initialized to avoid SMP problems with self modifying code.
360 * This implies that asymmetric systems where APs have less capabilities than
361 * the boot processor are not handled. Tough. Make sure you disable such
362 * features by hand.
34bfab0e
BP
363 *
364 * Marked "noinline" to cause control flow change and thus insn cache
365 * to refetch changed I$ lines.
db477a33 366 */
34bfab0e
BP
367void __init_or_module noinline apply_alternatives(struct alt_instr *start,
368 struct alt_instr *end)
9a0b5817 369{
9a0b5817 370 struct alt_instr *a;
59e97e4d 371 u8 *instr, *replacement;
1b1d9258 372 u8 insnbuf[MAX_PATCH_LEN];
9a0b5817 373
0e6c16c6 374 DPRINTK("alt table %px, -> %px", start, end);
50973133
FY
375 /*
376 * The scan order should be from start to end. A later scanned
db477a33 377 * alternative code can overwrite previously scanned alternative code.
50973133
FY
378 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
379 * patch code.
380 *
381 * So be careful if you want to change the scan order to any other
382 * order.
383 */
9a0b5817 384 for (a = start; a < end; a++) {
48c7a250
BP
385 int insnbuf_sz = 0;
386
59e97e4d
AL
387 instr = (u8 *)&a->instr_offset + a->instr_offset;
388 replacement = (u8 *)&a->repl_offset + a->repl_offset;
ab144f5e 389 BUG_ON(a->instrlen > sizeof(insnbuf));
65fc985b 390 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
4fd4b6e5
BP
391 if (!boot_cpu_has(a->cpuid)) {
392 if (a->padlen > 1)
393 optimize_nops(a, instr);
394
9a0b5817 395 continue;
4fd4b6e5 396 }
59e97e4d 397
c1d4e419 398 DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
db477a33
BP
399 a->cpuid >> 5,
400 a->cpuid & 0x1f,
c1d4e419 401 instr, instr, a->instrlen,
dbe4058a 402 replacement, a->replacementlen, a->padlen);
db477a33 403
0e6c16c6
BP
404 DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
405 DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
48c7a250 406
59e97e4d 407 memcpy(insnbuf, replacement, a->replacementlen);
48c7a250 408 insnbuf_sz = a->replacementlen;
59e97e4d 409
fc152d22
MJ
410 /*
411 * 0xe8 is a relative jump; fix the offset.
412 *
413 * Instruction length is checked before the opcode to avoid
414 * accessing uninitialized bytes for zero-length replacements.
415 */
416 if (a->replacementlen == 5 && *insnbuf == 0xe8) {
db477a33 417 *(s32 *)(insnbuf + 1) += replacement - instr;
48c7a250
BP
418 DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
419 *(s32 *)(insnbuf + 1),
420 (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
db477a33 421 }
59e97e4d 422
48c7a250
BP
423 if (a->replacementlen && is_jmp(replacement[0]))
424 recompute_jump(a, instr, replacement, insnbuf);
425
426 if (a->instrlen > a->replacementlen) {
4332195c
BP
427 add_nops(insnbuf + a->replacementlen,
428 a->instrlen - a->replacementlen);
48c7a250
BP
429 insnbuf_sz += a->instrlen - a->replacementlen;
430 }
0e6c16c6 431 DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr);
59e97e4d 432
48c7a250 433 text_poke_early(instr, insnbuf, insnbuf_sz);
9a0b5817
GH
434 }
435}
436
8ec4d41f 437#ifdef CONFIG_SMP
5967ed87
JB
438static void alternatives_smp_lock(const s32 *start, const s32 *end,
439 u8 *text, u8 *text_end)
9a0b5817 440{
5967ed87 441 const s32 *poff;
9a0b5817 442
5967ed87
JB
443 for (poff = start; poff < end; poff++) {
444 u8 *ptr = (u8 *)poff + *poff;
445
446 if (!*poff || ptr < text || ptr >= text_end)
9a0b5817 447 continue;
f88f07e0 448 /* turn DS segment override prefix into lock prefix */
d9c5841e
PA
449 if (*ptr == 0x3e)
450 text_poke(ptr, ((unsigned char []){0xf0}), 1);
4b8073e4 451 }
9a0b5817
GH
452}
453
5967ed87
JB
454static void alternatives_smp_unlock(const s32 *start, const s32 *end,
455 u8 *text, u8 *text_end)
9a0b5817 456{
5967ed87 457 const s32 *poff;
9a0b5817 458
5967ed87
JB
459 for (poff = start; poff < end; poff++) {
460 u8 *ptr = (u8 *)poff + *poff;
461
462 if (!*poff || ptr < text || ptr >= text_end)
9a0b5817 463 continue;
f88f07e0 464 /* turn lock prefix into DS segment override prefix */
d9c5841e
PA
465 if (*ptr == 0xf0)
466 text_poke(ptr, ((unsigned char []){0x3E}), 1);
4b8073e4 467 }
9a0b5817
GH
468}
469
470struct smp_alt_module {
471 /* what is this ??? */
472 struct module *mod;
473 char *name;
474
475 /* ptrs to lock prefixes */
5967ed87
JB
476 const s32 *locks;
477 const s32 *locks_end;
9a0b5817
GH
478
479 /* .text segment, needed to avoid patching init code ;) */
480 u8 *text;
481 u8 *text_end;
482
483 struct list_head next;
484};
485static LIST_HEAD(smp_alt_modules);
e846d139 486static bool uniproc_patched = false; /* protected by text_mutex */
9a0b5817 487
8b5a10fc
JB
488void __init_or_module alternatives_smp_module_add(struct module *mod,
489 char *name,
490 void *locks, void *locks_end,
491 void *text, void *text_end)
9a0b5817
GH
492{
493 struct smp_alt_module *smp;
9a0b5817 494
e846d139 495 mutex_lock(&text_mutex);
816afe4f
RR
496 if (!uniproc_patched)
497 goto unlock;
b7fb4af0 498
816afe4f
RR
499 if (num_possible_cpus() == 1)
500 /* Don't bother remembering, we'll never have to undo it. */
501 goto smp_unlock;
9a0b5817
GH
502
503 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
504 if (NULL == smp)
816afe4f
RR
505 /* we'll run the (safe but slow) SMP code then ... */
506 goto unlock;
9a0b5817
GH
507
508 smp->mod = mod;
509 smp->name = name;
510 smp->locks = locks;
511 smp->locks_end = locks_end;
512 smp->text = text;
513 smp->text_end = text_end;
db477a33
BP
514 DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
515 smp->locks, smp->locks_end,
9a0b5817
GH
516 smp->text, smp->text_end, smp->name);
517
9a0b5817 518 list_add_tail(&smp->next, &smp_alt_modules);
816afe4f
RR
519smp_unlock:
520 alternatives_smp_unlock(locks, locks_end, text, text_end);
521unlock:
e846d139 522 mutex_unlock(&text_mutex);
9a0b5817
GH
523}
524
8b5a10fc 525void __init_or_module alternatives_smp_module_del(struct module *mod)
9a0b5817
GH
526{
527 struct smp_alt_module *item;
9a0b5817 528
e846d139 529 mutex_lock(&text_mutex);
9a0b5817
GH
530 list_for_each_entry(item, &smp_alt_modules, next) {
531 if (mod != item->mod)
532 continue;
533 list_del(&item->next);
9a0b5817 534 kfree(item);
816afe4f 535 break;
9a0b5817 536 }
e846d139 537 mutex_unlock(&text_mutex);
9a0b5817
GH
538}
539
816afe4f 540void alternatives_enable_smp(void)
9a0b5817
GH
541{
542 struct smp_alt_module *mod;
9a0b5817 543
816afe4f
RR
544 /* Why bother if there are no other CPUs? */
545 BUG_ON(num_possible_cpus() == 1);
9a0b5817 546
e846d139 547 mutex_lock(&text_mutex);
ca74a6f8 548
816afe4f 549 if (uniproc_patched) {
c767a54b 550 pr_info("switching to SMP code\n");
816afe4f 551 BUG_ON(num_online_cpus() != 1);
53756d37
JF
552 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
553 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
9a0b5817
GH
554 list_for_each_entry(mod, &smp_alt_modules, next)
555 alternatives_smp_lock(mod->locks, mod->locks_end,
556 mod->text, mod->text_end);
816afe4f 557 uniproc_patched = false;
9a0b5817 558 }
e846d139 559 mutex_unlock(&text_mutex);
9a0b5817
GH
560}
561
e846d139
ZC
562/*
563 * Return 1 if the address range is reserved for SMP-alternatives.
564 * Must hold text_mutex.
565 */
2cfa1978
MH
566int alternatives_text_reserved(void *start, void *end)
567{
568 struct smp_alt_module *mod;
5967ed87 569 const s32 *poff;
076dc4a6
MH
570 u8 *text_start = start;
571 u8 *text_end = end;
2cfa1978 572
e846d139
ZC
573 lockdep_assert_held(&text_mutex);
574
2cfa1978 575 list_for_each_entry(mod, &smp_alt_modules, next) {
076dc4a6 576 if (mod->text > text_end || mod->text_end < text_start)
2cfa1978 577 continue;
5967ed87
JB
578 for (poff = mod->locks; poff < mod->locks_end; poff++) {
579 const u8 *ptr = (const u8 *)poff + *poff;
580
581 if (text_start <= ptr && text_end > ptr)
2cfa1978 582 return 1;
5967ed87 583 }
2cfa1978
MH
584 }
585
586 return 0;
587}
48c7a250 588#endif /* CONFIG_SMP */
8ec4d41f 589
139ec7c4 590#ifdef CONFIG_PARAVIRT
8b5a10fc
JB
591void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
592 struct paravirt_patch_site *end)
139ec7c4 593{
98de032b 594 struct paravirt_patch_site *p;
ab144f5e 595 char insnbuf[MAX_PATCH_LEN];
139ec7c4
RR
596
597 for (p = start; p < end; p++) {
598 unsigned int used;
599
ab144f5e 600 BUG_ON(p->len > MAX_PATCH_LEN);
d34fda4a
CW
601 /* prep the buffer with the original instructions */
602 memcpy(insnbuf, p->instr, p->len);
5c83511b 603 used = pv_ops.init.patch(p->instrtype, insnbuf,
93b1eab3 604 (unsigned long)p->instr, p->len);
7f63c41c 605
63f70270
JF
606 BUG_ON(used > p->len);
607
139ec7c4 608 /* Pad the rest with nops */
ab144f5e 609 add_nops(insnbuf + used, p->len - used);
e587cadd 610 text_poke_early(p->instr, insnbuf, p->len);
139ec7c4 611 }
139ec7c4 612}
98de032b 613extern struct paravirt_patch_site __start_parainstructions[],
139ec7c4
RR
614 __stop_parainstructions[];
615#endif /* CONFIG_PARAVIRT */
616
9a0b5817
GH
617void __init alternative_instructions(void)
618{
8f4e956b
AK
619 /* The patching is not fully atomic, so try to avoid local interruptions
620 that might execute the to be patched code.
621 Other CPUs are not running. */
622 stop_nmi();
123aa76e
AK
623
624 /*
625 * Don't stop machine check exceptions while patching.
626 * MCEs only happen when something got corrupted and in this
627 * case we must do something about the corruption.
628 * Ignoring it is worse than a unlikely patching race.
629 * Also machine checks tend to be broadcast and if one CPU
630 * goes into machine check the others follow quickly, so we don't
631 * expect a machine check to cause undue problems during to code
632 * patching.
633 */
8f4e956b 634
9a0b5817
GH
635 apply_alternatives(__alt_instructions, __alt_instructions_end);
636
8ec4d41f 637#ifdef CONFIG_SMP
816afe4f
RR
638 /* Patch to UP if other cpus not imminent. */
639 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
640 uniproc_patched = true;
9a0b5817
GH
641 alternatives_smp_module_add(NULL, "core kernel",
642 __smp_locks, __smp_locks_end,
643 _text, _etext);
9a0b5817 644 }
8f4e956b 645
816afe4f 646 if (!uniproc_patched || num_possible_cpus() == 1)
f68fd5f4
FW
647 free_init_pages("SMP alternatives",
648 (unsigned long)__smp_locks,
649 (unsigned long)__smp_locks_end);
816afe4f
RR
650#endif
651
652 apply_paravirt(__parainstructions, __parainstructions_end);
f68fd5f4 653
8f4e956b 654 restart_nmi();
5e907bb0 655 alternatives_patched = 1;
9a0b5817 656}
19d36ccd 657
e587cadd
MD
658/**
659 * text_poke_early - Update instructions on a live kernel at boot time
660 * @addr: address to modify
661 * @opcode: source of the copy
662 * @len: length to copy
663 *
19d36ccd
AK
664 * When you use this code to patch more than one byte of an instruction
665 * you need to make sure that other CPUs cannot execute this code in parallel.
e587cadd
MD
666 * Also no thread must be currently preempted in the middle of these
667 * instructions. And on the local CPU you need to be protected again NMI or MCE
668 * handlers seeing an inconsistent instruction while you patch.
19d36ccd 669 */
fa6f2cc7 670void *__init_or_module text_poke_early(void *addr, const void *opcode,
8b5a10fc 671 size_t len)
19d36ccd 672{
e587cadd
MD
673 unsigned long flags;
674 local_irq_save(flags);
19d36ccd 675 memcpy(addr, opcode, len);
5367b688 676 local_irq_restore(flags);
6fffacb3 677 sync_core();
e587cadd
MD
678 /* Could also do a CLFLUSH here to speed up CPU recovery; but
679 that causes hangs on some VIA CPUs. */
680 return addr;
681}
682
4fc19708
NA
683__ro_after_init struct mm_struct *poking_mm;
684__ro_after_init unsigned long poking_addr;
685
e836673c 686static void *__text_poke(void *addr, const void *opcode, size_t len)
e587cadd 687{
b3fd8e83
NA
688 bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
689 struct page *pages[2] = {NULL};
690 temp_mm_state_t prev;
78ff7fae 691 unsigned long flags;
b3fd8e83
NA
692 pte_t pte, *ptep;
693 spinlock_t *ptl;
694 pgprot_t pgprot;
e587cadd 695
6fffacb3 696 /*
b3fd8e83
NA
697 * While boot memory allocator is running we cannot use struct pages as
698 * they are not yet initialized. There is no way to recover.
6fffacb3
PT
699 */
700 BUG_ON(!after_bootmem);
701
b7b66baa
MD
702 if (!core_kernel_text((unsigned long)addr)) {
703 pages[0] = vmalloc_to_page(addr);
b3fd8e83
NA
704 if (cross_page_boundary)
705 pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
15a601eb 706 } else {
b7b66baa 707 pages[0] = virt_to_page(addr);
00c6b2d5 708 WARN_ON(!PageReserved(pages[0]));
b3fd8e83
NA
709 if (cross_page_boundary)
710 pages[1] = virt_to_page(addr + PAGE_SIZE);
e587cadd 711 }
b3fd8e83
NA
712 /*
713 * If something went wrong, crash and burn since recovery paths are not
714 * implemented.
715 */
716 BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
717
7cf49427 718 local_irq_save(flags);
b3fd8e83
NA
719
720 /*
721 * Map the page without the global bit, as TLB flushing is done with
722 * flush_tlb_mm_range(), which is intended for non-global PTEs.
723 */
724 pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
725
726 /*
727 * The lock is not really needed, but this allows to avoid open-coding.
728 */
729 ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
730
731 /*
732 * This must not fail; preallocated in poking_init().
733 */
734 VM_BUG_ON(!ptep);
735
736 pte = mk_pte(pages[0], pgprot);
737 set_pte_at(poking_mm, poking_addr, ptep, pte);
738
739 if (cross_page_boundary) {
740 pte = mk_pte(pages[1], pgprot);
741 set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
742 }
743
744 /*
745 * Loading the temporary mm behaves as a compiler barrier, which
746 * guarantees that the PTE will be set at the time memcpy() is done.
747 */
748 prev = use_temporary_mm(poking_mm);
749
750 kasan_disable_current();
751 memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len);
752 kasan_enable_current();
753
754 /*
755 * Ensure that the PTE is only cleared after the instructions of memcpy
756 * were issued by using a compiler barrier.
757 */
758 barrier();
759
760 pte_clear(poking_mm, poking_addr, ptep);
761 if (cross_page_boundary)
762 pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
763
764 /*
765 * Loading the previous page-table hierarchy requires a serializing
766 * instruction that already allows the core to see the updated version.
767 * Xen-PV is assumed to serialize execution in a similar manner.
768 */
769 unuse_temporary_mm(prev);
770
771 /*
772 * Flushing the TLB might involve IPIs, which would require enabled
773 * IRQs, but not if the mm is not used, as it is in this point.
774 */
775 flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
776 (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
777 PAGE_SHIFT, false);
778
779 /*
780 * If the text does not match what we just wrote then something is
781 * fundamentally screwy; there's nothing we can really do about that.
782 */
783 BUG_ON(memcmp(addr, opcode, len));
784
785 pte_unmap_unlock(ptep, ptl);
7cf49427 786 local_irq_restore(flags);
e587cadd 787 return addr;
19d36ccd 788}
3d55cc8a 789
e836673c
NA
790/**
791 * text_poke - Update instructions on a live kernel
792 * @addr: address to modify
793 * @opcode: source of the copy
794 * @len: length to copy
795 *
796 * Only atomic text poke/set should be allowed when not doing early patching.
797 * It means the size must be writable atomically and the address must be aligned
798 * in a way that permits an atomic write. It also makes sure we fit on a single
799 * page.
800 */
801void *text_poke(void *addr, const void *opcode, size_t len)
802{
803 lockdep_assert_held(&text_mutex);
804
805 return __text_poke(addr, opcode, len);
806}
807
808/**
809 * text_poke_kgdb - Update instructions on a live kernel by kgdb
810 * @addr: address to modify
811 * @opcode: source of the copy
812 * @len: length to copy
813 *
814 * Only atomic text poke/set should be allowed when not doing early patching.
815 * It means the size must be writable atomically and the address must be aligned
816 * in a way that permits an atomic write. It also makes sure we fit on a single
817 * page.
818 *
819 * Context: should only be used by kgdb, which ensures no other core is running,
820 * despite the fact it does not hold the text_mutex.
821 */
822void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
823{
824 return __text_poke(addr, opcode, len);
825}
826
fd4363ff
JK
827static void do_sync_core(void *info)
828{
829 sync_core();
830}
831
832static bool bp_patching_in_progress;
833static void *bp_int3_handler, *bp_int3_addr;
834
17f41571 835int poke_int3_handler(struct pt_regs *regs)
fd4363ff 836{
01651324
PZ
837 /*
838 * Having observed our INT3 instruction, we now must observe
839 * bp_patching_in_progress.
840 *
841 * in_progress = TRUE INT3
842 * WMB RMB
843 * write INT3 if (in_progress)
844 *
845 * Idem for bp_int3_handler.
846 */
fd4363ff
JK
847 smp_rmb();
848
849 if (likely(!bp_patching_in_progress))
17f41571 850 return 0;
fd4363ff 851
f39b6f0e 852 if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
17f41571 853 return 0;
fd4363ff
JK
854
855 /* set up the specified breakpoint handler */
17f41571
JK
856 regs->ip = (unsigned long) bp_int3_handler;
857
858 return 1;
fd4363ff 859}
c13324a5 860NOKPROBE_SYMBOL(poke_int3_handler);
17f41571 861
fd4363ff
JK
862/**
863 * text_poke_bp() -- update instructions on live kernel on SMP
864 * @addr: address to patch
865 * @opcode: opcode of new instruction
866 * @len: length to copy
867 * @handler: address to jump to when the temporary breakpoint is hit
868 *
869 * Modify multi-byte instruction by using int3 breakpoint on SMP.
ea8596bb
MH
870 * We completely avoid stop_machine() here, and achieve the
871 * synchronization using int3 breakpoint.
fd4363ff
JK
872 *
873 * The way it is done:
874 * - add a int3 trap to the address that will be patched
875 * - sync cores
876 * - update all but the first byte of the patched range
877 * - sync cores
878 * - replace the first byte (int3) by the first byte of
879 * replacing opcode
880 * - sync cores
fd4363ff
JK
881 */
882void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
883{
884 unsigned char int3 = 0xcc;
885
886 bp_int3_handler = handler;
887 bp_int3_addr = (u8 *)addr + sizeof(int3);
888 bp_patching_in_progress = true;
9222f606
JK
889
890 lockdep_assert_held(&text_mutex);
891
fd4363ff 892 /*
01651324
PZ
893 * Corresponding read barrier in int3 notifier for making sure the
894 * in_progress and handler are correctly ordered wrt. patching.
fd4363ff
JK
895 */
896 smp_wmb();
897
898 text_poke(addr, &int3, sizeof(int3));
899
900 on_each_cpu(do_sync_core, NULL, 1);
901
902 if (len - sizeof(int3) > 0) {
903 /* patch all but the first byte */
904 text_poke((char *)addr + sizeof(int3),
905 (const char *) opcode + sizeof(int3),
906 len - sizeof(int3));
907 /*
908 * According to Intel, this core syncing is very likely
909 * not necessary and we'd be safe even without it. But
910 * better safe than sorry (plus there's not only Intel).
911 */
912 on_each_cpu(do_sync_core, NULL, 1);
913 }
914
915 /* patch the first byte */
916 text_poke(addr, opcode, sizeof(int3));
917
918 on_each_cpu(do_sync_core, NULL, 1);
01651324
PZ
919 /*
920 * sync_core() implies an smp_mb() and orders this store against
921 * the writing of the new instruction.
922 */
fd4363ff 923 bp_patching_in_progress = false;
fd4363ff
JK
924
925 return addr;
926}
927