arch/x86/kernel/uprobes.c

   1 /*
   2  * Userspace Probes (UProbes) for x86
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17  *
  18  * Copyright (C) IBM Corporation, 2008-2011
  19  * Authors:
  20  *      Srikar Dronamraju
  21  *      Jim Keniston
  22  */
  23
  24 #include <linux/kernel.h>
  25 #include <linux/sched.h>
  26 #include <linux/ptrace.h>
  27 #include <linux/uprobes.h>
  28
  29 #include <linux/kdebug.h>
  30 #include <asm/insn.h>
  31
  32 /* Post-execution fixups. */
  33
  34 /* No fixup needed */
  35 #define UPROBES_FIX_NONE        0x0
  36 /* Adjust IP back to vicinity of actual insn */
  37 #define UPROBES_FIX_IP          0x1
  38 /* Adjust the return address of a call insn */
  39 #define UPROBES_FIX_CALL        0x2
  40
  41 #define UPROBES_FIX_RIP_AX      0x8000
  42 #define UPROBES_FIX_RIP_CX      0x4000
  43
  44 /* Adaptations for mhiramat x86 decoder v14. */
  45 #define OPCODE1(insn) ((insn)->opcode.bytes[0])
  46 #define OPCODE2(insn) ((insn)->opcode.bytes[1])
  47 #define OPCODE3(insn) ((insn)->opcode.bytes[2])
  48 #define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
  49
  50 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
  51         (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
  52           (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
  53           (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
  54           (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
  55          << (row % 32))
  56
  57 #ifdef CONFIG_X86_64
  58 static volatile u32 good_insns_64[256 / 32] = {
  59         /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
  60         /*      ----------------------------------------------         */
  61         W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
  62         W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
  63         W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
  64         W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
  65         W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
  66         W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
  67         W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
  68         W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
  69         W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
  70         W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
  71         W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
  72         W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
  73         W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
  74         W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
  75         W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
  76         W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
  77         /*      ----------------------------------------------         */
  78         /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
  79 };
  80 #endif
  81
  82 /* Good-instruction tables for 32-bit apps */
  83
  84 static volatile u32 good_insns_32[256 / 32] = {
  85         /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
  86         /*      ----------------------------------------------         */
  87         W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
  88         W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
  89         W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
  90         W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
  91         W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
  92         W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
  93         W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
  94         W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
  95         W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
  96         W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
  97         W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
  98         W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
  99         W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
 100         W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
 101         W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
 102         W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1)   /* f0 */
 103         /*      ----------------------------------------------         */
 104         /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 105 };
 106
 107 /* Using this for both 64-bit and 32-bit apps */
 108 static volatile u32 good_2byte_insns[256 / 32] = {
 109         /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 110         /*      ----------------------------------------------         */
 111         W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
 112         W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
 113         W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
 114         W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
 115         W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
 116         W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
 117         W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
 118         W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
 119         W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
 120         W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
 121         W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
 122         W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
 123         W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
 124         W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
 125         W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
 126         W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* f0 */
 127         /*      ----------------------------------------------         */
 128         /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */
 129 };
 130
 131 #undef W
 132
 133 /*
 134  * opcodes we'll probably never support:
 135  * 6c-6d, e4-e5, ec-ed - in
 136  * 6e-6f, e6-e7, ee-ef - out
 137  * cc, cd - int3, int
 138  * cf - iret
 139  * d6 - illegal instruction
 140  * f1 - int1/icebp
 141  * f4 - hlt
 142  * fa, fb - cli, sti
 143  * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
 144  *
 145  * invalid opcodes in 64-bit mode:
 146  * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
 147  *
 148  * 63 - we support this opcode in x86_64 but not in i386.
 149  *
 150  * opcodes we may need to refine support for:
 151  * 0f - 2-byte instructions: For many of these instructions, the validity
 152  * depends on the prefix and/or the reg field.  On such instructions, we
 153  * just consider the opcode combination valid if it corresponds to any
 154  * valid instruction.
 155  * 8f - Group 1 - only reg = 0 is OK
 156  * c6-c7 - Group 11 - only reg = 0 is OK
 157  * d9-df - fpu insns with some illegal encodings
 158  * f2, f3 - repnz, repz prefixes.  These are also the first byte for
 159  * certain floating-point instructions, such as addsd.
 160  * fe - Group 4 - only reg = 0 or 1 is OK
 161  * ff - Group 5 - only reg = 0-6 is OK
 162  *
 163  * others -- Do we need to support these?
 164  * 0f - (floating-point?) prefetch instructions
 165  * 07, 17, 1f - pop es, pop ss, pop ds
 166  * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
 167  *      but 64 and 65 (fs: and gs:) seem to be used, so we support them
 168  * 67 - addr16 prefix
 169  * ce - into
 170  * f0 - lock prefix
 171  */
 172
 173 /*
 174  * TODO:
 175  * - Where necessary, examine the modrm byte and allow only valid instructions
 176  * in the different Groups and fpu instructions.
 177  */
 178
 179 static bool is_prefix_bad(struct insn *insn)
 180 {
 181         int i;
 182
 183         for (i = 0; i < insn->prefixes.nbytes; i++) {
 184                 switch (insn->prefixes.bytes[i]) {
 185                 case 0x26:      /*INAT_PFX_ES   */
 186                 case 0x2E:      /*INAT_PFX_CS   */
 187                 case 0x36:      /*INAT_PFX_DS   */
 188                 case 0x3E:      /*INAT_PFX_SS   */
 189                 case 0xF0:      /*INAT_PFX_LOCK */
 190                         return true;
 191                 }
 192         }
 193         return false;
 194 }
 195
 196 static int validate_insn_32bits(struct uprobe *uprobe, struct insn *insn)
 197 {
 198         insn_init(insn, uprobe->insn, false);
 199
 200         /* Skip good instruction prefixes; reject "bad" ones. */
 201         insn_get_opcode(insn);
 202         if (is_prefix_bad(insn))
 203                 return -ENOTSUPP;
 204         if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
 205                 return 0;
 206         if (insn->opcode.nbytes == 2) {
 207                 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
 208                         return 0;
 209         }
 210         return -ENOTSUPP;
 211 }
 212
 213 /*
 214  * Figure out which fixups post_xol() will need to perform, and annotate
 215  * uprobe->arch_info.fixups accordingly.  To start with,
 216  * uprobe->arch_info.fixups is either zero or it reflects rip-related
 217  * fixups.
 218  */
 219 static void prepare_fixups(struct uprobe *uprobe, struct insn *insn)
 220 {
 221         bool fix_ip = true, fix_call = false;   /* defaults */
 222         int reg;
 223
 224         insn_get_opcode(insn);  /* should be a nop */
 225
 226         switch (OPCODE1(insn)) {
 227         case 0xc3:              /* ret/lret */
 228         case 0xcb:
 229         case 0xc2:
 230         case 0xca:
 231                 /* ip is correct */
 232                 fix_ip = false;
 233                 break;
 234         case 0xe8:              /* call relative - Fix return addr */
 235                 fix_call = true;
 236                 break;
 237         case 0x9a:              /* call absolute - Fix return addr, not ip */
 238                 fix_call = true;
 239                 fix_ip = false;
 240                 break;
 241         case 0xff:
 242                 insn_get_modrm(insn);
 243                 reg = MODRM_REG(insn);
 244                 if (reg == 2 || reg == 3) {
 245                         /* call or lcall, indirect */
 246                         /* Fix return addr; ip is correct. */
 247                         fix_call = true;
 248                         fix_ip = false;
 249                 } else if (reg == 4 || reg == 5) {
 250                         /* jmp or ljmp, indirect */
 251                         /* ip is correct. */
 252                         fix_ip = false;
 253                 }
 254                 break;
 255         case 0xea:              /* jmp absolute -- ip is correct */
 256                 fix_ip = false;
 257                 break;
 258         default:
 259                 break;
 260         }
 261         if (fix_ip)
 262                 uprobe->arch_info.fixups |= UPROBES_FIX_IP;
 263         if (fix_call)
 264                 uprobe->arch_info.fixups |= UPROBES_FIX_CALL;
 265 }
 266
 267 #ifdef CONFIG_X86_64
 268 /*
 269  * If uprobe->insn doesn't use rip-relative addressing, return
 270  * immediately.  Otherwise, rewrite the instruction so that it accesses
 271  * its memory operand indirectly through a scratch register.  Set
 272  * uprobe->arch_info.fixups and uprobe->arch_info.rip_rela_target_address
 273  * accordingly.  (The contents of the scratch register will be saved
 274  * before we single-step the modified instruction, and restored
 275  * afterward.)
 276  *
 277  * We do this because a rip-relative instruction can access only a
 278  * relatively small area (+/- 2 GB from the instruction), and the XOL
 279  * area typically lies beyond that area.  At least for instructions
 280  * that store to memory, we can't execute the original instruction
 281  * and "fix things up" later, because the misdirected store could be
 282  * disastrous.
 283  *
 284  * Some useful facts about rip-relative instructions:
 285  * - There's always a modrm byte.
 286  * - There's never a SIB byte.
 287  * - The displacement is always 4 bytes.
 288  */
 289 static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe,
 290                                                         struct insn *insn)
 291 {
 292         u8 *cursor;
 293         u8 reg;
 294
 295         if (mm->context.ia32_compat)
 296                 return;
 297
 298         uprobe->arch_info.rip_rela_target_address = 0x0;
 299         if (!insn_rip_relative(insn))
 300                 return;
 301
 302         /*
 303          * insn_rip_relative() would have decoded rex_prefix, modrm.
 304          * Clear REX.b bit (extension of MODRM.rm field):
 305          * we want to encode rax/rcx, not r8/r9.
 306          */
 307         if (insn->rex_prefix.nbytes) {
 308                 cursor = uprobe->insn + insn_offset_rex_prefix(insn);
 309                 *cursor &= 0xfe;        /* Clearing REX.B bit */
 310         }
 311
 312         /*
 313          * Point cursor at the modrm byte.  The next 4 bytes are the
 314          * displacement.  Beyond the displacement, for some instructions,
 315          * is the immediate operand.
 316          */
 317         cursor = uprobe->insn + insn_offset_modrm(insn);
 318         insn_get_length(insn);
 319
 320         /*
 321          * Convert from rip-relative addressing to indirect addressing
 322          * via a scratch register.  Change the r/m field from 0x5 (%rip)
 323          * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
 324          */
 325         reg = MODRM_REG(insn);
 326         if (reg == 0) {
 327                 /*
 328                  * The register operand (if any) is either the A register
 329                  * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
 330                  * REX prefix) %r8.  In any case, we know the C register
 331                  * is NOT the register operand, so we use %rcx (register
 332                  * #1) for the scratch register.
 333                  */
 334                 uprobe->arch_info.fixups = UPROBES_FIX_RIP_CX;
 335                 /* Change modrm from 00 000 101 to 00 000 001. */
 336                 *cursor = 0x1;
 337         } else {
 338                 /* Use %rax (register #0) for the scratch register. */
 339                 uprobe->arch_info.fixups = UPROBES_FIX_RIP_AX;
 340                 /* Change modrm from 00 xxx 101 to 00 xxx 000 */
 341                 *cursor = (reg << 3);
 342         }
 343
 344         /* Target address = address of next instruction + (signed) offset */
 345         uprobe->arch_info.rip_rela_target_address = (long)insn->length
 346                                         + insn->displacement.value;
 347         /* Displacement field is gone; slide immediate field (if any) over. */
 348         if (insn->immediate.nbytes) {
 349                 cursor++;
 350                 memmove(cursor, cursor + insn->displacement.nbytes,
 351                                                 insn->immediate.nbytes);
 352         }
 353         return;
 354 }
 355
 356 static int validate_insn_64bits(struct uprobe *uprobe, struct insn *insn)
 357 {
 358         insn_init(insn, uprobe->insn, true);
 359
 360         /* Skip good instruction prefixes; reject "bad" ones. */
 361         insn_get_opcode(insn);
 362         if (is_prefix_bad(insn))
 363                 return -ENOTSUPP;
 364         if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
 365                 return 0;
 366         if (insn->opcode.nbytes == 2) {
 367                 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
 368                         return 0;
 369         }
 370         return -ENOTSUPP;
 371 }
 372
 373 static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe,
 374                                 struct insn *insn)
 375 {
 376         if (mm->context.ia32_compat)
 377                 return validate_insn_32bits(uprobe, insn);
 378         return validate_insn_64bits(uprobe, insn);
 379 }
 380 #else
 381 static void handle_riprel_insn(struct mm_struct *mm, struct uprobe *uprobe,
 382                                                         struct insn *insn)
 383 {
 384         return;
 385 }
 386
 387 static int validate_insn_bits(struct mm_struct *mm, struct uprobe *uprobe,
 388                                 struct insn *insn)
 389 {
 390         return validate_insn_32bits(uprobe, insn);
 391 }
 392 #endif /* CONFIG_X86_64 */
 393
 394 /**
 395  * analyze_insn - instruction analysis including validity and fixups.
 396  * @mm: the probed address space.
 397  * @uprobe: the probepoint information.
 398  * Return 0 on success or a -ve number on error.
 399  */
 400 int analyze_insn(struct mm_struct *mm, struct uprobe *uprobe)
 401 {
 402         int ret;
 403         struct insn insn;
 404
 405         uprobe->arch_info.fixups = 0;
 406         ret = validate_insn_bits(mm, uprobe, &insn);
 407         if (ret != 0)
 408                 return ret;
 409         handle_riprel_insn(mm, uprobe, &insn);
 410         prepare_fixups(uprobe, &insn);
 411         return 0;
 412 }