1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Kernel Probes (KProbes)
4 *
5 * Copyright (C) IBM Corporation, 2002, 2004
6 *
7 * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
8 * Probes initial implementation ( includes contributions from
9 * Rusty Russell).
10 * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
11 * interface to access function arguments.
12 * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
13 * <prasanna@in.ibm.com> adapted for x86_64 from i386.
14 * 2005-Mar Roland McGrath <roland@redhat.com>
15 * Fixed to handle %rip-relative addressing mode correctly.
16 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
17 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
18 * <prasanna@in.ibm.com> added function-return probes.
19 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
20 * Added function return probes functionality
21 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
22 * kprobe-booster and kretprobe-booster for i386.
23 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
24 * and kretprobe-booster for x86-64
25 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
26 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
27 * unified x86 kprobes code.
28 */
29 #include <linux/kprobes.h>
30 #include <linux/ptrace.h>
31 #include <linux/string.h>
32 #include <linux/slab.h>
33 #include <linux/hardirq.h>
34 #include <linux/preempt.h>
35 #include <linux/sched/debug.h>
36 #include <linux/perf_event.h>
37 #include <linux/extable.h>
38 #include <linux/kdebug.h>
39 #include <linux/kallsyms.h>
40 #include <linux/ftrace.h>
41 #include <linux/kasan.h>
42 #include <linux/moduleloader.h>
43 #include <linux/objtool.h>
44 #include <linux/vmalloc.h>
45 #include <linux/pgtable.h>
46
47 #include <asm/text-patching.h>
48 #include <asm/cacheflush.h>
49 #include <asm/desc.h>
50 #include <linux/uaccess.h>
51 #include <asm/alternative.h>
52 #include <asm/insn.h>
53 #include <asm/debugreg.h>
54 #include <asm/set_memory.h>
55 #include <asm/ibt.h>
56
57 #include "common.h"
58
59 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
60 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
61
62 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
63 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
64 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
65 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
66 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
67 << (row % 32))
68 /*
69 * Undefined/reserved opcodes, conditional jump, Opcode Extension
70 * Groups, and some special opcodes can not boost.
71 * This is non-const and volatile to keep gcc from statically
72 * optimizing it out, as variable_test_bit makes gcc think only
73 * *(unsigned long*) is used.
74 */
75 static volatile u32 twobyte_is_boostable[256 / 32] = {
76 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
77 /* ---------------------------------------------- */
78 W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
79 W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
80 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
81 W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
82 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
83 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
84 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
85 W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
86 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
87 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
88 W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
89 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
90 W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
91 W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
92 W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
93 W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */
94 /* ----------------------------------------------- */
95 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
96 };
97 #undef W
98
99 struct kretprobe_blackpoint kretprobe_blacklist[] = {
100 {"__switch_to", }, /* This function switches only current task, but
101 doesn't switch kernel stack.*/
102 {NULL, NULL} /* Terminator */
103 };
104
105 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
106
107 static nokprobe_inline void
__synthesize_relative_insn(void * dest,void * from,void * to,u8 op)108 __synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
109 {
110 struct __arch_relative_insn {
111 u8 op;
112 s32 raddr;
113 } __packed *insn;
114
115 insn = (struct __arch_relative_insn *)dest;
116 insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
117 insn->op = op;
118 }
119
120 /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
synthesize_reljump(void * dest,void * from,void * to)121 void synthesize_reljump(void *dest, void *from, void *to)
122 {
123 __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE);
124 }
125 NOKPROBE_SYMBOL(synthesize_reljump);
126
127 /* Insert a call instruction at address 'from', which calls address 'to'.*/
synthesize_relcall(void * dest,void * from,void * to)128 void synthesize_relcall(void *dest, void *from, void *to)
129 {
130 __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE);
131 }
132 NOKPROBE_SYMBOL(synthesize_relcall);
133
134 /*
135 * Returns non-zero if INSN is boostable.
136 * RIP relative instructions are adjusted at copying time in 64 bits mode
137 */
can_boost(struct insn * insn,void * addr)138 int can_boost(struct insn *insn, void *addr)
139 {
140 kprobe_opcode_t opcode;
141 insn_byte_t prefix;
142 int i;
143
144 if (search_exception_tables((unsigned long)addr))
145 return 0; /* Page fault may occur on this address. */
146
147 /* 2nd-byte opcode */
148 if (insn->opcode.nbytes == 2)
149 return test_bit(insn->opcode.bytes[1],
150 (unsigned long *)twobyte_is_boostable);
151
152 if (insn->opcode.nbytes != 1)
153 return 0;
154
155 for_each_insn_prefix(insn, i, prefix) {
156 insn_attr_t attr;
157
158 attr = inat_get_opcode_attribute(prefix);
159 /* Can't boost Address-size override prefix and CS override prefix */
160 if (prefix == 0x2e || inat_is_address_size_prefix(attr))
161 return 0;
162 }
163
164 opcode = insn->opcode.bytes[0];
165
166 switch (opcode) {
167 case 0x62: /* bound */
168 case 0x70 ... 0x7f: /* Conditional jumps */
169 case 0x9a: /* Call far */
170 case 0xc0 ... 0xc1: /* Grp2 */
171 case 0xcc ... 0xce: /* software exceptions */
172 case 0xd0 ... 0xd3: /* Grp2 */
173 case 0xd6: /* (UD) */
174 case 0xd8 ... 0xdf: /* ESC */
175 case 0xe0 ... 0xe3: /* LOOP*, JCXZ */
176 case 0xe8 ... 0xe9: /* near Call, JMP */
177 case 0xeb: /* Short JMP */
178 case 0xf0 ... 0xf4: /* LOCK/REP, HLT */
179 case 0xf6 ... 0xf7: /* Grp3 */
180 case 0xfe: /* Grp4 */
181 /* ... are not boostable */
182 return 0;
183 case 0xff: /* Grp5 */
184 /* Only indirect jmp is boostable */
185 return X86_MODRM_REG(insn->modrm.bytes[0]) == 4;
186 default:
187 return 1;
188 }
189 }
190
191 static unsigned long
__recover_probed_insn(kprobe_opcode_t * buf,unsigned long addr)192 __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
193 {
194 struct kprobe *kp;
195 bool faddr;
196
197 kp = get_kprobe((void *)addr);
198 faddr = ftrace_location(addr) == addr;
199 /*
200 * Use the current code if it is not modified by Kprobe
201 * and it cannot be modified by ftrace.
202 */
203 if (!kp && !faddr)
204 return addr;
205
206 /*
207 * Basically, kp->ainsn.insn has an original instruction.
208 * However, RIP-relative instruction can not do single-stepping
209 * at different place, __copy_instruction() tweaks the displacement of
210 * that instruction. In that case, we can't recover the instruction
211 * from the kp->ainsn.insn.
212 *
213 * On the other hand, in case on normal Kprobe, kp->opcode has a copy
214 * of the first byte of the probed instruction, which is overwritten
215 * by int3. And the instruction at kp->addr is not modified by kprobes
216 * except for the first byte, we can recover the original instruction
217 * from it and kp->opcode.
218 *
219 * In case of Kprobes using ftrace, we do not have a copy of
220 * the original instruction. In fact, the ftrace location might
221 * be modified at anytime and even could be in an inconsistent state.
222 * Fortunately, we know that the original code is the ideal 5-byte
223 * long NOP.
224 */
225 if (copy_from_kernel_nofault(buf, (void *)addr,
226 MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
227 return 0UL;
228
229 if (faddr)
230 memcpy(buf, x86_nops[5], 5);
231 else
232 buf[0] = kp->opcode;
233 return (unsigned long)buf;
234 }
235
236 /*
237 * Recover the probed instruction at addr for further analysis.
238 * Caller must lock kprobes by kprobe_mutex, or disable preemption
239 * for preventing to release referencing kprobes.
240 * Returns zero if the instruction can not get recovered (or access failed).
241 */
recover_probed_instruction(kprobe_opcode_t * buf,unsigned long addr)242 unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
243 {
244 unsigned long __addr;
245
246 __addr = __recover_optprobed_insn(buf, addr);
247 if (__addr != addr)
248 return __addr;
249
250 return __recover_probed_insn(buf, addr);
251 }
252
253 /* Check if paddr is at an instruction boundary */
can_probe(unsigned long paddr)254 static int can_probe(unsigned long paddr)
255 {
256 unsigned long addr, __addr, offset = 0;
257 struct insn insn;
258 kprobe_opcode_t buf[MAX_INSN_SIZE];
259
260 if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
261 return 0;
262
263 /* Decode instructions */
264 addr = paddr - offset;
265 while (addr < paddr) {
266 int ret;
267
268 /*
269 * Check if the instruction has been modified by another
270 * kprobe, in which case we replace the breakpoint by the
271 * original instruction in our buffer.
272 * Also, jump optimization will change the breakpoint to
273 * relative-jump. Since the relative-jump itself is
274 * normally used, we just go through if there is no kprobe.
275 */
276 __addr = recover_probed_instruction(buf, addr);
277 if (!__addr)
278 return 0;
279
280 ret = insn_decode_kernel(&insn, (void *)__addr);
281 if (ret < 0)
282 return 0;
283
284 /*
285 * Another debugging subsystem might insert this breakpoint.
286 * In that case, we can't recover it.
287 */
288 if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
289 return 0;
290 addr += insn.length;
291 }
292
293 return (addr == paddr);
294 }
295
296 /* If x86 supports IBT (ENDBR) it must be skipped. */
arch_adjust_kprobe_addr(unsigned long addr,unsigned long offset,bool * on_func_entry)297 kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset,
298 bool *on_func_entry)
299 {
300 if (is_endbr(*(u32 *)addr)) {
301 *on_func_entry = !offset || offset == 4;
302 if (*on_func_entry)
303 offset = 4;
304
305 } else {
306 *on_func_entry = !offset;
307 }
308
309 return (kprobe_opcode_t *)(addr + offset);
310 }
311
312 /*
313 * Copy an instruction with recovering modified instruction by kprobes
314 * and adjust the displacement if the instruction uses the %rip-relative
315 * addressing mode. Note that since @real will be the final place of copied
316 * instruction, displacement must be adjust by @real, not @dest.
317 * This returns the length of copied instruction, or 0 if it has an error.
318 */
__copy_instruction(u8 * dest,u8 * src,u8 * real,struct insn * insn)319 int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
320 {
321 kprobe_opcode_t buf[MAX_INSN_SIZE];
322 unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src);
323 int ret;
324
325 if (!recovered_insn || !insn)
326 return 0;
327
328 /* This can access kernel text if given address is not recovered */
329 if (copy_from_kernel_nofault(dest, (void *)recovered_insn,
330 MAX_INSN_SIZE))
331 return 0;
332
333 ret = insn_decode_kernel(insn, dest);
334 if (ret < 0)
335 return 0;
336
337 /* We can not probe force emulate prefixed instruction */
338 if (insn_has_emulate_prefix(insn))
339 return 0;
340
341 /* Another subsystem puts a breakpoint, failed to recover */
342 if (insn->opcode.bytes[0] == INT3_INSN_OPCODE)
343 return 0;
344
345 /* We should not singlestep on the exception masking instructions */
346 if (insn_masking_exception(insn))
347 return 0;
348
349 #ifdef CONFIG_X86_64
350 /* Only x86_64 has RIP relative instructions */
351 if (insn_rip_relative(insn)) {
352 s64 newdisp;
353 u8 *disp;
354 /*
355 * The copied instruction uses the %rip-relative addressing
356 * mode. Adjust the displacement for the difference between
357 * the original location of this instruction and the location
358 * of the copy that will actually be run. The tricky bit here
359 * is making sure that the sign extension happens correctly in
360 * this calculation, since we need a signed 32-bit result to
361 * be sign-extended to 64 bits when it's added to the %rip
362 * value and yield the same 64-bit result that the sign-
363 * extension of the original signed 32-bit displacement would
364 * have given.
365 */
366 newdisp = (u8 *) src + (s64) insn->displacement.value
367 - (u8 *) real;
368 if ((s64) (s32) newdisp != newdisp) {
369 pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
370 return 0;
371 }
372 disp = (u8 *) dest + insn_offset_displacement(insn);
373 *(s32 *) disp = (s32) newdisp;
374 }
375 #endif
376 return insn->length;
377 }
378
379 /* Prepare reljump or int3 right after instruction */
prepare_singlestep(kprobe_opcode_t * buf,struct kprobe * p,struct insn * insn)380 static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
381 struct insn *insn)
382 {
383 int len = insn->length;
384
385 if (!IS_ENABLED(CONFIG_PREEMPTION) &&
386 !p->post_handler && can_boost(insn, p->addr) &&
387 MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
388 /*
389 * These instructions can be executed directly if it
390 * jumps back to correct address.
391 */
392 synthesize_reljump(buf + len, p->ainsn.insn + len,
393 p->addr + insn->length);
394 len += JMP32_INSN_SIZE;
395 p->ainsn.boostable = 1;
396 } else {
397 /* Otherwise, put an int3 for trapping singlestep */
398 if (MAX_INSN_SIZE - len < INT3_INSN_SIZE)
399 return -ENOSPC;
400
401 buf[len] = INT3_INSN_OPCODE;
402 len += INT3_INSN_SIZE;
403 }
404
405 return len;
406 }
407
408 /* Make page to RO mode when allocate it */
alloc_insn_page(void)409 void *alloc_insn_page(void)
410 {
411 void *page;
412
413 page = module_alloc(PAGE_SIZE);
414 if (!page)
415 return NULL;
416
417 set_vm_flush_reset_perms(page);
418 /*
419 * First make the page read-only, and only then make it executable to
420 * prevent it from being W+X in between.
421 */
422 set_memory_ro((unsigned long)page, 1);
423
424 /*
425 * TODO: Once additional kernel code protection mechanisms are set, ensure
426 * that the page was not maliciously altered and it is still zeroed.
427 */
428 set_memory_x((unsigned long)page, 1);
429
430 return page;
431 }
432
433 /* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
434
kprobe_emulate_ifmodifiers(struct kprobe * p,struct pt_regs * regs)435 static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
436 {
437 switch (p->ainsn.opcode) {
438 case 0xfa: /* cli */
439 regs->flags &= ~(X86_EFLAGS_IF);
440 break;
441 case 0xfb: /* sti */
442 regs->flags |= X86_EFLAGS_IF;
443 break;
444 case 0x9c: /* pushf */
445 int3_emulate_push(regs, regs->flags);
446 break;
447 case 0x9d: /* popf */
448 regs->flags = int3_emulate_pop(regs);
449 break;
450 }
451 regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
452 }
453 NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers);
454
kprobe_emulate_ret(struct kprobe * p,struct pt_regs * regs)455 static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs)
456 {
457 int3_emulate_ret(regs);
458 }
459 NOKPROBE_SYMBOL(kprobe_emulate_ret);
460
kprobe_emulate_call(struct kprobe * p,struct pt_regs * regs)461 static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
462 {
463 unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
464
465 func += p->ainsn.rel32;
466 int3_emulate_call(regs, func);
467 }
468 NOKPROBE_SYMBOL(kprobe_emulate_call);
469
470 static nokprobe_inline
__kprobe_emulate_jmp(struct kprobe * p,struct pt_regs * regs,bool cond)471 void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond)
472 {
473 unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
474
475 if (cond)
476 ip += p->ainsn.rel32;
477 int3_emulate_jmp(regs, ip);
478 }
479
kprobe_emulate_jmp(struct kprobe * p,struct pt_regs * regs)480 static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
481 {
482 __kprobe_emulate_jmp(p, regs, true);
483 }
484 NOKPROBE_SYMBOL(kprobe_emulate_jmp);
485
486 static const unsigned long jcc_mask[6] = {
487 [0] = X86_EFLAGS_OF,
488 [1] = X86_EFLAGS_CF,
489 [2] = X86_EFLAGS_ZF,
490 [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
491 [4] = X86_EFLAGS_SF,
492 [5] = X86_EFLAGS_PF,
493 };
494
kprobe_emulate_jcc(struct kprobe * p,struct pt_regs * regs)495 static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
496 {
497 bool invert = p->ainsn.jcc.type & 1;
498 bool match;
499
500 if (p->ainsn.jcc.type < 0xc) {
501 match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1];
502 } else {
503 match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
504 ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
505 if (p->ainsn.jcc.type >= 0xe)
506 match = match || (regs->flags & X86_EFLAGS_ZF);
507 }
508 __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert));
509 }
510 NOKPROBE_SYMBOL(kprobe_emulate_jcc);
511
kprobe_emulate_loop(struct kprobe * p,struct pt_regs * regs)512 static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
513 {
514 bool match;
515
516 if (p->ainsn.loop.type != 3) { /* LOOP* */
517 if (p->ainsn.loop.asize == 32)
518 match = ((*(u32 *)®s->cx)--) != 0;
519 #ifdef CONFIG_X86_64
520 else if (p->ainsn.loop.asize == 64)
521 match = ((*(u64 *)®s->cx)--) != 0;
522 #endif
523 else
524 match = ((*(u16 *)®s->cx)--) != 0;
525 } else { /* JCXZ */
526 if (p->ainsn.loop.asize == 32)
527 match = *(u32 *)(®s->cx) == 0;
528 #ifdef CONFIG_X86_64
529 else if (p->ainsn.loop.asize == 64)
530 match = *(u64 *)(®s->cx) == 0;
531 #endif
532 else
533 match = *(u16 *)(®s->cx) == 0;
534 }
535
536 if (p->ainsn.loop.type == 0) /* LOOPNE */
537 match = match && !(regs->flags & X86_EFLAGS_ZF);
538 else if (p->ainsn.loop.type == 1) /* LOOPE */
539 match = match && (regs->flags & X86_EFLAGS_ZF);
540
541 __kprobe_emulate_jmp(p, regs, match);
542 }
543 NOKPROBE_SYMBOL(kprobe_emulate_loop);
544
545 static const int addrmode_regoffs[] = {
546 offsetof(struct pt_regs, ax),
547 offsetof(struct pt_regs, cx),
548 offsetof(struct pt_regs, dx),
549 offsetof(struct pt_regs, bx),
550 offsetof(struct pt_regs, sp),
551 offsetof(struct pt_regs, bp),
552 offsetof(struct pt_regs, si),
553 offsetof(struct pt_regs, di),
554 #ifdef CONFIG_X86_64
555 offsetof(struct pt_regs, r8),
556 offsetof(struct pt_regs, r9),
557 offsetof(struct pt_regs, r10),
558 offsetof(struct pt_regs, r11),
559 offsetof(struct pt_regs, r12),
560 offsetof(struct pt_regs, r13),
561 offsetof(struct pt_regs, r14),
562 offsetof(struct pt_regs, r15),
563 #endif
564 };
565
kprobe_emulate_call_indirect(struct kprobe * p,struct pt_regs * regs)566 static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs)
567 {
568 unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
569
570 int3_emulate_call(regs, regs_get_register(regs, offs));
571 }
572 NOKPROBE_SYMBOL(kprobe_emulate_call_indirect);
573
kprobe_emulate_jmp_indirect(struct kprobe * p,struct pt_regs * regs)574 static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs)
575 {
576 unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
577
578 int3_emulate_jmp(regs, regs_get_register(regs, offs));
579 }
580 NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect);
581
prepare_emulation(struct kprobe * p,struct insn * insn)582 static int prepare_emulation(struct kprobe *p, struct insn *insn)
583 {
584 insn_byte_t opcode = insn->opcode.bytes[0];
585
586 switch (opcode) {
587 case 0xfa: /* cli */
588 case 0xfb: /* sti */
589 case 0x9c: /* pushfl */
590 case 0x9d: /* popf/popfd */
591 /*
592 * IF modifiers must be emulated since it will enable interrupt while
593 * int3 single stepping.
594 */
595 p->ainsn.emulate_op = kprobe_emulate_ifmodifiers;
596 p->ainsn.opcode = opcode;
597 break;
598 case 0xc2: /* ret/lret */
599 case 0xc3:
600 case 0xca:
601 case 0xcb:
602 p->ainsn.emulate_op = kprobe_emulate_ret;
603 break;
604 case 0x9a: /* far call absolute -- segment is not supported */
605 case 0xea: /* far jmp absolute -- segment is not supported */
606 case 0xcc: /* int3 */
607 case 0xcf: /* iret -- in-kernel IRET is not supported */
608 return -EOPNOTSUPP;
609 break;
610 case 0xe8: /* near call relative */
611 p->ainsn.emulate_op = kprobe_emulate_call;
612 if (insn->immediate.nbytes == 2)
613 p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
614 else
615 p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
616 break;
617 case 0xeb: /* short jump relative */
618 case 0xe9: /* near jump relative */
619 p->ainsn.emulate_op = kprobe_emulate_jmp;
620 if (insn->immediate.nbytes == 1)
621 p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
622 else if (insn->immediate.nbytes == 2)
623 p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
624 else
625 p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
626 break;
627 case 0x70 ... 0x7f:
628 /* 1 byte conditional jump */
629 p->ainsn.emulate_op = kprobe_emulate_jcc;
630 p->ainsn.jcc.type = opcode & 0xf;
631 p->ainsn.rel32 = *(char *)insn->immediate.bytes;
632 break;
633 case 0x0f:
634 opcode = insn->opcode.bytes[1];
635 if ((opcode & 0xf0) == 0x80) {
636 /* 2 bytes Conditional Jump */
637 p->ainsn.emulate_op = kprobe_emulate_jcc;
638 p->ainsn.jcc.type = opcode & 0xf;
639 if (insn->immediate.nbytes == 2)
640 p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
641 else
642 p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
643 } else if (opcode == 0x01 &&
644 X86_MODRM_REG(insn->modrm.bytes[0]) == 0 &&
645 X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) {
646 /* VM extensions - not supported */
647 return -EOPNOTSUPP;
648 }
649 break;
650 case 0xe0: /* Loop NZ */
651 case 0xe1: /* Loop */
652 case 0xe2: /* Loop */
653 case 0xe3: /* J*CXZ */
654 p->ainsn.emulate_op = kprobe_emulate_loop;
655 p->ainsn.loop.type = opcode & 0x3;
656 p->ainsn.loop.asize = insn->addr_bytes * 8;
657 p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
658 break;
659 case 0xff:
660 /*
661 * Since the 0xff is an extended group opcode, the instruction
662 * is determined by the MOD/RM byte.
663 */
664 opcode = insn->modrm.bytes[0];
665 if ((opcode & 0x30) == 0x10) {
666 if ((opcode & 0x8) == 0x8)
667 return -EOPNOTSUPP; /* far call */
668 /* call absolute, indirect */
669 p->ainsn.emulate_op = kprobe_emulate_call_indirect;
670 } else if ((opcode & 0x30) == 0x20) {
671 if ((opcode & 0x8) == 0x8)
672 return -EOPNOTSUPP; /* far jmp */
673 /* jmp near absolute indirect */
674 p->ainsn.emulate_op = kprobe_emulate_jmp_indirect;
675 } else
676 break;
677
678 if (insn->addr_bytes != sizeof(unsigned long))
679 return -EOPNOTSUPP; /* Don't support different size */
680 if (X86_MODRM_MOD(opcode) != 3)
681 return -EOPNOTSUPP; /* TODO: support memory addressing */
682
683 p->ainsn.indirect.reg = X86_MODRM_RM(opcode);
684 #ifdef CONFIG_X86_64
685 if (X86_REX_B(insn->rex_prefix.value))
686 p->ainsn.indirect.reg += 8;
687 #endif
688 break;
689 default:
690 break;
691 }
692 p->ainsn.size = insn->length;
693
694 return 0;
695 }
696
arch_copy_kprobe(struct kprobe * p)697 static int arch_copy_kprobe(struct kprobe *p)
698 {
699 struct insn insn;
700 kprobe_opcode_t buf[MAX_INSN_SIZE];
701 int ret, len;
702
703 /* Copy an instruction with recovering if other optprobe modifies it.*/
704 len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
705 if (!len)
706 return -EINVAL;
707
708 /* Analyze the opcode and setup emulate functions */
709 ret = prepare_emulation(p, &insn);
710 if (ret < 0)
711 return ret;
712
713 /* Add int3 for single-step or booster jmp */
714 len = prepare_singlestep(buf, p, &insn);
715 if (len < 0)
716 return len;
717
718 /* Also, displacement change doesn't affect the first byte */
719 p->opcode = buf[0];
720
721 p->ainsn.tp_len = len;
722 perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
723
724 /* OK, write back the instruction(s) into ROX insn buffer */
725 text_poke(p->ainsn.insn, buf, len);
726
727 return 0;
728 }
729
arch_prepare_kprobe(struct kprobe * p)730 int arch_prepare_kprobe(struct kprobe *p)
731 {
732 int ret;
733
734 if (alternatives_text_reserved(p->addr, p->addr))
735 return -EINVAL;
736
737 if (!can_probe((unsigned long)p->addr))
738 return -EILSEQ;
739
740 memset(&p->ainsn, 0, sizeof(p->ainsn));
741
742 /* insn: must be on special executable page on x86. */
743 p->ainsn.insn = get_insn_slot();
744 if (!p->ainsn.insn)
745 return -ENOMEM;
746
747 ret = arch_copy_kprobe(p);
748 if (ret) {
749 free_insn_slot(p->ainsn.insn, 0);
750 p->ainsn.insn = NULL;
751 }
752
753 return ret;
754 }
755
arch_arm_kprobe(struct kprobe * p)756 void arch_arm_kprobe(struct kprobe *p)
757 {
758 u8 int3 = INT3_INSN_OPCODE;
759
760 text_poke(p->addr, &int3, 1);
761 text_poke_sync();
762 perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
763 }
764
arch_disarm_kprobe(struct kprobe * p)765 void arch_disarm_kprobe(struct kprobe *p)
766 {
767 u8 int3 = INT3_INSN_OPCODE;
768
769 perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
770 text_poke(p->addr, &p->opcode, 1);
771 text_poke_sync();
772 }
773
arch_remove_kprobe(struct kprobe * p)774 void arch_remove_kprobe(struct kprobe *p)
775 {
776 if (p->ainsn.insn) {
777 /* Record the perf event before freeing the slot */
778 perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
779 p->ainsn.tp_len, NULL, 0);
780 free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
781 p->ainsn.insn = NULL;
782 }
783 }
784
785 static nokprobe_inline void
save_previous_kprobe(struct kprobe_ctlblk * kcb)786 save_previous_kprobe(struct kprobe_ctlblk *kcb)
787 {
788 kcb->prev_kprobe.kp = kprobe_running();
789 kcb->prev_kprobe.status = kcb->kprobe_status;
790 kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
791 kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
792 }
793
794 static nokprobe_inline void
restore_previous_kprobe(struct kprobe_ctlblk * kcb)795 restore_previous_kprobe(struct kprobe_ctlblk *kcb)
796 {
797 __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
798 kcb->kprobe_status = kcb->prev_kprobe.status;
799 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
800 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
801 }
802
803 static nokprobe_inline void
set_current_kprobe(struct kprobe * p,struct pt_regs * regs,struct kprobe_ctlblk * kcb)804 set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
805 struct kprobe_ctlblk *kcb)
806 {
807 __this_cpu_write(current_kprobe, p);
808 kcb->kprobe_saved_flags = kcb->kprobe_old_flags
809 = (regs->flags & X86_EFLAGS_IF);
810 }
811
kprobe_post_process(struct kprobe * cur,struct pt_regs * regs,struct kprobe_ctlblk * kcb)812 static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs,
813 struct kprobe_ctlblk *kcb)
814 {
815 /* Restore back the original saved kprobes variables and continue. */
816 if (kcb->kprobe_status == KPROBE_REENTER) {
817 /* This will restore both kcb and current_kprobe */
818 restore_previous_kprobe(kcb);
819 } else {
820 /*
821 * Always update the kcb status because
822 * reset_curent_kprobe() doesn't update kcb.
823 */
824 kcb->kprobe_status = KPROBE_HIT_SSDONE;
825 if (cur->post_handler)
826 cur->post_handler(cur, regs, 0);
827 reset_current_kprobe();
828 }
829 }
830 NOKPROBE_SYMBOL(kprobe_post_process);
831
setup_singlestep(struct kprobe * p,struct pt_regs * regs,struct kprobe_ctlblk * kcb,int reenter)832 static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
833 struct kprobe_ctlblk *kcb, int reenter)
834 {
835 if (setup_detour_execution(p, regs, reenter))
836 return;
837
838 #if !defined(CONFIG_PREEMPTION)
839 if (p->ainsn.boostable) {
840 /* Boost up -- we can execute copied instructions directly */
841 if (!reenter)
842 reset_current_kprobe();
843 /*
844 * Reentering boosted probe doesn't reset current_kprobe,
845 * nor set current_kprobe, because it doesn't use single
846 * stepping.
847 */
848 regs->ip = (unsigned long)p->ainsn.insn;
849 return;
850 }
851 #endif
852 if (reenter) {
853 save_previous_kprobe(kcb);
854 set_current_kprobe(p, regs, kcb);
855 kcb->kprobe_status = KPROBE_REENTER;
856 } else
857 kcb->kprobe_status = KPROBE_HIT_SS;
858
859 if (p->ainsn.emulate_op) {
860 p->ainsn.emulate_op(p, regs);
861 kprobe_post_process(p, regs, kcb);
862 return;
863 }
864
865 /* Disable interrupt, and set ip register on trampoline */
866 regs->flags &= ~X86_EFLAGS_IF;
867 regs->ip = (unsigned long)p->ainsn.insn;
868 }
869 NOKPROBE_SYMBOL(setup_singlestep);
870
871 /*
872 * Called after single-stepping. p->addr is the address of the
873 * instruction whose first byte has been replaced by the "int3"
874 * instruction. To avoid the SMP problems that can occur when we
875 * temporarily put back the original opcode to single-step, we
876 * single-stepped a copy of the instruction. The address of this
877 * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again
878 * right after the copied instruction.
879 * Different from the trap single-step, "int3" single-step can not
880 * handle the instruction which changes the ip register, e.g. jmp,
881 * call, conditional jmp, and the instructions which changes the IF
882 * flags because interrupt must be disabled around the single-stepping.
883 * Such instructions are software emulated, but others are single-stepped
884 * using "int3".
885 *
886 * When the 2nd "int3" handled, the regs->ip and regs->flags needs to
887 * be adjusted, so that we can resume execution on correct code.
888 */
resume_singlestep(struct kprobe * p,struct pt_regs * regs,struct kprobe_ctlblk * kcb)889 static void resume_singlestep(struct kprobe *p, struct pt_regs *regs,
890 struct kprobe_ctlblk *kcb)
891 {
892 unsigned long copy_ip = (unsigned long)p->ainsn.insn;
893 unsigned long orig_ip = (unsigned long)p->addr;
894
895 /* Restore saved interrupt flag and ip register */
896 regs->flags |= kcb->kprobe_saved_flags;
897 /* Note that regs->ip is executed int3 so must be a step back */
898 regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE;
899 }
900 NOKPROBE_SYMBOL(resume_singlestep);
901
902 /*
903 * We have reentered the kprobe_handler(), since another probe was hit while
904 * within the handler. We save the original kprobes variables and just single
905 * step on the instruction of the new probe without calling any user handlers.
906 */
reenter_kprobe(struct kprobe * p,struct pt_regs * regs,struct kprobe_ctlblk * kcb)907 static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
908 struct kprobe_ctlblk *kcb)
909 {
910 switch (kcb->kprobe_status) {
911 case KPROBE_HIT_SSDONE:
912 case KPROBE_HIT_ACTIVE:
913 case KPROBE_HIT_SS:
914 kprobes_inc_nmissed_count(p);
915 setup_singlestep(p, regs, kcb, 1);
916 break;
917 case KPROBE_REENTER:
918 /* A probe has been hit in the codepath leading up to, or just
919 * after, single-stepping of a probed instruction. This entire
920 * codepath should strictly reside in .kprobes.text section.
921 * Raise a BUG or we'll continue in an endless reentering loop
922 * and eventually a stack overflow.
923 */
924 pr_err("Unrecoverable kprobe detected.\n");
925 dump_kprobe(p);
926 BUG();
927 default:
928 /* impossible cases */
929 WARN_ON(1);
930 return 0;
931 }
932
933 return 1;
934 }
935 NOKPROBE_SYMBOL(reenter_kprobe);
936
kprobe_is_ss(struct kprobe_ctlblk * kcb)937 static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb)
938 {
939 return (kcb->kprobe_status == KPROBE_HIT_SS ||
940 kcb->kprobe_status == KPROBE_REENTER);
941 }
942
943 /*
944 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
945 * remain disabled throughout this function.
946 */
kprobe_int3_handler(struct pt_regs * regs)947 int kprobe_int3_handler(struct pt_regs *regs)
948 {
949 kprobe_opcode_t *addr;
950 struct kprobe *p;
951 struct kprobe_ctlblk *kcb;
952
953 if (user_mode(regs))
954 return 0;
955
956 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
957 /*
958 * We don't want to be preempted for the entire duration of kprobe
959 * processing. Since int3 and debug trap disables irqs and we clear
960 * IF while singlestepping, it must be no preemptible.
961 */
962
963 kcb = get_kprobe_ctlblk();
964 p = get_kprobe(addr);
965
966 if (p) {
967 if (kprobe_running()) {
968 if (reenter_kprobe(p, regs, kcb))
969 return 1;
970 } else {
971 set_current_kprobe(p, regs, kcb);
972 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
973
974 /*
975 * If we have no pre-handler or it returned 0, we
976 * continue with normal processing. If we have a
977 * pre-handler and it returned non-zero, that means
978 * user handler setup registers to exit to another
979 * instruction, we must skip the single stepping.
980 */
981 if (!p->pre_handler || !p->pre_handler(p, regs))
982 setup_singlestep(p, regs, kcb, 0);
983 else
984 reset_current_kprobe();
985 return 1;
986 }
987 } else if (kprobe_is_ss(kcb)) {
988 p = kprobe_running();
989 if ((unsigned long)p->ainsn.insn < regs->ip &&
990 (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) {
991 /* Most provably this is the second int3 for singlestep */
992 resume_singlestep(p, regs, kcb);
993 kprobe_post_process(p, regs, kcb);
994 return 1;
995 }
996 }
997
998 if (*addr != INT3_INSN_OPCODE) {
999 /*
1000 * The breakpoint instruction was removed right
1001 * after we hit it. Another cpu has removed
1002 * either a probepoint or a debugger breakpoint
1003 * at this address. In either case, no further
1004 * handling of this interrupt is appropriate.
1005 * Back up over the (now missing) int3 and run
1006 * the original instruction.
1007 */
1008 regs->ip = (unsigned long)addr;
1009 return 1;
1010 } /* else: not a kprobe fault; let the kernel handle it */
1011
1012 return 0;
1013 }
1014 NOKPROBE_SYMBOL(kprobe_int3_handler);
1015
kprobe_fault_handler(struct pt_regs * regs,int trapnr)1016 int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
1017 {
1018 struct kprobe *cur = kprobe_running();
1019 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1020
1021 if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
1022 /* This must happen on single-stepping */
1023 WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
1024 kcb->kprobe_status != KPROBE_REENTER);
1025 /*
1026 * We are here because the instruction being single
1027 * stepped caused a page fault. We reset the current
1028 * kprobe and the ip points back to the probe address
1029 * and allow the page fault handler to continue as a
1030 * normal page fault.
1031 */
1032 regs->ip = (unsigned long)cur->addr;
1033
1034 /*
1035 * If the IF flag was set before the kprobe hit,
1036 * don't touch it:
1037 */
1038 regs->flags |= kcb->kprobe_old_flags;
1039
1040 if (kcb->kprobe_status == KPROBE_REENTER)
1041 restore_previous_kprobe(kcb);
1042 else
1043 reset_current_kprobe();
1044 }
1045
1046 return 0;
1047 }
1048 NOKPROBE_SYMBOL(kprobe_fault_handler);
1049
arch_populate_kprobe_blacklist(void)1050 int __init arch_populate_kprobe_blacklist(void)
1051 {
1052 return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
1053 (unsigned long)__entry_text_end);
1054 }
1055
arch_init_kprobes(void)1056 int __init arch_init_kprobes(void)
1057 {
1058 return 0;
1059 }
1060
arch_trampoline_kprobe(struct kprobe * p)1061 int arch_trampoline_kprobe(struct kprobe *p)
1062 {
1063 return 0;
1064 }
1065