1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 1995  Linus Torvalds
4  *
5  *  Pentium III FXSR, SSE support
6  *	Gareth Hughes <gareth@valinux.com>, May 2000
7  *
8  *  X86-64 port
9  *	Andi Kleen.
10  *
11  *	CPU hotplug support - ashok.raj@intel.com
12  */
13 
14 /*
15  * This file handles the architecture-dependent parts of process handling..
16  */
17 
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/sched/task.h>
22 #include <linux/sched/task_stack.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/delay.h>
32 #include <linux/export.h>
33 #include <linux/ptrace.h>
34 #include <linux/notifier.h>
35 #include <linux/kprobes.h>
36 #include <linux/kdebug.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41 #include <linux/syscalls.h>
42 
43 #include <asm/processor.h>
44 #include <asm/fpu/internal.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52 #include <asm/xen/hypervisor.h>
53 #include <asm/vdso.h>
54 #include <asm/resctrl.h>
55 #include <asm/unistd.h>
56 #include <asm/fsgsbase.h>
57 #ifdef CONFIG_IA32_EMULATION
58 /* Not included via unistd.h */
59 #include <asm/unistd_32_ia32.h>
60 #endif
61 
62 #include "process.h"
63 
64 /* Prints also some state that isn't saved in the pt_regs */
__show_regs(struct pt_regs * regs,enum show_regs_mode mode,const char * log_lvl)65 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
66 		 const char *log_lvl)
67 {
68 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
69 	unsigned long d0, d1, d2, d3, d6, d7;
70 	unsigned int fsindex, gsindex;
71 	unsigned int ds, es;
72 
73 	show_iret_regs(regs, log_lvl);
74 
75 	if (regs->orig_ax != -1)
76 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
77 	else
78 		pr_cont("\n");
79 
80 	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
81 	       log_lvl, regs->ax, regs->bx, regs->cx);
82 	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
83 	       log_lvl, regs->dx, regs->si, regs->di);
84 	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
85 	       log_lvl, regs->bp, regs->r8, regs->r9);
86 	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
87 	       log_lvl, regs->r10, regs->r11, regs->r12);
88 	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
89 	       log_lvl, regs->r13, regs->r14, regs->r15);
90 
91 	if (mode == SHOW_REGS_SHORT)
92 		return;
93 
94 	if (mode == SHOW_REGS_USER) {
95 		rdmsrl(MSR_FS_BASE, fs);
96 		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
97 		printk("%sFS:  %016lx GS:  %016lx\n",
98 		       log_lvl, fs, shadowgs);
99 		return;
100 	}
101 
102 	asm("movl %%ds,%0" : "=r" (ds));
103 	asm("movl %%es,%0" : "=r" (es));
104 	asm("movl %%fs,%0" : "=r" (fsindex));
105 	asm("movl %%gs,%0" : "=r" (gsindex));
106 
107 	rdmsrl(MSR_FS_BASE, fs);
108 	rdmsrl(MSR_GS_BASE, gs);
109 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
110 
111 	cr0 = read_cr0();
112 	cr2 = read_cr2();
113 	cr3 = __read_cr3();
114 	cr4 = __read_cr4();
115 
116 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
117 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
118 	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
119 		log_lvl, regs->cs, ds, es, cr0);
120 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
121 		log_lvl, cr2, cr3, cr4);
122 
123 	get_debugreg(d0, 0);
124 	get_debugreg(d1, 1);
125 	get_debugreg(d2, 2);
126 	get_debugreg(d3, 3);
127 	get_debugreg(d6, 6);
128 	get_debugreg(d7, 7);
129 
130 	/* Only print out debug registers if they are in their non-default state. */
131 	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
132 	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
133 		printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
134 		       log_lvl, d0, d1, d2);
135 		printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
136 		       log_lvl, d3, d6, d7);
137 	}
138 
139 	if (boot_cpu_has(X86_FEATURE_OSPKE))
140 		printk("%sPKRU: %08x\n", log_lvl, read_pkru());
141 }
142 
release_thread(struct task_struct * dead_task)143 void release_thread(struct task_struct *dead_task)
144 {
145 	WARN_ON(dead_task->mm);
146 }
147 
148 enum which_selector {
149 	FS,
150 	GS
151 };
152 
153 /*
154  * Out of line to be protected from kprobes and tracing. If this would be
155  * traced or probed than any access to a per CPU variable happens with
156  * the wrong GS.
157  *
158  * It is not used on Xen paravirt. When paravirt support is needed, it
159  * needs to be renamed with native_ prefix.
160  */
__rdgsbase_inactive(void)161 static noinstr unsigned long __rdgsbase_inactive(void)
162 {
163 	unsigned long gsbase;
164 
165 	lockdep_assert_irqs_disabled();
166 
167 	if (!static_cpu_has(X86_FEATURE_XENPV)) {
168 		native_swapgs();
169 		gsbase = rdgsbase();
170 		native_swapgs();
171 	} else {
172 		instrumentation_begin();
173 		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
174 		instrumentation_end();
175 	}
176 
177 	return gsbase;
178 }
179 
180 /*
181  * Out of line to be protected from kprobes and tracing. If this would be
182  * traced or probed than any access to a per CPU variable happens with
183  * the wrong GS.
184  *
185  * It is not used on Xen paravirt. When paravirt support is needed, it
186  * needs to be renamed with native_ prefix.
187  */
__wrgsbase_inactive(unsigned long gsbase)188 static noinstr void __wrgsbase_inactive(unsigned long gsbase)
189 {
190 	lockdep_assert_irqs_disabled();
191 
192 	if (!static_cpu_has(X86_FEATURE_XENPV)) {
193 		native_swapgs();
194 		wrgsbase(gsbase);
195 		native_swapgs();
196 	} else {
197 		instrumentation_begin();
198 		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
199 		instrumentation_end();
200 	}
201 }
202 
203 /*
204  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
205  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
206  * It's forcibly inlined because it'll generate better code and this function
207  * is hot.
208  */
save_base_legacy(struct task_struct * prev_p,unsigned short selector,enum which_selector which)209 static __always_inline void save_base_legacy(struct task_struct *prev_p,
210 					     unsigned short selector,
211 					     enum which_selector which)
212 {
213 	if (likely(selector == 0)) {
214 		/*
215 		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
216 		 * be the pre-existing saved base or it could be zero.  On AMD
217 		 * (with X86_BUG_NULL_SEG), the segment base could be almost
218 		 * anything.
219 		 *
220 		 * This branch is very hot (it's hit twice on almost every
221 		 * context switch between 64-bit programs), and avoiding
222 		 * the RDMSR helps a lot, so we just assume that whatever
223 		 * value is already saved is correct.  This matches historical
224 		 * Linux behavior, so it won't break existing applications.
225 		 *
226 		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
227 		 * report that the base is zero, it needs to actually be zero:
228 		 * see the corresponding logic in load_seg_legacy.
229 		 */
230 	} else {
231 		/*
232 		 * If the selector is 1, 2, or 3, then the base is zero on
233 		 * !X86_BUG_NULL_SEG CPUs and could be anything on
234 		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
235 		 * has never attempted to preserve the base across context
236 		 * switches.
237 		 *
238 		 * If selector > 3, then it refers to a real segment, and
239 		 * saving the base isn't necessary.
240 		 */
241 		if (which == FS)
242 			prev_p->thread.fsbase = 0;
243 		else
244 			prev_p->thread.gsbase = 0;
245 	}
246 }
247 
save_fsgs(struct task_struct * task)248 static __always_inline void save_fsgs(struct task_struct *task)
249 {
250 	savesegment(fs, task->thread.fsindex);
251 	savesegment(gs, task->thread.gsindex);
252 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
253 		/*
254 		 * If FSGSBASE is enabled, we can't make any useful guesses
255 		 * about the base, and user code expects us to save the current
256 		 * value.  Fortunately, reading the base directly is efficient.
257 		 */
258 		task->thread.fsbase = rdfsbase();
259 		task->thread.gsbase = __rdgsbase_inactive();
260 	} else {
261 		save_base_legacy(task, task->thread.fsindex, FS);
262 		save_base_legacy(task, task->thread.gsindex, GS);
263 	}
264 }
265 
266 /*
267  * While a process is running,current->thread.fsbase and current->thread.gsbase
268  * may not match the corresponding CPU registers (see save_base_legacy()).
269  */
current_save_fsgs(void)270 void current_save_fsgs(void)
271 {
272 	unsigned long flags;
273 
274 	/* Interrupts need to be off for FSGSBASE */
275 	local_irq_save(flags);
276 	save_fsgs(current);
277 	local_irq_restore(flags);
278 }
279 #if IS_ENABLED(CONFIG_KVM)
280 EXPORT_SYMBOL_GPL(current_save_fsgs);
281 #endif
282 
loadseg(enum which_selector which,unsigned short sel)283 static __always_inline void loadseg(enum which_selector which,
284 				    unsigned short sel)
285 {
286 	if (which == FS)
287 		loadsegment(fs, sel);
288 	else
289 		load_gs_index(sel);
290 }
291 
load_seg_legacy(unsigned short prev_index,unsigned long prev_base,unsigned short next_index,unsigned long next_base,enum which_selector which)292 static __always_inline void load_seg_legacy(unsigned short prev_index,
293 					    unsigned long prev_base,
294 					    unsigned short next_index,
295 					    unsigned long next_base,
296 					    enum which_selector which)
297 {
298 	if (likely(next_index <= 3)) {
299 		/*
300 		 * The next task is using 64-bit TLS, is not using this
301 		 * segment at all, or is having fun with arcane CPU features.
302 		 */
303 		if (next_base == 0) {
304 			/*
305 			 * Nasty case: on AMD CPUs, we need to forcibly zero
306 			 * the base.
307 			 */
308 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
309 				loadseg(which, __USER_DS);
310 				loadseg(which, next_index);
311 			} else {
312 				/*
313 				 * We could try to exhaustively detect cases
314 				 * under which we can skip the segment load,
315 				 * but there's really only one case that matters
316 				 * for performance: if both the previous and
317 				 * next states are fully zeroed, we can skip
318 				 * the load.
319 				 *
320 				 * (This assumes that prev_base == 0 has no
321 				 * false positives.  This is the case on
322 				 * Intel-style CPUs.)
323 				 */
324 				if (likely(prev_index | next_index | prev_base))
325 					loadseg(which, next_index);
326 			}
327 		} else {
328 			if (prev_index != next_index)
329 				loadseg(which, next_index);
330 			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
331 			       next_base);
332 		}
333 	} else {
334 		/*
335 		 * The next task is using a real segment.  Loading the selector
336 		 * is sufficient.
337 		 */
338 		loadseg(which, next_index);
339 	}
340 }
341 
x86_fsgsbase_load(struct thread_struct * prev,struct thread_struct * next)342 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
343 					      struct thread_struct *next)
344 {
345 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
346 		/* Update the FS and GS selectors if they could have changed. */
347 		if (unlikely(prev->fsindex || next->fsindex))
348 			loadseg(FS, next->fsindex);
349 		if (unlikely(prev->gsindex || next->gsindex))
350 			loadseg(GS, next->gsindex);
351 
352 		/* Update the bases. */
353 		wrfsbase(next->fsbase);
354 		__wrgsbase_inactive(next->gsbase);
355 	} else {
356 		load_seg_legacy(prev->fsindex, prev->fsbase,
357 				next->fsindex, next->fsbase, FS);
358 		load_seg_legacy(prev->gsindex, prev->gsbase,
359 				next->gsindex, next->gsbase, GS);
360 	}
361 }
362 
x86_fsgsbase_read_task(struct task_struct * task,unsigned short selector)363 unsigned long x86_fsgsbase_read_task(struct task_struct *task,
364 				     unsigned short selector)
365 {
366 	unsigned short idx = selector >> 3;
367 	unsigned long base;
368 
369 	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
370 		if (unlikely(idx >= GDT_ENTRIES))
371 			return 0;
372 
373 		/*
374 		 * There are no user segments in the GDT with nonzero bases
375 		 * other than the TLS segments.
376 		 */
377 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
378 			return 0;
379 
380 		idx -= GDT_ENTRY_TLS_MIN;
381 		base = get_desc_base(&task->thread.tls_array[idx]);
382 	} else {
383 #ifdef CONFIG_MODIFY_LDT_SYSCALL
384 		struct ldt_struct *ldt;
385 
386 		/*
387 		 * If performance here mattered, we could protect the LDT
388 		 * with RCU.  This is a slow path, though, so we can just
389 		 * take the mutex.
390 		 */
391 		mutex_lock(&task->mm->context.lock);
392 		ldt = task->mm->context.ldt;
393 		if (unlikely(!ldt || idx >= ldt->nr_entries))
394 			base = 0;
395 		else
396 			base = get_desc_base(ldt->entries + idx);
397 		mutex_unlock(&task->mm->context.lock);
398 #else
399 		base = 0;
400 #endif
401 	}
402 
403 	return base;
404 }
405 
x86_gsbase_read_cpu_inactive(void)406 unsigned long x86_gsbase_read_cpu_inactive(void)
407 {
408 	unsigned long gsbase;
409 
410 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
411 		unsigned long flags;
412 
413 		local_irq_save(flags);
414 		gsbase = __rdgsbase_inactive();
415 		local_irq_restore(flags);
416 	} else {
417 		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
418 	}
419 
420 	return gsbase;
421 }
422 
x86_gsbase_write_cpu_inactive(unsigned long gsbase)423 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
424 {
425 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
426 		unsigned long flags;
427 
428 		local_irq_save(flags);
429 		__wrgsbase_inactive(gsbase);
430 		local_irq_restore(flags);
431 	} else {
432 		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
433 	}
434 }
435 
x86_fsbase_read_task(struct task_struct * task)436 unsigned long x86_fsbase_read_task(struct task_struct *task)
437 {
438 	unsigned long fsbase;
439 
440 	if (task == current)
441 		fsbase = x86_fsbase_read_cpu();
442 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
443 		 (task->thread.fsindex == 0))
444 		fsbase = task->thread.fsbase;
445 	else
446 		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
447 
448 	return fsbase;
449 }
450 
x86_gsbase_read_task(struct task_struct * task)451 unsigned long x86_gsbase_read_task(struct task_struct *task)
452 {
453 	unsigned long gsbase;
454 
455 	if (task == current)
456 		gsbase = x86_gsbase_read_cpu_inactive();
457 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
458 		 (task->thread.gsindex == 0))
459 		gsbase = task->thread.gsbase;
460 	else
461 		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
462 
463 	return gsbase;
464 }
465 
x86_fsbase_write_task(struct task_struct * task,unsigned long fsbase)466 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
467 {
468 	WARN_ON_ONCE(task == current);
469 
470 	task->thread.fsbase = fsbase;
471 }
472 
x86_gsbase_write_task(struct task_struct * task,unsigned long gsbase)473 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
474 {
475 	WARN_ON_ONCE(task == current);
476 
477 	task->thread.gsbase = gsbase;
478 }
479 
480 static void
start_thread_common(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp,unsigned int _cs,unsigned int _ss,unsigned int _ds)481 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
482 		    unsigned long new_sp,
483 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
484 {
485 	WARN_ON_ONCE(regs != current_pt_regs());
486 
487 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
488 		/* Loading zero below won't clear the base. */
489 		loadsegment(fs, __USER_DS);
490 		load_gs_index(__USER_DS);
491 	}
492 
493 	loadsegment(fs, 0);
494 	loadsegment(es, _ds);
495 	loadsegment(ds, _ds);
496 	load_gs_index(0);
497 
498 	regs->ip		= new_ip;
499 	regs->sp		= new_sp;
500 	regs->cs		= _cs;
501 	regs->ss		= _ss;
502 	regs->flags		= X86_EFLAGS_IF;
503 }
504 
505 void
start_thread(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp)506 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
507 {
508 	start_thread_common(regs, new_ip, new_sp,
509 			    __USER_CS, __USER_DS, 0);
510 }
511 EXPORT_SYMBOL_GPL(start_thread);
512 
513 #ifdef CONFIG_COMPAT
compat_start_thread(struct pt_regs * regs,u32 new_ip,u32 new_sp)514 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
515 {
516 	start_thread_common(regs, new_ip, new_sp,
517 			    test_thread_flag(TIF_X32)
518 			    ? __USER_CS : __USER32_CS,
519 			    __USER_DS, __USER_DS);
520 }
521 #endif
522 
523 /*
524  *	switch_to(x,y) should switch tasks from x to y.
525  *
526  * This could still be optimized:
527  * - fold all the options into a flag word and test it with a single test.
528  * - could test fs/gs bitsliced
529  *
530  * Kprobes not supported here. Set the probe on schedule instead.
531  * Function graph tracer not supported too.
532  */
533 __visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct * prev_p,struct task_struct * next_p)534 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
535 {
536 	struct thread_struct *prev = &prev_p->thread;
537 	struct thread_struct *next = &next_p->thread;
538 	struct fpu *prev_fpu = &prev->fpu;
539 	struct fpu *next_fpu = &next->fpu;
540 	int cpu = smp_processor_id();
541 
542 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
543 		     this_cpu_read(irq_count) != -1);
544 
545 	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
546 		switch_fpu_prepare(prev_fpu, cpu);
547 
548 	/* We must save %fs and %gs before load_TLS() because
549 	 * %fs and %gs may be cleared by load_TLS().
550 	 *
551 	 * (e.g. xen_load_tls())
552 	 */
553 	save_fsgs(prev_p);
554 
555 	/*
556 	 * Load TLS before restoring any segments so that segment loads
557 	 * reference the correct GDT entries.
558 	 */
559 	load_TLS(next, cpu);
560 
561 	/*
562 	 * Leave lazy mode, flushing any hypercalls made here.  This
563 	 * must be done after loading TLS entries in the GDT but before
564 	 * loading segments that might reference them.
565 	 */
566 	arch_end_context_switch(next_p);
567 
568 	/* Switch DS and ES.
569 	 *
570 	 * Reading them only returns the selectors, but writing them (if
571 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
572 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
573 	 * above.
574 	 *
575 	 * We therefore need to write new values to the segment
576 	 * registers on every context switch unless both the new and old
577 	 * values are zero.
578 	 *
579 	 * Note that we don't need to do anything for CS and SS, as
580 	 * those are saved and restored as part of pt_regs.
581 	 */
582 	savesegment(es, prev->es);
583 	if (unlikely(next->es | prev->es))
584 		loadsegment(es, next->es);
585 
586 	savesegment(ds, prev->ds);
587 	if (unlikely(next->ds | prev->ds))
588 		loadsegment(ds, next->ds);
589 
590 	x86_fsgsbase_load(prev, next);
591 
592 	/*
593 	 * Switch the PDA and FPU contexts.
594 	 */
595 	this_cpu_write(current_task, next_p);
596 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
597 
598 	switch_fpu_finish(next_fpu);
599 
600 	/* Reload sp0. */
601 	update_task_stack(next_p);
602 
603 	switch_to_extra(prev_p, next_p);
604 
605 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
606 		/*
607 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
608 		 * does not update the cached descriptor.  As a result, if we
609 		 * do SYSRET while SS is NULL, we'll end up in user mode with
610 		 * SS apparently equal to __USER_DS but actually unusable.
611 		 *
612 		 * The straightforward workaround would be to fix it up just
613 		 * before SYSRET, but that would slow down the system call
614 		 * fast paths.  Instead, we ensure that SS is never NULL in
615 		 * system call context.  We do this by replacing NULL SS
616 		 * selectors at every context switch.  SYSCALL sets up a valid
617 		 * SS, so the only way to get NULL is to re-enter the kernel
618 		 * from CPL 3 through an interrupt.  Since that can't happen
619 		 * in the same task as a running syscall, we are guaranteed to
620 		 * context switch between every interrupt vector entry and a
621 		 * subsequent SYSRET.
622 		 *
623 		 * We read SS first because SS reads are much faster than
624 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
625 		 * it previously had a different non-NULL value.
626 		 */
627 		unsigned short ss_sel;
628 		savesegment(ss, ss_sel);
629 		if (ss_sel != __KERNEL_DS)
630 			loadsegment(ss, __KERNEL_DS);
631 	}
632 
633 	/* Load the Intel cache allocation PQR MSR. */
634 	resctrl_sched_in();
635 
636 	return prev_p;
637 }
638 
set_personality_64bit(void)639 void set_personality_64bit(void)
640 {
641 	/* inherit personality from parent */
642 
643 	/* Make sure to be in 64bit mode */
644 	clear_thread_flag(TIF_IA32);
645 	clear_thread_flag(TIF_ADDR32);
646 	clear_thread_flag(TIF_X32);
647 	/* Pretend that this comes from a 64bit execve */
648 	task_pt_regs(current)->orig_ax = __NR_execve;
649 	current_thread_info()->status &= ~TS_COMPAT;
650 
651 	/* Ensure the corresponding mm is not marked. */
652 	if (current->mm)
653 		current->mm->context.ia32_compat = 0;
654 
655 	/* TBD: overwrites user setup. Should have two bits.
656 	   But 64bit processes have always behaved this way,
657 	   so it's not too bad. The main problem is just that
658 	   32bit children are affected again. */
659 	current->personality &= ~READ_IMPLIES_EXEC;
660 }
661 
__set_personality_x32(void)662 static void __set_personality_x32(void)
663 {
664 #ifdef CONFIG_X86_X32
665 	clear_thread_flag(TIF_IA32);
666 	set_thread_flag(TIF_X32);
667 	if (current->mm)
668 		current->mm->context.ia32_compat = TIF_X32;
669 	current->personality &= ~READ_IMPLIES_EXEC;
670 	/*
671 	 * in_32bit_syscall() uses the presence of the x32 syscall bit
672 	 * flag to determine compat status.  The x86 mmap() code relies on
673 	 * the syscall bitness so set x32 syscall bit right here to make
674 	 * in_32bit_syscall() work during exec().
675 	 *
676 	 * Pretend to come from a x32 execve.
677 	 */
678 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
679 	current_thread_info()->status &= ~TS_COMPAT;
680 #endif
681 }
682 
__set_personality_ia32(void)683 static void __set_personality_ia32(void)
684 {
685 #ifdef CONFIG_IA32_EMULATION
686 	set_thread_flag(TIF_IA32);
687 	clear_thread_flag(TIF_X32);
688 	if (current->mm)
689 		current->mm->context.ia32_compat = TIF_IA32;
690 	current->personality |= force_personality32;
691 	/* Prepare the first "return" to user space */
692 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
693 	current_thread_info()->status |= TS_COMPAT;
694 #endif
695 }
696 
set_personality_ia32(bool x32)697 void set_personality_ia32(bool x32)
698 {
699 	/* Make sure to be in 32bit mode */
700 	set_thread_flag(TIF_ADDR32);
701 
702 	if (x32)
703 		__set_personality_x32();
704 	else
705 		__set_personality_ia32();
706 }
707 EXPORT_SYMBOL_GPL(set_personality_ia32);
708 
709 #ifdef CONFIG_CHECKPOINT_RESTORE
prctl_map_vdso(const struct vdso_image * image,unsigned long addr)710 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
711 {
712 	int ret;
713 
714 	ret = map_vdso_once(image, addr);
715 	if (ret)
716 		return ret;
717 
718 	return (long)image->size;
719 }
720 #endif
721 
do_arch_prctl_64(struct task_struct * task,int option,unsigned long arg2)722 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
723 {
724 	int ret = 0;
725 
726 	switch (option) {
727 	case ARCH_SET_GS: {
728 		if (unlikely(arg2 >= TASK_SIZE_MAX))
729 			return -EPERM;
730 
731 		preempt_disable();
732 		/*
733 		 * ARCH_SET_GS has always overwritten the index
734 		 * and the base. Zero is the most sensible value
735 		 * to put in the index, and is the only value that
736 		 * makes any sense if FSGSBASE is unavailable.
737 		 */
738 		if (task == current) {
739 			loadseg(GS, 0);
740 			x86_gsbase_write_cpu_inactive(arg2);
741 
742 			/*
743 			 * On non-FSGSBASE systems, save_base_legacy() expects
744 			 * that we also fill in thread.gsbase.
745 			 */
746 			task->thread.gsbase = arg2;
747 
748 		} else {
749 			task->thread.gsindex = 0;
750 			x86_gsbase_write_task(task, arg2);
751 		}
752 		preempt_enable();
753 		break;
754 	}
755 	case ARCH_SET_FS: {
756 		/*
757 		 * Not strictly needed for %fs, but do it for symmetry
758 		 * with %gs
759 		 */
760 		if (unlikely(arg2 >= TASK_SIZE_MAX))
761 			return -EPERM;
762 
763 		preempt_disable();
764 		/*
765 		 * Set the selector to 0 for the same reason
766 		 * as %gs above.
767 		 */
768 		if (task == current) {
769 			loadseg(FS, 0);
770 			x86_fsbase_write_cpu(arg2);
771 
772 			/*
773 			 * On non-FSGSBASE systems, save_base_legacy() expects
774 			 * that we also fill in thread.fsbase.
775 			 */
776 			task->thread.fsbase = arg2;
777 		} else {
778 			task->thread.fsindex = 0;
779 			x86_fsbase_write_task(task, arg2);
780 		}
781 		preempt_enable();
782 		break;
783 	}
784 	case ARCH_GET_FS: {
785 		unsigned long base = x86_fsbase_read_task(task);
786 
787 		ret = put_user(base, (unsigned long __user *)arg2);
788 		break;
789 	}
790 	case ARCH_GET_GS: {
791 		unsigned long base = x86_gsbase_read_task(task);
792 
793 		ret = put_user(base, (unsigned long __user *)arg2);
794 		break;
795 	}
796 
797 #ifdef CONFIG_CHECKPOINT_RESTORE
798 # ifdef CONFIG_X86_X32_ABI
799 	case ARCH_MAP_VDSO_X32:
800 		return prctl_map_vdso(&vdso_image_x32, arg2);
801 # endif
802 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
803 	case ARCH_MAP_VDSO_32:
804 		return prctl_map_vdso(&vdso_image_32, arg2);
805 # endif
806 	case ARCH_MAP_VDSO_64:
807 		return prctl_map_vdso(&vdso_image_64, arg2);
808 #endif
809 
810 	default:
811 		ret = -EINVAL;
812 		break;
813 	}
814 
815 	return ret;
816 }
817 
SYSCALL_DEFINE2(arch_prctl,int,option,unsigned long,arg2)818 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
819 {
820 	long ret;
821 
822 	ret = do_arch_prctl_64(current, option, arg2);
823 	if (ret == -EINVAL)
824 		ret = do_arch_prctl_common(current, option, arg2);
825 
826 	return ret;
827 }
828 
829 #ifdef CONFIG_IA32_EMULATION
COMPAT_SYSCALL_DEFINE2(arch_prctl,int,option,unsigned long,arg2)830 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
831 {
832 	return do_arch_prctl_common(current, option, arg2);
833 }
834 #endif
835 
KSTK_ESP(struct task_struct * task)836 unsigned long KSTK_ESP(struct task_struct *task)
837 {
838 	return task_pt_regs(task)->sp;
839 }
840