1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * Asm versions of Xen pv-ops, suitable for direct use.
4 *
5 * We only bother with direct forms (ie, vcpu in pda) of the
6 * operations here; the indirect forms are better handled in C.
7 */
8
9#include <asm/thread_info.h>
10#include <asm/processor-flags.h>
11#include <asm/segment.h>
12#include <asm/asm.h>
13
14#include <xen/interface/xen.h>
15
16#include <linux/linkage.h>
17
18/* Pseudo-flag used for virtual NMI, which we don't implement yet */
19#define XEN_EFLAGS_NMI  0x80000000
20
21/*
22 * This is run where a normal iret would be run, with the same stack setup:
23 *	8: eflags
24 *	4: cs
25 *	esp-> 0: eip
26 *
27 * This attempts to make sure that any pending events are dealt with
28 * on return to usermode, but there is a small window in which an
29 * event can happen just before entering usermode.  If the nested
30 * interrupt ends up setting one of the TIF_WORK_MASK pending work
31 * flags, they will not be tested again before returning to
32 * usermode. This means that a process can end up with pending work,
33 * which will be unprocessed until the process enters and leaves the
34 * kernel again, which could be an unbounded amount of time.  This
35 * means that a pending signal or reschedule event could be
36 * indefinitely delayed.
37 *
38 * The fix is to notice a nested interrupt in the critical window, and
39 * if one occurs, then fold the nested interrupt into the current
40 * interrupt stack frame, and re-process it iteratively rather than
41 * recursively.  This means that it will exit via the normal path, and
42 * all pending work will be dealt with appropriately.
43 *
44 * Because the nested interrupt handler needs to deal with the current
45 * stack state in whatever form its in, we keep things simple by only
46 * using a single register which is pushed/popped on the stack.
47 */
48
49.macro POP_FS
501:
51	popw %fs
52.pushsection .fixup, "ax"
532:	movw $0, (%esp)
54	jmp 1b
55.popsection
56	_ASM_EXTABLE(1b,2b)
57.endm
58
59ENTRY(xen_iret)
60	/* test eflags for special cases */
61	testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
62	jnz hyper_iret
63
64	push %eax
65	ESP_OFFSET=4	# bytes pushed onto stack
66
67	/* Store vcpu_info pointer for easy access */
68#ifdef CONFIG_SMP
69	pushw %fs
70	movl $(__KERNEL_PERCPU), %eax
71	movl %eax, %fs
72	movl %fs:xen_vcpu, %eax
73	POP_FS
74#else
75	movl %ss:xen_vcpu, %eax
76#endif
77
78	/* check IF state we're restoring */
79	testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
80
81	/*
82	 * Maybe enable events.  Once this happens we could get a
83	 * recursive event, so the critical region starts immediately
84	 * afterwards.  However, if that happens we don't end up
85	 * resuming the code, so we don't have to be worried about
86	 * being preempted to another CPU.
87	 */
88	setz %ss:XEN_vcpu_info_mask(%eax)
89xen_iret_start_crit:
90
91	/* check for unmasked and pending */
92	cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax)
93
94	/*
95	 * If there's something pending, mask events again so we can
96	 * jump back into xen_hypervisor_callback. Otherwise do not
97	 * touch XEN_vcpu_info_mask.
98	 */
99	jne 1f
100	movb $1, %ss:XEN_vcpu_info_mask(%eax)
101
1021:	popl %eax
103
104	/*
105	 * From this point on the registers are restored and the stack
106	 * updated, so we don't need to worry about it if we're
107	 * preempted
108	 */
109iret_restore_end:
110
111	/*
112	 * Jump to hypervisor_callback after fixing up the stack.
113	 * Events are masked, so jumping out of the critical region is
114	 * OK.
115	 */
116	je xen_hypervisor_callback
117
1181:	iret
119xen_iret_end_crit:
120	_ASM_EXTABLE(1b, iret_exc)
121
122hyper_iret:
123	/* put this out of line since its very rarely used */
124	jmp hypercall_page + __HYPERVISOR_iret * 32
125
126	.globl xen_iret_start_crit, xen_iret_end_crit
127
128/*
129 * This is called by xen_hypervisor_callback in entry.S when it sees
130 * that the EIP at the time of interrupt was between
131 * xen_iret_start_crit and xen_iret_end_crit.  We're passed the EIP in
132 * %eax so we can do a more refined determination of what to do.
133 *
134 * The stack format at this point is:
135 *	----------------
136 *	 ss		: (ss/esp may be present if we came from usermode)
137 *	 esp		:
138 *	 eflags		}  outer exception info
139 *	 cs		}
140 *	 eip		}
141 *	---------------- <- edi (copy dest)
142 *	 eax		:  outer eax if it hasn't been restored
143 *	----------------
144 *	 eflags		}  nested exception info
145 *	 cs		}   (no ss/esp because we're nested
146 *	 eip		}    from the same ring)
147 *	 orig_eax	}<- esi (copy src)
148 *	 - - - - - - - -
149 *	 fs		}
150 *	 es		}
151 *	 ds		}  SAVE_ALL state
152 *	 eax		}
153 *	  :		:
154 *	 ebx		}<- esp
155 *	----------------
156 *
157 * In order to deliver the nested exception properly, we need to shift
158 * everything from the return addr up to the error code so it sits
159 * just under the outer exception info.  This means that when we
160 * handle the exception, we do it in the context of the outer
161 * exception rather than starting a new one.
162 *
163 * The only caveat is that if the outer eax hasn't been restored yet
164 * (ie, it's still on stack), we need to insert its value into the
165 * SAVE_ALL state before going on, since it's usermode state which we
166 * eventually need to restore.
167 */
168ENTRY(xen_iret_crit_fixup)
169	/*
170	 * Paranoia: Make sure we're really coming from kernel space.
171	 * One could imagine a case where userspace jumps into the
172	 * critical range address, but just before the CPU delivers a
173	 * GP, it decides to deliver an interrupt instead.  Unlikely?
174	 * Definitely.  Easy to avoid?  Yes.  The Intel documents
175	 * explicitly say that the reported EIP for a bad jump is the
176	 * jump instruction itself, not the destination, but some
177	 * virtual environments get this wrong.
178	 */
179	movl PT_CS(%esp), %ecx
180	andl $SEGMENT_RPL_MASK, %ecx
181	cmpl $USER_RPL, %ecx
182	je 2f
183
184	lea PT_ORIG_EAX(%esp), %esi
185	lea PT_EFLAGS(%esp), %edi
186
187	/*
188	 * If eip is before iret_restore_end then stack
189	 * hasn't been restored yet.
190	 */
191	cmp $iret_restore_end, %eax
192	jae 1f
193
194	movl 0+4(%edi), %eax		/* copy EAX (just above top of frame) */
195	movl %eax, PT_EAX(%esp)
196
197	lea ESP_OFFSET(%edi), %edi	/* move dest up over saved regs */
198
199	/* set up the copy */
2001:	std
201	mov $PT_EIP / 4, %ecx		/* saved regs up to orig_eax */
202	rep movsl
203	cld
204
205	lea 4(%edi), %esp		/* point esp to new frame */
2062:	jmp xen_do_upcall
207
208