1 /*
2  * Copyright (c) 2017, Intel Corporation
3  *
4  * SPDX-License-Identifier: Apache-2.0
5  */
6 
7 #ifndef ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_ASM2_S_H
8 #define ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_ASM2_S_H
9 
10 #include <zephyr/zsr.h>
11 #include "xtensa_asm2_context.h"
12 
13 #include <zephyr/offsets.h>
14 
15 /* Assembler header!  This file contains macros designed to be included
16  * only by the assembler.
17  */
18 
19 #if defined(CONFIG_XTENSA_HIFI_SHARING)
20 .extern _xtensa_hifi_save
21 #endif
22 
23 /*
24  * SPILL_ALL_WINDOWS
25  *
26  * Spills all windowed registers (i.e. registers not visible as
27  * A0-A15) to their ABI-defined spill regions on the stack.
28  *
29  * Unlike the Xtensa HAL implementation, this code requires that the
30  * EXCM and WOE bit be enabled in PS, and relies on repeated hardware
31  * exception handling to do the register spills.  The trick is to do a
32  * noop write to the high registers, which the hardware will trap
33  * (into an overflow exception) in the case where those registers are
34  * already used by an existing call frame.  Then it rotates the window
35  * and repeats until all but the A0-A3 registers of the original frame
36  * are guaranteed to be spilled, eventually rotating back around into
37  * the original frame.  Advantages:
38  *
39  * - Vastly smaller code size
40  *
41  * - More easily maintained if changes are needed to window over/underflow
42  *   exception handling.
43  *
44  * - Requires no scratch registers to do its work, so can be used safely in any
45  *   context.
46  *
47  * - If the WOE bit is not enabled (for example, in code written for
48  *   the CALL0 ABI), this becomes a silent noop and operates compatibly.
49  *
50  * - In memory protection situations, this relies on the existing
51  *   exception handlers (and thus their use of the L/S32E
52  *   instructions) to execute stores in the protected space.  AFAICT,
53  *   the HAL routine does not handle this situation and isn't safe: it
54  *   will happily write through the "stack pointers" found in
55  *   registers regardless of where they might point.
56  *
57  * - Hilariously it's ACTUALLY FASTER than the HAL routine.  And not
58  *   just a little bit, it's MUCH faster.  With a mostly full register
59  *   file on an LX6 core (ESP-32) I'm measuring 145 cycles to spill
60  *   registers with this vs. 279 (!) to do it with
61  *   xthal_spill_windows().  Apparently Xtensa exception handling is
62  *   really fast, and no one told their software people.
63  *
64  * Note that as with the Xtensa HAL spill routine, and unlike context
65  * switching code on most sane architectures, the intermediate states
66  * here will have an invalid stack pointer.  That means that this code
67  * must not be preempted in any context (i.e. all Zephyr situations)
68  * where the interrupt code will need to use the stack to save the
69  * context.  But unlike the HAL, which runs with exceptions masked via
70  * EXCM, this will not: hit needs the overflow handlers unmasked.  Use
71  * INTLEVEL instead (which, happily, is what Zephyr's locking does
72  * anyway).
73  */
74 .macro SPILL_ALL_WINDOWS
75 #if XCHAL_NUM_AREGS == 64
76 	and a12, a12, a12
77 	rotw 3
78 	and a12, a12, a12
79 	rotw 3
80 	and a12, a12, a12
81 	rotw 3
82 	and a12, a12, a12
83 	rotw 3
84 	and a12, a12, a12
85 	rotw 4
86 #elif XCHAL_NUM_AREGS == 32
87 	and a12, a12, a12
88 	rotw 3
89 	and a12, a12, a12
90 	rotw 3
91 	and a4, a4, a4
92 	rotw 2
93 #else
94 #error Unrecognized XCHAL_NUM_AREGS
95 #endif
96 .endm
97 
98 #if XCHAL_HAVE_FP && defined(CONFIG_CPU_HAS_FPU) && defined(CONFIG_FPU_SHARING)
99 /*
100  * FPU_REG_SAVE
101  *
102  * Saves the Float Point Unit context registers in the base save
103  * area pointed to by the current stack pointer A1. The Floating-Point
104  * Coprocessor Option adds the FR register file and two User Registers
105  * called FCR and FSR.The FR register file consists of 16 registers of
106  * 32 bits each and is used for all data computation.
107  */
108 .macro FPU_REG_SAVE
109 	rur.fcr	a0
110 	s32i	a0, a1, ___xtensa_irq_bsa_t_fcr_OFFSET
111 	rur.fsr	a0
112 	s32i	a0, a1, ___xtensa_irq_bsa_t_fsr_OFFSET
113 	ssi	f0, a1, ___xtensa_irq_bsa_t_fpu0_OFFSET
114 	ssi	f1, a1, ___xtensa_irq_bsa_t_fpu1_OFFSET
115 	ssi	f2, a1, ___xtensa_irq_bsa_t_fpu2_OFFSET
116 	ssi	f3, a1, ___xtensa_irq_bsa_t_fpu3_OFFSET
117 	ssi	f4, a1, ___xtensa_irq_bsa_t_fpu4_OFFSET
118 	ssi	f5, a1, ___xtensa_irq_bsa_t_fpu5_OFFSET
119 	ssi	f6, a1, ___xtensa_irq_bsa_t_fpu6_OFFSET
120 	ssi	f7, a1, ___xtensa_irq_bsa_t_fpu7_OFFSET
121 	ssi	f8, a1, ___xtensa_irq_bsa_t_fpu8_OFFSET
122 	ssi	f9, a1, ___xtensa_irq_bsa_t_fpu9_OFFSET
123 	ssi	f10, a1, ___xtensa_irq_bsa_t_fpu10_OFFSET
124 	ssi	f11, a1, ___xtensa_irq_bsa_t_fpu11_OFFSET
125 	ssi	f12, a1, ___xtensa_irq_bsa_t_fpu12_OFFSET
126 	ssi	f13, a1, ___xtensa_irq_bsa_t_fpu13_OFFSET
127 	ssi	f14, a1, ___xtensa_irq_bsa_t_fpu14_OFFSET
128 	ssi	f15, a1, ___xtensa_irq_bsa_t_fpu15_OFFSET
129 .endm
130 
131 .macro FPU_REG_RESTORE
132 	l32i.n	a0, a1, ___xtensa_irq_bsa_t_fcr_OFFSET
133 	wur.fcr	a0
134 	l32i.n	a0, a1, ___xtensa_irq_bsa_t_fsr_OFFSET
135 	wur.fsr	a0
136 	lsi	f0, a1, ___xtensa_irq_bsa_t_fpu0_OFFSET
137 	lsi	f1, a1, ___xtensa_irq_bsa_t_fpu1_OFFSET
138 	lsi	f2, a1, ___xtensa_irq_bsa_t_fpu2_OFFSET
139 	lsi	f3, a1, ___xtensa_irq_bsa_t_fpu3_OFFSET
140 	lsi	f4, a1, ___xtensa_irq_bsa_t_fpu4_OFFSET
141 	lsi	f5, a1, ___xtensa_irq_bsa_t_fpu5_OFFSET
142 	lsi	f6, a1, ___xtensa_irq_bsa_t_fpu6_OFFSET
143 	lsi	f7, a1, ___xtensa_irq_bsa_t_fpu7_OFFSET
144 	lsi	f8, a1, ___xtensa_irq_bsa_t_fpu8_OFFSET
145 	lsi	f9, a1, ___xtensa_irq_bsa_t_fpu9_OFFSET
146 	lsi	f10, a1, ___xtensa_irq_bsa_t_fpu10_OFFSET
147 	lsi	f11, a1, ___xtensa_irq_bsa_t_fpu11_OFFSET
148 	lsi	f12, a1, ___xtensa_irq_bsa_t_fpu12_OFFSET
149 	lsi	f13, a1, ___xtensa_irq_bsa_t_fpu13_OFFSET
150 	lsi	f14, a1, ___xtensa_irq_bsa_t_fpu14_OFFSET
151 	lsi	f15, a1, ___xtensa_irq_bsa_t_fpu15_OFFSET
152 .endm
153 #endif
154 
155 /*
156  * ODD_REG_SAVE
157  *
158  * Stashes the oddball shift/loop context registers in the base save
159  * area pointed to by the current stack pointer.  On exit, A0 will
160  * have been modified but A2/A3 have not, and the shift/loop
161  * instructions can be used freely (though note loops don't work in
162  * exceptions for other reasons!).
163  *
164  * Does not populate or modify the PS/PC save locations.
165  */
166 .macro ODD_REG_SAVE
167 	rsr.sar a0
168 	s32i a0, a1, ___xtensa_irq_bsa_t_sar_OFFSET
169 #if XCHAL_HAVE_LOOPS
170 	rsr.lbeg a0
171 	s32i a0, a1, ___xtensa_irq_bsa_t_lbeg_OFFSET
172 	rsr.lend a0
173 	s32i a0, a1, ___xtensa_irq_bsa_t_lend_OFFSET
174 	rsr.lcount a0
175 	s32i a0, a1, ___xtensa_irq_bsa_t_lcount_OFFSET
176 #endif
177 	rsr.exccause a0
178 	s32i a0, a1, ___xtensa_irq_bsa_t_exccause_OFFSET
179 #if XCHAL_HAVE_S32C1I
180 	rsr.scompare1 a0
181 	s32i a0, a1, ___xtensa_irq_bsa_t_scompare1_OFFSET
182 #endif
183 #if XCHAL_HAVE_THREADPTR && \
184 	(defined(CONFIG_USERSPACE) || defined(CONFIG_THREAD_LOCAL_STORAGE))
185 	rur.THREADPTR a0
186 	s32i a0, a1, ___xtensa_irq_bsa_t_threadptr_OFFSET
187 #endif
188 #if XCHAL_HAVE_FP && defined(CONFIG_CPU_HAS_FPU) && defined(CONFIG_FPU_SHARING)
189 	FPU_REG_SAVE
190 #endif
191 
192 .endm
193 
194 #ifdef CONFIG_XTENSA_MMU
195 /*
196  * CALC_PTEVADDR_BASE
197  *
198  * This calculates the virtual address of the first PTE page
199  * (PTEVADDR base, the one mapping 0x00000000) so that we can
200  * use this to obtain the virtual address of the PTE page we are
201  * interested in. This can be obtained via
202  * (1 << CONFIG_XTENSA_MMU_PTEVADDR_SHIFT).
203  *
204  * Note that this is done this way is to avoid any TLB
205  * miss if we are to use l32r to load the PTEVADDR base.
206  * If the page containing the PTEVADDR base address is
207  * not in TLB, we will need to handle the TLB miss which
208  * we are trying to avoid here.
209  *
210  * @param ADDR_REG Register to store the calculated
211  *                 PTEVADDR base address.
212  *
213  * @note The content of ADDR_REG will be modified.
214  *       Save and restore it around this macro usage.
215  */
216 .macro CALC_PTEVADDR_BASE ADDR_REG
217 	movi \ADDR_REG, 1
218 	slli \ADDR_REG, \ADDR_REG, CONFIG_XTENSA_MMU_PTEVADDR_SHIFT
219 .endm
220 
221 /*
222  * PRELOAD_PTEVADDR
223  *
224  * This preloads the page table entries for a 4MB region to avoid TLB
225  * misses. This 4MB region is mapped via a page (4KB) of page table
226  * entries (PTE). Each entry is 4 bytes mapping a 4KB region. Each page,
227  * then, has 1024 entries mapping a 4MB region. Filling TLB entries is
228  * automatically done via hardware, as long as the PTE page associated
229  * with a particular address is also in TLB. If the PTE page is not in
230  * TLB, an exception will be raised that must be handled. This TLB miss
231  * is problematic when we are in the middle of dealing with another
232  * exception or handling an interrupt. So we need to put the PTE page
233  * into TLB by simply do a load operation.
234  *
235  * @param ADDR_REG Register containing the target address
236  * @param PTEVADDR_BASE_REG Register containing the PTEVADDR base
237  *
238  * @note Both the content of ADDR_REG will be modified.
239  *       Save and restore it around this macro usage.
240  */
241 .macro PRELOAD_PTEVADDR ADDR_REG, PTEVADDR_BASE_REG
242 	/*
243 	 * Calculate the offset to first PTE page of all memory.
244 	 *
245 	 * Every page (4KB) of page table entries contains
246 	 * 1024 entires (as each entry is 4 bytes). Each entry
247 	 * maps one 4KB page. So one page of entries maps 4MB of
248 	 * memory.
249 	 *
250 	 * 1. We need to find the virtual address of the PTE page
251 	 *    having the page table entry mapping the address in
252 	 *    register ADDR_REG. To do this, we first need to find
253 	 *    the offset of this PTE page from the first PTE page
254 	 *    (the one mapping memory 0x00000000):
255 	 *    a. Find the beginning address of the 4KB page
256 	 *       containing address in ADDR_REG. This can simply
257 	 *       be done by discarding 11 bits (or shifting right
258 	 *	 and then left 12 bits).
259 	 *    b. Since each PTE page contains 1024 entries,
260 	 *	 we divide the address obtained in step (a) by
261 	 *       further dividing it by 1024 (shifting right and
262 	 *       then left 10 bits) to obtain the offset of
263 	 *       the PTE page.
264 	 *
265 	 *    Step (a) and (b) can be obtained together so that
266 	 *    we can shift right 22 bits, and then shift left
267 	 *    12 bits.
268 	 *
269 	 * 2. Once we have combine the results from step (1) and
270 	 *    PTEVADDR_BASE_REG to get the virtual address of
271 	 *    the PTE page.
272 	 *
273 	 * 3. Do a l32i to force the PTE page to be in TLB.
274 	 */
275 
276 	/* Step 1 */
277 	srli \ADDR_REG, \ADDR_REG, 22
278 	slli \ADDR_REG, \ADDR_REG, 12
279 
280 	/* Step 2 */
281 	add \ADDR_REG, \ADDR_REG, \PTEVADDR_BASE_REG
282 
283 	/* Step 3 */
284 	l32i \ADDR_REG, \ADDR_REG, 0
285 .endm
286 #endif /* CONFIG_XTENSA_MMU */
287 
288 /*
289  * CROSS_STACK_CALL
290  *
291  * Sets the stack up carefully such that a "cross stack" call can spill
292  * correctly, then invokes an immediate handler.  Note that:
293  *
294  * 0. When spilling a frame, functions find their callEE's stack pointer
295  *    (to save A0-A3) from registers.  But they find their
296  *    already-spilled callER's stack pointer (to save higher GPRs) from
297  *    their own stack memory.
298  *
299  * 1. The function that was interrupted ("interruptee") does not need to
300  *    be spilled, because it already has been as part of the context
301  *    save.  So it doesn't need registers allocated for it anywhere.
302  *
303  * 2. Interruptee's caller needs to spill into the space below the
304  *    interrupted stack frame, which means that the A1 register it finds
305  *    below it needs to contain the old/interrupted stack and not the
306  *    context saved one.
307  *
308  * 3. The ISR dispatcher (called "underneath" interruptee) needs to spill
309  *    high registers into the space immediately above its own stack frame,
310  *    so it needs to find a caller with the "new" stack pointer instead.
311  *
312  * We make this work by inserting TWO 4-register frames between
313  * "interruptee's caller" and "ISR dispatcher".  The top one (which
314  * occupies the slot formerly held by "interruptee", whose registers
315  * were saved via external means) holds the "interrupted A1" and the
316  * bottom has the "top of the interrupt stack" which can be either the
317  * word above a new memory area (when handling an interrupt from user
318  * mode) OR the existing "post-context-save" stack pointer (when
319  * handling a nested interrupt).  The code works either way.  Because
320  * these are both only 4-registers, neither needs its own caller for
321  * spilling.
322  *
323  * The net cost is 32 wasted bytes on the interrupt stack frame to
324  * spill our two "phantom frames" (actually not quite, as we'd need a
325  * few of those words used somewhere for tracking the stack pointers
326  * anyway).  But the benefit is that NO REGISTER FRAMES NEED TO BE
327  * SPILLED on interrupt entry.  And if we return back into the same
328  * context we interrupted (a common case) no windows need to be
329  * explicitly spilled at all.  And in fact in the case where the ISR
330  * uses significant depth on its own stack, the interrupted frames
331  * will be spilled naturally as a standard cost of a function call,
332  * giving register windows something like "zero cost interrupts".
333  *
334  * FIXME: a terrible awful really nifty idea to fix the stack waste
335  * problem would be to use a SINGLE frame between the two stacks,
336  * pre-spill it with one stack pointer for the "lower" call to see and
337  * leave the register SP in place for the "upper" frame to use.
338  * Would require modifying the Window{Over|Under}flow4 exceptions to
339  * know not to spill/fill these special frames, but that's not too
340  * hard, maybe...
341  *
342  * Enter this macro with a valid "context saved" pointer (i.e. SP
343  * should point to a stored pointer which points to one BSA below the
344  * interrupted/old stack) in A1, a handler function in A2, and a "new"
345  * stack pointer (i.e. a pointer to the word ABOVE the allocated stack
346  * area) in A3.  Exceptions should be enabled via PS.EXCM, but
347  * PS.INTLEVEL must (!) be set such that no nested interrupts can
348  * arrive (we restore the natural INTLEVEL from the value in ZSR_EPS
349  * just before entering the call).  On return A0/1 will be unchanged,
350  * A2 has the return value of the called function, and A3 is
351  * clobbered.  A4-A15 become part of called frames and MUST NOT BE IN
352  * USE by the code that expands this macro.  The called function gets
353  * the context save handle in A1 as it's first argument.
354  */
355 .macro CROSS_STACK_CALL
356 	mov a6, a3		/* place "new sp" in the next frame's A2 */
357 	mov a10, a1             /* pass "context handle" in 2nd frame's A2 */
358 	mov a3, a1		/* stash it locally in A3 too */
359 	mov a11, a2		/* handler in 2nd frame's A3, next frame's A7 */
360 
361 	/* Recover the interrupted SP from the BSA */
362 	l32i a1, a1, 0
363 	l32i a0, a1, ___xtensa_irq_bsa_t_a0_OFFSET
364 	addi a1, a1, ___xtensa_irq_bsa_t_SIZEOF
365 
366 	call4 _xstack_call0_\@
367 	mov a1, a3		/* restore original SP */
368 	mov a2, a6		/* copy return value */
369 	j _xstack_returned_\@
370 .align 4
371 _xstack_call0_\@:
372 	/* We want an ENTRY to set a bit in windowstart and do the
373 	 * rotation, but we want our own SP.  After that, we are
374 	 * running in a valid frame, so re-enable interrupts.
375 	 */
376 	entry a1, 16
377 	mov a1, a2
378 	rsr.ZSR_EPS a2
379 	wsr.ps a2
380 	call4 _xstack_call1_\@
381 	mov a2, a6		/* copy return value */
382 	retw
383 .align 4
384 _xstack_call1_\@:
385 	/* Remember the handler is going to do our ENTRY, so the
386 	 * handler pointer is still in A6 (not A2) even though this is
387 	 * after the second CALL4.
388 	 */
389 	jx a7
390 _xstack_returned_\@:
391 .endm
392 
393 /* Entry setup for all exceptions and interrupts.  Arrive here with
394  * the stack pointer decremented across a base save area, A0-A3 and
395  * PS/PC already spilled to the stack in the BSA, and A2 containing a
396  * level-specific C handler function.
397  *
398  * This is a macro (to allow for unit testing) that expands to a
399  * handler body to which the vectors can jump.  It takes two static
400  * (!) arguments: a special register name (which should be set up to
401  * point to some kind of per-CPU record struct) and offsets within
402  * that struct which contains an interrupt stack top and a "nest
403  * count" word.
404  */
405 .macro EXCINT_HANDLER NEST_OFF, INTSTACK_OFF
406 	/* A2 contains our handler function which will get clobbered
407 	 * by the save.  Stash it into the unused "a1" slot in the
408 	 * BSA and recover it immediately after.  Kind of a hack.
409 	 */
410 	s32i a2, a1, ___xtensa_irq_bsa_t_scratch_OFFSET
411 
412 	ODD_REG_SAVE
413 
414 #if defined(CONFIG_XTENSA_HIFI_SHARING)
415 	call0 _xtensa_hifi_save    /* Save HiFi registers */
416 #endif
417 
418 	call0 xtensa_save_high_regs
419 
420 	l32i a2, a1, 0
421 	l32i a2, a2, ___xtensa_irq_bsa_t_scratch_OFFSET
422 
423 #if XCHAL_HAVE_THREADPTR && defined(CONFIG_USERSPACE)
424 	/* Clear up the threadptr because it is used
425 	 * to check if a thread is runnig on user mode. Since
426 	 * we are in a interruption we don't want the system
427 	 * thinking it is possbly running in user mode.
428 	 */
429 	movi.n a0, 0
430 	wur.THREADPTR a0
431 #endif /* XCHAL_HAVE_THREADPTR && CONFIG_USERSPACE */
432 
433 #ifdef CONFIG_XTENSA_INTERRUPT_NONPREEMPTABLE
434 
435 	/* Setting the interrupt mask to the max non-debug level
436 	 * to prevent lower priority interrupts being preempted by
437 	 * high level interrupts until processing of that lower level
438 	 * interrupt has completed.
439 	 */
440 	rsr.ps a0
441 	movi a3, ~(PS_INTLEVEL_MASK)
442 	and a0, a0, a3
443 	movi a3, PS_INTLEVEL(ZSR_RFI_LEVEL)
444 	or a0, a0, a3
445 	wsr.ps a0
446 
447 #else
448 
449 	/* There's a gotcha with level 1 handlers: the INTLEVEL field
450 	 * gets left at zero and not set like high priority interrupts
451 	 * do.  That works fine for exceptions, but for L1 interrupts,
452 	 * when we unmask EXCM below, the CPU will just fire the
453 	 * interrupt again and get stuck in a loop blasting save
454 	 * frames down the stack to the bottom of memory.  It would be
455 	 * good to put this code into the L1 handler only, but there's
456 	 * not enough room in the vector without some work there to
457 	 * squash it some.  Next choice would be to make this a macro
458 	 * argument and expand two versions of this handler.  An
459 	 * optimization FIXME, I guess.
460 	 */
461 	rsr.ps a0
462 	movi a3, PS_INTLEVEL_MASK
463 	and a0, a0, a3
464 	bnez a0, _not_l1
465 	rsr.ps a0
466 	movi a3, PS_INTLEVEL(1)
467 	or a0, a0, a3
468 	wsr.ps a0
469 
470 _not_l1:
471 #endif /* CONFIG_XTENSA_INTERRUPT_NONPREEMPTABLE */
472 
473 	/* Setting up the cross stack call below has states where the
474 	 * resulting frames are invalid/non-reentrant, so we can't
475 	 * allow nested interrupts.  But we do need EXCM unmasked, as
476 	 * we use CALL/ENTRY instructions in the process and need to
477 	 * handle exceptions to spill caller/interruptee frames.  Use
478 	 * PS.INTLEVEL at maximum to mask all interrupts and stash the
479 	 * current value in our designated EPS register (which is
480 	 * guaranteed unused across the call)
481 	 */
482 	rsil a0, 0xf
483 
484 	/* Since we are unmasking EXCM, we need to set RING bits to kernel
485 	 * mode, otherwise we won't be able to run the exception handler in C.
486 	 */
487 	movi a3, ~(PS_EXCM_MASK) & ~(PS_RING_MASK)
488 	and a0, a0, a3
489 	wsr.ZSR_EPS a0
490 	wsr.ps a0
491 	rsync
492 
493 	/* A1 already contains our saved stack, and A2 our handler.
494 	 * So all that's needed for CROSS_STACK_CALL is to put the
495 	 * "new" stack into A3.  This can be either a copy of A1 or an
496 	 * entirely new area depending on whether we find a 1 in our
497 	 * SR[off] macro argument.
498 	 */
499 	rsr.ZSR_CPU a3
500 	l32i a0, a3, \NEST_OFF
501 	beqz a0, _switch_stacks_\@
502 
503 	/* Use the same stack, just copy A1 to A3 after incrementing NEST */
504 	addi a0, a0, 1
505 	s32i a0, a3, \NEST_OFF
506 	mov a3, a1
507 	j _do_call_\@
508 
509 _switch_stacks_\@:
510 	addi a0, a0, 1
511 	s32i a0, a3, \NEST_OFF
512 	l32i a3, a3, \INTSTACK_OFF
513 
514 _do_call_\@:
515 	CROSS_STACK_CALL
516 
517 	/* Mask interrupts (which have been unmasked during the handler
518 	 * execution) while we muck with the windows and decrement the nested
519 	 * count.  The restore will unmask them correctly.
520 	 */
521 	rsil a0, XCHAL_NUM_INTLEVELS
522 
523 	/* Decrement nest count */
524 	rsr.ZSR_CPU a3
525 	l32i a0, a3, \NEST_OFF
526 	addi a0, a0, -1
527 	s32i a0, a3, \NEST_OFF
528 
529 	/* Last trick: the called function returned the "next" handle
530 	 * to restore to in A6 (the call4'd function's A2).  If this
531 	 * is not the same handle as we started with, we need to do a
532 	 * register spill before restoring, for obvious reasons.
533 	 * Remember to restore the A1 stack pointer as it existed at
534 	 * interrupt time so the caller of the interrupted function
535 	 * spills to the right place.
536 	 */
537 	beq a6, a1, _restore_\@
538 
539 #ifndef CONFIG_USERSPACE
540 	l32i a1, a1, 0
541 	l32i a0, a1, ___xtensa_irq_bsa_t_a0_OFFSET
542 	addi a1, a1, ___xtensa_irq_bsa_t_SIZEOF
543 #ifndef CONFIG_KERNEL_COHERENCE
544 	/* When using coherence, the registers of the interrupted
545 	 * context got spilled upstream in arch_cohere_stacks()
546 	 */
547 	SPILL_ALL_WINDOWS
548 #endif
549 
550 	/* Restore A1 stack pointer from "next" handle. */
551 	mov a1, a6
552 #else
553 	/* With userspace, we cannot simply restore A1 stack pointer
554 	 * at this pointer because we need to swap page tables to
555 	 * the incoming thread, and we do not want to call that
556 	 * function with thread's stack. So we stash the new stack
557 	 * pointer into A2 first, then move it to A1 after we have
558 	 * swapped the page table.
559 	 */
560 	mov a2, a6
561 
562 	/* Need to switch page tables because the "next" handle
563 	 * returned above is not the same handle as we started
564 	 * with. This means we are being restored to another
565 	 * thread.
566 	 */
567 	rsr a6, ZSR_CPU
568 	l32i a6, a6, ___cpu_t_current_OFFSET
569 
570 #ifdef CONFIG_XTENSA_MMU
571 	call4 xtensa_swap_update_page_tables
572 #endif
573 #ifdef CONFIG_XTENSA_MPU
574 	call4 xtensa_mpu_map_write
575 #endif
576 	l32i a1, a1, 0
577 	l32i a0, a1, ___xtensa_irq_bsa_t_a0_OFFSET
578 	addi a1, a1, ___xtensa_irq_bsa_t_SIZEOF
579 
580 	SPILL_ALL_WINDOWS
581 
582 	/* Moved stashed stack pointer to A1 to restore stack. */
583 	mov a1, a2
584 #endif
585 
586 _restore_\@:
587 	j _restore_context
588 .endm
589 
590 /* Defines an exception/interrupt vector for a specified level.  Saves
591  * off the interrupted A0-A3 registers and the per-level PS/PC
592  * registers to the stack before jumping to a handler (defined with
593  * EXCINT_HANDLER) to do the rest of the work.
594  *
595  * Arguments are a numeric interrupt level and symbol names for the
596  * entry code (defined via EXCINT_HANDLER) and a C handler for this
597  * particular level.
598  *
599  * Note that the linker sections for some levels get special names for
600  * no particularly good reason.  Only level 1 has any code generation
601  * difference, because it is the legacy exception level that predates
602  * the EPS/EPC registers.  It also lives in the "iram0.text" segment
603  * (which is linked immediately after the vectors) so that an assembly
604  * stub can be loaded into the vector area instead and reach this code
605  * with a simple jump instruction.
606  */
607 .macro DEF_EXCINT LVL, ENTRY_SYM, C_HANDLER_SYM
608 #if defined(CONFIG_XTENSA_SMALL_VECTOR_TABLE_ENTRY)
609 .pushsection .iram.text, "ax"
610 .global _Level\LVL\()VectorHelper
611 _Level\LVL\()VectorHelper :
612 #else
613 .if \LVL == 1
614 .pushsection .iram0.text, "ax"
615 .elseif \LVL == XCHAL_DEBUGLEVEL
616 .pushsection .DebugExceptionVector.text, "ax"
617 .elseif \LVL == XCHAL_NMILEVEL
618 .pushsection .NMIExceptionVector.text, "ax"
619 .else
620 .pushsection .Level\LVL\()InterruptVector.text, "ax"
621 .endif
622 .global _Level\LVL\()Vector
623 _Level\LVL\()Vector:
624 #endif
625 
626 #ifdef CONFIG_XTENSA_MMU
627 .if \LVL == 1
628 	/* If there are any TLB misses during interrupt handling,
629 	 * the user/kernel/double exception vector will be triggered
630 	 * to handle these misses. This results in DEPC and EXCCAUSE
631 	 * being overwritten, and then execution returned back to
632 	 * this site of TLB misses. When it gets to the C handler,
633 	 * it will not see the original cause. So stash
634 	 * the EXCCAUSE here so C handler can see the original cause.
635 	 *
636 	 * For double exception, DEPC in saved in earlier vector
637 	 * code.
638 	 */
639 	wsr a0, ZSR_EXCCAUSE_SAVE
640 
641 	esync
642 
643 	rsr a0, ZSR_DEPC_SAVE
644 	beqz a0, _not_triple_fault
645 
646 	/* If stashed DEPC is not zero, we have started servicing
647 	 * a double exception and yet we are here because there is
648 	 * another exception (through user/kernel if PS.EXCM is
649 	 * cleared, or through double if PS.EXCM is set). This can
650 	 * be considered triple fault. Although there is no triple
651 	 * faults on Xtensa. Once PS.EXCM is set, it keeps going
652 	 * through double exception vector for any new exceptions.
653 	 * However, our exception code needs to unmask PS.EXCM to
654 	 * enable register window operations. So after that, any
655 	 * new exceptions will go through the kernel or user vectors
656 	 * depending on PS.UM. If there is continuous faults, it may
657 	 * keep ping-ponging between double and kernel/user exception
658 	 * vectors that may never get resolved. Since we stash DEPC
659 	 * during double exception, and the stashed one is only cleared
660 	 * once the double exception has been processed, we can use
661 	 * the stashed DEPC value to detect if the next exception could
662 	 * be considered a triple fault. If such a case exists, simply
663 	 * jump to an infinite loop, or quit the simulator, or invoke
664 	 * debugger.
665 	 */
666 	rsr a0, ZSR_EXCCAUSE_SAVE
667 	j _TripleFault
668 
669 _not_triple_fault:
670 	rsr.exccause a0
671 
672 	xsr a0, ZSR_EXCCAUSE_SAVE
673 
674 	esync
675 .endif
676 #endif
677 
678 	addi a1, a1, -___xtensa_irq_bsa_t_SIZEOF
679 	s32i a0, a1, ___xtensa_irq_bsa_t_a0_OFFSET
680 	s32i a2, a1, ___xtensa_irq_bsa_t_a2_OFFSET
681 	s32i a3, a1, ___xtensa_irq_bsa_t_a3_OFFSET
682 
683 	/* Level "1" is the exception handler, which uses a different
684 	 * calling convention.  No special register holds the
685 	 * interrupted PS, instead we just assume that the CPU has
686 	 * turned on the EXCM bit and set INTLEVEL.
687 	 */
688 .if \LVL == 1
689 	rsr.ps a0
690 #ifdef CONFIG_XTENSA_MMU
691 	/* TLB misses also come through level 1 interrupts.
692 	 * We do not want to unconditionally unmask interrupts.
693 	 * Execution continues after a TLB miss is handled,
694 	 * and we need to preserve the interrupt mask.
695 	 * The interrupt mask will be cleared for non-TLB-misses
696 	 * level 1 interrupt later in the handler code.
697 	 */
698 	movi a2, ~PS_EXCM_MASK
699 #else
700 	movi a2, ~(PS_EXCM_MASK | PS_INTLEVEL_MASK)
701 #endif
702 	and a0, a0, a2
703 	s32i a0, a1, ___xtensa_irq_bsa_t_ps_OFFSET
704 .else
705 	rsr.eps\LVL a0
706 	s32i a0, a1, ___xtensa_irq_bsa_t_ps_OFFSET
707 .endif
708 
709 	rsr.epc\LVL a0
710 	s32i a0, a1, ___xtensa_irq_bsa_t_pc_OFFSET
711 
712 	/* What's happening with this jump is that the L32R
713 	 * instruction to load a full 32 bit immediate must use an
714 	 * offset that is negative from PC.  Normally the assembler
715 	 * fixes this up for you by putting the "literal pool"
716 	 * somewhere at the start of the section.  But vectors start
717 	 * at a fixed address in their own section, and don't (in our
718 	 * current linker setup) have anywhere "definitely before
719 	 * vectors" to place immediates.  Some platforms and apps will
720 	 * link by dumb luck, others won't.  We add an extra jump just
721 	 * to clear space we know to be legal.
722 	 *
723 	 * The right way to fix this would be to use a "literal_prefix"
724 	 * to put the literals into a per-vector section, then link
725 	 * that section into the PREVIOUS vector's area right after
726 	 * the vector code.  Requires touching a lot of linker scripts
727 	 * though.
728 	 */
729 	j _after_imms\LVL\()
730 .align 4
731 _handle_excint_imm\LVL:
732 	.word \ENTRY_SYM
733 _c_handler_imm\LVL:
734 	.word \C_HANDLER_SYM
735 _after_imms\LVL:
736 	l32r a2, _c_handler_imm\LVL
737 	l32r a0, _handle_excint_imm\LVL
738 	jx a0
739 .popsection
740 
741 #if defined(CONFIG_XTENSA_SMALL_VECTOR_TABLE_ENTRY)
742 .if \LVL == 1
743 .pushsection .iram0.text, "ax"
744 .elseif \LVL == XCHAL_DEBUGLEVEL
745 .pushsection .DebugExceptionVector.text, "ax"
746 .elseif \LVL == XCHAL_NMILEVEL
747 .pushsection .NMIExceptionVector.text, "ax"
748 .else
749 .pushsection .Level\LVL\()InterruptVector.text, "ax"
750 .endif
751 .global _Level\LVL\()Vector
752 _Level\LVL\()Vector :
753 j _Level\LVL\()VectorHelper
754 .popsection
755 #endif
756 
757 .endm
758 
759 #endif	/* ZEPHYR_ARCH_XTENSA_INCLUDE_XTENSA_ASM2_S_H */
760