xtensa: add support for the xtensa processor architecture [2/2]

The Xtensa processor architecture is a configurable, extensible,
and synthesizable 32-bit RISC processor core provided by Tensilica, inc.

This is the second part of the basic architecture port, adding the
'arch/xtensa' directory and a readme file.

Signed-off-by: Chris Zankel <chris@zankel.net>
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Reviewed-by: Simon Glass <sjg@chromium.org>
Reviewed-by: Tom Rini <trini@konsulko.com>
diff --git a/arch/xtensa/cpu/start.S b/arch/xtensa/cpu/start.S
new file mode 100644
index 0000000..8e4bc99
--- /dev/null
+++ b/arch/xtensa/cpu/start.S
@@ -0,0 +1,677 @@
+/*
+ * (C) Copyright 2008 - 2013 Tensilica Inc.
+ * (C) Copyright 2014 - 2016 Cadence Design Systems Inc.
+ *
+ * SPDX-License-Identifier:	GPL-2.0+
+ */
+
+#include <config.h>
+#include <asm/asmmacro.h>
+#include <asm/cacheasm.h>
+#include <asm/regs.h>
+#include <asm/arch/tie.h>
+#include <asm-offsets.h>
+
+/*
+ * Offsets into the the pt_regs struture.
+ * Make sure these always match with the structure defined in ptrace.h!
+ */
+
+#define PT_PC		0
+#define PT_PS		4
+#define PT_DEPC		8
+#define PT_EXCCAUSE	12
+#define PT_EXCVADDR	16
+#define PT_DEBUGCAUSE	20
+#define PT_WMASK	24
+#define PT_LBEG		28
+#define PT_LEND		32
+#define PT_LCOUNT	36
+#define PT_SAR		40
+#define PT_WINDOWBASE	44
+#define PT_WINDOWSTART	48
+#define PT_SYSCALL	52
+#define PT_ICOUNTLEVEL	56
+#define PT_RESERVED	60
+#define PT_AREG		64
+#define PT_SIZE		(64 + 64)
+
+/*
+ * Cache attributes are different for full MMU and region protection.
+ */
+
+#if XCHAL_HAVE_PTP_MMU
+#define CA_WRITEBACK	(0x7)
+#else
+#define CA_WRITEBACK	(0x4)
+#endif
+
+/*
+ * Reset vector.
+ * Only a trampoline to jump to _start
+ * (Note that we have to mark the section writable as the section contains
+ *  a relocatable literal)
+ */
+
+	.section .ResetVector.text, "awx"
+	.global _ResetVector
+_ResetVector:
+
+	j	1f
+	.align 4
+2:	.long	_start
+1:	l32r	a2, 2b
+	jx	a2
+
+
+/*
+ * Processor initialization. We still run in rom space.
+ *
+ * NOTE: Running in ROM
+ *  For Xtensa, we currently don't allow to run some code from ROM but
+ *  unpack the data immediately to memory. This requires, for example,
+ *  that DDR has been set up before running U-Boot. (See also comments
+ *  inline for ways to change it)
+ */
+
+	.section .reset.text, "ax"
+	.global _start
+	.align 4
+_start:
+	/* Keep a0 = 0 for various initializations */
+
+	movi	a0, 0
+
+	/*
+	 * For full MMU cores, put page table at unmapped virtual address.
+	 * This ensures that accesses outside the static maps result
+	 * in miss exceptions rather than random behaviour.
+	 */
+
+#if XCHAL_HAVE_PTP_MMU
+	wsr	a0, PTEVADDR
+#endif
+
+	/* Disable dbreak debug exceptions */
+
+#if XCHAL_HAVE_DEBUG && XCHAL_NUM_DBREAK > 0
+	.set	_index, 0
+	.rept	XCHAL_NUM_DBREAK
+	wsr	a0, DBREAKC + _index
+	.set	_index, _index + 1
+	.endr
+#endif
+
+	/* Reset windowbase and windowstart */
+
+#if XCHAL_HAVE_WINDOWED
+	movi	a3, 1
+	wsr	a3, windowstart
+	wsr	a0, windowbase
+	rsync
+	movi	a0, 0			/* windowbase might have changed */
+#endif
+
+	/*
+	 * Vecbase in bitstream may differ from header files
+	 * set or check it.
+	 */
+
+#if XCHAL_HAVE_VECBASE
+	movi	a3, XCHAL_VECBASE_RESET_VADDR	/* VECBASE reset value */
+	wsr	a3, VECBASE
+#endif
+
+#if XCHAL_HAVE_LOOPS
+	/* Disable loops */
+
+	wsr	a0, LCOUNT
+#endif
+
+	/* Set PS.WOE = 0, PS.EXCM = 0 (for loop), PS.INTLEVEL = EXCM level */
+
+#if XCHAL_HAVE_XEA1
+	movi	a2, 1
+#else
+	movi	a2, XCHAL_EXCM_LEVEL
+#endif
+	wsr	a2, PS
+	rsync
+
+	/* Unlock and invalidate caches */
+
+	___unlock_dcache_all a2, a3
+	___invalidate_dcache_all a2, a3
+	___unlock_icache_all a2, a3
+	___invalidate_icache_all a2, a3
+
+	isync
+
+	/* Unpack data sections */
+
+	movi	a2, __reloc_table_start
+	movi	a3, __reloc_table_end
+
+1:	beq	a2, a3, 3f	# no more entries?
+	l32i	a4, a2, 0	# start destination (in RAM)
+	l32i	a5, a2, 4	# end destination (in RAM)
+	l32i	a6, a2, 8	# start source (in ROM)
+	addi	a2, a2, 12	# next entry
+	beq	a4, a5, 1b	# skip, empty entry
+	beq	a4, a6, 1b	# skip, source and destination are the same
+
+	/* If there's memory protection option with 512MB TLB regions and
+	 * cache attributes in TLB entries and caching is not inhibited,
+	 * enable data/instruction cache for relocated image.
+	 */
+#if XCHAL_HAVE_SPANNING_WAY && \
+	(!defined(CONFIG_SYS_DCACHE_OFF) || \
+	 !defined(CONFIG_SYS_ICACHE_OFF))
+	srli	a7, a4, 29
+	slli	a7, a7, 29
+	addi	a7, a7, XCHAL_SPANNING_WAY
+#ifndef CONFIG_SYS_DCACHE_OFF
+	rdtlb1	a8, a7
+	srli	a8, a8, 4
+	slli	a8, a8, 4
+	addi	a8, a8, CA_WRITEBACK
+	wdtlb	a8, a7
+#endif
+#ifndef CONFIG_SYS_ICACHE_OFF
+	ritlb1	a8, a7
+	srli	a8, a8, 4
+	slli	a8, a8, 4
+	addi	a8, a8, CA_WRITEBACK
+	witlb	a8, a7
+#endif
+	isync
+#endif
+
+2:	l32i	a7, a6, 0
+	addi	a6, a6, 4
+	s32i	a7, a4, 0
+	addi	a4, a4, 4
+	bltu	a4, a5, 2b
+	j	1b
+
+3:	/* All code and initalized data segments have been copied */
+
+	/* Setup PS, PS.WOE = 1, PS.EXCM = 0, PS.INTLEVEL = EXCM level. */
+
+#if __XTENSA_CALL0_ABI__
+	movi	a2, XCHAL_EXCM_LEVEL
+#else
+	movi	a2, (1<<PS_WOE_BIT) | XCHAL_EXCM_LEVEL
+#endif
+	wsr	a2, PS
+	rsync
+
+	/* Writeback */
+
+	___flush_dcache_all a2, a3
+
+#ifdef __XTENSA_WINDOWED_ABI__
+	/*
+	 * In windowed ABI caller and call target need to be within the same
+	 * gigabyte. Put the rest of the code into the text segment and jump
+	 * there.
+	 */
+
+	movi	a4, .Lboard_init_code
+	jx	a4
+
+	.text
+	.align	4
+.Lboard_init_code:
+#endif
+
+	movi	a0, 0
+	movi	sp, (CONFIG_SYS_TEXT_ADDR - 16) & 0xfffffff0
+
+#ifdef CONFIG_DEBUG_UART
+	movi	a4, debug_uart_init
+#ifdef __XTENSA_CALL0_ABI__
+	callx0	a4
+#else
+	callx4	a4
+#endif
+#endif
+
+	movi	a4, board_init_f_alloc_reserve
+
+#ifdef __XTENSA_CALL0_ABI__
+	mov	a2, sp
+	callx0	a4
+	mov	sp, a2
+#else
+	mov	a6, sp
+	callx4	a4
+	movsp	sp, a6
+#endif
+
+	movi	a4, board_init_f_init_reserve
+
+#ifdef __XTENSA_CALL0_ABI__
+	callx0	a4
+#else
+	callx4	a4
+#endif
+
+        /*
+	 * Call board initialization routine (never returns).
+	 */
+
+	movi	a4, board_init_f
+
+#ifdef __XTENSA_CALL0_ABI__
+	movi	a2, 0
+	callx0	a4
+#else
+	movi	a6, 0
+	callx4	a4
+#endif
+	/* Never Returns */
+	ill
+
+/*
+ * void relocate_code (addr_sp, gd, addr_moni)
+ *
+ * This "function" does not return, instead it continues in RAM
+ * after relocating the monitor code.
+ *
+ * a2 = addr_sp
+ * a3 = gd
+ * a4 = destination address
+ */
+	.text
+	.globl relocate_code
+	.align 4
+relocate_code:
+	abi_entry
+
+#ifdef __XTENSA_CALL0_ABI__
+	mov	a1, a2
+	mov	a2, a3
+	mov	a3, a4
+	movi	a0, board_init_r
+	callx0	a0
+#else
+	/* We can't movsp here, because the chain of stack frames may cross
+	 * the now reserved memory. We need to toss all window frames except
+	 * the current, create new pristine stack frame and start from scratch.
+	 */
+	rsr	a0, windowbase
+	ssl	a0
+	movi	a0, 1
+	sll	a0, a0
+	wsr	a0, windowstart
+	rsync
+
+	movi	a0, 0
+
+	/* Reserve 16-byte save area */
+	addi	sp, a2, -16
+	mov	a6, a3
+	mov	a7, a4
+	movi	a4, board_init_r
+	callx4	a4
+#endif
+	ill
+
+#if XCHAL_HAVE_EXCEPTIONS
+
+/*
+ * Exception vectors.
+ *
+ *  Various notes:
+ *   - We currently don't use the user exception vector (PS.UM is always 0),
+ *     but do define such a vector, just in case. They both jump to the
+ *     same exception handler, though.
+ *   - We currently only save the bare minimum number of registers:
+ *     a0...a15, sar, loop-registers, exception register (epc1, excvaddr,
+ *     exccause, depc)
+ *   - WINDOWSTART is only saved to identify if registers have been spilled
+ *     to the wrong stack (exception stack) while executing the exception
+ *     handler.
+ */
+
+	.section .KernelExceptionVector.text, "ax"
+	.global _KernelExceptionVector
+_KernelExceptionVector:
+
+	wsr	a2, EXCSAVE1
+	movi	a2, ExceptionHandler
+	jx	a2
+
+	.section .UserExceptionVector.text, "ax"
+	.global _UserExceptionVector
+_UserExceptionVector:
+
+	wsr	a2, EXCSAVE1
+	movi	a2, ExceptionHandler
+	jx	a2
+
+#if !XCHAL_HAVE_XEA1
+	.section .DoubleExceptionVector.text, "ax"
+	.global _DoubleExceptionVector
+_DoubleExceptionVector:
+
+#ifdef __XTENSA_CALL0_ABI__
+	wsr	a0, EXCSAVE1
+	movi    a0, hang                # report and ask user to reset board
+	callx0	a0
+#else
+	wsr	a4, EXCSAVE1
+	movi    a4, hang                # report and ask user to reset board
+	callx4	a4
+#endif
+#endif
+	/* Does not return here */
+
+
+	.text
+	.align 4
+ExceptionHandler:
+
+	rsr	a2, EXCCAUSE		# find handler
+
+#if XCHAL_HAVE_WINDOWED
+	/* Special case for alloca handler */
+
+	bnei	a2, 5, 1f		# jump if not alloca exception
+
+	addi	a1, a1, -16 - 4		# create a small stack frame
+	s32i	a3, a1, 0		# and save a3 (a2 still in excsave1)
+	movi	a2, fast_alloca_exception
+	jx	a2			# jump to fast_alloca_exception
+#endif
+	/* All other exceptions go here: */
+
+	/* Create ptrace stack and save a0...a3 */
+
+1:	addi	a2, a1, - PT_SIZE - 16
+	s32i	a0, a2, PT_AREG + 0 * 4
+	s32i	a1, a2, PT_AREG + 1 * 4
+	s32i	a3, a2, PT_AREG + 3 * 4
+	rsr	a3, EXCSAVE1
+	s32i	a3, a2, PT_AREG + 2 * 4
+	mov	a1, a2
+
+	/* Save remaining AR registers */
+
+	s32i	a4, a1, PT_AREG + 4 * 4
+	s32i	a5, a1, PT_AREG + 5 * 4
+	s32i	a6, a1, PT_AREG + 6 * 4
+	s32i	a7, a1, PT_AREG + 7 * 4
+	s32i	a8, a1, PT_AREG + 8 * 4
+	s32i	a9, a1, PT_AREG + 9 * 4
+	s32i	a10, a1, PT_AREG + 10 * 4
+	s32i	a11, a1, PT_AREG + 11 * 4
+	s32i	a12, a1, PT_AREG + 12 * 4
+	s32i	a13, a1, PT_AREG + 13 * 4
+	s32i	a14, a1, PT_AREG + 14 * 4
+	s32i	a15, a1, PT_AREG + 15 * 4
+
+	/* Save SRs */
+
+#if XCHAL_HAVE_WINDOWED
+	rsr	a2, WINDOWSTART
+	s32i	a2, a1, PT_WINDOWSTART
+#endif
+
+	rsr	a2, SAR
+	rsr	a3, EPC1
+	rsr	a4, EXCVADDR
+	s32i	a2, a1, PT_SAR
+	s32i	a3, a1, PT_PC
+	s32i	a4, a1, PT_EXCVADDR
+
+#if XCHAL_HAVE_LOOPS
+	movi	a2, 0
+	rsr	a3, LBEG
+	xsr	a2, LCOUNT
+	s32i	a3, a1, PT_LBEG
+	rsr	a3, LEND
+	s32i	a2, a1, PT_LCOUNT
+	s32i	a3, a1, PT_LEND
+#endif
+
+	/* Set up C environment and call registered handler */
+	/* Setup stack, PS.WOE = 1, PS.EXCM = 0, PS.INTLEVEL = EXCM level. */
+
+	rsr	a2, EXCCAUSE
+#if XCHAL_HAVE_XEA1
+	movi	a3, (1<<PS_WOE_BIT) | 1
+#elif __XTENSA_CALL0_ABI__
+	movi	a3, XCHAL_EXCM_LEVEL
+#else
+	movi	a3, (1<<PS_WOE_BIT) | XCHAL_EXCM_LEVEL
+#endif
+	xsr	a3, PS
+	rsync
+	s32i	a2, a1, PT_EXCCAUSE
+	s32i	a3, a1, PT_PS
+
+	movi	a0, exc_table
+	addx4	a0, a2, a0
+	l32i	a0, a0, 0
+#ifdef __XTENSA_CALL0_ABI__
+	mov	a2, a1			# Provide stack frame as only argument
+	callx0	a0
+	l32i	a3, a1, PT_PS
+#else
+	mov	a6, a1			# Provide stack frame as only argument
+	callx4	a0
+#endif
+
+	/* Restore PS and go to exception mode (PS.EXCM=1) */
+
+	wsr	a3, PS
+
+	/* Restore SR registers */
+
+#if XCHAL_HAVE_LOOPS
+	l32i	a2, a1, PT_LBEG
+	l32i	a3, a1, PT_LEND
+	l32i	a4, a1, PT_LCOUNT
+	wsr	a2, LBEG
+	wsr	a3, LEND
+	wsr	a4, LCOUNT
+#endif
+
+	l32i	a2, a1, PT_SAR
+	l32i	a3, a1, PT_PC
+	wsr	a2, SAR
+	wsr	a3, EPC1
+
+#if XCHAL_HAVE_WINDOWED
+	/* Do we need to simulate a MOVSP? */
+
+	l32i	a2, a1, PT_WINDOWSTART
+	addi	a3, a2, -1
+	and	a2, a2, a3
+	beqz	a2, 1f			# Skip if regs were spilled before exc.
+
+	rsr	a2, WINDOWSTART
+	addi	a3, a2, -1
+	and	a2, a2, a3
+	bnez	a2, 1f			# Skip if registers aren't spilled now
+
+	addi	a2, a1, -16
+	l32i	a4, a2, 0
+	l32i	a5, a2, 4
+	s32i	a4, a1, PT_SIZE + 0
+	s32i	a5, a1, PT_SIZE + 4
+	l32i	a4, a2, 8
+	l32i	a5, a2, 12
+	s32i	a4, a1, PT_SIZE + 8
+	s32i	a5, a1, PT_SIZE + 12
+#endif
+
+	/* Restore address register */
+
+1:	l32i	a15, a1, PT_AREG + 15 * 4
+	l32i	a14, a1, PT_AREG + 14 * 4
+	l32i	a13, a1, PT_AREG + 13 * 4
+	l32i	a12, a1, PT_AREG + 12 * 4
+	l32i	a11, a1, PT_AREG + 11 * 4
+	l32i	a10, a1, PT_AREG + 10 * 4
+	l32i	a9, a1, PT_AREG + 9 * 4
+	l32i	a8, a1, PT_AREG + 8 * 4
+	l32i	a7, a1, PT_AREG + 7 * 4
+	l32i	a6, a1, PT_AREG + 6 * 4
+	l32i	a5, a1, PT_AREG + 5 * 4
+	l32i	a4, a1, PT_AREG + 4 * 4
+	l32i	a3, a1, PT_AREG + 3 * 4
+	l32i	a2, a1, PT_AREG + 2 * 4
+	l32i	a0, a1, PT_AREG + 0 * 4
+
+	l32i	a1, a1, PT_AREG + 1 * 4 # Remove ptrace stack frame
+
+	rfe
+
+#endif /* XCHAL_HAVE_EXCEPTIONS */
+
+#if XCHAL_HAVE_WINDOWED
+
+/*
+ * Window overflow and underflow handlers.
+ * The handlers must be 64 bytes apart, first starting with the underflow
+ * handlers underflow-4 to underflow-12, then the overflow handlers
+ * overflow-4 to overflow-12.
+ *
+ * Note: We rerun the underflow handlers if we hit an exception, so
+ *	 we try to access any page that would cause a page fault early.
+ */
+
+	.section .WindowVectors.text, "ax"
+
+/* 4-Register Window Overflow Vector (Handler) */
+
+	.align 64
+.global _WindowOverflow4
+_WindowOverflow4:
+	s32e	a0, a5, -16
+	s32e	a1, a5, -12
+	s32e	a2, a5,  -8
+	s32e	a3, a5,  -4
+	rfwo
+
+
+/* 4-Register Window Underflow Vector (Handler) */
+
+	.align 64
+.global _WindowUnderflow4
+_WindowUnderflow4:
+	l32e	a0, a5, -16
+	l32e	a1, a5, -12
+	l32e	a2, a5,  -8
+	l32e	a3, a5,  -4
+	rfwu
+
+/*
+ * a0:	a0
+ * a1:	new stack pointer = a1 - 16 - 4
+ * a2:	available, saved in excsave1
+ * a3:	available, saved on stack *a1
+ */
+
+/* 15*/	.byte	0xff
+
+fast_alloca_exception:	/* must be at _WindowUnderflow4 + 16 */
+
+/* 16*/	rsr	a2, PS
+/* 19*/	rsr	a3, WINDOWBASE
+/* 22*/	extui	a2, a2, PS_OWB_SHIFT, PS_OWB_SHIFT
+/* 25*/	xor	a2, a2, a3
+/* 28*/	rsr	a3, PS
+/* 31*/	slli	a2, a2, PS_OWB_SHIFT
+/* 34*/	xor	a2, a3, a2
+/* 37*/	wsr	a2, PS
+
+/* 40*/	_l32i	a3, a1, 0
+/* 43*/	addi	a1, a1, 16 + 4
+/* 46*/	rsr	a2, EXCSAVE1
+
+/* 49*/	rotw	-1
+/* 52*/	_bbci.l	a4, 31, _WindowUnderflow4	/* 0x: call4 */
+/* 55*/	rotw	-1
+/* 58*/	_bbci.l	a8, 30, _WindowUnderflow8	/* 10: call8 */
+/* 61*/ _j	__WindowUnderflow12		/* 11: call12 */
+/* 64*/
+
+/* 8-Register Window Overflow Vector (Handler) */
+
+	.align 64
+.global _WindowOverflow8
+_WindowOverflow8:
+	s32e	a0, a9, -16
+	l32e	a0, a1, -12
+	s32e	a2, a9,  -8
+	s32e	a1, a9, -12
+	s32e	a3, a9,  -4
+	s32e	a4, a0, -32
+	s32e	a5, a0, -28
+	s32e	a6, a0, -24
+	s32e	a7, a0, -20
+	rfwo
+
+/* 8-Register Window Underflow Vector (Handler) */
+
+	.align 64
+.global _WindowUnderflow8
+_WindowUnderflow8:
+	l32e	a1, a9, -12
+	l32e	a0, a9, -16
+	l32e	a7, a1, -12
+	l32e	a2, a9,  -8
+	l32e	a4, a7, -32
+	l32e	a3, a9,  -4
+	l32e	a5, a7, -28
+	l32e	a6, a7, -24
+	l32e	a7, a7, -20
+	rfwu
+
+/* 12-Register Window Overflow Vector (Handler) */
+
+	.align 64
+.global _WindowOverflow12
+_WindowOverflow12:
+	s32e	a0,  a13, -16
+	l32e	a0,  a1,  -12
+	s32e	a1,  a13, -12
+	s32e	a2,  a13,  -8
+	s32e	a3,  a13,  -4
+	s32e	a4,  a0,  -48
+	s32e	a5,  a0,  -44
+	s32e	a6,  a0,  -40
+	s32e	a7,  a0,  -36
+	s32e	a8,  a0,  -32
+	s32e	a9,  a0,  -28
+	s32e	a10, a0,  -24
+	s32e	a11, a0,  -20
+	rfwo
+
+/* 12-Register Window Underflow Vector (Handler) */
+
+	.org _WindowOverflow12 + 64 - 3
+__WindowUnderflow12:
+	rotw	-1
+.global _WindowUnderflow12
+_WindowUnderflow12:
+	l32e	a1,  a13, -12
+	l32e	a0,  a13, -16
+	l32e	a11, a1,  -12
+	l32e	a2,  a13,  -8
+	l32e	a4,  a11, -48
+	l32e	a8,  a11, -32
+	l32e	a3,  a13,  -4
+	l32e	a5,  a11, -44
+	l32e	a6,  a11, -40
+	l32e	a7,  a11, -36
+	l32e	a9,  a11, -28
+	l32e	a10, a11, -24
+	l32e	a11, a11, -20
+	rfwu
+
+#endif /* XCHAL_HAVE_WINDOWED */