armv8/cache: Change cache invalidate and flush function

When SoC first boots up, we should invalidate the cache but not flush it.
We can use the same function for invalid and flush mostly, with a wrapper.

Invalidating large cache can ben slow on emulator, so we postpone doing
so until I-cache is enabled, and before enabling D-cache.

Signed-off-by: York Sun <yorksun@freescale.com>
CC: David Feng <fenghua@phytium.com.cn>
diff --git a/arch/arm/cpu/armv8/cache.S b/arch/arm/cpu/armv8/cache.S
index 546a83e..249799c 100644
--- a/arch/arm/cpu/armv8/cache.S
+++ b/arch/arm/cpu/armv8/cache.S
@@ -19,11 +19,12 @@
  * clean and invalidate one level cache.
  *
  * x0: cache level
- * x1~x9: clobbered
+ * x1: 0 flush & invalidate, 1 invalidate only
+ * x2~x9: clobbered
  */
 ENTRY(__asm_flush_dcache_level)
-	lsl	x1, x0, #1
-	msr	csselr_el1, x1		/* select cache level */
+	lsl	x12, x0, #1
+	msr	csselr_el1, x12		/* select cache level */
 	isb				/* sync change of cssidr_el1 */
 	mrs	x6, ccsidr_el1		/* read the new cssidr_el1 */
 	and	x2, x6, #7		/* x2 <- log2(cache line size)-4 */
@@ -35,7 +36,7 @@
 	clz	w5, w4			/* bit position of #ways */
 	mov	x4, #0x7fff
 	and	x4, x4, x6, lsr #13	/* x4 <- max number of #sets */
-	/* x1 <- cache level << 1 */
+	/* x12 <- cache level << 1 */
 	/* x2 <- line length offset */
 	/* x3 <- number of cache ways - 1 */
 	/* x4 <- number of cache sets - 1 */
@@ -45,11 +46,14 @@
 	mov	x6, x3			/* x6 <- working copy of #ways */
 loop_way:
 	lsl	x7, x6, x5
-	orr	x9, x1, x7		/* map way and level to cisw value */
+	orr	x9, x12, x7		/* map way and level to cisw value */
 	lsl	x7, x4, x2
 	orr	x9, x9, x7		/* map set number to cisw value */
-	dc	cisw, x9		/* clean & invalidate by set/way */
-	subs	x6, x6, #1		/* decrement the way */
+	tbz	w1, #0, 1f
+	dc	isw, x9
+	b	2f
+1:	dc	cisw, x9		/* clean & invalidate by set/way */
+2:	subs	x6, x6, #1		/* decrement the way */
 	b.ge	loop_way
 	subs	x4, x4, #1		/* decrement the set */
 	b.ge	loop_set
@@ -58,11 +62,14 @@
 ENDPROC(__asm_flush_dcache_level)
 
 /*
- * void __asm_flush_dcache_all(void)
+ * void __asm_flush_dcache_all(int invalidate_only)
+ *
+ * x0: 0 flush & invalidate, 1 invalidate only
  *
  * clean and invalidate all data cache by SET/WAY.
  */
-ENTRY(__asm_flush_dcache_all)
+ENTRY(__asm_dcache_all)
+	mov	x1, x0
 	dsb	sy
 	mrs	x10, clidr_el1		/* read clidr_el1 */
 	lsr	x11, x10, #24
@@ -76,13 +83,13 @@
 	/* x15 <- return address */
 
 loop_level:
-	lsl	x1, x0, #1
-	add	x1, x1, x0		/* x0 <- tripled cache level */
-	lsr	x1, x10, x1
-	and	x1, x1, #7		/* x1 <- cache type */
-	cmp	x1, #2
+	lsl	x12, x0, #1
+	add	x12, x12, x0		/* x0 <- tripled cache level */
+	lsr	x12, x10, x12
+	and	x12, x12, #7		/* x12 <- cache type */
+	cmp	x12, #2
 	b.lt	skip			/* skip if no cache or icache */
-	bl	__asm_flush_dcache_level
+	bl	__asm_flush_dcache_level	/* x1 = 0 flush, 1 invalidate */
 skip:
 	add	x0, x0, #1		/* increment cache level */
 	cmp	x11, x0
@@ -96,8 +103,24 @@
 
 finished:
 	ret
+ENDPROC(__asm_dcache_all)
+
+ENTRY(__asm_flush_dcache_all)
+	mov	x16, lr
+	mov	x0, #0
+	bl	__asm_dcache_all
+	mov	lr, x16
+	ret
 ENDPROC(__asm_flush_dcache_all)
 
+ENTRY(__asm_invalidate_dcache_all)
+	mov	x16, lr
+	mov	x0, #0xffff
+	bl	__asm_dcache_all
+	mov	lr, x16
+	ret
+ENDPROC(__asm_invalidate_dcache_all)
+
 /*
  * void __asm_flush_dcache_range(start, end)
  *