armv8/fsl-lsch3: Convert flushing L3 to assembly to avoid using stack

Flushing L3 cache in CCN-504 requries d-cache to be disabled. Using
assembly function to guarantee stack is not used before flushing is
completed. Timeout is needed for simualtor on which CCN-504 is not
implemented. Return value can be checked for timeout situation.

Change bootm.c to disable dcache instead of simply flushing, required
by flushing L3.

Signed-off-by: York Sun <yorksun@freescale.com>
diff --git a/arch/arm/cpu/armv8/cache.S b/arch/arm/cpu/armv8/cache.S
index 9c6e824..fa447bc 100644
--- a/arch/arm/cpu/armv8/cache.S
+++ b/arch/arm/cpu/armv8/cache.S
@@ -155,3 +155,9 @@
 	isb	sy
 	ret
 ENDPROC(__asm_invalidate_icache_all)
+
+ENTRY(__asm_flush_l3_cache)
+	mov	x0, #0			/* return status as success */
+	ret
+ENDPROC(__asm_flush_l3_cache)
+	.weak	__asm_flush_l3_cache
diff --git a/arch/arm/cpu/armv8/cache_v8.c b/arch/arm/cpu/armv8/cache_v8.c
index 9dbcdf2..c5ec529 100644
--- a/arch/arm/cpu/armv8/cache_v8.c
+++ b/arch/arm/cpu/armv8/cache_v8.c
@@ -73,17 +73,21 @@
 	__asm_invalidate_dcache_all();
 }
 
-void __weak flush_l3_cache(void)
-{
-}
-
 /*
- * Performs a clean & invalidation of the entire data cache at all levels
+ * Performs a clean & invalidation of the entire data cache at all levels.
+ * This function needs to be inline to avoid using stack.
+ * __asm_flush_l3_cache return status of timeout
  */
-void flush_dcache_all(void)
+inline void flush_dcache_all(void)
 {
+	int ret;
+
 	__asm_flush_dcache_all();
-	flush_l3_cache();
+	ret = __asm_flush_l3_cache();
+	if (ret)
+		debug("flushing dcache returns 0x%x\n", ret);
+	else
+		debug("flushing dcache successfully.\n");
 }
 
 /*
diff --git a/arch/arm/cpu/armv8/fsl-lsch3/cpu.c b/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
index ada1690..2aaac01 100644
--- a/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
+++ b/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
@@ -243,59 +243,6 @@
 }
 
 /*
- * flush_l3_cache
- * Dickens L3 cache can be flushed by transitioning from FAM to SFONLY power
- * state, by writing to HP-F P-state request register.
- * Fixme: This function should moved to a common file if other SoCs also use
- * the same Dickens.
- */
-#define HNF0_PSTATE_REQ 0x04200010
-#define HNF1_PSTATE_REQ 0x04210010
-#define HNF2_PSTATE_REQ 0x04220010
-#define HNF3_PSTATE_REQ 0x04230010
-#define HNF4_PSTATE_REQ 0x04240010
-#define HNF5_PSTATE_REQ 0x04250010
-#define HNF6_PSTATE_REQ 0x04260010
-#define HNF7_PSTATE_REQ 0x04270010
-#define HNFPSTAT_MASK (0xFFFFFFFFFFFFFFFC)
-#define HNFPSTAT_FAM	0x3
-#define HNFPSTAT_SFONLY 0x01
-
-static void hnf_pstate_req(u64 *ptr, u64 state)
-{
-	int timeout = 1000;
-	out_le64(ptr, (in_le64(ptr) & HNFPSTAT_MASK) | (state & 0x3));
-	ptr++;
-	/* checking if the transition is completed */
-	while (timeout > 0) {
-		if (((in_le64(ptr) & 0x0c) >> 2) == (state & 0x3))
-			break;
-		udelay(100);
-		timeout--;
-	}
-}
-
-void flush_l3_cache(void)
-{
-	hnf_pstate_req((u64 *)HNF0_PSTATE_REQ, HNFPSTAT_SFONLY);
-	hnf_pstate_req((u64 *)HNF1_PSTATE_REQ, HNFPSTAT_SFONLY);
-	hnf_pstate_req((u64 *)HNF2_PSTATE_REQ, HNFPSTAT_SFONLY);
-	hnf_pstate_req((u64 *)HNF3_PSTATE_REQ, HNFPSTAT_SFONLY);
-	hnf_pstate_req((u64 *)HNF4_PSTATE_REQ, HNFPSTAT_SFONLY);
-	hnf_pstate_req((u64 *)HNF5_PSTATE_REQ, HNFPSTAT_SFONLY);
-	hnf_pstate_req((u64 *)HNF6_PSTATE_REQ, HNFPSTAT_SFONLY);
-	hnf_pstate_req((u64 *)HNF7_PSTATE_REQ, HNFPSTAT_SFONLY);
-	hnf_pstate_req((u64 *)HNF0_PSTATE_REQ, HNFPSTAT_FAM);
-	hnf_pstate_req((u64 *)HNF1_PSTATE_REQ, HNFPSTAT_FAM);
-	hnf_pstate_req((u64 *)HNF2_PSTATE_REQ, HNFPSTAT_FAM);
-	hnf_pstate_req((u64 *)HNF3_PSTATE_REQ, HNFPSTAT_FAM);
-	hnf_pstate_req((u64 *)HNF4_PSTATE_REQ, HNFPSTAT_FAM);
-	hnf_pstate_req((u64 *)HNF5_PSTATE_REQ, HNFPSTAT_FAM);
-	hnf_pstate_req((u64 *)HNF6_PSTATE_REQ, HNFPSTAT_FAM);
-	hnf_pstate_req((u64 *)HNF7_PSTATE_REQ, HNFPSTAT_FAM);
-}
-
-/*
  * This function is called from lib/board.c.
  * It recreates MMU table in main memory. MMU and d-cache are enabled earlier.
  * There is no need to disable d-cache for this operation.
diff --git a/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S b/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
index c283787..886576e 100644
--- a/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
+++ b/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
@@ -100,6 +100,84 @@
 	ret
 ENDPROC(lowlevel_init)
 
+hnf_pstate_poll:
+	/* x0 has the desired status, return 0 for success, 1 for timeout
+	 * clobber x1, x2, x3, x4, x6, x7
+	 */
+	mov	x1, x0
+	mov	x7, #0			/* flag for timeout */
+	mrs	x3, cntpct_el0		/* read timer */
+	add	x3, x3, #1200		/* timeout after 100 microseconds */
+	mov	x0, #0x18
+	movk	x0, #0x420, lsl #16	/* HNF0_PSTATE_STATUS */
+	mov	w6, #8			/* HN-F node count */
+1:
+	ldr	x2, [x0]
+	cmp	x2, x1			/* check status */
+	b.eq	2f
+	mrs	x4, cntpct_el0
+	cmp	x4, x3
+	b.ls	1b
+	mov	x7, #1			/* timeout */
+	b	3f
+2:
+	add	x0, x0, #0x10000	/* move to next node */
+	subs	w6, w6, #1
+	cbnz	w6, 1b
+3:
+	mov	x0, x7
+	ret
+
+hnf_set_pstate:
+	/* x0 has the desired state, clobber x1, x2, x6 */
+	mov	x1, x0
+	/* power state to SFONLY */
+	mov	w6, #8			/* HN-F node count */
+	mov	x0, #0x10
+	movk	x0, #0x420, lsl #16	/* HNF0_PSTATE_REQ */
+1:	/* set pstate to sfonly */
+	ldr	x2, [x0]
+	and	x2, x2, #0xfffffffffffffffc	/* & HNFPSTAT_MASK */
+	orr	x2, x2, x1
+	str	x2, [x0]
+	add	x0, x0, #0x10000	/* move to next node */
+	subs	w6, w6, #1
+	cbnz	w6, 1b
+
+	ret
+
+ENTRY(__asm_flush_l3_cache)
+	/*
+	 * Return status in x0
+	 *    success 0
+	 *    tmeout 1 for setting SFONLY, 2 for FAM, 3 for both
+	 */
+	mov	x29, lr
+	mov	x8, #0
+
+	dsb	sy
+	mov	x0, #0x1		/* HNFPSTAT_SFONLY */
+	bl	hnf_set_pstate
+
+	mov	x0, #0x4		/* SFONLY status */
+	bl	hnf_pstate_poll
+	cbz	x0, 1f
+	mov	x8, #1			/* timeout */
+1:
+	dsb	sy
+	mov	x0, #0x3		/* HNFPSTAT_FAM */
+	bl	hnf_set_pstate
+
+	mov	x0, #0xc		/* FAM status */
+	bl	hnf_pstate_poll
+	cbz	x0, 1f
+	add	x8, x8, #0x2
+1:
+	mov	x0, x8
+	mov	lr, x29
+	ret
+ENDPROC(__asm_flush_l3_cache)
+
 	/* Keep literals not used by the secondary boot code outside it */
 	.ltorg
 
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index 7820486..2a5bed2 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -70,6 +70,7 @@
 void __asm_flush_dcache_range(u64 start, u64 end);
 void __asm_invalidate_tlb_all(void);
 void __asm_invalidate_icache_all(void);
+int __asm_flush_l3_cache(void);
 
 void armv8_switch_to_el2(void);
 void armv8_switch_to_el1(void);
diff --git a/arch/arm/lib/bootm.c b/arch/arm/lib/bootm.c
index 0c1298a..2d6b676 100644
--- a/arch/arm/lib/bootm.c
+++ b/arch/arm/lib/bootm.c
@@ -191,7 +191,7 @@
 static void do_nonsec_virt_switch(void)
 {
 	smp_kick_all_cpus();
-	flush_dcache_all();	/* flush cache before swtiching to EL2 */
+	dcache_disable();	/* flush cache before swtiching to EL2 */
 	armv8_switch_to_el2();
 #ifdef CONFIG_ARMV8_SWITCH_TO_EL1
 	armv8_switch_to_el1();