arch/arc/lib/strcmp.S - github/trini/u-boot - Gitiles

 /*
  * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
  *
  * SPDX-License-Identifier:	GPL-2.0+
  */

 /*
  * This is optimized primarily for the ARC700.
  * It would be possible to speed up the loops by one cycle / word
  * respective one cycle / byte by forcing double source 1 alignment, unrolling
  * by a factor of two, and speculatively loading the second word / byte of
  * source 1; however, that would increase the overhead for loop setup / finish,
  * and strcmp might often terminate early.
  */

 .global strcmp
 .align 4
 strcmp:
 	or	%r2, %r0, %r1
 	bmsk_s	%r2, %r2, 1
 	brne	%r2, 0, .Lcharloop
 	mov_s	%r12, 0x01010101
 	ror	%r5, %r12
 .Lwordloop:
 	ld.ab	%r2, [%r0, 4]
 	ld.ab	%r3, [%r1, 4]
 	nop_s
 	sub	%r4, %r2, %r12
 	bic	%r4, %r4, %r2
 	and	%r4, %r4, %r5
 	brne	%r4, 0, .Lfound0
 	breq	%r2 ,%r3, .Lwordloop
 #ifdef	__LITTLE_ENDIAN__
 	xor	%r0, %r2, %r3	/* mask for difference */
 	sub_s	%r1, %r0, 1
 	bic_s	%r0, %r0, %r1	/* mask for least significant difference bit */
 	sub	%r1, %r5, %r0
 	xor	%r0, %r5, %r1	/* mask for least significant difference byte */
 	and_s	%r2, %r2, %r0
 	and_s	%r3, %r3, %r0
 #endif /* _ENDIAN__ */
 	cmp_s	%r2, %r3
 	mov_s	%r0, 1
 	j_s.d	[%blink]
 	bset.lo	%r0, %r0, 31

 	.balign	4
 #ifdef __LITTLE_ENDIAN__
 .Lfound0:
 	xor	%r0, %r2, %r3	/* mask for difference */
 	or	%r0, %r0, %r4	/* or in zero indicator */
 	sub_s	%r1, %r0, 1
 	bic_s	%r0, %r0, %r1	/* mask for least significant difference bit */
 	sub	%r1, %r5, %r0
 	xor	%r0, %r5, %r1	/* mask for least significant difference byte */
 	and_s	%r2, %r2, %r0
 	and_s	%r3, %r3, %r0
 	sub.f	%r0, %r2, %r3
 	mov.hi	%r0, 1
 	j_s.d	[%blink]
 	bset.lo	%r0, %r0, 31
 #else /* __BIG_ENDIAN__ */
 	/*
 	 * The zero-detection above can mis-detect 0x01 bytes as zeroes
 	 * because of carry-propagateion from a lower significant zero byte.
 	 * We can compensate for this by checking that bit0 is zero.
 	 * This compensation is not necessary in the step where we
 	 * get a low estimate for r2, because in any affected bytes
 	 * we already have 0x00 or 0x01, which will remain unchanged
 	 * when bit 7 is cleared.
 	 */
 	.balign	4
 .Lfound0:
 	lsr	%r0, %r4, 8
 	lsr_s	%r1, %r2
 	bic_s	%r2, %r2, %r0	/* get low estimate for r2 and get ... */
 	bic_s	%r0, %r0, %r1	/* <this is the adjusted mask for zeros> */
 	or_s	%r3, %r3, %r0	/* ... high estimate r3 so that r2 > r3 will */
 	cmp_s	%r3, %r2	/* ... be independent of trailing garbage */
 	or_s	%r2, %r2, %r0	/* likewise for r3 > r2 */
 	bic_s	%r3, %r3, %r0
 	rlc	%r0, 0		/* r0 := r2 > r3 ? 1 : 0 */
 	cmp_s	%r2, %r3
 	j_s.d	[%blink]
 	bset.lo	%r0, %r0, 31
 #endif /* _ENDIAN__ */

 	.balign	4
 .Lcharloop:
 	ldb.ab	%r2,[%r0,1]
 	ldb.ab	%r3,[%r1,1]
 	nop_s
 	breq	%r2, 0, .Lcmpend
 	breq	%r2, %r3, .Lcharloop
 .Lcmpend:
 	j_s.d	[%blink]
 	sub	%r0, %r2, %r3
	/*
	* Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
	*
	* SPDX-License-Identifier: GPL-2.0+
	*/

	/*
	* This is optimized primarily for the ARC700.
	* It would be possible to speed up the loops by one cycle / word
	* respective one cycle / byte by forcing double source 1 alignment, unrolling
	* by a factor of two, and speculatively loading the second word / byte of
	* source 1; however, that would increase the overhead for loop setup / finish,
	* and strcmp might often terminate early.
	*/

	.global strcmp
	.align 4
	strcmp:
	or %r2, %r0, %r1
	bmsk_s %r2, %r2, 1
	brne %r2, 0, .Lcharloop
	mov_s %r12, 0x01010101
	ror %r5, %r12
	.Lwordloop:
	ld.ab %r2, [%r0, 4]
	ld.ab %r3, [%r1, 4]
	nop_s
	sub %r4, %r2, %r12
	bic %r4, %r4, %r2
	and %r4, %r4, %r5
	brne %r4, 0, .Lfound0
	breq %r2 ,%r3, .Lwordloop
	#ifdef __LITTLE_ENDIAN__
	xor %r0, %r2, %r3 /* mask for difference */
	sub_s %r1, %r0, 1
	bic_s %r0, %r0, %r1 /* mask for least significant difference bit */
	sub %r1, %r5, %r0
	xor %r0, %r5, %r1 /* mask for least significant difference byte */
	and_s %r2, %r2, %r0
	and_s %r3, %r3, %r0
	#endif /* _ENDIAN__ */
	cmp_s %r2, %r3
	mov_s %r0, 1
	j_s.d [%blink]
	bset.lo %r0, %r0, 31

	.balign 4
	#ifdef __LITTLE_ENDIAN__
	.Lfound0:
	xor %r0, %r2, %r3 /* mask for difference */
	or %r0, %r0, %r4 /* or in zero indicator */
	sub_s %r1, %r0, 1
	bic_s %r0, %r0, %r1 /* mask for least significant difference bit */
	sub %r1, %r5, %r0
	xor %r0, %r5, %r1 /* mask for least significant difference byte */
	and_s %r2, %r2, %r0
	and_s %r3, %r3, %r0
	sub.f %r0, %r2, %r3
	mov.hi %r0, 1
	j_s.d [%blink]
	bset.lo %r0, %r0, 31
	#else /* __BIG_ENDIAN__ */
	/*
	* The zero-detection above can mis-detect 0x01 bytes as zeroes
	* because of carry-propagateion from a lower significant zero byte.
	* We can compensate for this by checking that bit0 is zero.
	* This compensation is not necessary in the step where we
	* get a low estimate for r2, because in any affected bytes
	* we already have 0x00 or 0x01, which will remain unchanged
	* when bit 7 is cleared.
	*/
	.balign 4
	.Lfound0:
	lsr %r0, %r4, 8
	lsr_s %r1, %r2
	bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */
	bic_s %r0, %r0, %r1 /* <this is the adjusted mask for zeros> */
	or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */
	cmp_s %r3, %r2 /* ... be independent of trailing garbage */
	or_s %r2, %r2, %r0 /* likewise for r3 > r2 */
	bic_s %r3, %r3, %r0
	rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */
	cmp_s %r2, %r3
	j_s.d [%blink]
	bset.lo %r0, %r0, 31
	#endif /* _ENDIAN__ */

	.balign 4
	.Lcharloop:
	ldb.ab %r2,[%r0,1]
	ldb.ab %r3,[%r1,1]
	nop_s
	breq %r2, 0, .Lcmpend
	breq %r2, %r3, .Lcharloop
	.Lcmpend:
	j_s.d [%blink]
	sub %r0, %r2, %r3