blob: 8cb7d2f18c662832c929ad758d77164e80984481 [file] [log] [blame]
Alexey Brodkin22723822014-02-04 12:56:15 +04001/*
2 * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved.
3 *
4 * SPDX-License-Identifier: GPL-2.0+
5 */
6
7/*
8 * This is optimized primarily for the ARC700.
9 * It would be possible to speed up the loops by one cycle / word
10 * respective one cycle / byte by forcing double source 1 alignment, unrolling
11 * by a factor of two, and speculatively loading the second word / byte of
12 * source 1; however, that would increase the overhead for loop setup / finish,
13 * and strcmp might often terminate early.
14 */
15
16.global strcmp
17.align 4
18strcmp:
19 or %r2, %r0, %r1
20 bmsk_s %r2, %r2, 1
21 brne %r2, 0, .Lcharloop
22 mov_s %r12, 0x01010101
23 ror %r5, %r12
24.Lwordloop:
25 ld.ab %r2, [%r0, 4]
26 ld.ab %r3, [%r1, 4]
27 nop_s
28 sub %r4, %r2, %r12
29 bic %r4, %r4, %r2
30 and %r4, %r4, %r5
31 brne %r4, 0, .Lfound0
32 breq %r2 ,%r3, .Lwordloop
33#ifdef __LITTLE_ENDIAN__
34 xor %r0, %r2, %r3 /* mask for difference */
35 sub_s %r1, %r0, 1
36 bic_s %r0, %r0, %r1 /* mask for least significant difference bit */
37 sub %r1, %r5, %r0
38 xor %r0, %r5, %r1 /* mask for least significant difference byte */
39 and_s %r2, %r2, %r0
40 and_s %r3, %r3, %r0
41#endif /* _ENDIAN__ */
42 cmp_s %r2, %r3
43 mov_s %r0, 1
44 j_s.d [%blink]
45 bset.lo %r0, %r0, 31
46
47 .balign 4
48#ifdef __LITTLE_ENDIAN__
49.Lfound0:
50 xor %r0, %r2, %r3 /* mask for difference */
51 or %r0, %r0, %r4 /* or in zero indicator */
52 sub_s %r1, %r0, 1
53 bic_s %r0, %r0, %r1 /* mask for least significant difference bit */
54 sub %r1, %r5, %r0
55 xor %r0, %r5, %r1 /* mask for least significant difference byte */
56 and_s %r2, %r2, %r0
57 and_s %r3, %r3, %r0
58 sub.f %r0, %r2, %r3
59 mov.hi %r0, 1
60 j_s.d [%blink]
61 bset.lo %r0, %r0, 31
62#else /* __BIG_ENDIAN__ */
63 /*
64 * The zero-detection above can mis-detect 0x01 bytes as zeroes
65 * because of carry-propagateion from a lower significant zero byte.
66 * We can compensate for this by checking that bit0 is zero.
67 * This compensation is not necessary in the step where we
68 * get a low estimate for r2, because in any affected bytes
69 * we already have 0x00 or 0x01, which will remain unchanged
70 * when bit 7 is cleared.
71 */
72 .balign 4
73.Lfound0:
74 lsr %r0, %r4, 8
75 lsr_s %r1, %r2
76 bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */
77 bic_s %r0, %r0, %r1 /* <this is the adjusted mask for zeros> */
78 or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */
79 cmp_s %r3, %r2 /* ... be independent of trailing garbage */
80 or_s %r2, %r2, %r0 /* likewise for r3 > r2 */
81 bic_s %r3, %r3, %r0
82 rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */
83 cmp_s %r2, %r3
84 j_s.d [%blink]
85 bset.lo %r0, %r0, 31
86#endif /* _ENDIAN__ */
87
88 .balign 4
89.Lcharloop:
90 ldb.ab %r2,[%r0,1]
91 ldb.ab %r3,[%r1,1]
92 nop_s
93 breq %r2, 0, .Lcmpend
94 breq %r2, %r3, .Lcharloop
95.Lcmpend:
96 j_s.d [%blink]
97 sub %r0, %r2, %r3