]>
Commit | Line | Data |
---|---|---|
c86cd1e1 AB |
1 | //\r |
2 | // Copyright (c) 2013, Linaro Limited\r | |
3 | // All rights reserved.\r | |
aa1b377e | 4 | // SPDX-License-Identifier: BSD-2-Clause-Patent\r |
c86cd1e1 AB |
5 | //\r |
6 | \r | |
7 | // Assumptions:\r | |
8 | //\r | |
9 | // ARMv8-a, AArch64\r | |
10 | //\r | |
11 | \r | |
12 | \r | |
13 | // Parameters and result.\r | |
14 | #define src1 x0\r | |
15 | #define src2 x1\r | |
16 | #define limit x2\r | |
17 | #define result x0\r | |
18 | \r | |
19 | // Internal variables.\r | |
20 | #define data1 x3\r | |
21 | #define data1w w3\r | |
22 | #define data2 x4\r | |
23 | #define data2w w4\r | |
24 | #define diff x6\r | |
25 | #define endloop x7\r | |
26 | #define tmp1 x8\r | |
27 | #define tmp2 x9\r | |
28 | #define pos x11\r | |
29 | #define limit_wd x12\r | |
30 | #define mask x13\r | |
31 | \r | |
32 | .p2align 6\r | |
33 | ASM_GLOBAL ASM_PFX(InternalMemCompareMem)\r | |
34 | ASM_PFX(InternalMemCompareMem):\r | |
35 | eor tmp1, src1, src2\r | |
36 | tst tmp1, #7\r | |
37 | b.ne .Lmisaligned8\r | |
38 | ands tmp1, src1, #7\r | |
39 | b.ne .Lmutual_align\r | |
40 | add limit_wd, limit, #7\r | |
41 | lsr limit_wd, limit_wd, #3\r | |
42 | \r | |
43 | // Start of performance-critical section -- one 64B cache line.\r | |
44 | .Lloop_aligned:\r | |
45 | ldr data1, [src1], #8\r | |
46 | ldr data2, [src2], #8\r | |
47 | .Lstart_realigned:\r | |
48 | subs limit_wd, limit_wd, #1\r | |
49 | eor diff, data1, data2 // Non-zero if differences found.\r | |
50 | csinv endloop, diff, xzr, ne // Last Dword or differences.\r | |
51 | cbz endloop, .Lloop_aligned\r | |
52 | // End of performance-critical section -- one 64B cache line.\r | |
53 | \r | |
54 | // Not reached the limit, must have found a diff.\r | |
55 | cbnz limit_wd, .Lnot_limit\r | |
56 | \r | |
57 | // Limit % 8 == 0 => all bytes significant.\r | |
58 | ands limit, limit, #7\r | |
59 | b.eq .Lnot_limit\r | |
60 | \r | |
61 | lsl limit, limit, #3 // Bits -> bytes.\r | |
62 | mov mask, #~0\r | |
63 | lsl mask, mask, limit\r | |
64 | bic data1, data1, mask\r | |
65 | bic data2, data2, mask\r | |
66 | \r | |
67 | orr diff, diff, mask\r | |
68 | \r | |
69 | .Lnot_limit:\r | |
70 | rev diff, diff\r | |
71 | rev data1, data1\r | |
72 | rev data2, data2\r | |
73 | \r | |
74 | // The MS-non-zero bit of DIFF marks either the first bit\r | |
75 | // that is different, or the end of the significant data.\r | |
76 | // Shifting left now will bring the critical information into the\r | |
77 | // top bits.\r | |
78 | clz pos, diff\r | |
79 | lsl data1, data1, pos\r | |
80 | lsl data2, data2, pos\r | |
81 | \r | |
82 | // But we need to zero-extend (char is unsigned) the value and then\r | |
83 | // perform a signed 32-bit subtraction.\r | |
84 | lsr data1, data1, #56\r | |
85 | sub result, data1, data2, lsr #56\r | |
86 | ret\r | |
87 | \r | |
88 | .Lmutual_align:\r | |
89 | // Sources are mutually aligned, but are not currently at an\r | |
90 | // alignment boundary. Round down the addresses and then mask off\r | |
91 | // the bytes that precede the start point.\r | |
92 | bic src1, src1, #7\r | |
93 | bic src2, src2, #7\r | |
94 | add limit, limit, tmp1 // Adjust the limit for the extra.\r | |
95 | lsl tmp1, tmp1, #3 // Bytes beyond alignment -> bits.\r | |
96 | ldr data1, [src1], #8\r | |
97 | neg tmp1, tmp1 // Bits to alignment -64.\r | |
98 | ldr data2, [src2], #8\r | |
99 | mov tmp2, #~0\r | |
100 | \r | |
101 | // Little-endian. Early bytes are at LSB.\r | |
102 | lsr tmp2, tmp2, tmp1 // Shift (tmp1 & 63).\r | |
103 | add limit_wd, limit, #7\r | |
104 | orr data1, data1, tmp2\r | |
105 | orr data2, data2, tmp2\r | |
106 | lsr limit_wd, limit_wd, #3\r | |
107 | b .Lstart_realigned\r | |
108 | \r | |
109 | .p2align 6\r | |
110 | .Lmisaligned8:\r | |
111 | sub limit, limit, #1\r | |
112 | 1:\r | |
113 | // Perhaps we can do better than this.\r | |
114 | ldrb data1w, [src1], #1\r | |
115 | ldrb data2w, [src2], #1\r | |
116 | subs limit, limit, #1\r | |
117 | ccmp data1w, data2w, #0, cs // NZCV = 0b0000.\r | |
118 | b.eq 1b\r | |
119 | sub result, data1, data2\r | |
120 | ret\r |