--- /dev/null
+//\r
+// Copyright (c) 2013, Linaro Limited\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions are met:\r
+// * Redistributions of source code must retain the above copyright\r
+// notice, this list of conditions and the following disclaimer.\r
+// * Redistributions in binary form must reproduce the above copyright\r
+// notice, this list of conditions and the following disclaimer in the\r
+// documentation and/or other materials provided with the distribution.\r
+// * Neither the name of the Linaro nor the\r
+// names of its contributors may be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+// Assumptions:\r
+//\r
+// ARMv8-a, AArch64\r
+//\r
+\r
+\r
+// Parameters and result.\r
+#define src1 x0\r
+#define src2 x1\r
+#define limit x2\r
+#define result x0\r
+\r
+// Internal variables.\r
+#define data1 x3\r
+#define data1w w3\r
+#define data2 x4\r
+#define data2w w4\r
+#define diff x6\r
+#define endloop x7\r
+#define tmp1 x8\r
+#define tmp2 x9\r
+#define pos x11\r
+#define limit_wd x12\r
+#define mask x13\r
+\r
+ .p2align 6\r
+ASM_GLOBAL ASM_PFX(InternalMemCompareMem)\r
+ASM_PFX(InternalMemCompareMem):\r
+ eor tmp1, src1, src2\r
+ tst tmp1, #7\r
+ b.ne .Lmisaligned8\r
+ ands tmp1, src1, #7\r
+ b.ne .Lmutual_align\r
+ add limit_wd, limit, #7\r
+ lsr limit_wd, limit_wd, #3\r
+\r
+ // Start of performance-critical section -- one 64B cache line.\r
+.Lloop_aligned:\r
+ ldr data1, [src1], #8\r
+ ldr data2, [src2], #8\r
+.Lstart_realigned:\r
+ subs limit_wd, limit_wd, #1\r
+ eor diff, data1, data2 // Non-zero if differences found.\r
+ csinv endloop, diff, xzr, ne // Last Dword or differences.\r
+ cbz endloop, .Lloop_aligned\r
+ // End of performance-critical section -- one 64B cache line.\r
+\r
+ // Not reached the limit, must have found a diff.\r
+ cbnz limit_wd, .Lnot_limit\r
+\r
+ // Limit % 8 == 0 => all bytes significant.\r
+ ands limit, limit, #7\r
+ b.eq .Lnot_limit\r
+\r
+ lsl limit, limit, #3 // Bits -> bytes.\r
+ mov mask, #~0\r
+ lsl mask, mask, limit\r
+ bic data1, data1, mask\r
+ bic data2, data2, mask\r
+\r
+ orr diff, diff, mask\r
+\r
+.Lnot_limit:\r
+ rev diff, diff\r
+ rev data1, data1\r
+ rev data2, data2\r
+\r
+ // The MS-non-zero bit of DIFF marks either the first bit\r
+ // that is different, or the end of the significant data.\r
+ // Shifting left now will bring the critical information into the\r
+ // top bits.\r
+ clz pos, diff\r
+ lsl data1, data1, pos\r
+ lsl data2, data2, pos\r
+\r
+ // But we need to zero-extend (char is unsigned) the value and then\r
+ // perform a signed 32-bit subtraction.\r
+ lsr data1, data1, #56\r
+ sub result, data1, data2, lsr #56\r
+ ret\r
+\r
+.Lmutual_align:\r
+ // Sources are mutually aligned, but are not currently at an\r
+ // alignment boundary. Round down the addresses and then mask off\r
+ // the bytes that precede the start point.\r
+ bic src1, src1, #7\r
+ bic src2, src2, #7\r
+ add limit, limit, tmp1 // Adjust the limit for the extra.\r
+ lsl tmp1, tmp1, #3 // Bytes beyond alignment -> bits.\r
+ ldr data1, [src1], #8\r
+ neg tmp1, tmp1 // Bits to alignment -64.\r
+ ldr data2, [src2], #8\r
+ mov tmp2, #~0\r
+\r
+ // Little-endian. Early bytes are at LSB.\r
+ lsr tmp2, tmp2, tmp1 // Shift (tmp1 & 63).\r
+ add limit_wd, limit, #7\r
+ orr data1, data1, tmp2\r
+ orr data2, data2, tmp2\r
+ lsr limit_wd, limit_wd, #3\r
+ b .Lstart_realigned\r
+\r
+ .p2align 6\r
+.Lmisaligned8:\r
+ sub limit, limit, #1\r
+1:\r
+ // Perhaps we can do better than this.\r
+ ldrb data1w, [src1], #1\r
+ ldrb data2w, [src2], #1\r
+ subs limit, limit, #1\r
+ ccmp data1w, data2w, #0, cs // NZCV = 0b0000.\r
+ b.eq 1b\r
+ sub result, data1, data2\r
+ ret\r
--- /dev/null
+//\r
+// Copyright (c) 2012 - 2016, Linaro Limited\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions are met:\r
+// * Redistributions of source code must retain the above copyright\r
+// notice, this list of conditions and the following disclaimer.\r
+// * Redistributions in binary form must reproduce the above copyright\r
+// notice, this list of conditions and the following disclaimer in the\r
+// documentation and/or other materials provided with the distribution.\r
+// * Neither the name of the Linaro nor the\r
+// names of its contributors may be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+//\r
+// Copyright (c) 2015 ARM Ltd\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions\r
+// are met:\r
+// 1. Redistributions of source code must retain the above copyright\r
+// notice, this list of conditions and the following disclaimer.\r
+// 2. Redistributions in binary form must reproduce the above copyright\r
+// notice, this list of conditions and the following disclaimer in the\r
+// documentation and/or other materials provided with the distribution.\r
+// 3. The name of the company may not be used to endorse or promote\r
+// products derived from this software without specific prior written\r
+// permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+// Assumptions:\r
+//\r
+// ARMv8-a, AArch64, unaligned accesses.\r
+//\r
+//\r
+\r
+#define dstin x0\r
+#define src x1\r
+#define count x2\r
+#define dst x3\r
+#define srcend x4\r
+#define dstend x5\r
+#define A_l x6\r
+#define A_lw w6\r
+#define A_h x7\r
+#define A_hw w7\r
+#define B_l x8\r
+#define B_lw w8\r
+#define B_h x9\r
+#define C_l x10\r
+#define C_h x11\r
+#define D_l x12\r
+#define D_h x13\r
+#define E_l x14\r
+#define E_h x15\r
+#define F_l srcend\r
+#define F_h dst\r
+#define tmp1 x9\r
+#define tmp2 x3\r
+\r
+#define L(l) .L ## l\r
+\r
+// Copies are split into 3 main cases: small copies of up to 16 bytes,\r
+// medium copies of 17..96 bytes which are fully unrolled. Large copies\r
+// of more than 96 bytes align the destination and use an unrolled loop\r
+// processing 64 bytes per iteration.\r
+// Small and medium copies read all data before writing, allowing any\r
+// kind of overlap, and memmove tailcalls memcpy for these cases as\r
+// well as non-overlapping copies.\r
+\r
+__memcpy:\r
+ prfm PLDL1KEEP, [src]\r
+ add srcend, src, count\r
+ add dstend, dstin, count\r
+ cmp count, 16\r
+ b.ls L(copy16)\r
+ cmp count, 96\r
+ b.hi L(copy_long)\r
+\r
+ // Medium copies: 17..96 bytes.\r
+ sub tmp1, count, 1\r
+ ldp A_l, A_h, [src]\r
+ tbnz tmp1, 6, L(copy96)\r
+ ldp D_l, D_h, [srcend, -16]\r
+ tbz tmp1, 5, 1f\r
+ ldp B_l, B_h, [src, 16]\r
+ ldp C_l, C_h, [srcend, -32]\r
+ stp B_l, B_h, [dstin, 16]\r
+ stp C_l, C_h, [dstend, -32]\r
+1:\r
+ stp A_l, A_h, [dstin]\r
+ stp D_l, D_h, [dstend, -16]\r
+ ret\r
+\r
+ .p2align 4\r
+ // Small copies: 0..16 bytes.\r
+L(copy16):\r
+ cmp count, 8\r
+ b.lo 1f\r
+ ldr A_l, [src]\r
+ ldr A_h, [srcend, -8]\r
+ str A_l, [dstin]\r
+ str A_h, [dstend, -8]\r
+ ret\r
+ .p2align 4\r
+1:\r
+ tbz count, 2, 1f\r
+ ldr A_lw, [src]\r
+ ldr A_hw, [srcend, -4]\r
+ str A_lw, [dstin]\r
+ str A_hw, [dstend, -4]\r
+ ret\r
+\r
+ // Copy 0..3 bytes. Use a branchless sequence that copies the same\r
+ // byte 3 times if count==1, or the 2nd byte twice if count==2.\r
+1:\r
+ cbz count, 2f\r
+ lsr tmp1, count, 1\r
+ ldrb A_lw, [src]\r
+ ldrb A_hw, [srcend, -1]\r
+ ldrb B_lw, [src, tmp1]\r
+ strb A_lw, [dstin]\r
+ strb B_lw, [dstin, tmp1]\r
+ strb A_hw, [dstend, -1]\r
+2: ret\r
+\r
+ .p2align 4\r
+ // Copy 64..96 bytes. Copy 64 bytes from the start and\r
+ // 32 bytes from the end.\r
+L(copy96):\r
+ ldp B_l, B_h, [src, 16]\r
+ ldp C_l, C_h, [src, 32]\r
+ ldp D_l, D_h, [src, 48]\r
+ ldp E_l, E_h, [srcend, -32]\r
+ ldp F_l, F_h, [srcend, -16]\r
+ stp A_l, A_h, [dstin]\r
+ stp B_l, B_h, [dstin, 16]\r
+ stp C_l, C_h, [dstin, 32]\r
+ stp D_l, D_h, [dstin, 48]\r
+ stp E_l, E_h, [dstend, -32]\r
+ stp F_l, F_h, [dstend, -16]\r
+ ret\r
+\r
+ // Align DST to 16 byte alignment so that we don't cross cache line\r
+ // boundaries on both loads and stores. There are at least 96 bytes\r
+ // to copy, so copy 16 bytes unaligned and then align. The loop\r
+ // copies 64 bytes per iteration and prefetches one iteration ahead.\r
+\r
+ .p2align 4\r
+L(copy_long):\r
+ and tmp1, dstin, 15\r
+ bic dst, dstin, 15\r
+ ldp D_l, D_h, [src]\r
+ sub src, src, tmp1\r
+ add count, count, tmp1 // Count is now 16 too large.\r
+ ldp A_l, A_h, [src, 16]\r
+ stp D_l, D_h, [dstin]\r
+ ldp B_l, B_h, [src, 32]\r
+ ldp C_l, C_h, [src, 48]\r
+ ldp D_l, D_h, [src, 64]!\r
+ subs count, count, 128 + 16 // Test and readjust count.\r
+ b.ls 2f\r
+1:\r
+ stp A_l, A_h, [dst, 16]\r
+ ldp A_l, A_h, [src, 16]\r
+ stp B_l, B_h, [dst, 32]\r
+ ldp B_l, B_h, [src, 32]\r
+ stp C_l, C_h, [dst, 48]\r
+ ldp C_l, C_h, [src, 48]\r
+ stp D_l, D_h, [dst, 64]!\r
+ ldp D_l, D_h, [src, 64]!\r
+ subs count, count, 64\r
+ b.hi 1b\r
+\r
+ // Write the last full set of 64 bytes. The remainder is at most 64\r
+ // bytes, so it is safe to always copy 64 bytes from the end even if\r
+ // there is just 1 byte left.\r
+2:\r
+ ldp E_l, E_h, [srcend, -64]\r
+ stp A_l, A_h, [dst, 16]\r
+ ldp A_l, A_h, [srcend, -48]\r
+ stp B_l, B_h, [dst, 32]\r
+ ldp B_l, B_h, [srcend, -32]\r
+ stp C_l, C_h, [dst, 48]\r
+ ldp C_l, C_h, [srcend, -16]\r
+ stp D_l, D_h, [dst, 64]\r
+ stp E_l, E_h, [dstend, -64]\r
+ stp A_l, A_h, [dstend, -48]\r
+ stp B_l, B_h, [dstend, -32]\r
+ stp C_l, C_h, [dstend, -16]\r
+ ret\r
+\r
+\r
+//\r
+// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r
+// Larger backwards copies are also handled by memcpy. The only remaining\r
+// case is forward large copies. The destination is aligned, and an\r
+// unrolled loop processes 64 bytes per iteration.\r
+//\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
+ASM_PFX(InternalMemCopyMem):\r
+ sub tmp2, dstin, src\r
+ cmp count, 96\r
+ ccmp tmp2, count, 2, hi\r
+ b.hs __memcpy\r
+\r
+ cbz tmp2, 3f\r
+ add dstend, dstin, count\r
+ add srcend, src, count\r
+\r
+ // Align dstend to 16 byte alignment so that we don't cross cache line\r
+ // boundaries on both loads and stores. There are at least 96 bytes\r
+ // to copy, so copy 16 bytes unaligned and then align. The loop\r
+ // copies 64 bytes per iteration and prefetches one iteration ahead.\r
+\r
+ and tmp2, dstend, 15\r
+ ldp D_l, D_h, [srcend, -16]\r
+ sub srcend, srcend, tmp2\r
+ sub count, count, tmp2\r
+ ldp A_l, A_h, [srcend, -16]\r
+ stp D_l, D_h, [dstend, -16]\r
+ ldp B_l, B_h, [srcend, -32]\r
+ ldp C_l, C_h, [srcend, -48]\r
+ ldp D_l, D_h, [srcend, -64]!\r
+ sub dstend, dstend, tmp2\r
+ subs count, count, 128\r
+ b.ls 2f\r
+ nop\r
+1:\r
+ stp A_l, A_h, [dstend, -16]\r
+ ldp A_l, A_h, [srcend, -16]\r
+ stp B_l, B_h, [dstend, -32]\r
+ ldp B_l, B_h, [srcend, -32]\r
+ stp C_l, C_h, [dstend, -48]\r
+ ldp C_l, C_h, [srcend, -48]\r
+ stp D_l, D_h, [dstend, -64]!\r
+ ldp D_l, D_h, [srcend, -64]!\r
+ subs count, count, 64\r
+ b.hi 1b\r
+\r
+ // Write the last full set of 64 bytes. The remainder is at most 64\r
+ // bytes, so it is safe to always copy 64 bytes from the start even if\r
+ // there is just 1 byte left.\r
+2:\r
+ ldp E_l, E_h, [src, 48]\r
+ stp A_l, A_h, [dstend, -16]\r
+ ldp A_l, A_h, [src, 32]\r
+ stp B_l, B_h, [dstend, -32]\r
+ ldp B_l, B_h, [src, 16]\r
+ stp C_l, C_h, [dstend, -48]\r
+ ldp C_l, C_h, [src]\r
+ stp D_l, D_h, [dstend, -64]\r
+ stp E_l, E_h, [dstin, 48]\r
+ stp A_l, A_h, [dstin, 32]\r
+ stp B_l, B_h, [dstin, 16]\r
+ stp C_l, C_h, [dstin]\r
+3: ret\r
--- /dev/null
+//\r
+// Copyright (c) 2014, ARM Limited\r
+// All rights Reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions are met:\r
+// * Redistributions of source code must retain the above copyright\r
+// notice, this list of conditions and the following disclaimer.\r
+// * Redistributions in binary form must reproduce the above copyright\r
+// notice, this list of conditions and the following disclaimer in the\r
+// documentation and/or other materials provided with the distribution.\r
+// * Neither the name of the company nor the names of its contributors\r
+// may be used to endorse or promote products derived from this\r
+// software without specific prior written permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+// Assumptions:\r
+//\r
+// ARMv8-a, AArch64\r
+// Neon Available.\r
+//\r
+\r
+// Arguments and results.\r
+#define srcin x0\r
+#define cntin x1\r
+#define chrin w2\r
+\r
+#define result x0\r
+\r
+#define src x3\r
+#define tmp x4\r
+#define wtmp2 w5\r
+#define synd x6\r
+#define soff x9\r
+#define cntrem x10\r
+\r
+#define vrepchr v0\r
+#define vdata1 v1\r
+#define vdata2 v2\r
+#define vhas_chr1 v3\r
+#define vhas_chr2 v4\r
+#define vrepmask v5\r
+#define vend v6\r
+\r
+//\r
+// Core algorithm:\r
+//\r
+// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits\r
+// per byte. For each tuple, bit 0 is set if the relevant byte matched the\r
+// requested character and bit 1 is not used (faster than using a 32bit\r
+// syndrome). Since the bits in the syndrome reflect exactly the order in which\r
+// things occur in the original string, counting trailing zeros allows to\r
+// identify exactly which byte has matched.\r
+//\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemScanMem8)\r
+ASM_PFX(InternalMemScanMem8):\r
+ // Do not dereference srcin if no bytes to compare.\r
+ cbz cntin, .Lzero_length\r
+ //\r
+ // Magic constant 0x40100401 allows us to identify which lane matches\r
+ // the requested byte.\r
+ //\r
+ mov wtmp2, #0x0401\r
+ movk wtmp2, #0x4010, lsl #16\r
+ dup vrepchr.16b, chrin\r
+ // Work with aligned 32-byte chunks\r
+ bic src, srcin, #31\r
+ dup vrepmask.4s, wtmp2\r
+ ands soff, srcin, #31\r
+ and cntrem, cntin, #31\r
+ b.eq .Lloop\r
+\r
+ //\r
+ // Input string is not 32-byte aligned. We calculate the syndrome\r
+ // value for the aligned 32 bytes block containing the first bytes\r
+ // and mask the irrelevant part.\r
+ //\r
+\r
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32\r
+ sub tmp, soff, #32\r
+ adds cntin, cntin, tmp\r
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r
+ addp vend.16b, vend.16b, vend.16b // 128->64\r
+ mov synd, vend.d[0]\r
+ // Clear the soff*2 lower bits\r
+ lsl tmp, soff, #1\r
+ lsr synd, synd, tmp\r
+ lsl synd, synd, tmp\r
+ // The first block can also be the last\r
+ b.ls .Lmasklast\r
+ // Have we found something already?\r
+ cbnz synd, .Ltail\r
+\r
+.Lloop:\r
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32\r
+ subs cntin, cntin, #32\r
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
+ // If we're out of data we finish regardless of the result\r
+ b.ls .Lend\r
+ // Use a fast check for the termination condition\r
+ orr vend.16b, vhas_chr1.16b, vhas_chr2.16b\r
+ addp vend.2d, vend.2d, vend.2d\r
+ mov synd, vend.d[0]\r
+ // We're not out of data, loop if we haven't found the character\r
+ cbz synd, .Lloop\r
+\r
+.Lend:\r
+ // Termination condition found, let's calculate the syndrome value\r
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
+ addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128\r
+ addp vend.16b, vend.16b, vend.16b // 128->64\r
+ mov synd, vend.d[0]\r
+ // Only do the clear for the last possible block\r
+ b.hi .Ltail\r
+\r
+.Lmasklast:\r
+ // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits\r
+ add tmp, cntrem, soff\r
+ and tmp, tmp, #31\r
+ sub tmp, tmp, #32\r
+ neg tmp, tmp, lsl #1\r
+ lsl synd, synd, tmp\r
+ lsr synd, synd, tmp\r
+\r
+.Ltail:\r
+ // Count the trailing zeros using bit reversing\r
+ rbit synd, synd\r
+ // Compensate the last post-increment\r
+ sub src, src, #32\r
+ // Check that we have found a character\r
+ cmp synd, #0\r
+ // And count the leading zeros\r
+ clz synd, synd\r
+ // Compute the potential result\r
+ add result, src, synd, lsr #1\r
+ // Select result or NULL\r
+ csel result, xzr, result, eq\r
+ ret\r
+\r
+.Lzero_length:\r
+ mov result, #0\r
+ ret\r
--- /dev/null
+//\r
+// Copyright (c) 2012 - 2016, Linaro Limited\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions are met:\r
+// * Redistributions of source code must retain the above copyright\r
+// notice, this list of conditions and the following disclaimer.\r
+// * Redistributions in binary form must reproduce the above copyright\r
+// notice, this list of conditions and the following disclaimer in the\r
+// documentation and/or other materials provided with the distribution.\r
+// * Neither the name of the Linaro nor the\r
+// names of its contributors may be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+//\r
+// Copyright (c) 2015 ARM Ltd\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions\r
+// are met:\r
+// 1. Redistributions of source code must retain the above copyright\r
+// notice, this list of conditions and the following disclaimer.\r
+// 2. Redistributions in binary form must reproduce the above copyright\r
+// notice, this list of conditions and the following disclaimer in the\r
+// documentation and/or other materials provided with the distribution.\r
+// 3. The name of the company may not be used to endorse or promote\r
+// products derived from this software without specific prior written\r
+// permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+// Assumptions:\r
+//\r
+// ARMv8-a, AArch64, unaligned accesses\r
+//\r
+//\r
+\r
+#define dstin x0\r
+#define count x1\r
+#define val x2\r
+#define valw w2\r
+#define dst x3\r
+#define dstend x4\r
+#define tmp1 x5\r
+#define tmp1w w5\r
+#define tmp2 x6\r
+#define tmp2w w6\r
+#define zva_len x7\r
+#define zva_lenw w7\r
+\r
+#define L(l) .L ## l\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemSetMem16)\r
+ASM_PFX(InternalMemSetMem16):\r
+ dup v0.8H, valw\r
+ b 0f\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemSetMem32)\r
+ASM_PFX(InternalMemSetMem32):\r
+ dup v0.4S, valw\r
+ b 0f\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemSetMem64)\r
+ASM_PFX(InternalMemSetMem64):\r
+ dup v0.2D, val\r
+ b 0f\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemZeroMem)\r
+ASM_PFX(InternalMemZeroMem):\r
+ movi v0.16B, #0\r
+ b 0f\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemSetMem)\r
+ASM_PFX(InternalMemSetMem):\r
+ dup v0.16B, valw\r
+0: add dstend, dstin, count\r
+ mov val, v0.D[0]\r
+\r
+ cmp count, 96\r
+ b.hi L(set_long)\r
+ cmp count, 16\r
+ b.hs L(set_medium)\r
+\r
+ // Set 0..15 bytes.\r
+ tbz count, 3, 1f\r
+ str val, [dstin]\r
+ str val, [dstend, -8]\r
+ ret\r
+ nop\r
+1: tbz count, 2, 2f\r
+ str valw, [dstin]\r
+ str valw, [dstend, -4]\r
+ ret\r
+2: cbz count, 3f\r
+ strb valw, [dstin]\r
+ tbz count, 1, 3f\r
+ strh valw, [dstend, -2]\r
+3: ret\r
+\r
+ // Set 17..96 bytes.\r
+L(set_medium):\r
+ str q0, [dstin]\r
+ tbnz count, 6, L(set96)\r
+ str q0, [dstend, -16]\r
+ tbz count, 5, 1f\r
+ str q0, [dstin, 16]\r
+ str q0, [dstend, -32]\r
+1: ret\r
+\r
+ .p2align 4\r
+ // Set 64..96 bytes. Write 64 bytes from the start and\r
+ // 32 bytes from the end.\r
+L(set96):\r
+ str q0, [dstin, 16]\r
+ stp q0, q0, [dstin, 32]\r
+ stp q0, q0, [dstend, -32]\r
+ ret\r
+\r
+ .p2align 3\r
+ nop\r
+L(set_long):\r
+ bic dst, dstin, 15\r
+ str q0, [dstin]\r
+ cmp count, 256\r
+ ccmp val, 0, 0, cs\r
+ b.eq L(try_zva)\r
+L(no_zva):\r
+ sub count, dstend, dst // Count is 16 too large.\r
+ add dst, dst, 16\r
+ sub count, count, 64 + 16 // Adjust count and bias for loop.\r
+1: stp q0, q0, [dst], 64\r
+ stp q0, q0, [dst, -32]\r
+L(tail64):\r
+ subs count, count, 64\r
+ b.hi 1b\r
+2: stp q0, q0, [dstend, -64]\r
+ stp q0, q0, [dstend, -32]\r
+ ret\r
+\r
+ .p2align 3\r
+L(try_zva):\r
+ mrs tmp1, dczid_el0\r
+ tbnz tmp1w, 4, L(no_zva)\r
+ and tmp1w, tmp1w, 15\r
+ cmp tmp1w, 4 // ZVA size is 64 bytes.\r
+ b.ne L(zva_128)\r
+\r
+ // Write the first and last 64 byte aligned block using stp rather\r
+ // than using DC ZVA. This is faster on some cores.\r
+L(zva_64):\r
+ str q0, [dst, 16]\r
+ stp q0, q0, [dst, 32]\r
+ bic dst, dst, 63\r
+ stp q0, q0, [dst, 64]\r
+ stp q0, q0, [dst, 96]\r
+ sub count, dstend, dst // Count is now 128 too large.\r
+ sub count, count, 128+64+64 // Adjust count and bias for loop.\r
+ add dst, dst, 128\r
+ nop\r
+1: dc zva, dst\r
+ add dst, dst, 64\r
+ subs count, count, 64\r
+ b.hi 1b\r
+ stp q0, q0, [dst, 0]\r
+ stp q0, q0, [dst, 32]\r
+ stp q0, q0, [dstend, -64]\r
+ stp q0, q0, [dstend, -32]\r
+ ret\r
+\r
+ .p2align 3\r
+L(zva_128):\r
+ cmp tmp1w, 5 // ZVA size is 128 bytes.\r
+ b.ne L(zva_other)\r
+\r
+ str q0, [dst, 16]\r
+ stp q0, q0, [dst, 32]\r
+ stp q0, q0, [dst, 64]\r
+ stp q0, q0, [dst, 96]\r
+ bic dst, dst, 127\r
+ sub count, dstend, dst // Count is now 128 too large.\r
+ sub count, count, 128+128 // Adjust count and bias for loop.\r
+ add dst, dst, 128\r
+1: dc zva, dst\r
+ add dst, dst, 128\r
+ subs count, count, 128\r
+ b.hi 1b\r
+ stp q0, q0, [dstend, -128]\r
+ stp q0, q0, [dstend, -96]\r
+ stp q0, q0, [dstend, -64]\r
+ stp q0, q0, [dstend, -32]\r
+ ret\r
+\r
+L(zva_other):\r
+ mov tmp2w, 4\r
+ lsl zva_lenw, tmp2w, tmp1w\r
+ add tmp1, zva_len, 64 // Max alignment bytes written.\r
+ cmp count, tmp1\r
+ blo L(no_zva)\r
+\r
+ sub tmp2, zva_len, 1\r
+ add tmp1, dst, zva_len\r
+ add dst, dst, 16\r
+ subs count, tmp1, dst // Actual alignment bytes to write.\r
+ bic tmp1, tmp1, tmp2 // Aligned dc zva start address.\r
+ beq 2f\r
+1: stp q0, q0, [dst], 64\r
+ stp q0, q0, [dst, -32]\r
+ subs count, count, 64\r
+ b.hi 1b\r
+2: mov dst, tmp1\r
+ sub count, dstend, tmp1 // Remaining bytes to write.\r
+ subs count, count, zva_len\r
+ b.lo 4f\r
+3: dc zva, dst\r
+ add dst, dst, zva_len\r
+ subs count, count, zva_len\r
+ b.hs 3b\r
+4: add count, count, zva_len\r
+ b L(tail64)\r
\r
\r
#\r
-# VALID_ARCHITECTURES = IA32 X64 ARM\r
+# VALID_ARCHITECTURES = IA32 X64 ARM AARCH64\r
#\r
\r
[Sources]\r
Arm/CopyMem.asm |RVCT\r
Arm/CompareMem.asm |RVCT\r
\r
+[Sources.AARCH64]\r
+ AArch64/ScanMem.S\r
+ AArch64/SetMem.S\r
+ AArch64/CopyMem.S\r
+ AArch64/CompareMem.S\r
+\r
+[Sources.ARM, Sources.AARCH64]\r
Arm/ScanMemGeneric.c\r
\r
[Sources]\r