+//\r
+// Copyright (c) 2012 - 2016, Linaro Limited\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions are met:\r
+// * Redistributions of source code must retain the above copyright\r
+// notice, this list of conditions and the following disclaimer.\r
+// * Redistributions in binary form must reproduce the above copyright\r
+// notice, this list of conditions and the following disclaimer in the\r
+// documentation and/or other materials provided with the distribution.\r
+// * Neither the name of the Linaro nor the\r
+// names of its contributors may be used to endorse or promote products\r
+// derived from this software without specific prior written permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+//\r
+// Copyright (c) 2015 ARM Ltd\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions\r
+// are met:\r
+// 1. Redistributions of source code must retain the above copyright\r
+// notice, this list of conditions and the following disclaimer.\r
+// 2. Redistributions in binary form must reproduce the above copyright\r
+// notice, this list of conditions and the following disclaimer in the\r
+// documentation and/or other materials provided with the distribution.\r
+// 3. The name of the company may not be used to endorse or promote\r
+// products derived from this software without specific prior written\r
+// permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+// Assumptions:\r
+//\r
+// ARMv8-a, AArch64, unaligned accesses.\r
+//\r
+//\r
+\r
+#define dstin x0\r
+#define src x1\r
+#define count x2\r
+#define dst x3\r
+#define srcend x4\r
+#define dstend x5\r
+#define A_l x6\r
+#define A_lw w6\r
+#define A_h x7\r
+#define A_hw w7\r
+#define B_l x8\r
+#define B_lw w8\r
+#define B_h x9\r
+#define C_l x10\r
+#define C_h x11\r
+#define D_l x12\r
+#define D_h x13\r
+#define E_l x14\r
+#define E_h x15\r
+#define F_l srcend\r
+#define F_h dst\r
+#define tmp1 x9\r
+#define tmp2 x3\r
+\r
+#define L(l) .L ## l\r
+\r
+// Copies are split into 3 main cases: small copies of up to 16 bytes,\r
+// medium copies of 17..96 bytes which are fully unrolled. Large copies\r
+// of more than 96 bytes align the destination and use an unrolled loop\r
+// processing 64 bytes per iteration.\r
+// Small and medium copies read all data before writing, allowing any\r
+// kind of overlap, and memmove tailcalls memcpy for these cases as\r
+// well as non-overlapping copies.\r
+\r
+__memcpy:\r
+ prfm PLDL1KEEP, [src]\r
+ add srcend, src, count\r
+ add dstend, dstin, count\r
+ cmp count, 16\r
+ b.ls L(copy16)\r
+ cmp count, 96\r
+ b.hi L(copy_long)\r
+\r
+ // Medium copies: 17..96 bytes.\r
+ sub tmp1, count, 1\r
+ ldp A_l, A_h, [src]\r
+ tbnz tmp1, 6, L(copy96)\r
+ ldp D_l, D_h, [srcend, -16]\r
+ tbz tmp1, 5, 1f\r
+ ldp B_l, B_h, [src, 16]\r
+ ldp C_l, C_h, [srcend, -32]\r
+ stp B_l, B_h, [dstin, 16]\r
+ stp C_l, C_h, [dstend, -32]\r
+1:\r
+ stp A_l, A_h, [dstin]\r
+ stp D_l, D_h, [dstend, -16]\r
+ ret\r
+\r
+ .p2align 4\r
+ // Small copies: 0..16 bytes.\r
+L(copy16):\r
+ cmp count, 8\r
+ b.lo 1f\r
+ ldr A_l, [src]\r
+ ldr A_h, [srcend, -8]\r
+ str A_l, [dstin]\r
+ str A_h, [dstend, -8]\r
+ ret\r
+ .p2align 4\r
+1:\r
+ tbz count, 2, 1f\r
+ ldr A_lw, [src]\r
+ ldr A_hw, [srcend, -4]\r
+ str A_lw, [dstin]\r
+ str A_hw, [dstend, -4]\r
+ ret\r
+\r
+ // Copy 0..3 bytes. Use a branchless sequence that copies the same\r
+ // byte 3 times if count==1, or the 2nd byte twice if count==2.\r
+1:\r
+ cbz count, 2f\r
+ lsr tmp1, count, 1\r
+ ldrb A_lw, [src]\r
+ ldrb A_hw, [srcend, -1]\r
+ ldrb B_lw, [src, tmp1]\r
+ strb A_lw, [dstin]\r
+ strb B_lw, [dstin, tmp1]\r
+ strb A_hw, [dstend, -1]\r
+2: ret\r
+\r
+ .p2align 4\r
+ // Copy 64..96 bytes. Copy 64 bytes from the start and\r
+ // 32 bytes from the end.\r
+L(copy96):\r
+ ldp B_l, B_h, [src, 16]\r
+ ldp C_l, C_h, [src, 32]\r
+ ldp D_l, D_h, [src, 48]\r
+ ldp E_l, E_h, [srcend, -32]\r
+ ldp F_l, F_h, [srcend, -16]\r
+ stp A_l, A_h, [dstin]\r
+ stp B_l, B_h, [dstin, 16]\r
+ stp C_l, C_h, [dstin, 32]\r
+ stp D_l, D_h, [dstin, 48]\r
+ stp E_l, E_h, [dstend, -32]\r
+ stp F_l, F_h, [dstend, -16]\r
+ ret\r
+\r
+ // Align DST to 16 byte alignment so that we don't cross cache line\r
+ // boundaries on both loads and stores. There are at least 96 bytes\r
+ // to copy, so copy 16 bytes unaligned and then align. The loop\r
+ // copies 64 bytes per iteration and prefetches one iteration ahead.\r
+\r
+ .p2align 4\r
+L(copy_long):\r
+ and tmp1, dstin, 15\r
+ bic dst, dstin, 15\r
+ ldp D_l, D_h, [src]\r
+ sub src, src, tmp1\r
+ add count, count, tmp1 // Count is now 16 too large.\r
+ ldp A_l, A_h, [src, 16]\r
+ stp D_l, D_h, [dstin]\r
+ ldp B_l, B_h, [src, 32]\r
+ ldp C_l, C_h, [src, 48]\r
+ ldp D_l, D_h, [src, 64]!\r
+ subs count, count, 128 + 16 // Test and readjust count.\r
+ b.ls 2f\r
+1:\r
+ stp A_l, A_h, [dst, 16]\r
+ ldp A_l, A_h, [src, 16]\r
+ stp B_l, B_h, [dst, 32]\r
+ ldp B_l, B_h, [src, 32]\r
+ stp C_l, C_h, [dst, 48]\r
+ ldp C_l, C_h, [src, 48]\r
+ stp D_l, D_h, [dst, 64]!\r
+ ldp D_l, D_h, [src, 64]!\r
+ subs count, count, 64\r
+ b.hi 1b\r
+\r
+ // Write the last full set of 64 bytes. The remainder is at most 64\r
+ // bytes, so it is safe to always copy 64 bytes from the end even if\r
+ // there is just 1 byte left.\r
+2:\r
+ ldp E_l, E_h, [srcend, -64]\r
+ stp A_l, A_h, [dst, 16]\r
+ ldp A_l, A_h, [srcend, -48]\r
+ stp B_l, B_h, [dst, 32]\r
+ ldp B_l, B_h, [srcend, -32]\r
+ stp C_l, C_h, [dst, 48]\r
+ ldp C_l, C_h, [srcend, -16]\r
+ stp D_l, D_h, [dst, 64]\r
+ stp E_l, E_h, [dstend, -64]\r
+ stp A_l, A_h, [dstend, -48]\r
+ stp B_l, B_h, [dstend, -32]\r
+ stp C_l, C_h, [dstend, -16]\r
+ ret\r
+\r
+\r
+//\r
+// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r
+// Larger backwards copies are also handled by memcpy. The only remaining\r
+// case is forward large copies. The destination is aligned, and an\r
+// unrolled loop processes 64 bytes per iteration.\r
+//\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
+ASM_PFX(InternalMemCopyMem):\r
+ sub tmp2, dstin, src\r
+ cmp count, 96\r
+ ccmp tmp2, count, 2, hi\r
+ b.hs __memcpy\r
+\r
+ cbz tmp2, 3f\r
+ add dstend, dstin, count\r
+ add srcend, src, count\r
+\r
+ // Align dstend to 16 byte alignment so that we don't cross cache line\r
+ // boundaries on both loads and stores. There are at least 96 bytes\r
+ // to copy, so copy 16 bytes unaligned and then align. The loop\r
+ // copies 64 bytes per iteration and prefetches one iteration ahead.\r
+\r
+ and tmp2, dstend, 15\r
+ ldp D_l, D_h, [srcend, -16]\r
+ sub srcend, srcend, tmp2\r
+ sub count, count, tmp2\r
+ ldp A_l, A_h, [srcend, -16]\r
+ stp D_l, D_h, [dstend, -16]\r
+ ldp B_l, B_h, [srcend, -32]\r
+ ldp C_l, C_h, [srcend, -48]\r
+ ldp D_l, D_h, [srcend, -64]!\r
+ sub dstend, dstend, tmp2\r
+ subs count, count, 128\r
+ b.ls 2f\r
+ nop\r
+1:\r
+ stp A_l, A_h, [dstend, -16]\r
+ ldp A_l, A_h, [srcend, -16]\r
+ stp B_l, B_h, [dstend, -32]\r
+ ldp B_l, B_h, [srcend, -32]\r
+ stp C_l, C_h, [dstend, -48]\r
+ ldp C_l, C_h, [srcend, -48]\r
+ stp D_l, D_h, [dstend, -64]!\r
+ ldp D_l, D_h, [srcend, -64]!\r
+ subs count, count, 64\r
+ b.hi 1b\r
+\r
+ // Write the last full set of 64 bytes. The remainder is at most 64\r
+ // bytes, so it is safe to always copy 64 bytes from the start even if\r
+ // there is just 1 byte left.\r
+2:\r
+ ldp E_l, E_h, [src, 48]\r
+ stp A_l, A_h, [dstend, -16]\r
+ ldp A_l, A_h, [src, 32]\r
+ stp B_l, B_h, [dstend, -32]\r
+ ldp B_l, B_h, [src, 16]\r
+ stp C_l, C_h, [dstend, -48]\r
+ ldp C_l, C_h, [src]\r
+ stp D_l, D_h, [dstend, -64]\r
+ stp E_l, E_h, [dstin, 48]\r
+ stp A_l, A_h, [dstin, 32]\r
+ stp B_l, B_h, [dstin, 16]\r
+ stp C_l, C_h, [dstin]\r
+3: ret\r