// // Copyright (c) 2012 - 2016, Linaro Limited // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of the Linaro nor the // names of its contributors may be used to endorse or promote products // derived from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // // Copyright (c) 2015 ARM Ltd // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. The name of the company may not be used to endorse or promote // products derived from this software without specific prior written // permission. // // THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. // IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Assumptions: // // ARMv8-a, AArch64, unaligned accesses. // // #define dstin x0 #define src x1 #define count x2 #define dst x3 #define srcend x4 #define dstend x5 #define A_l x6 #define A_lw w6 #define A_h x7 #define A_hw w7 #define B_l x8 #define B_lw w8 #define B_h x9 #define C_l x10 #define C_h x11 #define D_l x12 #define D_h x13 #define E_l x14 #define E_h x15 #define F_l srcend #define F_h dst #define tmp1 x9 #define tmp2 x3 #define L(l) .L ## l // Copies are split into 3 main cases: small copies of up to 16 bytes, // medium copies of 17..96 bytes which are fully unrolled. Large copies // of more than 96 bytes align the destination and use an unrolled loop // processing 64 bytes per iteration. // Small and medium copies read all data before writing, allowing any // kind of overlap, and memmove tailcalls memcpy for these cases as // well as non-overlapping copies. __memcpy: prfm PLDL1KEEP, [src] add srcend, src, count add dstend, dstin, count cmp count, 16 b.ls L(copy16) cmp count, 96 b.hi L(copy_long) // Medium copies: 17..96 bytes. sub tmp1, count, 1 ldp A_l, A_h, [src] tbnz tmp1, 6, L(copy96) ldp D_l, D_h, [srcend, -16] tbz tmp1, 5, 1f ldp B_l, B_h, [src, 16] ldp C_l, C_h, [srcend, -32] stp B_l, B_h, [dstin, 16] stp C_l, C_h, [dstend, -32] 1: stp A_l, A_h, [dstin] stp D_l, D_h, [dstend, -16] ret .p2align 4 // Small copies: 0..16 bytes. L(copy16): cmp count, 8 b.lo 1f ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret .p2align 4 1: tbz count, 2, 1f ldr A_lw, [src] ldr A_hw, [srcend, -4] str A_lw, [dstin] str A_hw, [dstend, -4] ret // Copy 0..3 bytes. Use a branchless sequence that copies the same // byte 3 times if count==1, or the 2nd byte twice if count==2. 1: cbz count, 2f lsr tmp1, count, 1 ldrb A_lw, [src] ldrb A_hw, [srcend, -1] ldrb B_lw, [src, tmp1] strb A_lw, [dstin] strb B_lw, [dstin, tmp1] strb A_hw, [dstend, -1] 2: ret .p2align 4 // Copy 64..96 bytes. Copy 64 bytes from the start and // 32 bytes from the end. L(copy96): ldp B_l, B_h, [src, 16] ldp C_l, C_h, [src, 32] ldp D_l, D_h, [src, 48] ldp E_l, E_h, [srcend, -32] ldp F_l, F_h, [srcend, -16] stp A_l, A_h, [dstin] stp B_l, B_h, [dstin, 16] stp C_l, C_h, [dstin, 32] stp D_l, D_h, [dstin, 48] stp E_l, E_h, [dstend, -32] stp F_l, F_h, [dstend, -16] ret // Align DST to 16 byte alignment so that we don't cross cache line // boundaries on both loads and stores. There are at least 96 bytes // to copy, so copy 16 bytes unaligned and then align. The loop // copies 64 bytes per iteration and prefetches one iteration ahead. .p2align 4 L(copy_long): and tmp1, dstin, 15 bic dst, dstin, 15 ldp D_l, D_h, [src] sub src, src, tmp1 add count, count, tmp1 // Count is now 16 too large. ldp A_l, A_h, [src, 16] stp D_l, D_h, [dstin] ldp B_l, B_h, [src, 32] ldp C_l, C_h, [src, 48] ldp D_l, D_h, [src, 64]! subs count, count, 128 + 16 // Test and readjust count. b.ls 2f 1: stp A_l, A_h, [dst, 16] ldp A_l, A_h, [src, 16] stp B_l, B_h, [dst, 32] ldp B_l, B_h, [src, 32] stp C_l, C_h, [dst, 48] ldp C_l, C_h, [src, 48] stp D_l, D_h, [dst, 64]! ldp D_l, D_h, [src, 64]! subs count, count, 64 b.hi 1b // Write the last full set of 64 bytes. The remainder is at most 64 // bytes, so it is safe to always copy 64 bytes from the end even if // there is just 1 byte left. 2: ldp E_l, E_h, [srcend, -64] stp A_l, A_h, [dst, 16] ldp A_l, A_h, [srcend, -48] stp B_l, B_h, [dst, 32] ldp B_l, B_h, [srcend, -32] stp C_l, C_h, [dst, 48] ldp C_l, C_h, [srcend, -16] stp D_l, D_h, [dst, 64] stp E_l, E_h, [dstend, -64] stp A_l, A_h, [dstend, -48] stp B_l, B_h, [dstend, -32] stp C_l, C_h, [dstend, -16] ret // // All memmoves up to 96 bytes are done by memcpy as it supports overlaps. // Larger backwards copies are also handled by memcpy. The only remaining // case is forward large copies. The destination is aligned, and an // unrolled loop processes 64 bytes per iteration. // ASM_GLOBAL ASM_PFX(InternalMemCopyMem) ASM_PFX(InternalMemCopyMem): sub tmp2, dstin, src cmp count, 96 ccmp tmp2, count, 2, hi b.hs __memcpy cbz tmp2, 3f add dstend, dstin, count add srcend, src, count // Align dstend to 16 byte alignment so that we don't cross cache line // boundaries on both loads and stores. There are at least 96 bytes // to copy, so copy 16 bytes unaligned and then align. The loop // copies 64 bytes per iteration and prefetches one iteration ahead. and tmp2, dstend, 15 ldp D_l, D_h, [srcend, -16] sub srcend, srcend, tmp2 sub count, count, tmp2 ldp A_l, A_h, [srcend, -16] stp D_l, D_h, [dstend, -16] ldp B_l, B_h, [srcend, -32] ldp C_l, C_h, [srcend, -48] ldp D_l, D_h, [srcend, -64]! sub dstend, dstend, tmp2 subs count, count, 128 b.ls 2f nop 1: stp A_l, A_h, [dstend, -16] ldp A_l, A_h, [srcend, -16] stp B_l, B_h, [dstend, -32] ldp B_l, B_h, [srcend, -32] stp C_l, C_h, [dstend, -48] ldp C_l, C_h, [srcend, -48] stp D_l, D_h, [dstend, -64]! ldp D_l, D_h, [srcend, -64]! subs count, count, 64 b.hi 1b // Write the last full set of 64 bytes. The remainder is at most 64 // bytes, so it is safe to always copy 64 bytes from the start even if // there is just 1 byte left. 2: ldp E_l, E_h, [src, 48] stp A_l, A_h, [dstend, -16] ldp A_l, A_h, [src, 32] stp B_l, B_h, [dstend, -32] ldp B_l, B_h, [src, 16] stp C_l, C_h, [dstend, -48] ldp C_l, C_h, [src] stp D_l, D_h, [dstend, -64] stp E_l, E_h, [dstin, 48] stp A_l, A_h, [dstin, 32] stp B_l, B_h, [dstin, 16] stp C_l, C_h, [dstin] 3: ret