/* * Copyright (c) 2011 - 2013, ARM Ltd * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the company may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ .text .align 2 ASM_GLOBAL ASM_PFX(memcpy) // Taken from Newlib BSD implementation. ASM_PFX(memcpy): // Copy dst to x6, so we can preserve return value. mov x6, x0 // NOTE: although size_t is unsigned, this code uses signed // comparisons on x2 so relies on nb never having its top bit // set. In practice this is not going to be a real problem. // Require at least 64 bytes to be worth aligning. cmp x2, #64 blt qwordcopy // Compute offset to align destination to 16 bytes. neg x3, x0 and x3, x3, 15 cbz x3, blockcopy // offset == 0 is likely // We know there is at least 64 bytes to be done, so we // do a 16 byte misaligned copy at first and then later do // all 16-byte aligned copies. Some bytes will be copied // twice, but there's no harm in that since memcpy does not // guarantee correctness on overlap. sub x2, x2, x3 // nb -= offset ldp x4, x5, [x1] add x1, x1, x3 stp x4, x5, [x6] add x6, x6, x3 // The destination pointer is now qword (16 byte) aligned. // (The src pointer might be.) blockcopy: // Copy 64 bytes at a time. subs x2, x2, #64 blt 3f 2: subs x2, x2, #64 ldp x4, x5, [x1,#0] ldp x8, x9, [x1,#16] ldp x10,x11,[x1,#32] ldp x12,x13,[x1,#48] add x1, x1, #64 stp x4, x5, [x6,#0] stp x8, x9, [x6,#16] stp x10,x11,[x6,#32] stp x12,x13,[x6,#48] add x6, x6, #64 bge 2b // Unwind pre-decrement 3: add x2, x2, #64 qwordcopy: // Copy 0-48 bytes, 16 bytes at a time. subs x2, x2, #16 blt tailcopy 2: ldp x4, x5, [x1],#16 subs x2, x2, #16 stp x4, x5, [x6],#16 bge 2b // No need to unwind the pre-decrement, it would not change // the low 4 bits of the count. But how likely is it for the // byte count to be multiple of 16? Is it worth the overhead // of testing for x2 == -16? tailcopy: // Copy trailing 0-15 bytes. tbz x2, #3, 1f ldr x4, [x1],#8 // copy 8 bytes str x4, [x6],#8 1: tbz x2, #2, 1f ldr w4, [x1],#4 // copy 4 bytes str w4, [x6],#4 1: tbz x2, #1, 1f ldrh w4, [x1],#2 // copy 2 bytes strh w4, [x6],#2 1: tbz x2, #0, return ldrb w4, [x1] // copy 1 byte strb w4, [x6] return: // This is the only return point of memcpy. ret