From: Ard Biesheuvel Date: Tue, 6 Sep 2016 14:06:38 +0000 (+0100) Subject: MdePkg/BaseMemoryLibOptDxe: add accelerated ARM routines X-Git-Tag: edk2-stable201903~5780 X-Git-Url: https://git.proxmox.com/?p=mirror_edk2.git;a=commitdiff_plain;h=a37f660599e8aefabf29a1ac9bef02ce55a3130c MdePkg/BaseMemoryLibOptDxe: add accelerated ARM routines This adds ARM support to BaseMemoryLibOptDxe, partially based on the cortex-strings library (ScanMem) and the existing CopyMem() implementation from BaseMemoryLibStm in ArmPkg. All string routines are accelerated except ScanMem16, ScanMem32, ScanMem64 and IsZeroBuffer, which can wait for another day. (Very few occurrences exist in the codebase) Contributed-under: TianoCore Contribution Agreement 1.0 Signed-off-by: Ard Biesheuvel Reviewed-by: Liming Gao --- diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S new file mode 100644 index 0000000000..951d15777a --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S @@ -0,0 +1,138 @@ +// +// Copyright (c) 2013 - 2016, Linaro Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of the Linaro nor the +// names of its contributors may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// Parameters and result. +#define src1 r0 +#define src2 r1 +#define limit r2 +#define result r0 + +// Internal variables. +#define data1 r3 +#define data2 r4 +#define limit_wd r5 +#define diff r6 +#define tmp1 r7 +#define tmp2 r12 +#define pos r8 +#define mask r14 + + .text + .thumb + .syntax unified + .align 5 +ASM_GLOBAL ASM_PFX(InternalMemCompareMem) +ASM_PFX(InternalMemCompareMem): + push {r4-r8, lr} + eor tmp1, src1, src2 + tst tmp1, #3 + bne .Lmisaligned4 + ands tmp1, src1, #3 + bne .Lmutual_align + add limit_wd, limit, #3 + nop.w + lsr limit_wd, limit_wd, #2 + + // Start of performance-critical section -- one 32B cache line. +.Lloop_aligned: + ldr data1, [src1], #4 + ldr data2, [src2], #4 +.Lstart_realigned: + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 // Non-zero if differences found. + cbnz diff, 0f + bne .Lloop_aligned + // End of performance-critical section -- one 32B cache line. + + // Not reached the limit, must have found a diff. +0: cbnz limit_wd, .Lnot_limit + + // Limit % 4 == 0 => all bytes significant. + ands limit, limit, #3 + beq .Lnot_limit + + lsl limit, limit, #3 // Bits -> bytes. + mov mask, #~0 + lsl mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + orr diff, diff, mask + +.Lnot_limit: + rev diff, diff + rev data1, data1 + rev data2, data2 + + // The MS-non-zero bit of DIFF marks either the first bit + // that is different, or the end of the significant data. + // Shifting left now will bring the critical information into the + // top bits. + clz pos, diff + lsl data1, data1, pos + lsl data2, data2, pos + + // But we need to zero-extend (char is unsigned) the value and then + // perform a signed 32-bit subtraction. + lsr data1, data1, #28 + sub result, data1, data2, lsr #28 + pop {r4-r8, pc} + +.Lmutual_align: + // Sources are mutually aligned, but are not currently at an + // alignment boundary. Round down the addresses and then mask off + // the bytes that precede the start point. + bic src1, src1, #3 + bic src2, src2, #3 + add limit, limit, tmp1 // Adjust the limit for the extra. + lsl tmp1, tmp1, #2 // Bytes beyond alignment -> bits. + ldr data1, [src1], #4 + neg tmp1, tmp1 // Bits to alignment -32. + ldr data2, [src2], #4 + mov tmp2, #~0 + + // Little-endian. Early bytes are at LSB. + lsr tmp2, tmp2, tmp1 // Shift (tmp1 & 31). + add limit_wd, limit, #3 + orr data1, data1, tmp2 + orr data2, data2, tmp2 + lsr limit_wd, limit_wd, #2 + b .Lstart_realigned + +.Lmisaligned4: + sub limit, limit, #1 +1: + // Perhaps we can do better than this. + ldrb data1, [src1], #1 + ldrb data2, [src2], #1 + subs limit, limit, #1 + it cs + cmpcs data1, data2 + beq 1b + sub result, data1, data2 + pop {r4-r8, pc} diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm new file mode 100644 index 0000000000..47b49ee164 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm @@ -0,0 +1,140 @@ +; +; Copyright (c) 2013 - 2016, Linaro Limited +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; * Neither the name of the Linaro nor the +; names of its contributors may be used to endorse or promote products +; derived from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +; + +; Parameters and result. +#define src1 r0 +#define src2 r1 +#define limit r2 +#define result r0 + +; Internal variables. +#define data1 r3 +#define data2 r4 +#define limit_wd r5 +#define diff r6 +#define tmp1 r7 +#define tmp2 r12 +#define pos r8 +#define mask r14 + + EXPORT InternalMemCompareMem + THUMB + AREA CompareMem, CODE, READONLY + +InternalMemCompareMem + push {r4-r8, lr} + eor tmp1, src1, src2 + tst tmp1, #3 + bne Lmisaligned4 + ands tmp1, src1, #3 + bne Lmutual_align + add limit_wd, limit, #3 + nop.w + lsr limit_wd, limit_wd, #2 + + ; Start of performance-critical section -- one 32B cache line. +Lloop_aligned + ldr data1, [src1], #4 + ldr data2, [src2], #4 +Lstart_realigned + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 ; Non-zero if differences found. + cbnz diff, L0 + bne Lloop_aligned + ; End of performance-critical section -- one 32B cache line. + + ; Not reached the limit, must have found a diff. +L0 + cbnz limit_wd, Lnot_limit + + // Limit % 4 == 0 => all bytes significant. + ands limit, limit, #3 + beq Lnot_limit + + lsl limit, limit, #3 // Bits -> bytes. + mov mask, #~0 + lsl mask, mask, limit + bic data1, data1, mask + bic data2, data2, mask + + orr diff, diff, mask + +Lnot_limit + rev diff, diff + rev data1, data1 + rev data2, data2 + + ; The MS-non-zero bit of DIFF marks either the first bit + ; that is different, or the end of the significant data. + ; Shifting left now will bring the critical information into the + ; top bits. + clz pos, diff + lsl data1, data1, pos + lsl data2, data2, pos + + ; But we need to zero-extend (char is unsigned) the value and then + ; perform a signed 32-bit subtraction. + lsr data1, data1, #28 + sub result, data1, data2, lsr #28 + pop {r4-r8, pc} + +Lmutual_align + ; Sources are mutually aligned, but are not currently at an + ; alignment boundary. Round down the addresses and then mask off + ; the bytes that precede the start point. + bic src1, src1, #3 + bic src2, src2, #3 + add limit, limit, tmp1 ; Adjust the limit for the extra. + lsl tmp1, tmp1, #2 ; Bytes beyond alignment -> bits. + ldr data1, [src1], #4 + neg tmp1, tmp1 ; Bits to alignment -32. + ldr data2, [src2], #4 + mov tmp2, #~0 + + ; Little-endian. Early bytes are at LSB. + lsr tmp2, tmp2, tmp1 ; Shift (tmp1 & 31). + add limit_wd, limit, #3 + orr data1, data1, tmp2 + orr data2, data2, tmp2 + lsr limit_wd, limit_wd, #2 + b Lstart_realigned + +Lmisaligned4 + sub limit, limit, #1 +L1 + // Perhaps we can do better than this. + ldrb data1, [src1], #1 + ldrb data2, [src2], #1 + subs limit, limit, #1 + it cs + cmpcs data1, data2 + beq L1 + sub result, data1, data2 + pop {r4-r8, pc} + + END diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S new file mode 100644 index 0000000000..fb5293befc --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S @@ -0,0 +1,172 @@ +#------------------------------------------------------------------------------ +# +# CopyMem() worker for ARM +# +# This file started out as C code that did 64 bit moves if the buffer was +# 32-bit aligned, else it does a byte copy. It also does a byte copy for +# any trailing bytes. It was updated to do 32-byte copies using stm/ldm. +# +# Copyright (c) 2008 - 2010, Apple Inc. All rights reserved.
+# Copyright (c) 2016, Linaro Ltd. All rights reserved.
+# This program and the accompanying materials +# are licensed and made available under the terms and conditions of the BSD License +# which accompanies this distribution. The full text of the license may be found at +# http://opensource.org/licenses/bsd-license.php +# +# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. +# +#------------------------------------------------------------------------------ + + .text + .thumb + .syntax unified + +/** + Copy Length bytes from Source to Destination. Overlap is OK. + + This implementation + + @param Destination Target of copy + @param Source Place to copy from + @param Length Number of bytes to copy + + @return Destination + + +VOID * +EFIAPI +InternalMemCopyMem ( + OUT VOID *DestinationBuffer, + IN CONST VOID *SourceBuffer, + IN UINTN Length + ) +**/ +ASM_GLOBAL ASM_PFX(InternalMemCopyMem) +ASM_PFX(InternalMemCopyMem): + push {r4-r11, lr} + // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length) + mov r11, r0 + mov r10, r0 + mov r12, r2 + mov r14, r1 + + cmp r11, r1 + // If (dest < source) + bcc memcopy_check_optim_default + + // If (source + length < dest) + rsb r3, r1, r11 + cmp r12, r3 + bcc memcopy_check_optim_default + b memcopy_check_optim_overlap + +memcopy_check_optim_default: + // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1) + tst r0, #0xF + it ne + movne r0, #0 + bne memcopy_default + tst r1, #0xF + ite ne + movne r3, #0 + moveq r3, #1 + cmp r2, #31 + ite ls + movls r0, #0 + andhi r0, r3, #1 + b memcopy_default + +memcopy_check_optim_overlap: + // r10 = dest_end, r14 = source_end + add r10, r11, r12 + add r14, r12, r1 + + // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned) + cmp r2, #31 + ite ls + movls r0, #0 + movhi r0, #1 + tst r10, #0xF + it ne + movne r0, #0 + tst r14, #0xF + it ne + movne r0, #0 + b memcopy_overlapped + +memcopy_overlapped_non_optim: + // We read 1 byte from the end of the source buffer + sub r3, r14, #1 + sub r12, r12, #1 + ldrb r3, [r3, #0] + sub r2, r10, #1 + cmp r12, #0 + // We write 1 byte at the end of the dest buffer + sub r10, r10, #1 + sub r14, r14, #1 + strb r3, [r2, #0] + bne memcopy_overlapped_non_optim + b memcopy_end + +// r10 = dest_end, r14 = source_end +memcopy_overlapped: + // Are we in the optimized case ? + cmp r0, #0 + beq memcopy_overlapped_non_optim + + // Optimized Overlapped - Read 32 bytes + sub r14, r14, #32 + sub r12, r12, #32 + cmp r12, #31 + ldmia r14, {r2-r9} + + // If length is less than 32 then disable optim + it ls + movls r0, #0 + + cmp r12, #0 + + // Optimized Overlapped - Write 32 bytes + sub r10, r10, #32 + stmia r10, {r2-r9} + + // while (length != 0) + bne memcopy_overlapped + b memcopy_end + +memcopy_default_non_optim: + // Byte copy + ldrb r3, [r14], #1 + sub r12, r12, #1 + strb r3, [r10], #1 + +memcopy_default: + cmp r12, #0 + beq memcopy_end + +// r10 = dest, r14 = source +memcopy_default_loop: + cmp r0, #0 + beq memcopy_default_non_optim + + // Optimized memcopy - Read 32 Bytes + sub r12, r12, #32 + cmp r12, #31 + ldmia r14!, {r2-r9} + + // If length is less than 32 then disable optim + it ls + movls r0, #0 + + cmp r12, #0 + + // Optimized memcopy - Write 32 Bytes + stmia r10!, {r2-r9} + + // while (length != 0) + bne memcopy_default_loop + +memcopy_end: + mov r0, r11 + pop {r4-r11, pc} diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm new file mode 100644 index 0000000000..2034807954 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm @@ -0,0 +1,147 @@ +;------------------------------------------------------------------------------ +; +; CopyMem() worker for ARM +; +; This file started out as C code that did 64 bit moves if the buffer was +; 32-bit aligned, else it does a byte copy. It also does a byte copy for +; any trailing bytes. It was updated to do 32-byte copies using stm/ldm. +; +; Copyright (c) 2008 - 2010, Apple Inc. All rights reserved.
+; Copyright (c) 2016, Linaro Ltd. All rights reserved.
+; This program and the accompanying materials +; are licensed and made available under the terms and conditions of the BSD License +; which accompanies this distribution. The full text of the license may be found at +; http://opensource.org/licenses/bsd-license.php +; +; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, +; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. +; +;------------------------------------------------------------------------------ + + EXPORT InternalMemCopyMem + AREA SetMem, CODE, READONLY + THUMB + +InternalMemCopyMem + stmfd sp!, {r4-r11, lr} + // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length) + mov r11, r0 + mov r10, r0 + mov r12, r2 + mov r14, r1 + +memcopy_check_overlapped + cmp r11, r1 + // If (dest < source) + bcc memcopy_check_optim_default + + // If (source + length < dest) + rsb r3, r1, r11 + cmp r12, r3 + bcc memcopy_check_optim_default + b memcopy_check_optim_overlap + +memcopy_check_optim_default + // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1) + tst r0, #0xF + movne r0, #0 + bne memcopy_default + tst r1, #0xF + movne r3, #0 + moveq r3, #1 + cmp r2, #31 + movls r0, #0 + andhi r0, r3, #1 + b memcopy_default + +memcopy_check_optim_overlap + // r10 = dest_end, r14 = source_end + add r10, r11, r12 + add r14, r12, r1 + + // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned) + cmp r2, #31 + movls r0, #0 + movhi r0, #1 + tst r10, #0xF + movne r0, #0 + tst r14, #0xF + movne r0, #0 + b memcopy_overlapped + +memcopy_overlapped_non_optim + // We read 1 byte from the end of the source buffer + sub r3, r14, #1 + sub r12, r12, #1 + ldrb r3, [r3, #0] + sub r2, r10, #1 + cmp r12, #0 + // We write 1 byte at the end of the dest buffer + sub r10, r10, #1 + sub r14, r14, #1 + strb r3, [r2, #0] + bne memcopy_overlapped_non_optim + b memcopy_end + +// r10 = dest_end, r14 = source_end +memcopy_overlapped + // Are we in the optimized case ? + cmp r0, #0 + beq memcopy_overlapped_non_optim + + // Optimized Overlapped - Read 32 bytes + sub r14, r14, #32 + sub r12, r12, #32 + cmp r12, #31 + ldmia r14, {r2-r9} + + // If length is less than 32 then disable optim + movls r0, #0 + + cmp r12, #0 + + // Optimized Overlapped - Write 32 bytes + sub r10, r10, #32 + stmia r10, {r2-r9} + + // while (length != 0) + bne memcopy_overlapped + b memcopy_end + +memcopy_default_non_optim + // Byte copy + ldrb r3, [r14], #1 + sub r12, r12, #1 + strb r3, [r10], #1 + +memcopy_default + cmp r12, #0 + beq memcopy_end + +// r10 = dest, r14 = source +memcopy_default_loop + cmp r0, #0 + beq memcopy_default_non_optim + + // Optimized memcopy - Read 32 Bytes + sub r12, r12, #32 + cmp r12, #31 + ldmia r14!, {r2-r9} + + // If length is less than 32 then disable optim + movls r0, #0 + + cmp r12, #0 + + // Optimized memcopy - Write 32 Bytes + stmia r10!, {r2-r9} + + // while (length != 0) + bne memcopy_default_loop + +memcopy_end + mov r0, r11 + ldmfd sp!, {r4-r11, pc} + + END + diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S new file mode 100644 index 0000000000..dc0e74e865 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S @@ -0,0 +1,146 @@ +// Copyright (c) 2010-2011, Linaro Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// * Neither the name of Linaro Limited nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +// +// Written by Dave Gilbert +// +// This memchr routine is optimised on a Cortex-A9 and should work on +// all ARMv7 processors. It has a fast past for short sizes, and has +// an optimised path for large data sets; the worst case is finding the +// match early in a large data set. +// + + +// 2011-02-07 david.gilbert@linaro.org +// Extracted from local git a5b438d861 +// 2011-07-14 david.gilbert@linaro.org +// Import endianness fix from local git ea786f1b +// 2011-12-07 david.gilbert@linaro.org +// Removed unneeded cbz from align loop + +// this lets us check a flag in a 00/ff byte easily in either endianness +#define CHARTSTMASK(c) 1<<(c*8) + + .text + .thumb + .syntax unified + + .type ASM_PFX(InternalMemScanMem8), %function +ASM_GLOBAL ASM_PFX(InternalMemScanMem8) +ASM_PFX(InternalMemScanMem8): + // r0 = start of memory to scan + // r1 = length + // r2 = character to look for + // returns r0 = pointer to character or NULL if not found + uxtb r2, r2 // Don't think we can trust the caller to actually pass a char + + cmp r1, #16 // If it's short don't bother with anything clever + blt 20f + + tst r0, #7 // If it's already aligned skip the next bit + beq 10f + + // Work up to an aligned point +5: + ldrb r3, [r0],#1 + subs r1, r1, #1 + cmp r3, r2 + beq 50f // If it matches exit found + tst r0, #7 + bne 5b // If not aligned yet then do next byte + +10: + // At this point, we are aligned, we know we have at least 8 bytes to work with + push {r4-r7} + orr r2, r2, r2, lsl #8 // expand the match word across to all bytes + orr r2, r2, r2, lsl #16 + bic r4, r1, #7 // Number of double words to work with + mvns r7, #0 // all F's + movs r3, #0 + +15: + ldmia r0!, {r5,r6} + subs r4, r4, #8 + eor r5, r5, r2 // Get it so that r5,r6 have 00's where the bytes match the target + eor r6, r6, r2 + uadd8 r5, r5, r7 // Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r5, r3, r7 // bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + uadd8 r6, r6, r7 // Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r6, r5, r7 // chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + cbnz r6, 60f + bne 15b // (Flags from the subs above) If not run out of bytes then go around again + + pop {r4-r7} + and r2, r2, #0xff // Get r2 back to a single character from the expansion above + and r1, r1, #7 // Leave the count remaining as the number after the double words have been done + +20: + cbz r1, 40f // 0 length or hit the end already then not found + +21: // Post aligned section, or just a short call + ldrb r3, [r0], #1 + subs r1, r1, #1 + eor r3, r3, r2 // r3 = 0 if match - doesn't break flags from sub + cbz r3, 50f + bne 21b // on r1 flags + +40: + movs r0, #0 // not found + bx lr + +50: + subs r0, r0, #1 // found + bx lr + +60: // We're here because the fast path found a hit - now we have to track down exactly which word it was + // r0 points to the start of the double word after the one that was tested + // r5 has the 00/ff pattern for the first word, r6 has the chained value + cmp r5, #0 + itte eq + moveq r5, r6 // the end is in the 2nd word + subeq r0, r0, #3 // Points to 2nd byte of 2nd word + subne r0, r0, #7 // or 2nd byte of 1st word + + // r0 currently points to the 3rd byte of the word containing the hit + tst r5, #CHARTSTMASK(0) // 1st character + bne 61f + adds r0, r0, #1 + tst r5, #CHARTSTMASK(1) // 2nd character + ittt eq + addeq r0, r0 ,#1 + tsteq r5, #(3 << 15) // 2nd & 3rd character + // If not the 3rd must be the last one + addeq r0, r0, #1 + +61: + pop {r4-r7} + subs r0, r0, #1 + bx lr diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm new file mode 100644 index 0000000000..abda87320e --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm @@ -0,0 +1,147 @@ +; Copyright (c) 2010-2011, Linaro Limited +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; * Neither the name of Linaro Limited nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +; + +; +; Written by Dave Gilbert +; +; This memchr routine is optimised on a Cortex-A9 and should work on +; all ARMv7 processors. It has a fast past for short sizes, and has +; an optimised path for large data sets; the worst case is finding the +; match early in a large data set. +; + + +; 2011-02-07 david.gilbert@linaro.org +; Extracted from local git a5b438d861 +; 2011-07-14 david.gilbert@linaro.org +; Import endianness fix from local git ea786f1b +; 2011-12-07 david.gilbert@linaro.org +; Removed unneeded cbz from align loop + +; this lets us check a flag in a 00/ff byte easily in either endianness +#define CHARTSTMASK(c) 1<<(c*8) + + EXPORT InternalMemScanMem8 + AREA ScanMem, CODE, READONLY + THUMB + +InternalMemScanMem8 + ; r0 = start of memory to scan + ; r1 = length + ; r2 = character to look for + ; returns r0 = pointer to character or NULL if not found + uxtb r2, r2 ; Don't think we can trust the caller to actually pass a char + + cmp r1, #16 ; If it's short don't bother with anything clever + blt L20 + + tst r0, #7 ; If it's already aligned skip the next bit + beq L10 + + ; Work up to an aligned point +L5 + ldrb r3, [r0],#1 + subs r1, r1, #1 + cmp r3, r2 + beq L50 ; If it matches exit found + tst r0, #7 + bne L5 ; If not aligned yet then do next byte + +L10 + ; At this point, we are aligned, we know we have at least 8 bytes to work with + push {r4-r7} + orr r2, r2, r2, lsl #8 ; expand the match word across to all bytes + orr r2, r2, r2, lsl #16 + bic r4, r1, #7 ; Number of double words to work with + mvns r7, #0 ; all F's + movs r3, #0 + +L15 + ldmia r0!, {r5,r6} + subs r4, r4, #8 + eor r5, r5, r2 ; Get it so that r5,r6 have 00's where the bytes match the target + eor r6, r6, r2 + uadd8 r5, r5, r7 ; Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r5, r3, r7 ; bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + uadd8 r6, r6, r7 ; Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r6, r5, r7 ; chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + cbnz r6, L60 + bne L15 ; (Flags from the subs above) If not run out of bytes then go around again + + pop {r4-r7} + and r2, r2, #0xff ; Get r2 back to a single character from the expansion above + and r1, r1, #7 ; Leave the count remaining as the number after the double words have been done + +L20 + cbz r1, L40 ; 0 length or hit the end already then not found + +L21 ; Post aligned section, or just a short call + ldrb r3, [r0], #1 + subs r1, r1, #1 + eor r3, r3, r2 ; r3 = 0 if match - doesn't break flags from sub + cbz r3, L50 + bne L21 ; on r1 flags + +L40 + movs r0, #0 ; not found + bx lr + +L50 + subs r0, r0, #1 ; found + bx lr + +L60 ; We're here because the fast path found a hit - now we have to track down exactly which word it was + ; r0 points to the start of the double word after the one that was tested + ; r5 has the 00/ff pattern for the first word, r6 has the chained value + cmp r5, #0 + itte eq + moveq r5, r6 ; the end is in the 2nd word + subeq r0, r0, #3 ; Points to 2nd byte of 2nd word + subne r0, r0, #7 ; or 2nd byte of 1st word + + ; r0 currently points to the 3rd byte of the word containing the hit + tst r5, #CHARTSTMASK(0) ; 1st character + bne L61 + adds r0, r0, #1 + tst r5, #CHARTSTMASK(1) ; 2nd character + ittt eq + addeq r0, r0 ,#1 + tsteq r5, #(3 << 15) ; 2nd & 3rd character + ; If not the 3rd must be the last one + addeq r0, r0, #1 + +L61 + pop {r4-r7} + subs r0, r0, #1 + bx lr + + END + diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c new file mode 100644 index 0000000000..20fa7e9be6 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c @@ -0,0 +1,142 @@ +/** @file + Architecture Independent Base Memory Library Implementation. + + The following BaseMemoryLib instances contain the same copy of this file: + BaseMemoryLib + PeiMemoryLib + UefiMemoryLib + + Copyright (c) 2006 - 2016, Intel Corporation. All rights reserved.
+ This program and the accompanying materials + are licensed and made available under the terms and conditions of the BSD License + which accompanies this distribution. The full text of the license may be found at + http://opensource.org/licenses/bsd-license.php. + + THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, + WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. + +**/ + +#include "../MemLibInternals.h" + +/** + Scans a target buffer for a 16-bit value, and returns a pointer to the + matching 16-bit value in the target buffer. + + @param Buffer The pointer to the target buffer to scan. + @param Length The count of 16-bit value to scan. Must be non-zero. + @param Value The value to search for in the target buffer. + + @return The pointer to the first occurrence, or NULL if not found. + +**/ +CONST VOID * +EFIAPI +InternalMemScanMem16 ( + IN CONST VOID *Buffer, + IN UINTN Length, + IN UINT16 Value + ) +{ + CONST UINT16 *Pointer; + + Pointer = (CONST UINT16*)Buffer; + do { + if (*Pointer == Value) { + return Pointer; + } + ++Pointer; + } while (--Length != 0); + return NULL; +} + +/** + Scans a target buffer for a 32-bit value, and returns a pointer to the + matching 32-bit value in the target buffer. + + @param Buffer The pointer to the target buffer to scan. + @param Length The count of 32-bit value to scan. Must be non-zero. + @param Value The value to search for in the target buffer. + + @return The pointer to the first occurrence, or NULL if not found. + +**/ +CONST VOID * +EFIAPI +InternalMemScanMem32 ( + IN CONST VOID *Buffer, + IN UINTN Length, + IN UINT32 Value + ) +{ + CONST UINT32 *Pointer; + + Pointer = (CONST UINT32*)Buffer; + do { + if (*Pointer == Value) { + return Pointer; + } + ++Pointer; + } while (--Length != 0); + return NULL; +} + +/** + Scans a target buffer for a 64-bit value, and returns a pointer to the + matching 64-bit value in the target buffer. + + @param Buffer The pointer to the target buffer to scan. + @param Length The count of 64-bit value to scan. Must be non-zero. + @param Value The value to search for in the target buffer. + + @return The pointer to the first occurrence, or NULL if not found. + +**/ +CONST VOID * +EFIAPI +InternalMemScanMem64 ( + IN CONST VOID *Buffer, + IN UINTN Length, + IN UINT64 Value + ) +{ + CONST UINT64 *Pointer; + + Pointer = (CONST UINT64*)Buffer; + do { + if (*Pointer == Value) { + return Pointer; + } + ++Pointer; + } while (--Length != 0); + return NULL; +} + +/** + Checks whether the contents of a buffer are all zeros. + + @param Buffer The pointer to the buffer to be checked. + @param Length The size of the buffer (in bytes) to be checked. + + @retval TRUE Contents of the buffer are all zeros. + @retval FALSE Contents of the buffer are not all zeros. + +**/ +BOOLEAN +EFIAPI +InternalMemIsZeroBuffer ( + IN CONST VOID *Buffer, + IN UINTN Length + ) +{ + CONST UINT8 *BufferData; + UINTN Index; + + BufferData = Buffer; + for (Index = 0; Index < Length; Index++) { + if (BufferData[Index] != 0) { + return FALSE; + } + } + return TRUE; +} diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S new file mode 100644 index 0000000000..c1755539d3 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S @@ -0,0 +1,77 @@ +#------------------------------------------------------------------------------ +# +# Copyright (c) 2016, Linaro Ltd. All rights reserved.
+# +# This program and the accompanying materials are licensed and made available +# under the terms and conditions of the BSD License which accompanies this +# distribution. The full text of the license may be found at +# http://opensource.org/licenses/bsd-license.php +# +# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, +# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. +# +#------------------------------------------------------------------------------ + + .text + .thumb + .syntax unified + .align 5 +ASM_GLOBAL ASM_PFX(InternalMemZeroMem) +ASM_PFX(InternalMemZeroMem): + movs r2, #0 + +ASM_GLOBAL ASM_PFX(InternalMemSetMem) +ASM_PFX(InternalMemSetMem): + uxtb r2, r2 + orr r2, r2, r2, lsl #8 + +ASM_GLOBAL ASM_PFX(InternalMemSetMem16) +ASM_PFX(InternalMemSetMem16): + uxth r2, r2 + orr r2, r2, r2, lsl #16 + +ASM_GLOBAL ASM_PFX(InternalMemSetMem32) +ASM_PFX(InternalMemSetMem32): + mov r3, r2 + +ASM_GLOBAL ASM_PFX(InternalMemSetMem64) +ASM_PFX(InternalMemSetMem64): + push {r4, lr} + cmp r1, #16 // fewer than 16 bytes of input? + add r1, r1, r0 // r1 := dst + length + add lr, r0, #16 + blt 2f + bic lr, lr, #15 // align output pointer + + str r2, [r0] // potentially unaligned store of 4 bytes + str r3, [r0, #4] // potentially unaligned store of 4 bytes + str r2, [r0, #8] // potentially unaligned store of 4 bytes + str r3, [r0, #12] // potentially unaligned store of 4 bytes + beq 1f + +0: add lr, lr, #16 // advance the output pointer by 16 bytes + subs r4, r1, lr // past the output? + blt 3f // break out of the loop + strd r2, r3, [lr, #-16] // aligned store of 16 bytes + strd r2, r3, [lr, #-8] + bne 0b // goto beginning of loop +1: pop {r4, pc} + +2: subs r4, r1, lr +3: adds r4, r4, #16 + subs r1, r1, #8 + cmp r4, #4 // between 4 and 15 bytes? + blt 4f + cmp r4, #8 // between 8 and 15 bytes? + str r2, [lr, #-16] // overlapping store of 4 + (4 + 4) + 4 bytes + itt gt + strgt r3, [lr, #-12] + strgt r2, [r1] + str r3, [r1, #4] + pop {r4, pc} + +4: cmp r4, #2 // 2 or 3 bytes? + strb r2, [lr, #-16] // store 1 byte + it ge + strhge r2, [r1, #6] // store 2 bytes + pop {r4, pc} diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm new file mode 100644 index 0000000000..2a8dc7d019 --- /dev/null +++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm @@ -0,0 +1,84 @@ +;------------------------------------------------------------------------------ +; +; Copyright (c) 2016, Linaro Ltd. All rights reserved.
+; +; This program and the accompanying materials are licensed and made available +; under the terms and conditions of the BSD License which accompanies this +; distribution. The full text of the license may be found at +; http://opensource.org/licenses/bsd-license.php +; +; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS, +; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED. +; +;------------------------------------------------------------------------------ + + EXPORT InternalMemZeroMem + EXPORT InternalMemSetMem + EXPORT InternalMemSetMem16 + EXPORT InternalMemSetMem32 + EXPORT InternalMemSetMem64 + + AREA SetMem, CODE, READONLY, CODEALIGN, ALIGN=5 + THUMB + +InternalMemZeroMem + movs r2, #0 + +InternalMemSetMem + uxtb r2, r2 + orr r2, r2, r2, lsl #8 + +InternalMemSetMem16 + uxth r2, r2 + orr r2, r2, r2, lsr #16 + +InternalMemSetMem32 + mov r3, r2 + +InternalMemSetMem64 + push {r4, lr} + cmp r1, #16 ; fewer than 16 bytes of input? + add r1, r1, r0 ; r1 := dst + length + add lr, r0, #16 + blt L2 + bic lr, lr, #15 ; align output pointer + + str r2, [r0] ; potentially unaligned store of 4 bytes + str r3, [r0, #4] ; potentially unaligned store of 4 bytes + str r2, [r0, #8] ; potentially unaligned store of 4 bytes + str r3, [r0, #12] ; potentially unaligned store of 4 bytes + beq L1 + +L0 + add lr, lr, #16 ; advance the output pointer by 16 bytes + subs r4, r1, lr ; past the output? + blt L3 ; break out of the loop + strd r2, r3, [lr, #-16] ; aligned store of 16 bytes + strd r2, r3, [lr, #-8] + bne L0 ; goto beginning of loop +L1 + pop {r4, pc} + +L2 + subs r4, r1, lr +L3 + adds r4, r4, #16 + subs r1, r1, #8 + cmp r4, #4 ; between 4 and 15 bytes? + blt L4 + cmp r4, #8 ; between 8 and 15 bytes? + str r2, [lr, #-16] ; overlapping store of 4 + (4 + 4) + 4 bytes + itt gt + strgt r3, [lr, #-12] + strgt r2, [r1] + str r3, [r1, #4] + pop {r4, pc} + +L4 + cmp r4, #2 ; 2 or 3 bytes? + strb r2, [lr, #-16] ; store 1 byte + it ge + strhge r2, [r1, #6] ; store 2 bytes + pop {r4, pc} + + END diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf index 71691b9859..d95eb599ea 100644 --- a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf +++ b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf @@ -27,7 +27,7 @@ # -# VALID_ARCHITECTURES = IA32 X64 +# VALID_ARCHITECTURES = IA32 X64 ARM # [Sources] @@ -79,19 +79,6 @@ Ia32/CopyMem.nasm Ia32/CopyMem.asm Ia32/IsZeroBuffer.nasm - ScanMem64Wrapper.c - ScanMem32Wrapper.c - ScanMem16Wrapper.c - ScanMem8Wrapper.c - ZeroMemWrapper.c - CompareMemWrapper.c - SetMem64Wrapper.c - SetMem32Wrapper.c - SetMem16Wrapper.c - SetMemWrapper.c - CopyMemWrapper.c - IsZeroBufferWrapper.c - MemLibGuid.c [Sources.X64] X64/ScanMem64.nasm @@ -128,6 +115,21 @@ X64/CopyMem.asm X64/CopyMem.S X64/IsZeroBuffer.nasm + +[Sources.ARM] + Arm/ScanMem.S |GCC + Arm/SetMem.S |GCC + Arm/CopyMem.S |GCC + Arm/CompareMem.S |GCC + + Arm/ScanMem.asm |RVCT + Arm/SetMem.asm |RVCT + Arm/CopyMem.asm |RVCT + Arm/CompareMem.asm |RVCT + + Arm/ScanMemGeneric.c + +[Sources] ScanMem64Wrapper.c ScanMem32Wrapper.c ScanMem16Wrapper.c