MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines

author Ard Biesheuvel <ard.biesheuvel@linaro.org>

Fri, 2 Sep 2016 11:34:22 +0000 (12:34 +0100)

committer Ard Biesheuvel <ard.biesheuvel@linaro.org>

Tue, 13 Sep 2016 15:28:15 +0000 (16:28 +0100)
author Ard Biesheuvel <ard.biesheuvel@linaro.org>
Fri, 2 Sep 2016 11:34:22 +0000 (12:34 +0100)
committer Ard Biesheuvel <ard.biesheuvel@linaro.org>
Tue, 13 Sep 2016 15:28:15 +0000 (16:28 +0100)
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S

new file mode 100644 (file)

index 0000000..a54de69
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S
@@ -0,0 +1,142 @@
+//\r
+// Copyright (c) 2013, Linaro Limited\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions are met:\r
+//     * Redistributions of source code must retain the above copyright\r
+//       notice, this list of conditions and the following disclaimer.\r
+//     * Redistributions in binary form must reproduce the above copyright\r
+//       notice, this list of conditions and the following disclaimer in the\r
+//       documentation and/or other materials provided with the distribution.\r
+//     * Neither the name of the Linaro nor the\r
+//       names of its contributors may be used to endorse or promote products\r
+//       derived from this software without specific prior written permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+// Assumptions:\r
+//\r
+// ARMv8-a, AArch64\r
+//\r
+\r
+\r
+// Parameters and result.\r
+#define src1      x0\r
+#define src2      x1\r
+#define limit     x2\r
+#define result    x0\r
+\r
+// Internal variables.\r
+#define data1     x3\r
+#define data1w    w3\r
+#define data2     x4\r
+#define data2w    w4\r
+#define diff      x6\r
+#define endloop   x7\r
+#define tmp1      x8\r
+#define tmp2      x9\r
+#define pos       x11\r
+#define limit_wd  x12\r
+#define mask      x13\r
+\r
+    .p2align 6\r
+ASM_GLOBAL ASM_PFX(InternalMemCompareMem)\r
+ASM_PFX(InternalMemCompareMem):\r
+    eor     tmp1, src1, src2\r
+    tst     tmp1, #7\r
+    b.ne    .Lmisaligned8\r
+    ands    tmp1, src1, #7\r
+    b.ne    .Lmutual_align\r
+    add     limit_wd, limit, #7\r
+    lsr     limit_wd, limit_wd, #3\r
+\r
+    // Start of performance-critical section  -- one 64B cache line.\r
+.Lloop_aligned:\r
+    ldr     data1, [src1], #8\r
+    ldr     data2, [src2], #8\r
+.Lstart_realigned:\r
+    subs    limit_wd, limit_wd, #1\r
+    eor     diff, data1, data2        // Non-zero if differences found.\r
+    csinv   endloop, diff, xzr, ne    // Last Dword or differences.\r
+    cbz     endloop, .Lloop_aligned\r
+    // End of performance-critical section  -- one 64B cache line.\r
+\r
+    // Not reached the limit, must have found a diff.\r
+    cbnz    limit_wd, .Lnot_limit\r
+\r
+    // Limit % 8 == 0 => all bytes significant.\r
+    ands    limit, limit, #7\r
+    b.eq    .Lnot_limit\r
+\r
+    lsl     limit, limit, #3              // Bits -> bytes.\r
+    mov     mask, #~0\r
+    lsl     mask, mask, limit\r
+    bic     data1, data1, mask\r
+    bic     data2, data2, mask\r
+\r
+    orr     diff, diff, mask\r
+\r
+.Lnot_limit:\r
+    rev     diff, diff\r
+    rev     data1, data1\r
+    rev     data2, data2\r
+\r
+    // The MS-non-zero bit of DIFF marks either the first bit\r
+    // that is different, or the end of the significant data.\r
+    // Shifting left now will bring the critical information into the\r
+    // top bits.\r
+    clz     pos, diff\r
+    lsl     data1, data1, pos\r
+    lsl     data2, data2, pos\r
+\r
+    // But we need to zero-extend (char is unsigned) the value and then\r
+    // perform a signed 32-bit subtraction.\r
+    lsr     data1, data1, #56\r
+    sub     result, data1, data2, lsr #56\r
+    ret\r
+\r
+.Lmutual_align:\r
+    // Sources are mutually aligned, but are not currently at an\r
+    // alignment boundary.  Round down the addresses and then mask off\r
+    // the bytes that precede the start point.\r
+    bic     src1, src1, #7\r
+    bic     src2, src2, #7\r
+    add     limit, limit, tmp1          // Adjust the limit for the extra.\r
+    lsl     tmp1, tmp1, #3              // Bytes beyond alignment -> bits.\r
+    ldr     data1, [src1], #8\r
+    neg     tmp1, tmp1                  // Bits to alignment -64.\r
+    ldr     data2, [src2], #8\r
+    mov     tmp2, #~0\r
+\r
+    // Little-endian.  Early bytes are at LSB.\r
+    lsr     tmp2, tmp2, tmp1            // Shift (tmp1 & 63).\r
+    add     limit_wd, limit, #7\r
+    orr     data1, data1, tmp2\r
+    orr     data2, data2, tmp2\r
+    lsr     limit_wd, limit_wd, #3\r
+    b       .Lstart_realigned\r
+\r
+    .p2align 6\r
+.Lmisaligned8:\r
+    sub     limit, limit, #1\r
+1:\r
+    // Perhaps we can do better than this.\r
+    ldrb    data1w, [src1], #1\r
+    ldrb    data2w, [src2], #1\r
+    subs    limit, limit, #1\r
+    ccmp    data1w, data2w, #0, cs      // NZCV = 0b0000.\r
+    b.eq    1b\r
+    sub     result, data1, data2\r
+    ret\r
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S

new file mode 100644 (file)

index 0000000..10b55b0
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
@@ -0,0 +1,284 @@
+//\r
+// Copyright (c) 2012 - 2016, Linaro Limited\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions are met:\r
+//     * Redistributions of source code must retain the above copyright\r
+//       notice, this list of conditions and the following disclaimer.\r
+//     * Redistributions in binary form must reproduce the above copyright\r
+//       notice, this list of conditions and the following disclaimer in the\r
+//       documentation and/or other materials provided with the distribution.\r
+//     * Neither the name of the Linaro nor the\r
+//       names of its contributors may be used to endorse or promote products\r
+//       derived from this software without specific prior written permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+//\r
+// Copyright (c) 2015 ARM Ltd\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions\r
+// are met:\r
+// 1. Redistributions of source code must retain the above copyright\r
+//    notice, this list of conditions and the following disclaimer.\r
+// 2. Redistributions in binary form must reproduce the above copyright\r
+//    notice, this list of conditions and the following disclaimer in the\r
+//    documentation and/or other materials provided with the distribution.\r
+// 3. The name of the company may not be used to endorse or promote\r
+//    products derived from this software without specific prior written\r
+//    permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+// Assumptions:\r
+//\r
+// ARMv8-a, AArch64, unaligned accesses.\r
+//\r
+//\r
+\r
+#define dstin     x0\r
+#define src       x1\r
+#define count     x2\r
+#define dst       x3\r
+#define srcend    x4\r
+#define dstend    x5\r
+#define A_l       x6\r
+#define A_lw      w6\r
+#define A_h       x7\r
+#define A_hw      w7\r
+#define B_l       x8\r
+#define B_lw      w8\r
+#define B_h       x9\r
+#define C_l       x10\r
+#define C_h       x11\r
+#define D_l       x12\r
+#define D_h       x13\r
+#define E_l       x14\r
+#define E_h       x15\r
+#define F_l       srcend\r
+#define F_h       dst\r
+#define tmp1      x9\r
+#define tmp2      x3\r
+\r
+#define L(l) .L ## l\r
+\r
+// Copies are split into 3 main cases: small copies of up to 16 bytes,\r
+// medium copies of 17..96 bytes which are fully unrolled. Large copies\r
+// of more than 96 bytes align the destination and use an unrolled loop\r
+// processing 64 bytes per iteration.\r
+// Small and medium copies read all data before writing, allowing any\r
+// kind of overlap, and memmove tailcalls memcpy for these cases as\r
+// well as non-overlapping copies.\r
+\r
+__memcpy:\r
+    prfm    PLDL1KEEP, [src]\r
+    add     srcend, src, count\r
+    add     dstend, dstin, count\r
+    cmp     count, 16\r
+    b.ls    L(copy16)\r
+    cmp     count, 96\r
+    b.hi    L(copy_long)\r
+\r
+    // Medium copies: 17..96 bytes.\r
+    sub     tmp1, count, 1\r
+    ldp     A_l, A_h, [src]\r
+    tbnz    tmp1, 6, L(copy96)\r
+    ldp     D_l, D_h, [srcend, -16]\r
+    tbz     tmp1, 5, 1f\r
+    ldp     B_l, B_h, [src, 16]\r
+    ldp     C_l, C_h, [srcend, -32]\r
+    stp     B_l, B_h, [dstin, 16]\r
+    stp     C_l, C_h, [dstend, -32]\r
+1:\r
+    stp     A_l, A_h, [dstin]\r
+    stp     D_l, D_h, [dstend, -16]\r
+    ret\r
+\r
+    .p2align 4\r
+    // Small copies: 0..16 bytes.\r
+L(copy16):\r
+    cmp     count, 8\r
+    b.lo    1f\r
+    ldr     A_l, [src]\r
+    ldr     A_h, [srcend, -8]\r
+    str     A_l, [dstin]\r
+    str     A_h, [dstend, -8]\r
+    ret\r
+    .p2align 4\r
+1:\r
+    tbz     count, 2, 1f\r
+    ldr     A_lw, [src]\r
+    ldr     A_hw, [srcend, -4]\r
+    str     A_lw, [dstin]\r
+    str     A_hw, [dstend, -4]\r
+    ret\r
+\r
+    // Copy 0..3 bytes.  Use a branchless sequence that copies the same\r
+    // byte 3 times if count==1, or the 2nd byte twice if count==2.\r
+1:\r
+    cbz     count, 2f\r
+    lsr     tmp1, count, 1\r
+    ldrb    A_lw, [src]\r
+    ldrb    A_hw, [srcend, -1]\r
+    ldrb    B_lw, [src, tmp1]\r
+    strb    A_lw, [dstin]\r
+    strb    B_lw, [dstin, tmp1]\r
+    strb    A_hw, [dstend, -1]\r
+2:  ret\r
+\r
+    .p2align 4\r
+    // Copy 64..96 bytes.  Copy 64 bytes from the start and\r
+    // 32 bytes from the end.\r
+L(copy96):\r
+    ldp     B_l, B_h, [src, 16]\r
+    ldp     C_l, C_h, [src, 32]\r
+    ldp     D_l, D_h, [src, 48]\r
+    ldp     E_l, E_h, [srcend, -32]\r
+    ldp     F_l, F_h, [srcend, -16]\r
+    stp     A_l, A_h, [dstin]\r
+    stp     B_l, B_h, [dstin, 16]\r
+    stp     C_l, C_h, [dstin, 32]\r
+    stp     D_l, D_h, [dstin, 48]\r
+    stp     E_l, E_h, [dstend, -32]\r
+    stp     F_l, F_h, [dstend, -16]\r
+    ret\r
+\r
+    // Align DST to 16 byte alignment so that we don't cross cache line\r
+    // boundaries on both loads and stores. There are at least 96 bytes\r
+    // to copy, so copy 16 bytes unaligned and then align.     The loop\r
+    // copies 64 bytes per iteration and prefetches one iteration ahead.\r
+\r
+    .p2align 4\r
+L(copy_long):\r
+    and     tmp1, dstin, 15\r
+    bic     dst, dstin, 15\r
+    ldp     D_l, D_h, [src]\r
+    sub     src, src, tmp1\r
+    add     count, count, tmp1      // Count is now 16 too large.\r
+    ldp     A_l, A_h, [src, 16]\r
+    stp     D_l, D_h, [dstin]\r
+    ldp     B_l, B_h, [src, 32]\r
+    ldp     C_l, C_h, [src, 48]\r
+    ldp     D_l, D_h, [src, 64]!\r
+    subs    count, count, 128 + 16  // Test and readjust count.\r
+    b.ls    2f\r
+1:\r
+    stp     A_l, A_h, [dst, 16]\r
+    ldp     A_l, A_h, [src, 16]\r
+    stp     B_l, B_h, [dst, 32]\r
+    ldp     B_l, B_h, [src, 32]\r
+    stp     C_l, C_h, [dst, 48]\r
+    ldp     C_l, C_h, [src, 48]\r
+    stp     D_l, D_h, [dst, 64]!\r
+    ldp     D_l, D_h, [src, 64]!\r
+    subs    count, count, 64\r
+    b.hi    1b\r
+\r
+    // Write the last full set of 64 bytes.     The remainder is at most 64\r
+    // bytes, so it is safe to always copy 64 bytes from the end even if\r
+    // there is just 1 byte left.\r
+2:\r
+    ldp     E_l, E_h, [srcend, -64]\r
+    stp     A_l, A_h, [dst, 16]\r
+    ldp     A_l, A_h, [srcend, -48]\r
+    stp     B_l, B_h, [dst, 32]\r
+    ldp     B_l, B_h, [srcend, -32]\r
+    stp     C_l, C_h, [dst, 48]\r
+    ldp     C_l, C_h, [srcend, -16]\r
+    stp     D_l, D_h, [dst, 64]\r
+    stp     E_l, E_h, [dstend, -64]\r
+    stp     A_l, A_h, [dstend, -48]\r
+    stp     B_l, B_h, [dstend, -32]\r
+    stp     C_l, C_h, [dstend, -16]\r
+    ret\r
+\r
+\r
+//\r
+// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.\r
+// Larger backwards copies are also handled by memcpy. The only remaining\r
+// case is forward large copies.  The destination is aligned, and an\r
+// unrolled loop processes 64 bytes per iteration.\r
+//\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemCopyMem)\r
+ASM_PFX(InternalMemCopyMem):\r
+    sub     tmp2, dstin, src\r
+    cmp     count, 96\r
+    ccmp    tmp2, count, 2, hi\r
+    b.hs    __memcpy\r
+\r
+    cbz     tmp2, 3f\r
+    add     dstend, dstin, count\r
+    add     srcend, src, count\r
+\r
+    // Align dstend to 16 byte alignment so that we don't cross cache line\r
+    // boundaries on both loads and stores. There are at least 96 bytes\r
+    // to copy, so copy 16 bytes unaligned and then align. The loop\r
+    // copies 64 bytes per iteration and prefetches one iteration ahead.\r
+\r
+    and     tmp2, dstend, 15\r
+    ldp     D_l, D_h, [srcend, -16]\r
+    sub     srcend, srcend, tmp2\r
+    sub     count, count, tmp2\r
+    ldp     A_l, A_h, [srcend, -16]\r
+    stp     D_l, D_h, [dstend, -16]\r
+    ldp     B_l, B_h, [srcend, -32]\r
+    ldp     C_l, C_h, [srcend, -48]\r
+    ldp     D_l, D_h, [srcend, -64]!\r
+    sub     dstend, dstend, tmp2\r
+    subs    count, count, 128\r
+    b.ls    2f\r
+    nop\r
+1:\r
+    stp     A_l, A_h, [dstend, -16]\r
+    ldp     A_l, A_h, [srcend, -16]\r
+    stp     B_l, B_h, [dstend, -32]\r
+    ldp     B_l, B_h, [srcend, -32]\r
+    stp     C_l, C_h, [dstend, -48]\r
+    ldp     C_l, C_h, [srcend, -48]\r
+    stp     D_l, D_h, [dstend, -64]!\r
+    ldp     D_l, D_h, [srcend, -64]!\r
+    subs    count, count, 64\r
+    b.hi    1b\r
+\r
+    // Write the last full set of 64 bytes. The remainder is at most 64\r
+    // bytes, so it is safe to always copy 64 bytes from the start even if\r
+    // there is just 1 byte left.\r
+2:\r
+    ldp     E_l, E_h, [src, 48]\r
+    stp     A_l, A_h, [dstend, -16]\r
+    ldp     A_l, A_h, [src, 32]\r
+    stp     B_l, B_h, [dstend, -32]\r
+    ldp     B_l, B_h, [src, 16]\r
+    stp     C_l, C_h, [dstend, -48]\r
+    ldp     C_l, C_h, [src]\r
+    stp     D_l, D_h, [dstend, -64]\r
+    stp     E_l, E_h, [dstin, 48]\r
+    stp     A_l, A_h, [dstin, 32]\r
+    stp     B_l, B_h, [dstin, 16]\r
+    stp     C_l, C_h, [dstin]\r
+3:  ret\r
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S

new file mode 100644 (file)

index 0000000..08e1fbb
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
@@ -0,0 +1,161 @@
+//\r
+// Copyright (c) 2014, ARM Limited\r
+// All rights Reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions are met:\r
+//     * Redistributions of source code must retain the above copyright\r
+//       notice, this list of conditions and the following disclaimer.\r
+//     * Redistributions in binary form must reproduce the above copyright\r
+//       notice, this list of conditions and the following disclaimer in the\r
+//       documentation and/or other materials provided with the distribution.\r
+//     * Neither the name of the company nor the names of its contributors\r
+//       may be used to endorse or promote products derived from this\r
+//       software without specific prior written permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+// Assumptions:\r
+//\r
+// ARMv8-a, AArch64\r
+// Neon Available.\r
+//\r
+\r
+// Arguments and results.\r
+#define srcin     x0\r
+#define cntin     x1\r
+#define chrin     w2\r
+\r
+#define result    x0\r
+\r
+#define src       x3\r
+#define        tmp       x4\r
+#define wtmp2     w5\r
+#define synd      x6\r
+#define soff      x9\r
+#define cntrem    x10\r
+\r
+#define vrepchr   v0\r
+#define vdata1    v1\r
+#define vdata2    v2\r
+#define vhas_chr1 v3\r
+#define vhas_chr2 v4\r
+#define vrepmask  v5\r
+#define vend      v6\r
+\r
+//\r
+// Core algorithm:\r
+//\r
+// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits\r
+// per byte. For each tuple, bit 0 is set if the relevant byte matched the\r
+// requested character and bit 1 is not used (faster than using a 32bit\r
+// syndrome). Since the bits in the syndrome reflect exactly the order in which\r
+// things occur in the original string, counting trailing zeros allows to\r
+// identify exactly which byte has matched.\r
+//\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemScanMem8)\r
+ASM_PFX(InternalMemScanMem8):\r
+    // Do not dereference srcin if no bytes to compare.\r
+    cbz        cntin, .Lzero_length\r
+    //\r
+    // Magic constant 0x40100401 allows us to identify which lane matches\r
+    // the requested byte.\r
+    //\r
+    mov     wtmp2, #0x0401\r
+    movk    wtmp2, #0x4010, lsl #16\r
+    dup     vrepchr.16b, chrin\r
+    // Work with aligned 32-byte chunks\r
+    bic     src, srcin, #31\r
+    dup     vrepmask.4s, wtmp2\r
+    ands    soff, srcin, #31\r
+    and     cntrem, cntin, #31\r
+    b.eq    .Lloop\r
+\r
+    //\r
+    // Input string is not 32-byte aligned. We calculate the syndrome\r
+    // value for the aligned 32 bytes block containing the first bytes\r
+    // and mask the irrelevant part.\r
+    //\r
+\r
+    ld1     {vdata1.16b, vdata2.16b}, [src], #32\r
+    sub     tmp, soff, #32\r
+    adds    cntin, cntin, tmp\r
+    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
+    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
+    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
+    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
+    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b        // 256->128\r
+    addp    vend.16b, vend.16b, vend.16b                  // 128->64\r
+    mov     synd, vend.d[0]\r
+    // Clear the soff*2 lower bits\r
+    lsl     tmp, soff, #1\r
+    lsr     synd, synd, tmp\r
+    lsl     synd, synd, tmp\r
+    // The first block can also be the last\r
+    b.ls    .Lmasklast\r
+    // Have we found something already?\r
+    cbnz    synd, .Ltail\r
+\r
+.Lloop:\r
+    ld1     {vdata1.16b, vdata2.16b}, [src], #32\r
+    subs    cntin, cntin, #32\r
+    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b\r
+    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b\r
+    // If we're out of data we finish regardless of the result\r
+    b.ls    .Lend\r
+    // Use a fast check for the termination condition\r
+    orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b\r
+    addp    vend.2d, vend.2d, vend.2d\r
+    mov     synd, vend.d[0]\r
+    // We're not out of data, loop if we haven't found the character\r
+    cbz     synd, .Lloop\r
+\r
+.Lend:\r
+    // Termination condition found, let's calculate the syndrome value\r
+    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b\r
+    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b\r
+    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b      // 256->128\r
+    addp    vend.16b, vend.16b, vend.16b                // 128->64\r
+    mov     synd, vend.d[0]\r
+    // Only do the clear for the last possible block\r
+    b.hi    .Ltail\r
+\r
+.Lmasklast:\r
+    // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits\r
+    add     tmp, cntrem, soff\r
+    and     tmp, tmp, #31\r
+    sub     tmp, tmp, #32\r
+    neg     tmp, tmp, lsl #1\r
+    lsl     synd, synd, tmp\r
+    lsr     synd, synd, tmp\r
+\r
+.Ltail:\r
+    // Count the trailing zeros using bit reversing\r
+    rbit    synd, synd\r
+    // Compensate the last post-increment\r
+    sub     src, src, #32\r
+    // Check that we have found a character\r
+    cmp     synd, #0\r
+    // And count the leading zeros\r
+    clz     synd, synd\r
+    // Compute the potential result\r
+    add     result, src, synd, lsr #1\r
+    // Select result or NULL\r
+    csel    result, xzr, result, eq\r
+    ret\r
+\r
+.Lzero_length:\r
+    mov   result, #0\r
+    ret\r
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S

new file mode 100644 (file)

index 0000000..7f36111
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
@@ -0,0 +1,244 @@
+//\r
+// Copyright (c) 2012 - 2016, Linaro Limited\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions are met:\r
+//     * Redistributions of source code must retain the above copyright\r
+//       notice, this list of conditions and the following disclaimer.\r
+//     * Redistributions in binary form must reproduce the above copyright\r
+//       notice, this list of conditions and the following disclaimer in the\r
+//       documentation and/or other materials provided with the distribution.\r
+//     * Neither the name of the Linaro nor the\r
+//       names of its contributors may be used to endorse or promote products\r
+//       derived from this software without specific prior written permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\r
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\r
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\r
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\r
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\r
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\r
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\r
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\r
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\r
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+//\r
+// Copyright (c) 2015 ARM Ltd\r
+// All rights reserved.\r
+//\r
+// Redistribution and use in source and binary forms, with or without\r
+// modification, are permitted provided that the following conditions\r
+// are met:\r
+// 1. Redistributions of source code must retain the above copyright\r
+//    notice, this list of conditions and the following disclaimer.\r
+// 2. Redistributions in binary form must reproduce the above copyright\r
+//    notice, this list of conditions and the following disclaimer in the\r
+//    documentation and/or other materials provided with the distribution.\r
+// 3. The name of the company may not be used to endorse or promote\r
+//    products derived from this software without specific prior written\r
+//    permission.\r
+//\r
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED\r
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF\r
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.\r
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\r
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\r
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\r
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\r
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\r
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\r
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\r
+//\r
+\r
+// Assumptions:\r
+//\r
+// ARMv8-a, AArch64, unaligned accesses\r
+//\r
+//\r
+\r
+#define dstin     x0\r
+#define count     x1\r
+#define val       x2\r
+#define valw      w2\r
+#define dst       x3\r
+#define dstend    x4\r
+#define tmp1      x5\r
+#define tmp1w     w5\r
+#define tmp2      x6\r
+#define tmp2w     w6\r
+#define zva_len   x7\r
+#define zva_lenw  w7\r
+\r
+#define L(l) .L ## l\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemSetMem16)\r
+ASM_PFX(InternalMemSetMem16):\r
+    dup     v0.8H, valw\r
+    b       0f\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemSetMem32)\r
+ASM_PFX(InternalMemSetMem32):\r
+    dup     v0.4S, valw\r
+    b       0f\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemSetMem64)\r
+ASM_PFX(InternalMemSetMem64):\r
+    dup     v0.2D, val\r
+    b       0f\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemZeroMem)\r
+ASM_PFX(InternalMemZeroMem):\r
+    movi    v0.16B, #0\r
+    b       0f\r
+\r
+ASM_GLOBAL ASM_PFX(InternalMemSetMem)\r
+ASM_PFX(InternalMemSetMem):\r
+    dup     v0.16B, valw\r
+0:  add     dstend, dstin, count\r
+    mov     val, v0.D[0]\r
+\r
+    cmp     count, 96\r
+    b.hi    L(set_long)\r
+    cmp     count, 16\r
+    b.hs    L(set_medium)\r
+\r
+    // Set 0..15 bytes.\r
+    tbz     count, 3, 1f\r
+    str     val, [dstin]\r
+    str     val, [dstend, -8]\r
+    ret\r
+    nop\r
+1:  tbz     count, 2, 2f\r
+    str     valw, [dstin]\r
+    str     valw, [dstend, -4]\r
+    ret\r
+2:  cbz     count, 3f\r
+    strb    valw, [dstin]\r
+    tbz     count, 1, 3f\r
+    strh    valw, [dstend, -2]\r
+3:  ret\r
+\r
+    // Set 17..96 bytes.\r
+L(set_medium):\r
+    str     q0, [dstin]\r
+    tbnz    count, 6, L(set96)\r
+    str     q0, [dstend, -16]\r
+    tbz     count, 5, 1f\r
+    str     q0, [dstin, 16]\r
+    str     q0, [dstend, -32]\r
+1:  ret\r
+\r
+    .p2align 4\r
+    // Set 64..96 bytes.  Write 64 bytes from the start and\r
+    // 32 bytes from the end.\r
+L(set96):\r
+    str     q0, [dstin, 16]\r
+    stp     q0, q0, [dstin, 32]\r
+    stp     q0, q0, [dstend, -32]\r
+    ret\r
+\r
+    .p2align 3\r
+    nop\r
+L(set_long):\r
+    bic     dst, dstin, 15\r
+    str     q0, [dstin]\r
+    cmp     count, 256\r
+    ccmp    val, 0, 0, cs\r
+    b.eq    L(try_zva)\r
+L(no_zva):\r
+    sub     count, dstend, dst        // Count is 16 too large.\r
+    add     dst, dst, 16\r
+    sub     count, count, 64 + 16     // Adjust count and bias for loop.\r
+1:  stp     q0, q0, [dst], 64\r
+    stp     q0, q0, [dst, -32]\r
+L(tail64):\r
+    subs    count, count, 64\r
+    b.hi    1b\r
+2:  stp     q0, q0, [dstend, -64]\r
+    stp     q0, q0, [dstend, -32]\r
+    ret\r
+\r
+    .p2align 3\r
+L(try_zva):\r
+    mrs     tmp1, dczid_el0\r
+    tbnz    tmp1w, 4, L(no_zva)\r
+    and     tmp1w, tmp1w, 15\r
+    cmp     tmp1w, 4                  // ZVA size is 64 bytes.\r
+    b.ne    L(zva_128)\r
+\r
+    // Write the first and last 64 byte aligned block using stp rather\r
+    // than using DC ZVA.  This is faster on some cores.\r
+L(zva_64):\r
+    str     q0, [dst, 16]\r
+    stp     q0, q0, [dst, 32]\r
+    bic     dst, dst, 63\r
+    stp     q0, q0, [dst, 64]\r
+    stp     q0, q0, [dst, 96]\r
+    sub     count, dstend, dst         // Count is now 128 too large.\r
+    sub     count, count, 128+64+64    // Adjust count and bias for loop.\r
+    add     dst, dst, 128\r
+    nop\r
+1:  dc      zva, dst\r
+    add     dst, dst, 64\r
+    subs    count, count, 64\r
+    b.hi    1b\r
+    stp     q0, q0, [dst, 0]\r
+    stp     q0, q0, [dst, 32]\r
+    stp     q0, q0, [dstend, -64]\r
+    stp     q0, q0, [dstend, -32]\r
+    ret\r
+\r
+    .p2align 3\r
+L(zva_128):\r
+    cmp     tmp1w, 5                    // ZVA size is 128 bytes.\r
+    b.ne    L(zva_other)\r
+\r
+    str     q0, [dst, 16]\r
+    stp     q0, q0, [dst, 32]\r
+    stp     q0, q0, [dst, 64]\r
+    stp     q0, q0, [dst, 96]\r
+    bic     dst, dst, 127\r
+    sub     count, dstend, dst          // Count is now 128 too large.\r
+    sub     count, count, 128+128       // Adjust count and bias for loop.\r
+    add     dst, dst, 128\r
+1:  dc      zva, dst\r
+    add     dst, dst, 128\r
+    subs    count, count, 128\r
+    b.hi    1b\r
+    stp     q0, q0, [dstend, -128]\r
+    stp     q0, q0, [dstend, -96]\r
+    stp     q0, q0, [dstend, -64]\r
+    stp     q0, q0, [dstend, -32]\r
+    ret\r
+\r
+L(zva_other):\r
+    mov     tmp2w, 4\r
+    lsl     zva_lenw, tmp2w, tmp1w\r
+    add     tmp1, zva_len, 64           // Max alignment bytes written.\r
+    cmp     count, tmp1\r
+    blo     L(no_zva)\r
+\r
+    sub     tmp2, zva_len, 1\r
+    add     tmp1, dst, zva_len\r
+    add     dst, dst, 16\r
+    subs    count, tmp1, dst            // Actual alignment bytes to write.\r
+    bic     tmp1, tmp1, tmp2            // Aligned dc zva start address.\r
+    beq     2f\r
+1:  stp     q0, q0, [dst], 64\r
+    stp     q0, q0, [dst, -32]\r
+    subs    count, count, 64\r
+    b.hi    1b\r
+2:  mov     dst, tmp1\r
+    sub     count, dstend, tmp1         // Remaining bytes to write.\r
+    subs    count, count, zva_len\r
+    b.lo    4f\r
+3:  dc      zva, dst\r
+    add     dst, dst, zva_len\r
+    subs    count, count, zva_len\r
+    b.hs    3b\r
+4:  add     count, count, zva_len\r
+    b       L(tail64)\r
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf

index d95eb599ea9e86221af9ee1832f631d91280ac9c..64d11b09ef0600856ec71e3c711b4b60105eadfa 100644 (file)
--- a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
@@ -27,7 +27,7 @@
  \r
  \r
  #\r
  \r
  \r
  #\r
-#  VALID_ARCHITECTURES           = IA32 X64 ARM\r
+#  VALID_ARCHITECTURES           = IA32 X64 ARM AARCH64\r
  #\r
  \r
  [Sources]\r
  #\r
  \r
  [Sources]\r
@@ -127,6 +127,13 @@
    Arm/CopyMem.asm     |RVCT\r
    Arm/CompareMem.asm  |RVCT\r
  \r
    Arm/CopyMem.asm     |RVCT\r
    Arm/CompareMem.asm  |RVCT\r
  \r
+[Sources.AARCH64]\r
+  AArch64/ScanMem.S\r
+  AArch64/SetMem.S\r
+  AArch64/CopyMem.S\r
+  AArch64/CompareMem.S\r
+\r
+[Sources.ARM, Sources.AARCH64]\r
    Arm/ScanMemGeneric.c\r
  \r
  [Sources]\r
    Arm/ScanMemGeneric.c\r
  \r
  [Sources]\r
author	Ard Biesheuvel <ard.biesheuvel@linaro.org>
	Fri, 2 Sep 2016 11:34:22 +0000 (12:34 +0100)
committer	Ard Biesheuvel <ard.biesheuvel@linaro.org>
	Tue, 13 Sep 2016 15:28:15 +0000 (16:28 +0100)
MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S	[new file with mode: 0644]	patch \| blob
MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S	[new file with mode: 0644]	patch \| blob
MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S	[new file with mode: 0644]	patch \| blob
MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S	[new file with mode: 0644]	patch \| blob
MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf		patch \| blob \| blame \| history