From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Fri, 2 Sep 2016 11:34:22 +0000 (+0100)
Subject: MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines
X-Git-Tag: edk2-stable201903~5779
X-Git-Url: https://git.proxmox.com/?p=mirror_edk2.git;a=commitdiff_plain;h=c86cd1e175fb3f3b545521c53fa751141abd1b2d

MdePkg/BaseMemoryLibOptDxe: add accelerated AARCH64 routines

This adds AARCH64 support to BaseMemoryLibOptDxe, based on the cortex-strings
library. All string routines are accelerated except ScanMem16, ScanMem32,
ScanMem64 and IsZeroBuffer, which can wait for another day. (Very few
occurrences exist in the codebase)

Contributed-under: TianoCore Contribution Agreement 1.0
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Liming Gao <liming.gao@intel.com>
---

diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S
new file mode 100644
index 0000000000..a54de6948b
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CompareMem.S
@@ -0,0 +1,142 @@
+//
+// Copyright (c) 2013, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the Linaro nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64
+//
+
+
+// Parameters and result.
+#define src1      x0
+#define src2      x1
+#define limit     x2
+#define result    x0
+
+// Internal variables.
+#define data1     x3
+#define data1w    w3
+#define data2     x4
+#define data2w    w4
+#define diff      x6
+#define endloop   x7
+#define tmp1      x8
+#define tmp2      x9
+#define pos       x11
+#define limit_wd  x12
+#define mask      x13
+
+    .p2align 6
+ASM_GLOBAL ASM_PFX(InternalMemCompareMem)
+ASM_PFX(InternalMemCompareMem):
+    eor     tmp1, src1, src2
+    tst     tmp1, #7
+    b.ne    .Lmisaligned8
+    ands    tmp1, src1, #7
+    b.ne    .Lmutual_align
+    add     limit_wd, limit, #7
+    lsr     limit_wd, limit_wd, #3
+
+    // Start of performance-critical section  -- one 64B cache line.
+.Lloop_aligned:
+    ldr     data1, [src1], #8
+    ldr     data2, [src2], #8
+.Lstart_realigned:
+    subs    limit_wd, limit_wd, #1
+    eor     diff, data1, data2        // Non-zero if differences found.
+    csinv   endloop, diff, xzr, ne    // Last Dword or differences.
+    cbz     endloop, .Lloop_aligned
+    // End of performance-critical section  -- one 64B cache line.
+
+    // Not reached the limit, must have found a diff.
+    cbnz    limit_wd, .Lnot_limit
+
+    // Limit % 8 == 0 => all bytes significant.
+    ands    limit, limit, #7
+    b.eq    .Lnot_limit
+
+    lsl     limit, limit, #3              // Bits -> bytes.
+    mov     mask, #~0
+    lsl     mask, mask, limit
+    bic     data1, data1, mask
+    bic     data2, data2, mask
+
+    orr     diff, diff, mask
+
+.Lnot_limit:
+    rev     diff, diff
+    rev     data1, data1
+    rev     data2, data2
+
+    // The MS-non-zero bit of DIFF marks either the first bit
+    // that is different, or the end of the significant data.
+    // Shifting left now will bring the critical information into the
+    // top bits.
+    clz     pos, diff
+    lsl     data1, data1, pos
+    lsl     data2, data2, pos
+
+    // But we need to zero-extend (char is unsigned) the value and then
+    // perform a signed 32-bit subtraction.
+    lsr     data1, data1, #56
+    sub     result, data1, data2, lsr #56
+    ret
+
+.Lmutual_align:
+    // Sources are mutually aligned, but are not currently at an
+    // alignment boundary.  Round down the addresses and then mask off
+    // the bytes that precede the start point.
+    bic     src1, src1, #7
+    bic     src2, src2, #7
+    add     limit, limit, tmp1          // Adjust the limit for the extra.
+    lsl     tmp1, tmp1, #3              // Bytes beyond alignment -> bits.
+    ldr     data1, [src1], #8
+    neg     tmp1, tmp1                  // Bits to alignment -64.
+    ldr     data2, [src2], #8
+    mov     tmp2, #~0
+
+    // Little-endian.  Early bytes are at LSB.
+    lsr     tmp2, tmp2, tmp1            // Shift (tmp1 & 63).
+    add     limit_wd, limit, #7
+    orr     data1, data1, tmp2
+    orr     data2, data2, tmp2
+    lsr     limit_wd, limit_wd, #3
+    b       .Lstart_realigned
+
+    .p2align 6
+.Lmisaligned8:
+    sub     limit, limit, #1
+1:
+    // Perhaps we can do better than this.
+    ldrb    data1w, [src1], #1
+    ldrb    data2w, [src2], #1
+    subs    limit, limit, #1
+    ccmp    data1w, data2w, #0, cs      // NZCV = 0b0000.
+    b.eq    1b
+    sub     result, data1, data2
+    ret
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
new file mode 100644
index 0000000000..10b55b065c
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/CopyMem.S
@@ -0,0 +1,284 @@
+//
+// Copyright (c) 2012 - 2016, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the Linaro nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+//
+// Copyright (c) 2015 ARM Ltd
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+// 3. The name of the company may not be used to endorse or promote
+//    products derived from this software without specific prior written
+//    permission.
+//
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64, unaligned accesses.
+//
+//
+
+#define dstin     x0
+#define src       x1
+#define count     x2
+#define dst       x3
+#define srcend    x4
+#define dstend    x5
+#define A_l       x6
+#define A_lw      w6
+#define A_h       x7
+#define A_hw      w7
+#define B_l       x8
+#define B_lw      w8
+#define B_h       x9
+#define C_l       x10
+#define C_h       x11
+#define D_l       x12
+#define D_h       x13
+#define E_l       x14
+#define E_h       x15
+#define F_l       srcend
+#define F_h       dst
+#define tmp1      x9
+#define tmp2      x3
+
+#define L(l) .L ## l
+
+// Copies are split into 3 main cases: small copies of up to 16 bytes,
+// medium copies of 17..96 bytes which are fully unrolled. Large copies
+// of more than 96 bytes align the destination and use an unrolled loop
+// processing 64 bytes per iteration.
+// Small and medium copies read all data before writing, allowing any
+// kind of overlap, and memmove tailcalls memcpy for these cases as
+// well as non-overlapping copies.
+
+__memcpy:
+    prfm    PLDL1KEEP, [src]
+    add     srcend, src, count
+    add     dstend, dstin, count
+    cmp     count, 16
+    b.ls    L(copy16)
+    cmp     count, 96
+    b.hi    L(copy_long)
+
+    // Medium copies: 17..96 bytes.
+    sub     tmp1, count, 1
+    ldp     A_l, A_h, [src]
+    tbnz    tmp1, 6, L(copy96)
+    ldp     D_l, D_h, [srcend, -16]
+    tbz     tmp1, 5, 1f
+    ldp     B_l, B_h, [src, 16]
+    ldp     C_l, C_h, [srcend, -32]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstend, -32]
+1:
+    stp     A_l, A_h, [dstin]
+    stp     D_l, D_h, [dstend, -16]
+    ret
+
+    .p2align 4
+    // Small copies: 0..16 bytes.
+L(copy16):
+    cmp     count, 8
+    b.lo    1f
+    ldr     A_l, [src]
+    ldr     A_h, [srcend, -8]
+    str     A_l, [dstin]
+    str     A_h, [dstend, -8]
+    ret
+    .p2align 4
+1:
+    tbz     count, 2, 1f
+    ldr     A_lw, [src]
+    ldr     A_hw, [srcend, -4]
+    str     A_lw, [dstin]
+    str     A_hw, [dstend, -4]
+    ret
+
+    // Copy 0..3 bytes.  Use a branchless sequence that copies the same
+    // byte 3 times if count==1, or the 2nd byte twice if count==2.
+1:
+    cbz     count, 2f
+    lsr     tmp1, count, 1
+    ldrb    A_lw, [src]
+    ldrb    A_hw, [srcend, -1]
+    ldrb    B_lw, [src, tmp1]
+    strb    A_lw, [dstin]
+    strb    B_lw, [dstin, tmp1]
+    strb    A_hw, [dstend, -1]
+2:  ret
+
+    .p2align 4
+    // Copy 64..96 bytes.  Copy 64 bytes from the start and
+    // 32 bytes from the end.
+L(copy96):
+    ldp     B_l, B_h, [src, 16]
+    ldp     C_l, C_h, [src, 32]
+    ldp     D_l, D_h, [src, 48]
+    ldp     E_l, E_h, [srcend, -32]
+    ldp     F_l, F_h, [srcend, -16]
+    stp     A_l, A_h, [dstin]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstin, 32]
+    stp     D_l, D_h, [dstin, 48]
+    stp     E_l, E_h, [dstend, -32]
+    stp     F_l, F_h, [dstend, -16]
+    ret
+
+    // Align DST to 16 byte alignment so that we don't cross cache line
+    // boundaries on both loads and stores. There are at least 96 bytes
+    // to copy, so copy 16 bytes unaligned and then align.	The loop
+    // copies 64 bytes per iteration and prefetches one iteration ahead.
+
+    .p2align 4
+L(copy_long):
+    and     tmp1, dstin, 15
+    bic     dst, dstin, 15
+    ldp     D_l, D_h, [src]
+    sub     src, src, tmp1
+    add     count, count, tmp1      // Count is now 16 too large.
+    ldp     A_l, A_h, [src, 16]
+    stp     D_l, D_h, [dstin]
+    ldp     B_l, B_h, [src, 32]
+    ldp     C_l, C_h, [src, 48]
+    ldp     D_l, D_h, [src, 64]!
+    subs    count, count, 128 + 16  // Test and readjust count.
+    b.ls    2f
+1:
+    stp     A_l, A_h, [dst, 16]
+    ldp     A_l, A_h, [src, 16]
+    stp     B_l, B_h, [dst, 32]
+    ldp     B_l, B_h, [src, 32]
+    stp     C_l, C_h, [dst, 48]
+    ldp     C_l, C_h, [src, 48]
+    stp     D_l, D_h, [dst, 64]!
+    ldp     D_l, D_h, [src, 64]!
+    subs    count, count, 64
+    b.hi    1b
+
+    // Write the last full set of 64 bytes.	 The remainder is at most 64
+    // bytes, so it is safe to always copy 64 bytes from the end even if
+    // there is just 1 byte left.
+2:
+    ldp     E_l, E_h, [srcend, -64]
+    stp     A_l, A_h, [dst, 16]
+    ldp     A_l, A_h, [srcend, -48]
+    stp     B_l, B_h, [dst, 32]
+    ldp     B_l, B_h, [srcend, -32]
+    stp     C_l, C_h, [dst, 48]
+    ldp     C_l, C_h, [srcend, -16]
+    stp     D_l, D_h, [dst, 64]
+    stp     E_l, E_h, [dstend, -64]
+    stp     A_l, A_h, [dstend, -48]
+    stp     B_l, B_h, [dstend, -32]
+    stp     C_l, C_h, [dstend, -16]
+    ret
+
+
+//
+// All memmoves up to 96 bytes are done by memcpy as it supports overlaps.
+// Larger backwards copies are also handled by memcpy. The only remaining
+// case is forward large copies.  The destination is aligned, and an
+// unrolled loop processes 64 bytes per iteration.
+//
+
+ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
+ASM_PFX(InternalMemCopyMem):
+    sub     tmp2, dstin, src
+    cmp     count, 96
+    ccmp    tmp2, count, 2, hi
+    b.hs    __memcpy
+
+    cbz     tmp2, 3f
+    add     dstend, dstin, count
+    add     srcend, src, count
+
+    // Align dstend to 16 byte alignment so that we don't cross cache line
+    // boundaries on both loads and stores. There are at least 96 bytes
+    // to copy, so copy 16 bytes unaligned and then align. The loop
+    // copies 64 bytes per iteration and prefetches one iteration ahead.
+
+    and     tmp2, dstend, 15
+    ldp     D_l, D_h, [srcend, -16]
+    sub     srcend, srcend, tmp2
+    sub     count, count, tmp2
+    ldp     A_l, A_h, [srcend, -16]
+    stp     D_l, D_h, [dstend, -16]
+    ldp     B_l, B_h, [srcend, -32]
+    ldp     C_l, C_h, [srcend, -48]
+    ldp     D_l, D_h, [srcend, -64]!
+    sub     dstend, dstend, tmp2
+    subs    count, count, 128
+    b.ls    2f
+    nop
+1:
+    stp     A_l, A_h, [dstend, -16]
+    ldp     A_l, A_h, [srcend, -16]
+    stp     B_l, B_h, [dstend, -32]
+    ldp     B_l, B_h, [srcend, -32]
+    stp     C_l, C_h, [dstend, -48]
+    ldp     C_l, C_h, [srcend, -48]
+    stp     D_l, D_h, [dstend, -64]!
+    ldp     D_l, D_h, [srcend, -64]!
+    subs    count, count, 64
+    b.hi    1b
+
+    // Write the last full set of 64 bytes. The remainder is at most 64
+    // bytes, so it is safe to always copy 64 bytes from the start even if
+    // there is just 1 byte left.
+2:
+    ldp     E_l, E_h, [src, 48]
+    stp     A_l, A_h, [dstend, -16]
+    ldp     A_l, A_h, [src, 32]
+    stp     B_l, B_h, [dstend, -32]
+    ldp     B_l, B_h, [src, 16]
+    stp     C_l, C_h, [dstend, -48]
+    ldp     C_l, C_h, [src]
+    stp     D_l, D_h, [dstend, -64]
+    stp     E_l, E_h, [dstin, 48]
+    stp     A_l, A_h, [dstin, 32]
+    stp     B_l, B_h, [dstin, 16]
+    stp     C_l, C_h, [dstin]
+3:  ret
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
new file mode 100644
index 0000000000..08e1fbb170
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/ScanMem.S
@@ -0,0 +1,161 @@
+//
+// Copyright (c) 2014, ARM Limited
+// All rights Reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the company nor the names of its contributors
+//       may be used to endorse or promote products derived from this
+//       software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64
+// Neon Available.
+//
+
+// Arguments and results.
+#define srcin     x0
+#define cntin     x1
+#define chrin     w2
+
+#define result    x0
+
+#define src       x3
+#define	tmp       x4
+#define wtmp2     w5
+#define synd      x6
+#define soff      x9
+#define cntrem    x10
+
+#define vrepchr   v0
+#define vdata1    v1
+#define vdata2    v2
+#define vhas_chr1 v3
+#define vhas_chr2 v4
+#define vrepmask  v5
+#define vend      v6
+
+//
+// Core algorithm:
+//
+// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+// per byte. For each tuple, bit 0 is set if the relevant byte matched the
+// requested character and bit 1 is not used (faster than using a 32bit
+// syndrome). Since the bits in the syndrome reflect exactly the order in which
+// things occur in the original string, counting trailing zeros allows to
+// identify exactly which byte has matched.
+//
+
+ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
+ASM_PFX(InternalMemScanMem8):
+    // Do not dereference srcin if no bytes to compare.
+    cbz	cntin, .Lzero_length
+    //
+    // Magic constant 0x40100401 allows us to identify which lane matches
+    // the requested byte.
+    //
+    mov     wtmp2, #0x0401
+    movk    wtmp2, #0x4010, lsl #16
+    dup     vrepchr.16b, chrin
+    // Work with aligned 32-byte chunks
+    bic     src, srcin, #31
+    dup     vrepmask.4s, wtmp2
+    ands    soff, srcin, #31
+    and     cntrem, cntin, #31
+    b.eq    .Lloop
+
+    //
+    // Input string is not 32-byte aligned. We calculate the syndrome
+    // value for the aligned 32 bytes block containing the first bytes
+    // and mask the irrelevant part.
+    //
+
+    ld1     {vdata1.16b, vdata2.16b}, [src], #32
+    sub     tmp, soff, #32
+    adds    cntin, cntin, tmp
+    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
+    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
+    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b        // 256->128
+    addp    vend.16b, vend.16b, vend.16b                  // 128->64
+    mov     synd, vend.d[0]
+    // Clear the soff*2 lower bits
+    lsl     tmp, soff, #1
+    lsr     synd, synd, tmp
+    lsl     synd, synd, tmp
+    // The first block can also be the last
+    b.ls    .Lmasklast
+    // Have we found something already?
+    cbnz    synd, .Ltail
+
+.Lloop:
+    ld1     {vdata1.16b, vdata2.16b}, [src], #32
+    subs    cntin, cntin, #32
+    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
+    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
+    // If we're out of data we finish regardless of the result
+    b.ls    .Lend
+    // Use a fast check for the termination condition
+    orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b
+    addp    vend.2d, vend.2d, vend.2d
+    mov     synd, vend.d[0]
+    // We're not out of data, loop if we haven't found the character
+    cbz     synd, .Lloop
+
+.Lend:
+    // Termination condition found, let's calculate the syndrome value
+    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b      // 256->128
+    addp    vend.16b, vend.16b, vend.16b                // 128->64
+    mov     synd, vend.d[0]
+    // Only do the clear for the last possible block
+    b.hi    .Ltail
+
+.Lmasklast:
+    // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
+    add     tmp, cntrem, soff
+    and     tmp, tmp, #31
+    sub     tmp, tmp, #32
+    neg     tmp, tmp, lsl #1
+    lsl     synd, synd, tmp
+    lsr     synd, synd, tmp
+
+.Ltail:
+    // Count the trailing zeros using bit reversing
+    rbit    synd, synd
+    // Compensate the last post-increment
+    sub     src, src, #32
+    // Check that we have found a character
+    cmp     synd, #0
+    // And count the leading zeros
+    clz     synd, synd
+    // Compute the potential result
+    add     result, src, synd, lsr #1
+    // Select result or NULL
+    csel    result, xzr, result, eq
+    ret
+
+.Lzero_length:
+    mov   result, #0
+    ret
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
new file mode 100644
index 0000000000..7f361110d4
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/AArch64/SetMem.S
@@ -0,0 +1,244 @@
+//
+// Copyright (c) 2012 - 2016, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the Linaro nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+//
+// Copyright (c) 2015 ARM Ltd
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+// 3. The name of the company may not be used to endorse or promote
+//    products derived from this software without specific prior written
+//    permission.
+//
+// THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+// IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Assumptions:
+//
+// ARMv8-a, AArch64, unaligned accesses
+//
+//
+
+#define dstin     x0
+#define count     x1
+#define val       x2
+#define valw      w2
+#define dst       x3
+#define dstend    x4
+#define tmp1      x5
+#define tmp1w     w5
+#define tmp2      x6
+#define tmp2w     w6
+#define zva_len   x7
+#define zva_lenw  w7
+
+#define L(l) .L ## l
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
+ASM_PFX(InternalMemSetMem16):
+    dup     v0.8H, valw
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
+ASM_PFX(InternalMemSetMem32):
+    dup     v0.4S, valw
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
+ASM_PFX(InternalMemSetMem64):
+    dup     v0.2D, val
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
+ASM_PFX(InternalMemZeroMem):
+    movi    v0.16B, #0
+    b       0f
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem)
+ASM_PFX(InternalMemSetMem):
+    dup     v0.16B, valw
+0:  add     dstend, dstin, count
+    mov     val, v0.D[0]
+
+    cmp     count, 96
+    b.hi    L(set_long)
+    cmp     count, 16
+    b.hs    L(set_medium)
+
+    // Set 0..15 bytes.
+    tbz     count, 3, 1f
+    str     val, [dstin]
+    str     val, [dstend, -8]
+    ret
+    nop
+1:  tbz     count, 2, 2f
+    str     valw, [dstin]
+    str     valw, [dstend, -4]
+    ret
+2:  cbz     count, 3f
+    strb    valw, [dstin]
+    tbz     count, 1, 3f
+    strh    valw, [dstend, -2]
+3:  ret
+
+    // Set 17..96 bytes.
+L(set_medium):
+    str     q0, [dstin]
+    tbnz    count, 6, L(set96)
+    str     q0, [dstend, -16]
+    tbz     count, 5, 1f
+    str     q0, [dstin, 16]
+    str     q0, [dstend, -32]
+1:  ret
+
+    .p2align 4
+    // Set 64..96 bytes.  Write 64 bytes from the start and
+    // 32 bytes from the end.
+L(set96):
+    str     q0, [dstin, 16]
+    stp     q0, q0, [dstin, 32]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+    .p2align 3
+    nop
+L(set_long):
+    bic     dst, dstin, 15
+    str     q0, [dstin]
+    cmp     count, 256
+    ccmp    val, 0, 0, cs
+    b.eq    L(try_zva)
+L(no_zva):
+    sub     count, dstend, dst        // Count is 16 too large.
+    add     dst, dst, 16
+    sub     count, count, 64 + 16     // Adjust count and bias for loop.
+1:  stp     q0, q0, [dst], 64
+    stp     q0, q0, [dst, -32]
+L(tail64):
+    subs    count, count, 64
+    b.hi    1b
+2:  stp     q0, q0, [dstend, -64]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+    .p2align 3
+L(try_zva):
+    mrs     tmp1, dczid_el0
+    tbnz    tmp1w, 4, L(no_zva)
+    and     tmp1w, tmp1w, 15
+    cmp     tmp1w, 4                  // ZVA size is 64 bytes.
+    b.ne    L(zva_128)
+
+    // Write the first and last 64 byte aligned block using stp rather
+    // than using DC ZVA.  This is faster on some cores.
+L(zva_64):
+    str     q0, [dst, 16]
+    stp     q0, q0, [dst, 32]
+    bic     dst, dst, 63
+    stp     q0, q0, [dst, 64]
+    stp     q0, q0, [dst, 96]
+    sub     count, dstend, dst         // Count is now 128 too large.
+    sub     count, count, 128+64+64    // Adjust count and bias for loop.
+    add     dst, dst, 128
+    nop
+1:  dc      zva, dst
+    add     dst, dst, 64
+    subs    count, count, 64
+    b.hi    1b
+    stp     q0, q0, [dst, 0]
+    stp     q0, q0, [dst, 32]
+    stp     q0, q0, [dstend, -64]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+    .p2align 3
+L(zva_128):
+    cmp     tmp1w, 5                    // ZVA size is 128 bytes.
+    b.ne    L(zva_other)
+
+    str     q0, [dst, 16]
+    stp     q0, q0, [dst, 32]
+    stp     q0, q0, [dst, 64]
+    stp     q0, q0, [dst, 96]
+    bic     dst, dst, 127
+    sub     count, dstend, dst          // Count is now 128 too large.
+    sub     count, count, 128+128       // Adjust count and bias for loop.
+    add     dst, dst, 128
+1:  dc      zva, dst
+    add     dst, dst, 128
+    subs    count, count, 128
+    b.hi    1b
+    stp     q0, q0, [dstend, -128]
+    stp     q0, q0, [dstend, -96]
+    stp     q0, q0, [dstend, -64]
+    stp     q0, q0, [dstend, -32]
+    ret
+
+L(zva_other):
+    mov     tmp2w, 4
+    lsl     zva_lenw, tmp2w, tmp1w
+    add     tmp1, zva_len, 64           // Max alignment bytes written.
+    cmp     count, tmp1
+    blo     L(no_zva)
+
+    sub     tmp2, zva_len, 1
+    add     tmp1, dst, zva_len
+    add     dst, dst, 16
+    subs    count, tmp1, dst            // Actual alignment bytes to write.
+    bic     tmp1, tmp1, tmp2            // Aligned dc zva start address.
+    beq     2f
+1:  stp     q0, q0, [dst], 64
+    stp     q0, q0, [dst, -32]
+    subs    count, count, 64
+    b.hi    1b
+2:  mov     dst, tmp1
+    sub     count, dstend, tmp1         // Remaining bytes to write.
+    subs    count, count, zva_len
+    b.lo    4f
+3:  dc      zva, dst
+    add     dst, dst, zva_len
+    subs    count, count, zva_len
+    b.hs    3b
+4:  add     count, count, zva_len
+    b       L(tail64)
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
index d95eb599ea..64d11b09ef 100644
--- a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
@@ -27,7 +27,7 @@
 
 
 #
-#  VALID_ARCHITECTURES           = IA32 X64 ARM
+#  VALID_ARCHITECTURES           = IA32 X64 ARM AARCH64
 #
 
 [Sources]
@@ -127,6 +127,13 @@
   Arm/CopyMem.asm     |RVCT
   Arm/CompareMem.asm  |RVCT
 
+[Sources.AARCH64]
+  AArch64/ScanMem.S
+  AArch64/SetMem.S
+  AArch64/CopyMem.S
+  AArch64/CompareMem.S
+
+[Sources.ARM, Sources.AARCH64]
   Arm/ScanMemGeneric.c
 
 [Sources]