From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 6 Sep 2016 14:06:38 +0000 (+0100)
Subject: MdePkg/BaseMemoryLibOptDxe: add accelerated ARM routines
X-Git-Tag: edk2-stable201903~5780
X-Git-Url: https://git.proxmox.com/?p=mirror_edk2.git;a=commitdiff_plain;h=a37f660599e8aefabf29a1ac9bef02ce55a3130c

MdePkg/BaseMemoryLibOptDxe: add accelerated ARM routines

This adds ARM support to BaseMemoryLibOptDxe, partially based on the
cortex-strings library (ScanMem) and the existing CopyMem() implementation
from BaseMemoryLibStm in ArmPkg.

All string routines are accelerated except ScanMem16, ScanMem32,
ScanMem64 and IsZeroBuffer, which can wait for another day. (Very few
occurrences exist in the codebase)

Contributed-under: TianoCore Contribution Agreement 1.0
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Reviewed-by: Liming Gao <liming.gao@intel.com>
---

diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S
new file mode 100644
index 0000000000..951d15777a
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.S
@@ -0,0 +1,138 @@
+//
+// Copyright (c) 2013 - 2016, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above copyright
+//       notice, this list of conditions and the following disclaimer in the
+//       documentation and/or other materials provided with the distribution.
+//     * Neither the name of the Linaro nor the
+//       names of its contributors may be used to endorse or promote products
+//       derived from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+// Parameters and result.
+#define src1      r0
+#define src2      r1
+#define limit     r2
+#define result    r0
+
+// Internal variables.
+#define data1     r3
+#define data2     r4
+#define limit_wd  r5
+#define diff      r6
+#define tmp1      r7
+#define tmp2      r12
+#define pos       r8
+#define mask      r14
+
+    .text
+    .thumb
+    .syntax unified
+    .align  5
+ASM_GLOBAL ASM_PFX(InternalMemCompareMem)
+ASM_PFX(InternalMemCompareMem):
+    push    {r4-r8, lr}
+    eor     tmp1, src1, src2
+    tst     tmp1, #3
+    bne     .Lmisaligned4
+    ands    tmp1, src1, #3
+    bne     .Lmutual_align
+    add     limit_wd, limit, #3
+    nop.w
+    lsr     limit_wd, limit_wd, #2
+
+    // Start of performance-critical section  -- one 32B cache line.
+.Lloop_aligned:
+    ldr     data1, [src1], #4
+    ldr     data2, [src2], #4
+.Lstart_realigned:
+    subs    limit_wd, limit_wd, #1
+    eor     diff, data1, data2        // Non-zero if differences found.
+    cbnz    diff, 0f
+    bne     .Lloop_aligned
+    // End of performance-critical section  -- one 32B cache line.
+
+    // Not reached the limit, must have found a diff.
+0:  cbnz    limit_wd, .Lnot_limit
+
+    // Limit % 4 == 0 => all bytes significant.
+    ands    limit, limit, #3
+    beq     .Lnot_limit
+
+    lsl     limit, limit, #3              // Bits -> bytes.
+    mov     mask, #~0
+    lsl     mask, mask, limit
+    bic     data1, data1, mask
+    bic     data2, data2, mask
+
+    orr     diff, diff, mask
+
+.Lnot_limit:
+    rev     diff, diff
+    rev     data1, data1
+    rev     data2, data2
+
+    // The MS-non-zero bit of DIFF marks either the first bit
+    // that is different, or the end of the significant data.
+    // Shifting left now will bring the critical information into the
+    // top bits.
+    clz     pos, diff
+    lsl     data1, data1, pos
+    lsl     data2, data2, pos
+
+    // But we need to zero-extend (char is unsigned) the value and then
+    // perform a signed 32-bit subtraction.
+    lsr     data1, data1, #28
+    sub     result, data1, data2, lsr #28
+    pop     {r4-r8, pc}
+
+.Lmutual_align:
+    // Sources are mutually aligned, but are not currently at an
+    // alignment boundary.  Round down the addresses and then mask off
+    // the bytes that precede the start point.
+    bic     src1, src1, #3
+    bic     src2, src2, #3
+    add     limit, limit, tmp1          // Adjust the limit for the extra.
+    lsl     tmp1, tmp1, #2              // Bytes beyond alignment -> bits.
+    ldr     data1, [src1], #4
+    neg     tmp1, tmp1                  // Bits to alignment -32.
+    ldr     data2, [src2], #4
+    mov     tmp2, #~0
+
+    // Little-endian.  Early bytes are at LSB.
+    lsr     tmp2, tmp2, tmp1            // Shift (tmp1 & 31).
+    add     limit_wd, limit, #3
+    orr     data1, data1, tmp2
+    orr     data2, data2, tmp2
+    lsr     limit_wd, limit_wd, #2
+    b       .Lstart_realigned
+
+.Lmisaligned4:
+    sub     limit, limit, #1
+1:
+    // Perhaps we can do better than this.
+    ldrb    data1, [src1], #1
+    ldrb    data2, [src2], #1
+    subs    limit, limit, #1
+    it      cs
+    cmpcs   data1, data2
+    beq     1b
+    sub     result, data1, data2
+    pop     {r4-r8, pc}
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm
new file mode 100644
index 0000000000..47b49ee164
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CompareMem.asm
@@ -0,0 +1,140 @@
+;
+; Copyright (c) 2013 - 2016, Linaro Limited
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;     * Redistributions of source code must retain the above copyright
+;       notice, this list of conditions and the following disclaimer.
+;     * Redistributions in binary form must reproduce the above copyright
+;       notice, this list of conditions and the following disclaimer in the
+;       documentation and/or other materials provided with the distribution.
+;     * Neither the name of the Linaro nor the
+;       names of its contributors may be used to endorse or promote products
+;       derived from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;
+
+; Parameters and result.
+#define src1      r0
+#define src2      r1
+#define limit     r2
+#define result    r0
+
+; Internal variables.
+#define data1     r3
+#define data2     r4
+#define limit_wd  r5
+#define diff      r6
+#define tmp1      r7
+#define tmp2      r12
+#define pos       r8
+#define mask      r14
+
+    EXPORT  InternalMemCompareMem
+    THUMB
+    AREA    CompareMem, CODE, READONLY
+
+InternalMemCompareMem
+    push    {r4-r8, lr}
+    eor     tmp1, src1, src2
+    tst     tmp1, #3
+    bne     Lmisaligned4
+    ands    tmp1, src1, #3
+    bne     Lmutual_align
+    add     limit_wd, limit, #3
+    nop.w
+    lsr     limit_wd, limit_wd, #2
+
+    ; Start of performance-critical section  -- one 32B cache line.
+Lloop_aligned
+    ldr     data1, [src1], #4
+    ldr     data2, [src2], #4
+Lstart_realigned
+    subs    limit_wd, limit_wd, #1
+    eor     diff, data1, data2        ; Non-zero if differences found.
+    cbnz    diff, L0
+    bne     Lloop_aligned
+    ; End of performance-critical section  -- one 32B cache line.
+
+    ; Not reached the limit, must have found a diff.
+L0
+    cbnz    limit_wd, Lnot_limit
+
+    // Limit % 4 == 0 => all bytes significant.
+    ands    limit, limit, #3
+    beq     Lnot_limit
+
+    lsl     limit, limit, #3              // Bits -> bytes.
+    mov     mask, #~0
+    lsl     mask, mask, limit
+    bic     data1, data1, mask
+    bic     data2, data2, mask
+
+    orr     diff, diff, mask
+
+Lnot_limit
+    rev     diff, diff
+    rev     data1, data1
+    rev     data2, data2
+
+    ; The MS-non-zero bit of DIFF marks either the first bit
+    ; that is different, or the end of the significant data.
+    ; Shifting left now will bring the critical information into the
+    ; top bits.
+    clz     pos, diff
+    lsl     data1, data1, pos
+    lsl     data2, data2, pos
+
+    ; But we need to zero-extend (char is unsigned) the value and then
+    ; perform a signed 32-bit subtraction.
+    lsr     data1, data1, #28
+    sub     result, data1, data2, lsr #28
+    pop     {r4-r8, pc}
+
+Lmutual_align
+    ; Sources are mutually aligned, but are not currently at an
+    ; alignment boundary.  Round down the addresses and then mask off
+    ; the bytes that precede the start point.
+    bic     src1, src1, #3
+    bic     src2, src2, #3
+    add     limit, limit, tmp1          ; Adjust the limit for the extra.
+    lsl     tmp1, tmp1, #2              ; Bytes beyond alignment -> bits.
+    ldr     data1, [src1], #4
+    neg     tmp1, tmp1                  ; Bits to alignment -32.
+    ldr     data2, [src2], #4
+    mov     tmp2, #~0
+
+    ; Little-endian.  Early bytes are at LSB.
+    lsr     tmp2, tmp2, tmp1            ; Shift (tmp1 & 31).
+    add     limit_wd, limit, #3
+    orr     data1, data1, tmp2
+    orr     data2, data2, tmp2
+    lsr     limit_wd, limit_wd, #2
+    b       Lstart_realigned
+
+Lmisaligned4
+    sub     limit, limit, #1
+L1
+    // Perhaps we can do better than this.
+    ldrb    data1, [src1], #1
+    ldrb    data2, [src2], #1
+    subs    limit, limit, #1
+    it      cs
+    cmpcs   data1, data2
+    beq     L1
+    sub     result, data1, data2
+    pop     {r4-r8, pc}
+
+    END
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S
new file mode 100644
index 0000000000..fb5293befc
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.S
@@ -0,0 +1,172 @@
+#------------------------------------------------------------------------------
+#
+# CopyMem() worker for ARM
+#
+# This file started out as C code that did 64 bit moves if the buffer was
+# 32-bit aligned, else it does a byte copy. It also does a byte copy for
+# any trailing bytes. It was updated to do 32-byte copies using stm/ldm.
+#
+# Copyright (c) 2008 - 2010, Apple Inc. All rights reserved.<BR>
+# Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+# This program and the accompanying materials
+# are licensed and made available under the terms and conditions of the BSD License
+# which accompanies this distribution.  The full text of the license may be found at
+# http://opensource.org/licenses/bsd-license.php
+#
+# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
+# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+#
+#------------------------------------------------------------------------------
+
+    .text
+    .thumb
+    .syntax unified
+
+/**
+  Copy Length bytes from Source to Destination. Overlap is OK.
+
+  This implementation
+
+  @param  Destination Target of copy
+  @param  Source      Place to copy from
+  @param  Length      Number of bytes to copy
+
+  @return Destination
+
+
+VOID *
+EFIAPI
+InternalMemCopyMem (
+  OUT     VOID                      *DestinationBuffer,
+  IN      CONST VOID                *SourceBuffer,
+  IN      UINTN                     Length
+  )
+**/
+ASM_GLOBAL ASM_PFX(InternalMemCopyMem)
+ASM_PFX(InternalMemCopyMem):
+    push    {r4-r11, lr}
+    // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length)
+    mov     r11, r0
+    mov     r10, r0
+    mov     r12, r2
+    mov     r14, r1
+
+    cmp     r11, r1
+    // If (dest < source)
+    bcc     memcopy_check_optim_default
+
+    // If (source + length < dest)
+    rsb     r3, r1, r11
+    cmp     r12, r3
+    bcc     memcopy_check_optim_default
+    b       memcopy_check_optim_overlap
+
+memcopy_check_optim_default:
+    // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1)
+    tst     r0, #0xF
+    it      ne
+    movne   r0, #0
+    bne     memcopy_default
+    tst     r1, #0xF
+    ite     ne
+    movne   r3, #0
+    moveq   r3, #1
+    cmp     r2, #31
+    ite     ls
+    movls   r0, #0
+    andhi   r0, r3, #1
+    b       memcopy_default
+
+memcopy_check_optim_overlap:
+    // r10 = dest_end, r14 = source_end
+    add     r10, r11, r12
+    add     r14, r12, r1
+
+    // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned)
+    cmp     r2, #31
+    ite     ls
+    movls   r0, #0
+    movhi   r0, #1
+    tst     r10, #0xF
+    it      ne
+    movne   r0, #0
+    tst     r14, #0xF
+    it      ne
+    movne   r0, #0
+    b       memcopy_overlapped
+
+memcopy_overlapped_non_optim:
+    // We read 1 byte from the end of the source buffer
+    sub     r3, r14, #1
+    sub     r12, r12, #1
+    ldrb    r3, [r3, #0]
+    sub     r2, r10, #1
+    cmp     r12, #0
+    // We write 1 byte at the end of the dest buffer
+    sub     r10, r10, #1
+    sub     r14, r14, #1
+    strb    r3, [r2, #0]
+    bne     memcopy_overlapped_non_optim
+    b       memcopy_end
+
+// r10 = dest_end, r14 = source_end
+memcopy_overlapped:
+    // Are we in the optimized case ?
+    cmp     r0, #0
+    beq     memcopy_overlapped_non_optim
+
+    // Optimized Overlapped - Read 32 bytes
+    sub     r14, r14, #32
+    sub     r12, r12, #32
+    cmp     r12, #31
+    ldmia   r14, {r2-r9}
+
+    // If length is less than 32 then disable optim
+    it      ls
+    movls   r0, #0
+
+    cmp     r12, #0
+
+    // Optimized Overlapped - Write 32 bytes
+    sub     r10, r10, #32
+    stmia   r10, {r2-r9}
+
+    // while (length != 0)
+    bne     memcopy_overlapped
+    b       memcopy_end
+
+memcopy_default_non_optim:
+    // Byte copy
+    ldrb    r3, [r14], #1
+    sub     r12, r12, #1
+    strb    r3, [r10], #1
+
+memcopy_default:
+    cmp     r12, #0
+    beq     memcopy_end
+
+// r10 = dest, r14 = source
+memcopy_default_loop:
+    cmp     r0, #0
+    beq     memcopy_default_non_optim
+
+    // Optimized memcopy - Read 32 Bytes
+    sub     r12, r12, #32
+    cmp     r12, #31
+    ldmia   r14!, {r2-r9}
+
+    // If length is less than 32 then disable optim
+    it      ls
+    movls   r0, #0
+
+    cmp     r12, #0
+
+    // Optimized memcopy - Write 32 Bytes
+    stmia   r10!, {r2-r9}
+
+    // while (length != 0)
+    bne     memcopy_default_loop
+
+memcopy_end:
+    mov     r0, r11
+    pop     {r4-r11, pc}
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm
new file mode 100644
index 0000000000..2034807954
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/CopyMem.asm
@@ -0,0 +1,147 @@
+;------------------------------------------------------------------------------
+;
+; CopyMem() worker for ARM
+;
+; This file started out as C code that did 64 bit moves if the buffer was
+; 32-bit aligned, else it does a byte copy. It also does a byte copy for
+; any trailing bytes. It was updated to do 32-byte copies using stm/ldm.
+;
+; Copyright (c) 2008 - 2010, Apple Inc. All rights reserved.<BR>
+; Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+; This program and the accompanying materials
+; are licensed and made available under the terms and conditions of the BSD License
+; which accompanies this distribution.  The full text of the license may be found at
+; http://opensource.org/licenses/bsd-license.php
+;
+; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
+; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+;
+;------------------------------------------------------------------------------
+
+    EXPORT  InternalMemCopyMem
+    AREA    SetMem, CODE, READONLY
+    THUMB
+
+InternalMemCopyMem
+  stmfd  sp!, {r4-r11, lr}
+  // Save the input parameters in extra registers (r11 = destination, r14 = source, r12 = length)
+  mov  r11, r0
+  mov  r10, r0
+  mov  r12, r2
+  mov  r14, r1
+
+memcopy_check_overlapped
+  cmp  r11, r1
+  // If (dest < source)
+  bcc  memcopy_check_optim_default
+
+  // If (source + length < dest)
+  rsb  r3, r1, r11
+  cmp  r12, r3
+  bcc  memcopy_check_optim_default
+  b     memcopy_check_optim_overlap
+
+memcopy_check_optim_default
+  // Check if we can use an optimized path ((length >= 32) && destination word-aligned && source word-aligned) for the memcopy (optimized path if r0 == 1)
+  tst  r0, #0xF
+  movne  r0, #0
+  bne   memcopy_default
+  tst  r1, #0xF
+  movne  r3, #0
+  moveq  r3, #1
+  cmp  r2, #31
+  movls  r0, #0
+  andhi  r0, r3, #1
+  b     memcopy_default
+
+memcopy_check_optim_overlap
+  // r10 = dest_end, r14 = source_end
+  add  r10, r11, r12
+  add  r14, r12, r1
+
+  // Are we in the optimized case ((length >= 32) && dest_end word-aligned && source_end word-aligned)
+  cmp  r2, #31
+  movls  r0, #0
+  movhi  r0, #1
+  tst  r10, #0xF
+  movne  r0, #0
+  tst  r14, #0xF
+  movne  r0, #0
+  b  memcopy_overlapped
+
+memcopy_overlapped_non_optim
+  // We read 1 byte from the end of the source buffer
+  sub  r3, r14, #1
+  sub  r12, r12, #1
+  ldrb  r3, [r3, #0]
+  sub  r2, r10, #1
+  cmp  r12, #0
+  // We write 1 byte at the end of the dest buffer
+  sub  r10, r10, #1
+  sub  r14, r14, #1
+  strb  r3, [r2, #0]
+  bne  memcopy_overlapped_non_optim
+  b   memcopy_end
+
+// r10 = dest_end, r14 = source_end
+memcopy_overlapped
+  // Are we in the optimized case ?
+  cmp  r0, #0
+  beq  memcopy_overlapped_non_optim
+
+  // Optimized Overlapped - Read 32 bytes
+  sub  r14, r14, #32
+  sub  r12, r12, #32
+  cmp  r12, #31
+  ldmia  r14, {r2-r9}
+
+  // If length is less than 32 then disable optim
+  movls  r0, #0
+
+  cmp  r12, #0
+
+  // Optimized Overlapped - Write 32 bytes
+  sub  r10, r10, #32
+  stmia  r10, {r2-r9}
+
+  // while (length != 0)
+  bne  memcopy_overlapped
+  b   memcopy_end
+
+memcopy_default_non_optim
+  // Byte copy
+  ldrb  r3, [r14], #1
+  sub  r12, r12, #1
+  strb  r3, [r10], #1
+
+memcopy_default
+  cmp  r12, #0
+  beq  memcopy_end
+
+// r10 = dest, r14 = source
+memcopy_default_loop
+  cmp  r0, #0
+  beq  memcopy_default_non_optim
+
+  // Optimized memcopy - Read 32 Bytes
+  sub  r12, r12, #32
+  cmp  r12, #31
+  ldmia  r14!, {r2-r9}
+
+  // If length is less than 32 then disable optim
+  movls  r0, #0
+
+  cmp  r12, #0
+
+  // Optimized memcopy - Write 32 Bytes
+  stmia  r10!, {r2-r9}
+
+  // while (length != 0)
+  bne  memcopy_default_loop
+
+memcopy_end
+  mov  r0, r11
+  ldmfd  sp!, {r4-r11, pc}
+
+  END
+
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S
new file mode 100644
index 0000000000..dc0e74e865
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.S
@@ -0,0 +1,146 @@
+// Copyright (c) 2010-2011, Linaro Limited
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//    * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//
+//    * Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+//    * Neither the name of Linaro Limited nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+
+//
+// Written by Dave Gilbert <david.gilbert@linaro.org>
+//
+// This memchr routine is optimised on a Cortex-A9 and should work on
+// all ARMv7 processors.   It has a fast past for short sizes, and has
+// an optimised path for large data sets; the worst case is finding the
+// match early in a large data set.
+//
+
+
+// 2011-02-07 david.gilbert@linaro.org
+//    Extracted from local git a5b438d861
+// 2011-07-14 david.gilbert@linaro.org
+//    Import endianness fix from local git ea786f1b
+// 2011-12-07 david.gilbert@linaro.org
+//    Removed unneeded cbz from align loop
+
+// this lets us check a flag in a 00/ff byte easily in either endianness
+#define CHARTSTMASK(c) 1<<(c*8)
+
+    .text
+    .thumb
+    .syntax unified
+
+    .type ASM_PFX(InternalMemScanMem8), %function
+ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
+ASM_PFX(InternalMemScanMem8):
+    // r0 = start of memory to scan
+    // r1 = length
+    // r2 = character to look for
+    // returns r0 = pointer to character or NULL if not found
+    uxtb    r2, r2        // Don't think we can trust the caller to actually pass a char
+
+    cmp     r1, #16       // If it's short don't bother with anything clever
+    blt     20f
+
+    tst     r0, #7        // If it's already aligned skip the next bit
+    beq     10f
+
+    // Work up to an aligned point
+5:
+    ldrb    r3, [r0],#1
+    subs    r1, r1, #1
+    cmp     r3, r2
+    beq     50f           // If it matches exit found
+    tst     r0, #7
+    bne     5b            // If not aligned yet then do next byte
+
+10:
+    // At this point, we are aligned, we know we have at least 8 bytes to work with
+    push    {r4-r7}
+    orr     r2, r2, r2, lsl #8  // expand the match word across to all bytes
+    orr     r2, r2, r2, lsl #16
+    bic     r4, r1, #7    // Number of double words to work with
+    mvns    r7, #0        // all F's
+    movs    r3, #0
+
+15:
+    ldmia   r0!, {r5,r6}
+    subs    r4, r4, #8
+    eor     r5, r5, r2    // Get it so that r5,r6 have 00's where the bytes match the target
+    eor     r6, r6, r2
+    uadd8   r5, r5, r7    // Parallel add 0xff - sets the GE bits for anything that wasn't 0
+    sel     r5, r3, r7    // bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+    uadd8   r6, r6, r7    // Parallel add 0xff - sets the GE bits for anything that wasn't 0
+    sel     r6, r5, r7    // chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+    cbnz    r6, 60f
+    bne     15b           // (Flags from the subs above) If not run out of bytes then go around again
+
+    pop     {r4-r7}
+    and     r2, r2, #0xff // Get r2 back to a single character from the expansion above
+    and     r1, r1, #7    // Leave the count remaining as the number after the double words have been done
+
+20:
+    cbz     r1, 40f       // 0 length or hit the end already then not found
+
+21: // Post aligned section, or just a short call
+    ldrb    r3, [r0], #1
+    subs    r1, r1, #1
+    eor     r3, r3, r2    // r3 = 0 if match - doesn't break flags from sub
+    cbz     r3, 50f
+    bne     21b           // on r1 flags
+
+40:
+    movs    r0, #0        // not found
+    bx      lr
+
+50:
+    subs    r0, r0, #1    // found
+    bx      lr
+
+60: // We're here because the fast path found a hit - now we have to track down exactly which word it was
+    // r0 points to the start of the double word after the one that was tested
+    // r5 has the 00/ff pattern for the first word, r6 has the chained value
+    cmp     r5, #0
+    itte    eq
+    moveq   r5, r6        // the end is in the 2nd word
+    subeq   r0, r0, #3    // Points to 2nd byte of 2nd word
+    subne   r0, r0, #7    // or 2nd byte of 1st word
+
+    // r0 currently points to the 3rd byte of the word containing the hit
+    tst     r5, #CHARTSTMASK(0)     // 1st character
+    bne     61f
+    adds    r0, r0, #1
+    tst     r5, #CHARTSTMASK(1)     // 2nd character
+    ittt    eq
+    addeq   r0, r0 ,#1
+    tsteq   r5, #(3 << 15)          // 2nd & 3rd character
+    // If not the 3rd must be the last one
+    addeq   r0, r0, #1
+
+61:
+    pop     {r4-r7}
+    subs    r0, r0, #1
+    bx      lr
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm
new file mode 100644
index 0000000000..abda87320e
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMem.asm
@@ -0,0 +1,147 @@
+; Copyright (c) 2010-2011, Linaro Limited
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+;
+;    * Redistributions of source code must retain the above copyright
+;    notice, this list of conditions and the following disclaimer.
+;
+;    * Redistributions in binary form must reproduce the above copyright
+;    notice, this list of conditions and the following disclaimer in the
+;    documentation and/or other materials provided with the distribution.
+;
+;    * Neither the name of Linaro Limited nor the names of its
+;    contributors may be used to endorse or promote products derived
+;    from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;
+
+;
+; Written by Dave Gilbert <david.gilbert@linaro.org>
+;
+; This memchr routine is optimised on a Cortex-A9 and should work on
+; all ARMv7 processors.   It has a fast past for short sizes, and has
+; an optimised path for large data sets; the worst case is finding the
+; match early in a large data set.
+;
+
+
+; 2011-02-07 david.gilbert@linaro.org
+;    Extracted from local git a5b438d861
+; 2011-07-14 david.gilbert@linaro.org
+;    Import endianness fix from local git ea786f1b
+; 2011-12-07 david.gilbert@linaro.org
+;    Removed unneeded cbz from align loop
+
+; this lets us check a flag in a 00/ff byte easily in either endianness
+#define CHARTSTMASK(c) 1<<(c*8)
+
+    EXPORT  InternalMemScanMem8
+    AREA    ScanMem, CODE, READONLY
+    THUMB
+
+InternalMemScanMem8
+    ; r0 = start of memory to scan
+    ; r1 = length
+    ; r2 = character to look for
+    ; returns r0 = pointer to character or NULL if not found
+    uxtb    r2, r2        ; Don't think we can trust the caller to actually pass a char
+
+    cmp     r1, #16       ; If it's short don't bother with anything clever
+    blt     L20
+
+    tst     r0, #7        ; If it's already aligned skip the next bit
+    beq     L10
+
+    ; Work up to an aligned point
+L5
+    ldrb    r3, [r0],#1
+    subs    r1, r1, #1
+    cmp     r3, r2
+    beq     L50           ; If it matches exit found
+    tst     r0, #7
+    bne     L5            ; If not aligned yet then do next byte
+
+L10
+    ; At this point, we are aligned, we know we have at least 8 bytes to work with
+    push    {r4-r7}
+    orr     r2, r2, r2, lsl #8  ; expand the match word across to all bytes
+    orr     r2, r2, r2, lsl #16
+    bic     r4, r1, #7    ; Number of double words to work with
+    mvns    r7, #0        ; all F's
+    movs    r3, #0
+
+L15
+    ldmia   r0!, {r5,r6}
+    subs    r4, r4, #8
+    eor     r5, r5, r2    ; Get it so that r5,r6 have 00's where the bytes match the target
+    eor     r6, r6, r2
+    uadd8   r5, r5, r7    ; Parallel add 0xff - sets the GE bits for anything that wasn't 0
+    sel     r5, r3, r7    ; bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+    uadd8   r6, r6, r7    ; Parallel add 0xff - sets the GE bits for anything that wasn't 0
+    sel     r6, r5, r7    ; chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+    cbnz    r6, L60
+    bne     L15           ; (Flags from the subs above) If not run out of bytes then go around again
+
+    pop     {r4-r7}
+    and     r2, r2, #0xff ; Get r2 back to a single character from the expansion above
+    and     r1, r1, #7    ; Leave the count remaining as the number after the double words have been done
+
+L20
+    cbz     r1, L40       ; 0 length or hit the end already then not found
+
+L21 ; Post aligned section, or just a short call
+    ldrb    r3, [r0], #1
+    subs    r1, r1, #1
+    eor     r3, r3, r2    ; r3 = 0 if match - doesn't break flags from sub
+    cbz     r3, L50
+    bne     L21           ; on r1 flags
+
+L40
+    movs    r0, #0        ; not found
+    bx      lr
+
+L50
+    subs    r0, r0, #1    ; found
+    bx      lr
+
+L60 ; We're here because the fast path found a hit - now we have to track down exactly which word it was
+    ; r0 points to the start of the double word after the one that was tested
+    ; r5 has the 00/ff pattern for the first word, r6 has the chained value
+    cmp     r5, #0
+    itte    eq
+    moveq   r5, r6        ; the end is in the 2nd word
+    subeq   r0, r0, #3    ; Points to 2nd byte of 2nd word
+    subne   r0, r0, #7    ; or 2nd byte of 1st word
+
+    ; r0 currently points to the 3rd byte of the word containing the hit
+    tst     r5, #CHARTSTMASK(0)     ; 1st character
+    bne     L61
+    adds    r0, r0, #1
+    tst     r5, #CHARTSTMASK(1)     ; 2nd character
+    ittt    eq
+    addeq   r0, r0 ,#1
+    tsteq   r5, #(3 << 15)          ; 2nd & 3rd character
+    ; If not the 3rd must be the last one
+    addeq   r0, r0, #1
+
+L61
+    pop     {r4-r7}
+    subs    r0, r0, #1
+    bx      lr
+
+    END
+
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c
new file mode 100644
index 0000000000..20fa7e9be6
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/ScanMemGeneric.c
@@ -0,0 +1,142 @@
+/** @file
+  Architecture Independent Base Memory Library Implementation.
+
+  The following BaseMemoryLib instances contain the same copy of this file:
+    BaseMemoryLib
+    PeiMemoryLib
+    UefiMemoryLib
+
+  Copyright (c) 2006 - 2016, Intel Corporation. All rights reserved.<BR>
+  This program and the accompanying materials
+  are licensed and made available under the terms and conditions of the BSD License
+  which accompanies this distribution.  The full text of the license may be found at
+  http://opensource.org/licenses/bsd-license.php.
+
+  THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
+  WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+
+**/
+
+#include "../MemLibInternals.h"
+
+/**
+  Scans a target buffer for a 16-bit value, and returns a pointer to the
+  matching 16-bit value in the target buffer.
+
+  @param  Buffer  The pointer to the target buffer to scan.
+  @param  Length  The count of 16-bit value to scan. Must be non-zero.
+  @param  Value   The value to search for in the target buffer.
+
+  @return The pointer to the first occurrence, or NULL if not found.
+
+**/
+CONST VOID *
+EFIAPI
+InternalMemScanMem16 (
+  IN      CONST VOID                *Buffer,
+  IN      UINTN                     Length,
+  IN      UINT16                    Value
+  )
+{
+  CONST UINT16                      *Pointer;
+
+  Pointer = (CONST UINT16*)Buffer;
+  do {
+    if (*Pointer == Value) {
+      return Pointer;
+    }
+    ++Pointer;
+  } while (--Length != 0);
+  return NULL;
+}
+
+/**
+  Scans a target buffer for a 32-bit value, and returns a pointer to the
+  matching 32-bit value in the target buffer.
+
+  @param  Buffer  The pointer to the target buffer to scan.
+  @param  Length  The count of 32-bit value to scan. Must be non-zero.
+  @param  Value   The value to search for in the target buffer.
+
+  @return The pointer to the first occurrence, or NULL if not found.
+
+**/
+CONST VOID *
+EFIAPI
+InternalMemScanMem32 (
+  IN      CONST VOID                *Buffer,
+  IN      UINTN                     Length,
+  IN      UINT32                    Value
+  )
+{
+  CONST UINT32                      *Pointer;
+
+  Pointer = (CONST UINT32*)Buffer;
+  do {
+    if (*Pointer == Value) {
+      return Pointer;
+    }
+    ++Pointer;
+  } while (--Length != 0);
+  return NULL;
+}
+
+/**
+  Scans a target buffer for a 64-bit value, and returns a pointer to the
+  matching 64-bit value in the target buffer.
+
+  @param  Buffer  The pointer to the target buffer to scan.
+  @param  Length  The count of 64-bit value to scan. Must be non-zero.
+  @param  Value   The value to search for in the target buffer.
+
+  @return The pointer to the first occurrence, or NULL if not found.
+
+**/
+CONST VOID *
+EFIAPI
+InternalMemScanMem64 (
+  IN      CONST VOID                *Buffer,
+  IN      UINTN                     Length,
+  IN      UINT64                    Value
+  )
+{
+  CONST UINT64                      *Pointer;
+
+  Pointer = (CONST UINT64*)Buffer;
+  do {
+    if (*Pointer == Value) {
+      return Pointer;
+    }
+    ++Pointer;
+  } while (--Length != 0);
+  return NULL;
+}
+
+/**
+  Checks whether the contents of a buffer are all zeros.
+
+  @param  Buffer  The pointer to the buffer to be checked.
+  @param  Length  The size of the buffer (in bytes) to be checked.
+
+  @retval TRUE    Contents of the buffer are all zeros.
+  @retval FALSE   Contents of the buffer are not all zeros.
+
+**/
+BOOLEAN
+EFIAPI
+InternalMemIsZeroBuffer (
+  IN CONST VOID  *Buffer,
+  IN UINTN       Length
+  )
+{
+  CONST UINT8 *BufferData;
+  UINTN       Index;
+
+  BufferData = Buffer;
+  for (Index = 0; Index < Length; Index++) {
+    if (BufferData[Index] != 0) {
+      return FALSE;
+    }
+  }
+  return TRUE;
+}
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S
new file mode 100644
index 0000000000..c1755539d3
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.S
@@ -0,0 +1,77 @@
+#------------------------------------------------------------------------------
+#
+# Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+#
+# This program and the accompanying materials are licensed and made available
+# under the terms and conditions of the BSD License which accompanies this
+# distribution.  The full text of the license may be found at
+# http://opensource.org/licenses/bsd-license.php
+#
+# THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
+# WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+#
+#------------------------------------------------------------------------------
+
+    .text
+    .thumb
+    .syntax unified
+    .align  5
+ASM_GLOBAL ASM_PFX(InternalMemZeroMem)
+ASM_PFX(InternalMemZeroMem):
+    movs    r2, #0
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem)
+ASM_PFX(InternalMemSetMem):
+    uxtb    r2, r2
+    orr     r2, r2, r2, lsl #8
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem16)
+ASM_PFX(InternalMemSetMem16):
+    uxth    r2, r2
+    orr     r2, r2, r2, lsl #16
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem32)
+ASM_PFX(InternalMemSetMem32):
+    mov     r3, r2
+
+ASM_GLOBAL ASM_PFX(InternalMemSetMem64)
+ASM_PFX(InternalMemSetMem64):
+    push    {r4, lr}
+    cmp     r1, #16                 // fewer than 16 bytes of input?
+    add     r1, r1, r0              // r1 := dst + length
+    add     lr, r0, #16
+    blt     2f
+    bic     lr, lr, #15             // align output pointer
+
+    str     r2, [r0]                // potentially unaligned store of 4 bytes
+    str     r3, [r0, #4]            // potentially unaligned store of 4 bytes
+    str     r2, [r0, #8]            // potentially unaligned store of 4 bytes
+    str     r3, [r0, #12]           // potentially unaligned store of 4 bytes
+    beq     1f
+
+0:  add     lr, lr, #16             // advance the output pointer by 16 bytes
+    subs    r4, r1, lr              // past the output?
+    blt     3f                      // break out of the loop
+    strd    r2, r3, [lr, #-16]      // aligned store of 16 bytes
+    strd    r2, r3, [lr, #-8]
+    bne     0b                      // goto beginning of loop
+1:  pop     {r4, pc}
+
+2:  subs    r4, r1, lr
+3:  adds    r4, r4, #16
+    subs    r1, r1, #8
+    cmp     r4, #4                  // between 4 and 15 bytes?
+    blt     4f
+    cmp     r4, #8                  // between 8 and 15 bytes?
+    str     r2, [lr, #-16]          // overlapping store of 4 + (4 + 4) + 4 bytes
+    itt     gt
+    strgt   r3, [lr, #-12]
+    strgt   r2, [r1]
+    str     r3, [r1, #4]
+    pop     {r4, pc}
+
+4:  cmp     r4, #2                  // 2 or 3 bytes?
+    strb    r2, [lr, #-16]          // store 1 byte
+    it      ge
+    strhge  r2, [r1, #6]            // store 2 bytes
+    pop     {r4, pc}
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm
new file mode 100644
index 0000000000..2a8dc7d019
--- /dev/null
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/Arm/SetMem.asm
@@ -0,0 +1,84 @@
+;------------------------------------------------------------------------------
+;
+; Copyright (c) 2016, Linaro Ltd. All rights reserved.<BR>
+;
+; This program and the accompanying materials are licensed and made available
+; under the terms and conditions of the BSD License which accompanies this
+; distribution.  The full text of the license may be found at
+; http://opensource.org/licenses/bsd-license.php
+;
+; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
+; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
+;
+;------------------------------------------------------------------------------
+
+    EXPORT  InternalMemZeroMem
+    EXPORT  InternalMemSetMem
+    EXPORT  InternalMemSetMem16
+    EXPORT  InternalMemSetMem32
+    EXPORT  InternalMemSetMem64
+
+    AREA    SetMem, CODE, READONLY, CODEALIGN, ALIGN=5
+    THUMB
+
+InternalMemZeroMem
+    movs    r2, #0
+
+InternalMemSetMem
+    uxtb    r2, r2
+    orr     r2, r2, r2, lsl #8
+
+InternalMemSetMem16
+    uxth    r2, r2
+    orr     r2, r2, r2, lsr #16
+
+InternalMemSetMem32
+    mov     r3, r2
+
+InternalMemSetMem64
+    push    {r4, lr}
+    cmp     r1, #16                 ; fewer than 16 bytes of input?
+    add     r1, r1, r0              ; r1 := dst + length
+    add     lr, r0, #16
+    blt     L2
+    bic     lr, lr, #15             ; align output pointer
+
+    str     r2, [r0]                ; potentially unaligned store of 4 bytes
+    str     r3, [r0, #4]            ; potentially unaligned store of 4 bytes
+    str     r2, [r0, #8]            ; potentially unaligned store of 4 bytes
+    str     r3, [r0, #12]           ; potentially unaligned store of 4 bytes
+    beq     L1
+
+L0
+    add     lr, lr, #16             ; advance the output pointer by 16 bytes
+    subs    r4, r1, lr              ; past the output?
+    blt     L3                      ; break out of the loop
+    strd    r2, r3, [lr, #-16]      ; aligned store of 16 bytes
+    strd    r2, r3, [lr, #-8]
+    bne     L0                      ; goto beginning of loop
+L1
+    pop     {r4, pc}
+
+L2
+    subs    r4, r1, lr
+L3
+    adds    r4, r4, #16
+    subs    r1, r1, #8
+    cmp     r4, #4                  ; between 4 and 15 bytes?
+    blt     L4
+    cmp     r4, #8                  ; between 8 and 15 bytes?
+    str     r2, [lr, #-16]          ; overlapping store of 4 + (4 + 4) + 4 bytes
+    itt     gt
+    strgt   r3, [lr, #-12]
+    strgt   r2, [r1]
+    str     r3, [r1, #4]
+    pop     {r4, pc}
+
+L4
+    cmp     r4, #2                  ; 2 or 3 bytes?
+    strb    r2, [lr, #-16]          ; store 1 byte
+    it      ge
+    strhge  r2, [r1, #6]            ; store 2 bytes
+    pop     {r4, pc}
+
+    END
diff --git a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
index 71691b9859..d95eb599ea 100644
--- a/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
+++ b/MdePkg/Library/BaseMemoryLibOptDxe/BaseMemoryLibOptDxe.inf
@@ -27,7 +27,7 @@
 
 
 #
-#  VALID_ARCHITECTURES           = IA32 X64
+#  VALID_ARCHITECTURES           = IA32 X64 ARM
 #
 
 [Sources]
@@ -79,19 +79,6 @@
   Ia32/CopyMem.nasm
   Ia32/CopyMem.asm
   Ia32/IsZeroBuffer.nasm
-  ScanMem64Wrapper.c
-  ScanMem32Wrapper.c
-  ScanMem16Wrapper.c
-  ScanMem8Wrapper.c
-  ZeroMemWrapper.c
-  CompareMemWrapper.c
-  SetMem64Wrapper.c
-  SetMem32Wrapper.c
-  SetMem16Wrapper.c
-  SetMemWrapper.c
-  CopyMemWrapper.c
-  IsZeroBufferWrapper.c
-  MemLibGuid.c
 
 [Sources.X64]
   X64/ScanMem64.nasm
@@ -128,6 +115,21 @@
   X64/CopyMem.asm
   X64/CopyMem.S
   X64/IsZeroBuffer.nasm
+
+[Sources.ARM]
+  Arm/ScanMem.S       |GCC
+  Arm/SetMem.S        |GCC
+  Arm/CopyMem.S       |GCC
+  Arm/CompareMem.S    |GCC
+
+  Arm/ScanMem.asm     |RVCT
+  Arm/SetMem.asm      |RVCT
+  Arm/CopyMem.asm     |RVCT
+  Arm/CompareMem.asm  |RVCT
+
+  Arm/ScanMemGeneric.c
+
+[Sources]
   ScanMem64Wrapper.c
   ScanMem32Wrapper.c
   ScanMem16Wrapper.c