arch/x86/lib/memcpy_64.S

   1 /* Copyright 2002 Andi Kleen */
   2
   3 #include <linux/linkage.h>
   4
   5 #include <asm/cpufeature.h>
   6 #include <asm/dwarf2.h>
   7 #include <asm/alternative-asm.h>
   8
   9 /*
  10  * memcpy - Copy a memory block.
  11  *
  12  * Input:
  13  *  rdi destination
  14  *  rsi source
  15  *  rdx count
  16  *
  17  * Output:
  18  * rax original destination
  19  */
  20
  21 /*
  22  * memcpy_c() - fast string ops (REP MOVSQ) based variant.
  23  *
  24  * This gets patched over the unrolled variant (below) via the
  25  * alternative instructions framework:
  26  */
  27         .section .altinstr_replacement, "ax", @progbits
  28 .Lmemcpy_c:
  29         movq %rdi, %rax
  30
  31         movl %edx, %ecx
  32         shrl $3, %ecx
  33         andl $7, %edx
  34         rep movsq
  35         movl %edx, %ecx
  36         rep movsb
  37         ret
  38 .Lmemcpy_e:
  39         .previous
  40
  41 /*
  42  * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
  43  * memcpy_c. Use memcpy_c_e when possible.
  44  *
  45  * This gets patched over the unrolled variant (below) via the
  46  * alternative instructions framework:
  47  */
  48         .section .altinstr_replacement, "ax", @progbits
  49 .Lmemcpy_c_e:
  50         movq %rdi, %rax
  51
  52         movl %edx, %ecx
  53         rep movsb
  54         ret
  55 .Lmemcpy_e_e:
  56         .previous
  57
  58 ENTRY(__memcpy)
  59 ENTRY(memcpy)
  60         CFI_STARTPROC
  61         movq %rdi, %rax
  62
  63         /*
  64          * Use 32bit CMP here to avoid long NOP padding.
  65          */
  66         cmp  $0x20, %edx
  67         jb .Lhandle_tail
  68
  69         /*
  70          * We check whether memory false dependece could occur,
  71          * then jump to corresponding copy mode.
  72          */
  73         cmp  %dil, %sil
  74         jl .Lcopy_backward
  75         subl $0x20, %edx
  76 .Lcopy_forward_loop:
  77         subq $0x20,     %rdx
  78
  79         /*
  80          * Move in blocks of 4x8 bytes:
  81          */
  82         movq 0*8(%rsi), %r8
  83         movq 1*8(%rsi), %r9
  84         movq 2*8(%rsi), %r10
  85         movq 3*8(%rsi), %r11
  86         leaq 4*8(%rsi), %rsi
  87
  88         movq %r8,       0*8(%rdi)
  89         movq %r9,       1*8(%rdi)
  90         movq %r10,      2*8(%rdi)
  91         movq %r11,      3*8(%rdi)
  92         leaq 4*8(%rdi), %rdi
  93         jae  .Lcopy_forward_loop
  94         addq $0x20,     %rdx
  95         jmp  .Lhandle_tail
  96
  97 .Lcopy_backward:
  98         /*
  99          * Calculate copy position to tail.
 100          */
 101         addq %rdx,      %rsi
 102         addq %rdx,      %rdi
 103         subq $0x20,     %rdx
 104         /*
 105          * At most 3 ALU operations in one cycle,
 106          * so append NOPS in the same 16bytes trunk.
 107          */
 108         .p2align 4
 109 .Lcopy_backward_loop:
 110         subq $0x20,     %rdx
 111         movq -1*8(%rsi),        %r8
 112         movq -2*8(%rsi),        %r9
 113         movq -3*8(%rsi),        %r10
 114         movq -4*8(%rsi),        %r11
 115         leaq -4*8(%rsi),        %rsi
 116         movq %r8,               -1*8(%rdi)
 117         movq %r9,               -2*8(%rdi)
 118         movq %r10,              -3*8(%rdi)
 119         movq %r11,              -4*8(%rdi)
 120         leaq -4*8(%rdi),        %rdi
 121         jae  .Lcopy_backward_loop
 122
 123         /*
 124          * Calculate copy position to head.
 125          */
 126         addq $0x20,     %rdx
 127         subq %rdx,      %rsi
 128         subq %rdx,      %rdi
 129 .Lhandle_tail:
 130         cmpq $16,       %rdx
 131         jb   .Lless_16bytes
 132
 133         /*
 134          * Move data from 16 bytes to 31 bytes.
 135          */
 136         movq 0*8(%rsi), %r8
 137         movq 1*8(%rsi), %r9
 138         movq -2*8(%rsi, %rdx),  %r10
 139         movq -1*8(%rsi, %rdx),  %r11
 140         movq %r8,       0*8(%rdi)
 141         movq %r9,       1*8(%rdi)
 142         movq %r10,      -2*8(%rdi, %rdx)
 143         movq %r11,      -1*8(%rdi, %rdx)
 144         retq
 145         .p2align 4
 146 .Lless_16bytes:
 147         cmpq $8,        %rdx
 148         jb   .Lless_8bytes
 149         /*
 150          * Move data from 8 bytes to 15 bytes.
 151          */
 152         movq 0*8(%rsi), %r8
 153         movq -1*8(%rsi, %rdx),  %r9
 154         movq %r8,       0*8(%rdi)
 155         movq %r9,       -1*8(%rdi, %rdx)
 156         retq
 157         .p2align 4
 158 .Lless_8bytes:
 159         cmpq $4,        %rdx
 160         jb   .Lless_3bytes
 161
 162         /*
 163          * Move data from 4 bytes to 7 bytes.
 164          */
 165         movl (%rsi), %ecx
 166         movl -4(%rsi, %rdx), %r8d
 167         movl %ecx, (%rdi)
 168         movl %r8d, -4(%rdi, %rdx)
 169         retq
 170         .p2align 4
 171 .Lless_3bytes:
 172         cmpl $0, %edx
 173         je .Lend
 174         /*
 175          * Move data from 1 bytes to 3 bytes.
 176          */
 177 .Lloop_1:
 178         movb (%rsi), %r8b
 179         movb %r8b, (%rdi)
 180         incq %rdi
 181         incq %rsi
 182         decl %edx
 183         jnz .Lloop_1
 184
 185 .Lend:
 186         retq
 187         CFI_ENDPROC
 188 ENDPROC(memcpy)
 189 ENDPROC(__memcpy)
 190
 191         /*
 192          * Some CPUs are adding enhanced REP MOVSB/STOSB feature
 193          * If the feature is supported, memcpy_c_e() is the first choice.
 194          * If enhanced rep movsb copy is not available, use fast string copy
 195          * memcpy_c() when possible. This is faster and code is simpler than
 196          * original memcpy().
 197          * Otherwise, original memcpy() is used.
 198          * In .altinstructions section, ERMS feature is placed after REG_GOOD
 199          * feature to implement the right patch order.
 200          *
 201          * Replace only beginning, memcpy is used to apply alternatives,
 202          * so it is silly to overwrite itself with nops - reboot is the
 203          * only outcome...
 204          */
 205         .section .altinstructions, "a"
 206         altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
 207                              .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
 208         altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
 209                              .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
 210         .previous