arch/arm/crypto/ghash-ce-core.S

   1 /*
   2  * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
   3  *
   4  * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 as published
   8  * by the Free Software Foundation.
   9  */
  10
  11 #include <linux/linkage.h>
  12 #include <asm/assembler.h>
  13
  14         SHASH           .req    q0
  15         T1              .req    q1
  16         XL              .req    q2
  17         XM              .req    q3
  18         XH              .req    q4
  19         IN1             .req    q4
  20
  21         SHASH_L         .req    d0
  22         SHASH_H         .req    d1
  23         T1_L            .req    d2
  24         T1_H            .req    d3
  25         XL_L            .req    d4
  26         XL_H            .req    d5
  27         XM_L            .req    d6
  28         XM_H            .req    d7
  29         XH_L            .req    d8
  30
  31         t0l             .req    d10
  32         t0h             .req    d11
  33         t1l             .req    d12
  34         t1h             .req    d13
  35         t2l             .req    d14
  36         t2h             .req    d15
  37         t3l             .req    d16
  38         t3h             .req    d17
  39         t4l             .req    d18
  40         t4h             .req    d19
  41
  42         t0q             .req    q5
  43         t1q             .req    q6
  44         t2q             .req    q7
  45         t3q             .req    q8
  46         t4q             .req    q9
  47         T2              .req    q9
  48
  49         s1l             .req    d20
  50         s1h             .req    d21
  51         s2l             .req    d22
  52         s2h             .req    d23
  53         s3l             .req    d24
  54         s3h             .req    d25
  55         s4l             .req    d26
  56         s4h             .req    d27
  57
  58         MASK            .req    d28
  59         SHASH2_p8       .req    d28
  60
  61         k16             .req    d29
  62         k32             .req    d30
  63         k48             .req    d31
  64         SHASH2_p64      .req    d31
  65
  66         .text
  67         .fpu            crypto-neon-fp-armv8
  68
  69         .macro          __pmull_p64, rd, rn, rm, b1, b2, b3, b4
  70         vmull.p64       \rd, \rn, \rm
  71         .endm
  72
  73         /*
  74          * This implementation of 64x64 -> 128 bit polynomial multiplication
  75          * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
  76          * "Fast Software Polynomial Multiplication on ARM Processors Using
  77          * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
  78          * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
  79          *
  80          * It has been slightly tweaked for in-order performance, and to allow
  81          * 'rq' to overlap with 'ad' or 'bd'.
  82          */
  83         .macro          __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
  84         vext.8          t0l, \ad, \ad, #1       @ A1
  85         .ifc            \b1, t4l
  86         vext.8          t4l, \bd, \bd, #1       @ B1
  87         .endif
  88         vmull.p8        t0q, t0l, \bd           @ F = A1*B
  89         vext.8          t1l, \ad, \ad, #2       @ A2
  90         vmull.p8        t4q, \ad, \b1           @ E = A*B1
  91         .ifc            \b2, t3l
  92         vext.8          t3l, \bd, \bd, #2       @ B2
  93         .endif
  94         vmull.p8        t1q, t1l, \bd           @ H = A2*B
  95         vext.8          t2l, \ad, \ad, #3       @ A3
  96         vmull.p8        t3q, \ad, \b2           @ G = A*B2
  97         veor            t0q, t0q, t4q           @ L = E + F
  98         .ifc            \b3, t4l
  99         vext.8          t4l, \bd, \bd, #3       @ B3
 100         .endif
 101         vmull.p8        t2q, t2l, \bd           @ J = A3*B
 102         veor            t0l, t0l, t0h           @ t0 = (L) (P0 + P1) << 8
 103         veor            t1q, t1q, t3q           @ M = G + H
 104         .ifc            \b4, t3l
 105         vext.8          t3l, \bd, \bd, #4       @ B4
 106         .endif
 107         vmull.p8        t4q, \ad, \b3           @ I = A*B3
 108         veor            t1l, t1l, t1h           @ t1 = (M) (P2 + P3) << 16
 109         vmull.p8        t3q, \ad, \b4           @ K = A*B4
 110         vand            t0h, t0h, k48
 111         vand            t1h, t1h, k32
 112         veor            t2q, t2q, t4q           @ N = I + J
 113         veor            t0l, t0l, t0h
 114         veor            t1l, t1l, t1h
 115         veor            t2l, t2l, t2h           @ t2 = (N) (P4 + P5) << 24
 116         vand            t2h, t2h, k16
 117         veor            t3l, t3l, t3h           @ t3 = (K) (P6 + P7) << 32
 118         vmov.i64        t3h, #0
 119         vext.8          t0q, t0q, t0q, #15
 120         veor            t2l, t2l, t2h
 121         vext.8          t1q, t1q, t1q, #14
 122         vmull.p8        \rq, \ad, \bd           @ D = A*B
 123         vext.8          t2q, t2q, t2q, #13
 124         vext.8          t3q, t3q, t3q, #12
 125         veor            t0q, t0q, t1q
 126         veor            t2q, t2q, t3q
 127         veor            \rq, \rq, t0q
 128         veor            \rq, \rq, t2q
 129         .endm
 130
 131         //
 132         // PMULL (64x64->128) based reduction for CPUs that can do
 133         // it in a single instruction.
 134         //
 135         .macro          __pmull_reduce_p64
 136         vmull.p64       T1, XL_L, MASK
 137
 138         veor            XH_L, XH_L, XM_H
 139         vext.8          T1, T1, T1, #8
 140         veor            XL_H, XL_H, XM_L
 141         veor            T1, T1, XL
 142
 143         vmull.p64       XL, T1_H, MASK
 144         .endm
 145
 146         //
 147         // Alternative reduction for CPUs that lack support for the
 148         // 64x64->128 PMULL instruction
 149         //
 150         .macro          __pmull_reduce_p8
 151         veor            XL_H, XL_H, XM_L
 152         veor            XH_L, XH_L, XM_H
 153
 154         vshl.i64        T1, XL, #57
 155         vshl.i64        T2, XL, #62
 156         veor            T1, T1, T2
 157         vshl.i64        T2, XL, #63
 158         veor            T1, T1, T2
 159         veor            XL_H, XL_H, T1_L
 160         veor            XH_L, XH_L, T1_H
 161
 162         vshr.u64        T1, XL, #1
 163         veor            XH, XH, XL
 164         veor            XL, XL, T1
 165         vshr.u64        T1, T1, #6
 166         vshr.u64        XL, XL, #1
 167         .endm
 168
 169         .macro          ghash_update, pn
 170         vld1.64         {XL}, [r1]
 171
 172         /* do the head block first, if supplied */
 173         ldr             ip, [sp]
 174         teq             ip, #0
 175         beq             0f
 176         vld1.64         {T1}, [ip]
 177         teq             r0, #0
 178         b               1f
 179
 180 0:      vld1.64         {T1}, [r2]!
 181         subs            r0, r0, #1
 182
 183 1:      /* multiply XL by SHASH in GF(2^128) */
 184 #ifndef CONFIG_CPU_BIG_ENDIAN
 185         vrev64.8        T1, T1
 186 #endif
 187         vext.8          IN1, T1, T1, #8
 188         veor            T1_L, T1_L, XL_H
 189         veor            XL, XL, IN1
 190
 191         __pmull_\pn     XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h   @ a1 * b1
 192         veor            T1, T1, XL
 193         __pmull_\pn     XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l   @ a0 * b0
 194         __pmull_\pn     XM, T1_L, SHASH2_\pn                    @ (a1+a0)(b1+b0)
 195
 196         veor            T1, XL, XH
 197         veor            XM, XM, T1
 198
 199         __pmull_reduce_\pn
 200
 201         veor            T1, T1, XH
 202         veor            XL, XL, T1
 203
 204         bne             0b
 205
 206         vst1.64         {XL}, [r1]
 207         bx              lr
 208         .endm
 209
 210         /*
 211          * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
 212          *                         struct ghash_key const *k, const char *head)
 213          */
 214 ENTRY(pmull_ghash_update_p64)
 215         vld1.64         {SHASH}, [r3]
 216         veor            SHASH2_p64, SHASH_L, SHASH_H
 217
 218         vmov.i8         MASK, #0xe1
 219         vshl.u64        MASK, MASK, #57
 220
 221         ghash_update    p64
 222 ENDPROC(pmull_ghash_update_p64)
 223
 224 ENTRY(pmull_ghash_update_p8)
 225         vld1.64         {SHASH}, [r3]
 226         veor            SHASH2_p8, SHASH_L, SHASH_H
 227
 228         vext.8          s1l, SHASH_L, SHASH_L, #1
 229         vext.8          s2l, SHASH_L, SHASH_L, #2
 230         vext.8          s3l, SHASH_L, SHASH_L, #3
 231         vext.8          s4l, SHASH_L, SHASH_L, #4
 232         vext.8          s1h, SHASH_H, SHASH_H, #1
 233         vext.8          s2h, SHASH_H, SHASH_H, #2
 234         vext.8          s3h, SHASH_H, SHASH_H, #3
 235         vext.8          s4h, SHASH_H, SHASH_H, #4
 236
 237         vmov.i64        k16, #0xffff
 238         vmov.i64        k32, #0xffffffff
 239         vmov.i64        k48, #0xffffffffffff
 240
 241         ghash_update    p8
 242 ENDPROC(pmull_ghash_update_p8)