]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/commitdiff
crypto: x86/poly1305 - fix overflow during partial reduction
authorEric Biggers <ebiggers@google.com>
Sun, 31 Mar 2019 20:04:11 +0000 (13:04 -0700)
committerKleber Sacilotto de Souza <kleber.souza@canonical.com>
Wed, 14 Aug 2019 09:18:49 +0000 (11:18 +0200)
BugLink: https://bugs.launchpad.net/bugs/1838349
commit 678cce4019d746da6c680c48ba9e6d417803e127 upstream.

The x86_64 implementation of Poly1305 produces the wrong result on some
inputs because poly1305_4block_avx2() incorrectly assumes that when
partially reducing the accumulator, the bits carried from limb 'd4' to
limb 'h0' fit in a 32-bit integer.  This is true for poly1305-generic
which processes only one block at a time.  However, it's not true for
the AVX2 implementation, which processes 4 blocks at a time and
therefore can produce intermediate limbs about 4x larger.

Fix it by making the relevant calculations use 64-bit arithmetic rather
than 32-bit.  Note that most of the carries already used 64-bit
arithmetic, but the d4 -> h0 carry was different for some reason.

To be safe I also made the same change to the corresponding SSE2 code,
though that only operates on 1 or 2 blocks at a time.  I don't think
it's really needed for poly1305_block_sse2(), but it doesn't hurt
because it's already x86_64 code.  It *might* be needed for
poly1305_2block_sse2(), but overflows aren't easy to reproduce there.

This bug was originally detected by my patches that improve testmgr to
fuzz algorithms against their generic implementation.  But also add a
test vector which reproduces it directly (in the AVX2 case).

Fixes: b1ccc8f4b631 ("crypto: poly1305 - Add a four block AVX2 variant for x86_64")
Fixes: c70f4abef07a ("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64")
Cc: <stable@vger.kernel.org> # v4.3+
Cc: Martin Willi <martin@strongswan.org>
Cc: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Martin Willi <martin@strongswan.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Kamal Mostafa <kamal@canonical.com>
Signed-off-by: Khalid Elmously <khalid.elmously@canonical.com>
arch/x86/crypto/poly1305-avx2-x86_64.S
arch/x86/crypto/poly1305-sse2-x86_64.S
crypto/testmgr.h

index 3b6e70d085da89775317c8e2a560625ab4799e01..8457cdd47f751167a2321ebf063eb18bdb4ef8aa 100644 (file)
@@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2)
        vpaddq          t2,t1,t1
        vmovq           t1x,d4
 
+       # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+       # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+       # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
+       # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+       # integers.  It's true in a single-block implementation, but not here.
+
        # d1 += d0 >> 26
        mov             d0,%rax
        shr             $26,%rax
@@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2)
        # h0 += (d4 >> 26) * 5
        mov             d4,%rax
        shr             $26,%rax
-       lea             (%eax,%eax,4),%eax
-       add             %eax,%ebx
+       lea             (%rax,%rax,4),%rax
+       add             %rax,%rbx
        # h4 = d4 & 0x3ffffff
        mov             d4,%rax
        and             $0x3ffffff,%eax
        mov             %eax,h4
 
        # h1 += h0 >> 26
-       mov             %ebx,%eax
-       shr             $26,%eax
+       mov             %rbx,%rax
+       shr             $26,%rax
        add             %eax,h1
        # h0 = h0 & 0x3ffffff
        andl            $0x3ffffff,%ebx
index c88c670cb5fc6d4b6331ba18882fae34038400b4..5851c7418fb73241cd0f346a2d26f008865c5715 100644 (file)
@@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2)
        # h0 += (d4 >> 26) * 5
        mov             d4,%rax
        shr             $26,%rax
-       lea             (%eax,%eax,4),%eax
-       add             %eax,%ebx
+       lea             (%rax,%rax,4),%rax
+       add             %rax,%rbx
        # h4 = d4 & 0x3ffffff
        mov             d4,%rax
        and             $0x3ffffff,%eax
        mov             %eax,h4
 
        # h1 += h0 >> 26
-       mov             %ebx,%eax
-       shr             $26,%eax
+       mov             %rbx,%rax
+       shr             $26,%rax
        add             %eax,h1
        # h0 = h0 & 0x3ffffff
        andl            $0x3ffffff,%ebx
@@ -520,6 +520,12 @@ ENTRY(poly1305_2block_sse2)
        paddq           t2,t1
        movq            t1,d4
 
+       # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
+       # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
+       # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
+       # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
+       # integers.  It's true in a single-block implementation, but not here.
+
        # d1 += d0 >> 26
        mov             d0,%rax
        shr             $26,%rax
@@ -558,16 +564,16 @@ ENTRY(poly1305_2block_sse2)
        # h0 += (d4 >> 26) * 5
        mov             d4,%rax
        shr             $26,%rax
-       lea             (%eax,%eax,4),%eax
-       add             %eax,%ebx
+       lea             (%rax,%rax,4),%rax
+       add             %rax,%rbx
        # h4 = d4 & 0x3ffffff
        mov             d4,%rax
        and             $0x3ffffff,%eax
        mov             %eax,h4
 
        # h1 += h0 >> 26
-       mov             %ebx,%eax
-       shr             $26,%eax
+       mov             %rbx,%rax
+       shr             $26,%rax
        add             %eax,h1
        # h0 = h0 & 0x3ffffff
        andl            $0x3ffffff,%ebx
index 033de45257fc733950e2a501bbf701776fac211f..85e6ad1186de9c0f50db86b1a3c6039f5cddb3e6 100644 (file)
@@ -4727,7 +4727,49 @@ static const struct hash_testvec poly1305_tv_template[] = {
                .psize          = 80,
                .digest         = "\x13\x00\x00\x00\x00\x00\x00\x00"
                                  "\x00\x00\x00\x00\x00\x00\x00\x00",
-       },
+       }, { /* Regression test for overflow in AVX2 implementation */
+               .plaintext      = "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff\xff\xff\xff\xff"
+                                 "\xff\xff\xff\xff",
+               .psize          = 300,
+               .digest         = "\xfb\x5e\x96\xd8\x61\xd5\xc7\xc8"
+                                 "\x78\xe5\x87\xcc\x2d\x5a\x22\xe1",
+       }
 };
 
 /*