[mirror_ubuntu-bionic-kernel.git] / crypto / gf128mul.c

/* gf128mul.c - GF(2^128) multiplication functions
 *
 * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
 * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
 *
 * Based on Dr Brian Gladman's (GPL'd) work published at
 * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
 * See the original copyright notice below.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.
 */

/*
 ---------------------------------------------------------------------------
 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.

 LICENSE TERMS

 The free distribution and use of this software in both source and binary
 form is allowed (with or without changes) provided that:

   1. distributions of this source code include the above copyright
      notice, this list of conditions and the following disclaimer;

   2. distributions in binary form include the above copyright
      notice, this list of conditions and the following disclaimer
      in the documentation and/or other associated materials;

   3. the copyright holder's name is not used to endorse products
      built using this software without specific written permission.

 ALTERNATIVELY, provided that this notice is retained in full, this product
 may be distributed under the terms of the GNU General Public License (GPL),
 in which case the provisions of the GPL apply INSTEAD OF those given above.

 DISCLAIMER

 This software is provided 'as is' with no explicit or implied warranties
 in respect of its properties, including, but not limited to, correctness
 and/or fitness for purpose.
 ---------------------------------------------------------------------------
 Issue 31/01/2006

 This file provides fast multiplication in GF(2^128) as required by several
 cryptographic authentication modes
*/

#include <crypto/gf128mul.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>

#define gf128mul_dat(q) { \
	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
}

/*
 * Given a value i in 0..255 as the byte overflow when a field element
 * in GF(2^128) is multiplied by x^8, the following macro returns the
 * 16-bit value that must be XOR-ed into the low-degree end of the
 * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
 *
 * There are two versions of the macro, and hence two tables: one for
 * the "be" convention where the highest-order bit is the coefficient of
 * the highest-degree polynomial term, and one for the "le" convention
 * where the highest-order bit is the coefficient of the lowest-degree
 * polynomial term.  In both cases the values are stored in CPU byte
 * endianness such that the coefficients are ordered consistently across
 * bytes, i.e. in the "be" table bits 15..0 of the stored value
 * correspond to the coefficients of x^15..x^0, and in the "le" table
 * bits 15..0 correspond to the coefficients of x^0..x^15.
 *
 * Therefore, provided that the appropriate byte endianness conversions
 * are done by the multiplication functions (and these must be in place
 * anyway to support both little endian and big endian CPUs), the "be"
 * table can be used for multiplications of both "bbe" and "ble"
 * elements, and the "le" table can be used for multiplications of both
 * "lle" and "lbe" elements.
 */

#define xda_be(i) ( \
	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
)

#define xda_le(i) ( \
	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
)

static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);

/*
 * The following functions multiply a field element by x or by x^8 in
 * the polynomial field representation.  They use 64-bit word operations
 * to gain speed but compensate for machine endianness and hence work
 * correctly on both styles of machine.
 */

static void gf128mul_x_lle(be128 *r, const be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_le[(b << 7) & 0xff];

	r->b = cpu_to_be64((b >> 1) | (a << 63));
	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
}

static void gf128mul_x_bbe(be128 *r, const be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[a >> 63];

	r->a = cpu_to_be64((a << 1) | (b >> 63));
	r->b = cpu_to_be64((b << 1) ^ _tt);
}

void gf128mul_x_ble(be128 *r, const be128 *x)
{
	u64 a = le64_to_cpu(x->a);
	u64 b = le64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[b >> 63];

	r->a = cpu_to_le64((a << 1) ^ _tt);
	r->b = cpu_to_le64((b << 1) | (a >> 63));
}
EXPORT_SYMBOL(gf128mul_x_ble);

static void gf128mul_x8_lle(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_le[b & 0xff];

	x->b = cpu_to_be64((b >> 8) | (a << 56));
	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
}

static void gf128mul_x8_bbe(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[a >> 56];

	x->a = cpu_to_be64((a << 8) | (b >> 56));
	x->b = cpu_to_be64((b << 8) ^ _tt);
}

void gf128mul_lle(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_lle(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[15 - i];

		if (ch & 0x80)
			be128_xor(r, r, &p[0]);
		if (ch & 0x40)
			be128_xor(r, r, &p[1]);
		if (ch & 0x20)
			be128_xor(r, r, &p[2]);
		if (ch & 0x10)
			be128_xor(r, r, &p[3]);
		if (ch & 0x08)
			be128_xor(r, r, &p[4]);
		if (ch & 0x04)
			be128_xor(r, r, &p[5]);
		if (ch & 0x02)
			be128_xor(r, r, &p[6]);
		if (ch & 0x01)
			be128_xor(r, r, &p[7]);

		if (++i >= 16)
			break;

		gf128mul_x8_lle(r);
	}
}
EXPORT_SYMBOL(gf128mul_lle);

void gf128mul_bbe(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_bbe(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[i];

		if (ch & 0x80)
			be128_xor(r, r, &p[7]);
		if (ch & 0x40)
			be128_xor(r, r, &p[6]);
		if (ch & 0x20)
			be128_xor(r, r, &p[5]);
		if (ch & 0x10)
			be128_xor(r, r, &p[4]);
		if (ch & 0x08)
			be128_xor(r, r, &p[3]);
		if (ch & 0x04)
			be128_xor(r, r, &p[2]);
		if (ch & 0x02)
			be128_xor(r, r, &p[1]);
		if (ch & 0x01)
			be128_xor(r, r, &p[0]);

		if (++i >= 16)
			break;

		gf128mul_x8_bbe(r);
	}
}
EXPORT_SYMBOL(gf128mul_bbe);

/*      This version uses 64k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in
    the buffer's lowest byte, we can construct a table of
    the 256 16 byte values that result from the 256 values
    of this byte.  This requires 4096 bytes. But we also
    need tables for each of the 16 higher bytes in the
    buffer as well, which makes 64 kbytes in total.
*/
/* additional explanation
 * t[0][BYTE] contains g*BYTE
 * t[1][BYTE] contains g*x^8*BYTE
 *  ..
 * t[15][BYTE] contains g*x^120*BYTE */
struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
{
	struct gf128mul_64k *t;
	int i, j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	for (i = 0; i < 16; i++) {
		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
		if (!t->t[i]) {
			gf128mul_free_64k(t);
			t = NULL;
			goto out;
		}
	}

	t->t[0]->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);

	for (i = 0;;) {
		for (j = 2; j < 256; j += j)
			for (k = 1; k < j; ++k)
				be128_xor(&t->t[i]->t[j + k],
					  &t->t[i]->t[j], &t->t[i]->t[k]);

		if (++i >= 16)
			break;

		for (j = 128; j > 0; j >>= 1) {
			t->t[i]->t[j] = t->t[i - 1]->t[j];
			gf128mul_x8_bbe(&t->t[i]->t[j]);
		}
	}

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_64k_bbe);

void gf128mul_free_64k(struct gf128mul_64k *t)
{
	int i;

	for (i = 0; i < 16; i++)
		kzfree(t->t[i]);
	kzfree(t);
}
EXPORT_SYMBOL(gf128mul_free_64k);

void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i;

	*r = t->t[0]->t[ap[15]];
	for (i = 1; i < 16; ++i)
		be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_64k_bbe);

/*      This version uses 4k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in a
    single byte, we can construct a table of the 256 16 byte
    values that result from the 256 values of this byte.
    This requires 4096 bytes. If we take the highest byte in
    the buffer and use this table to get the result, we then
    have to multiply by x^120 to get the final value. For the
    next highest byte the result has to be multiplied by x^112
    and so on. But we can do this by accumulating the result
    in an accumulator starting with the result for the top
    byte.  We repeatedly multiply the accumulator value by
    x^8 and then add in (i.e. xor) the 16 bytes of the next
    lower byte in the buffer, stopping when we reach the
    lowest byte. This requires a 4096 byte table.
*/
struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[128] = *g;
	for (j = 64; j > 0; j >>= 1)
		gf128mul_x_lle(&t->t[j], &t->t[j+j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_lle);

struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[j + j], &t->t[j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_bbe);

void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 15;

	*r = t->t[ap[15]];
	while (i--) {
		gf128mul_x8_lle(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_lle);

void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 0;

	*r = t->t[ap[0]];
	while (++i < 16) {
		gf128mul_x8_bbe(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_bbe);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
Commit	Line	Data
c494e070 RS	1	/* gf128mul.c - GF(2^128) multiplication functions
	2	*
	3	* Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
	4	* Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
	5	*
	6	* Based on Dr Brian Gladman's (GPL'd) work published at
8c882f64	7	* http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
c494e070 RS	8	* See the original copyright notice below.
	9	*
	10	* This program is free software; you can redistribute it and/or modify it
	11	* under the terms of the GNU General Public License as published by the Free
	12	* Software Foundation; either version 2 of the License, or (at your option)
	13	* any later version.
	14	*/
	15
	16	/*
	17	---------------------------------------------------------------------------
	18	Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
	19
	20	LICENSE TERMS
	21
	22	The free distribution and use of this software in both source and binary
	23	form is allowed (with or without changes) provided that:
	24
	25	1. distributions of this source code include the above copyright
	26	notice, this list of conditions and the following disclaimer;
	27
	28	2. distributions in binary form include the above copyright
	29	notice, this list of conditions and the following disclaimer
	30	in the documentation and/or other associated materials;
	31
	32	3. the copyright holder's name is not used to endorse products
	33	built using this software without specific written permission.
	34
	35	ALTERNATIVELY, provided that this notice is retained in full, this product
	36	may be distributed under the terms of the GNU General Public License (GPL),
	37	in which case the provisions of the GPL apply INSTEAD OF those given above.
	38
	39	DISCLAIMER
	40
	41	This software is provided 'as is' with no explicit or implied warranties
	42	in respect of its properties, including, but not limited to, correctness
	43	and/or fitness for purpose.
	44	---------------------------------------------------------------------------
	45	Issue 31/01/2006
	46
63be5b53	47	This file provides fast multiplication in GF(2^128) as required by several
c494e070 RS	48	cryptographic authentication modes
	49	*/
	50
	51	#include <crypto/gf128mul.h>
	52	#include <linux/kernel.h>
	53	#include <linux/module.h>
	54	#include <linux/slab.h>
	55
	56	#define gf128mul_dat(q) { \
	57	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	58	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	59	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	60	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	61	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	62	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	63	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	64	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	65	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	66	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	67	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	68	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	69	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	70	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	71	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	72	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	73	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	74	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	75	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	76	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	77	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	78	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	79	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	80	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	81	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	82	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	83	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	84	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	85	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	86	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	87	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	88	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
	89	}
	90
f33fd647 EB	91	/*
	92	* Given a value i in 0..255 as the byte overflow when a field element
	93	* in GF(2^128) is multiplied by x^8, the following macro returns the
	94	* 16-bit value that must be XOR-ed into the low-degree end of the
	95	* product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
	96	*
	97	* There are two versions of the macro, and hence two tables: one for
	98	* the "be" convention where the highest-order bit is the coefficient of
	99	* the highest-degree polynomial term, and one for the "le" convention
	100	* where the highest-order bit is the coefficient of the lowest-degree
	101	* polynomial term. In both cases the values are stored in CPU byte
	102	* endianness such that the coefficients are ordered consistently across
	103	* bytes, i.e. in the "be" table bits 15..0 of the stored value
	104	* correspond to the coefficients of x^15..x^0, and in the "le" table
	105	* bits 15..0 correspond to the coefficients of x^0..x^15.
	106	*
	107	* Therefore, provided that the appropriate byte endianness conversions
	108	* are done by the multiplication functions (and these must be in place
	109	* anyway to support both little endian and big endian CPUs), the "be"
	110	* table can be used for multiplications of both "bbe" and "ble"
	111	* elements, and the "le" table can be used for multiplications of both
	112	* "lle" and "lbe" elements.
	113	*/
c494e070	114
f33fd647	115	#define xda_be(i) ( \
2416e4fa EB	116	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	117	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	118	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	119	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
c494e070 RS	120	)
c494e070 RS	121
f33fd647	122	#define xda_le(i) ( \
2416e4fa EB	123	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	124	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	125	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	126	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
c494e070 RS	127	)
c494e070 RS	128
f33fd647 EB	129	static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
f33fd647 EB	130	static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
c494e070	131
63be5b53 EB	132	/*
	133	* The following functions multiply a field element by x or by x^8 in
	134	* the polynomial field representation. They use 64-bit word operations
	135	* to gain speed but compensate for machine endianness and hence work
c494e070 RS	136	* correctly on both styles of machine.
	137	*/
	138
	139	static void gf128mul_x_lle(be128 r, const be128 x)
	140	{
	141	u64 a = be64_to_cpu(x->a);
	142	u64 b = be64_to_cpu(x->b);
f33fd647	143	u64 _tt = gf128mul_table_le[(b << 7) & 0xff];
c494e070 RS	144
	145	r->b = cpu_to_be64((b >> 1) \| (a << 63));
	146	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
	147	}
	148
	149	static void gf128mul_x_bbe(be128 r, const be128 x)
	150	{
	151	u64 a = be64_to_cpu(x->a);
	152	u64 b = be64_to_cpu(x->b);
f33fd647	153	u64 _tt = gf128mul_table_be[a >> 63];
c494e070 RS	154
	155	r->a = cpu_to_be64((a << 1) \| (b >> 63));
	156	r->b = cpu_to_be64((b << 1) ^ _tt);
	157	}
	158
f19f5111 RS	159	void gf128mul_x_ble(be128 r, const be128 x)
	160	{
	161	u64 a = le64_to_cpu(x->a);
	162	u64 b = le64_to_cpu(x->b);
f33fd647	163	u64 _tt = gf128mul_table_be[b >> 63];
f19f5111 RS	164
	165	r->a = cpu_to_le64((a << 1) ^ _tt);
	166	r->b = cpu_to_le64((b << 1) \| (a >> 63));
	167	}
	168	EXPORT_SYMBOL(gf128mul_x_ble);
	169
c494e070 RS	170	static void gf128mul_x8_lle(be128 *x)
	171	{
	172	u64 a = be64_to_cpu(x->a);
	173	u64 b = be64_to_cpu(x->b);
f33fd647	174	u64 _tt = gf128mul_table_le[b & 0xff];
c494e070 RS	175
	176	x->b = cpu_to_be64((b >> 8) \| (a << 56));
	177	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
	178	}
	179
	180	static void gf128mul_x8_bbe(be128 *x)
	181	{
	182	u64 a = be64_to_cpu(x->a);
	183	u64 b = be64_to_cpu(x->b);
f33fd647	184	u64 _tt = gf128mul_table_be[a >> 56];
c494e070 RS	185
	186	x->a = cpu_to_be64((a << 8) \| (b >> 56));
	187	x->b = cpu_to_be64((b << 8) ^ _tt);
	188	}
	189
	190	void gf128mul_lle(be128 r, const be128 b)
	191	{
	192	be128 p[8];
	193	int i;
	194
	195	p[0] = *r;
	196	for (i = 0; i < 7; ++i)
	197	gf128mul_x_lle(&p[i + 1], &p[i]);
	198
62542663	199	memset(r, 0, sizeof(*r));
c494e070 RS	200	for (i = 0;;) {
	201	u8 ch = ((u8 *)b)[15 - i];
	202
	203	if (ch & 0x80)
	204	be128_xor(r, r, &p[0]);
	205	if (ch & 0x40)
	206	be128_xor(r, r, &p[1]);
	207	if (ch & 0x20)
	208	be128_xor(r, r, &p[2]);
	209	if (ch & 0x10)
	210	be128_xor(r, r, &p[3]);
	211	if (ch & 0x08)
	212	be128_xor(r, r, &p[4]);
	213	if (ch & 0x04)
	214	be128_xor(r, r, &p[5]);
	215	if (ch & 0x02)
	216	be128_xor(r, r, &p[6]);
	217	if (ch & 0x01)
	218	be128_xor(r, r, &p[7]);
	219
	220	if (++i >= 16)
	221	break;
	222
	223	gf128mul_x8_lle(r);
	224	}
	225	}
	226	EXPORT_SYMBOL(gf128mul_lle);
	227
	228	void gf128mul_bbe(be128 r, const be128 b)
	229	{
	230	be128 p[8];
	231	int i;
	232
	233	p[0] = *r;
	234	for (i = 0; i < 7; ++i)
	235	gf128mul_x_bbe(&p[i + 1], &p[i]);
	236
62542663	237	memset(r, 0, sizeof(*r));
c494e070 RS	238	for (i = 0;;) {
	239	u8 ch = ((u8 *)b)[i];
	240
	241	if (ch & 0x80)
	242	be128_xor(r, r, &p[7]);
	243	if (ch & 0x40)
	244	be128_xor(r, r, &p[6]);
	245	if (ch & 0x20)
	246	be128_xor(r, r, &p[5]);
	247	if (ch & 0x10)
	248	be128_xor(r, r, &p[4]);
	249	if (ch & 0x08)
	250	be128_xor(r, r, &p[3]);
	251	if (ch & 0x04)
	252	be128_xor(r, r, &p[2]);
	253	if (ch & 0x02)
	254	be128_xor(r, r, &p[1]);
	255	if (ch & 0x01)
	256	be128_xor(r, r, &p[0]);
	257
	258	if (++i >= 16)
	259	break;
	260
	261	gf128mul_x8_bbe(r);
	262	}
	263	}
	264	EXPORT_SYMBOL(gf128mul_bbe);
	265
	266	/* This version uses 64k bytes of table space.
	267	A 16 byte buffer has to be multiplied by a 16 byte key
63be5b53	268	value in GF(2^128). If we consider a GF(2^128) value in
c494e070 RS	269	the buffer's lowest byte, we can construct a table of
	270	the 256 16 byte values that result from the 256 values
	271	of this byte. This requires 4096 bytes. But we also
	272	need tables for each of the 16 higher bytes in the
	273	buffer as well, which makes 64 kbytes in total.
	274	*/
	275	/* additional explanation
	276	* t[0][BYTE] contains g*BYTE
	277	* t[1][BYTE] contains gx^8BYTE
	278	* ..
	279	* t[15][BYTE] contains gx^120BYTE */
c494e070 RS	280	struct gf128mul_64k gf128mul_init_64k_bbe(const be128 g)
	281	{
	282	struct gf128mul_64k *t;
	283	int i, j, k;
	284
	285	t = kzalloc(sizeof(*t), GFP_KERNEL);
	286	if (!t)
	287	goto out;
	288
	289	for (i = 0; i < 16; i++) {
	290	t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
	291	if (!t->t[i]) {
	292	gf128mul_free_64k(t);
	293	t = NULL;
	294	goto out;
	295	}
	296	}
	297
	298	t->t[0]->t[1] = *g;
	299	for (j = 1; j <= 64; j <<= 1)
	300	gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);
	301
	302	for (i = 0;;) {
	303	for (j = 2; j < 256; j += j)
	304	for (k = 1; k < j; ++k)
	305	be128_xor(&t->t[i]->t[j + k],
	306	&t->t[i]->t[j], &t->t[i]->t[k]);
	307
	308	if (++i >= 16)
	309	break;
	310
	311	for (j = 128; j > 0; j >>= 1) {
	312	t->t[i]->t[j] = t->t[i - 1]->t[j];
	313	gf128mul_x8_bbe(&t->t[i]->t[j]);
	314	}
	315	}
	316
	317	out:
	318	return t;
	319	}
	320	EXPORT_SYMBOL(gf128mul_init_64k_bbe);
	321
	322	void gf128mul_free_64k(struct gf128mul_64k *t)
	323	{
	324	int i;
	325
	326	for (i = 0; i < 16; i++)
75aa0a7c AC	327	kzfree(t->t[i]);
75aa0a7c AC	328	kzfree(t);
c494e070 RS	329	}
	330	EXPORT_SYMBOL(gf128mul_free_64k);
	331
c494e070 RS	332	void gf128mul_64k_bbe(be128 a, struct gf128mul_64k t)
	333	{
	334	u8 ap = (u8 )a;
	335	be128 r[1];
	336	int i;
	337
	338	*r = t->t[0]->t[ap[15]];
	339	for (i = 1; i < 16; ++i)
	340	be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	341	a = r;
	342	}
	343	EXPORT_SYMBOL(gf128mul_64k_bbe);
	344
	345	/* This version uses 4k bytes of table space.
	346	A 16 byte buffer has to be multiplied by a 16 byte key
63be5b53	347	value in GF(2^128). If we consider a GF(2^128) value in a
c494e070 RS	348	single byte, we can construct a table of the 256 16 byte
	349	values that result from the 256 values of this byte.
	350	This requires 4096 bytes. If we take the highest byte in
	351	the buffer and use this table to get the result, we then
	352	have to multiply by x^120 to get the final value. For the
	353	next highest byte the result has to be multiplied by x^112
	354	and so on. But we can do this by accumulating the result
	355	in an accumulator starting with the result for the top
	356	byte. We repeatedly multiply the accumulator value by
	357	x^8 and then add in (i.e. xor) the 16 bytes of the next
	358	lower byte in the buffer, stopping when we reach the
	359	lowest byte. This requires a 4096 byte table.
	360	*/
	361	struct gf128mul_4k gf128mul_init_4k_lle(const be128 g)
	362	{
	363	struct gf128mul_4k *t;
	364	int j, k;
	365
	366	t = kzalloc(sizeof(*t), GFP_KERNEL);
	367	if (!t)
	368	goto out;
	369
	370	t->t[128] = *g;
	371	for (j = 64; j > 0; j >>= 1)
	372	gf128mul_x_lle(&t->t[j], &t->t[j+j]);
	373
	374	for (j = 2; j < 256; j += j)
	375	for (k = 1; k < j; ++k)
	376	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	377
	378	out:
	379	return t;
	380	}
	381	EXPORT_SYMBOL(gf128mul_init_4k_lle);
	382
	383	struct gf128mul_4k gf128mul_init_4k_bbe(const be128 g)
	384	{
	385	struct gf128mul_4k *t;
	386	int j, k;
	387
	388	t = kzalloc(sizeof(*t), GFP_KERNEL);
	389	if (!t)
	390	goto out;
	391
	392	t->t[1] = *g;
	393	for (j = 1; j <= 64; j <<= 1)
	394	gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
	395
	396	for (j = 2; j < 256; j += j)
	397	for (k = 1; k < j; ++k)
	398	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	399
	400	out:
	401	return t;
	402	}
	403	EXPORT_SYMBOL(gf128mul_init_4k_bbe);
	404
	405	void gf128mul_4k_lle(be128 a, struct gf128mul_4k t)
	406	{
	407	u8 ap = (u8 )a;
	408	be128 r[1];
	409	int i = 15;
	410
	411	*r = t->t[ap[15]];
412	while (i--) {
413	gf128mul_x8_lle(r);
414	be128_xor(r, r, &t->t[ap[i]]);
415	}
416	a = r;
417	}
418	EXPORT_SYMBOL(gf128mul_4k_lle);
419
420	void gf128mul_4k_bbe(be128 a, struct gf128mul_4k t)
421	{
422	u8 ap = (u8 )a;
423	be128 r[1];
424	int i = 0;
425
426	*r = t->t[ap[0]];
427	while (++i < 16) {
428	gf128mul_x8_bbe(r);
429	be128_xor(r, r, &t->t[ap[i]]);
430	}
431	a = r;
432	}
433	EXPORT_SYMBOL(gf128mul_4k_bbe);
434
435	MODULE_LICENSE("GPL");
436	MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");