[mirror_ubuntu-artful-kernel.git] / crypto / gf128mul.c

/* gf128mul.c - GF(2^128) multiplication functions
 *
 * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
 * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
 *
 * Based on Dr Brian Gladman's (GPL'd) work published at
 * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
 * See the original copyright notice below.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.
 */

/*
 ---------------------------------------------------------------------------
 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.

 LICENSE TERMS

 The free distribution and use of this software in both source and binary
 form is allowed (with or without changes) provided that:

   1. distributions of this source code include the above copyright
      notice, this list of conditions and the following disclaimer;

   2. distributions in binary form include the above copyright
      notice, this list of conditions and the following disclaimer
      in the documentation and/or other associated materials;

   3. the copyright holder's name is not used to endorse products
      built using this software without specific written permission.

 ALTERNATIVELY, provided that this notice is retained in full, this product
 may be distributed under the terms of the GNU General Public License (GPL),
 in which case the provisions of the GPL apply INSTEAD OF those given above.

 DISCLAIMER

 This software is provided 'as is' with no explicit or implied warranties
 in respect of its properties, including, but not limited to, correctness
 and/or fitness for purpose.
 ---------------------------------------------------------------------------
 Issue 31/01/2006

 This file provides fast multiplication in GF(2^128) as required by several
 cryptographic authentication modes
*/

#include <crypto/gf128mul.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>

#define gf128mul_dat(q) { \
	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
}

/*
 * Given a value i in 0..255 as the byte overflow when a field element
 * in GF(2^128) is multiplied by x^8, the following macro returns the
 * 16-bit value that must be XOR-ed into the low-degree end of the
 * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
 *
 * There are two versions of the macro, and hence two tables: one for
 * the "be" convention where the highest-order bit is the coefficient of
 * the highest-degree polynomial term, and one for the "le" convention
 * where the highest-order bit is the coefficient of the lowest-degree
 * polynomial term.  In both cases the values are stored in CPU byte
 * endianness such that the coefficients are ordered consistently across
 * bytes, i.e. in the "be" table bits 15..0 of the stored value
 * correspond to the coefficients of x^15..x^0, and in the "le" table
 * bits 15..0 correspond to the coefficients of x^0..x^15.
 *
 * Therefore, provided that the appropriate byte endianness conversions
 * are done by the multiplication functions (and these must be in place
 * anyway to support both little endian and big endian CPUs), the "be"
 * table can be used for multiplications of both "bbe" and "ble"
 * elements, and the "le" table can be used for multiplications of both
 * "lle" and "lbe" elements.
 */

#define xda_be(i) ( \
	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
)

#define xda_le(i) ( \
	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
)

static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);

/*
 * The following functions multiply a field element by x^8 in
 * the polynomial field representation.  They use 64-bit word operations
 * to gain speed but compensate for machine endianness and hence work
 * correctly on both styles of machine.
 */

static void gf128mul_x8_lle(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_le[b & 0xff];

	x->b = cpu_to_be64((b >> 8) | (a << 56));
	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
}

static void gf128mul_x8_bbe(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[a >> 56];

	x->a = cpu_to_be64((a << 8) | (b >> 56));
	x->b = cpu_to_be64((b << 8) ^ _tt);
}

void gf128mul_lle(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_lle(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[15 - i];

		if (ch & 0x80)
			be128_xor(r, r, &p[0]);
		if (ch & 0x40)
			be128_xor(r, r, &p[1]);
		if (ch & 0x20)
			be128_xor(r, r, &p[2]);
		if (ch & 0x10)
			be128_xor(r, r, &p[3]);
		if (ch & 0x08)
			be128_xor(r, r, &p[4]);
		if (ch & 0x04)
			be128_xor(r, r, &p[5]);
		if (ch & 0x02)
			be128_xor(r, r, &p[6]);
		if (ch & 0x01)
			be128_xor(r, r, &p[7]);

		if (++i >= 16)
			break;

		gf128mul_x8_lle(r);
	}
}
EXPORT_SYMBOL(gf128mul_lle);

void gf128mul_bbe(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_bbe(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[i];

		if (ch & 0x80)
			be128_xor(r, r, &p[7]);
		if (ch & 0x40)
			be128_xor(r, r, &p[6]);
		if (ch & 0x20)
			be128_xor(r, r, &p[5]);
		if (ch & 0x10)
			be128_xor(r, r, &p[4]);
		if (ch & 0x08)
			be128_xor(r, r, &p[3]);
		if (ch & 0x04)
			be128_xor(r, r, &p[2]);
		if (ch & 0x02)
			be128_xor(r, r, &p[1]);
		if (ch & 0x01)
			be128_xor(r, r, &p[0]);

		if (++i >= 16)
			break;

		gf128mul_x8_bbe(r);
	}
}
EXPORT_SYMBOL(gf128mul_bbe);

/*      This version uses 64k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in
    the buffer's lowest byte, we can construct a table of
    the 256 16 byte values that result from the 256 values
    of this byte.  This requires 4096 bytes. But we also
    need tables for each of the 16 higher bytes in the
    buffer as well, which makes 64 kbytes in total.
*/
/* additional explanation
 * t[0][BYTE] contains g*BYTE
 * t[1][BYTE] contains g*x^8*BYTE
 *  ..
 * t[15][BYTE] contains g*x^120*BYTE */
struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
{
	struct gf128mul_64k *t;
	int i, j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	for (i = 0; i < 16; i++) {
		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
		if (!t->t[i]) {
			gf128mul_free_64k(t);
			t = NULL;
			goto out;
		}
	}

	t->t[0]->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);

	for (i = 0;;) {
		for (j = 2; j < 256; j += j)
			for (k = 1; k < j; ++k)
				be128_xor(&t->t[i]->t[j + k],
					  &t->t[i]->t[j], &t->t[i]->t[k]);

		if (++i >= 16)
			break;

		for (j = 128; j > 0; j >>= 1) {
			t->t[i]->t[j] = t->t[i - 1]->t[j];
			gf128mul_x8_bbe(&t->t[i]->t[j]);
		}
	}

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_64k_bbe);

void gf128mul_free_64k(struct gf128mul_64k *t)
{
	int i;

	for (i = 0; i < 16; i++)
		kzfree(t->t[i]);
	kzfree(t);
}
EXPORT_SYMBOL(gf128mul_free_64k);

void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i;

	*r = t->t[0]->t[ap[15]];
	for (i = 1; i < 16; ++i)
		be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_64k_bbe);

/*      This version uses 4k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in a
    single byte, we can construct a table of the 256 16 byte
    values that result from the 256 values of this byte.
    This requires 4096 bytes. If we take the highest byte in
    the buffer and use this table to get the result, we then
    have to multiply by x^120 to get the final value. For the
    next highest byte the result has to be multiplied by x^112
    and so on. But we can do this by accumulating the result
    in an accumulator starting with the result for the top
    byte.  We repeatedly multiply the accumulator value by
    x^8 and then add in (i.e. xor) the 16 bytes of the next
    lower byte in the buffer, stopping when we reach the
    lowest byte. This requires a 4096 byte table.
*/
struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[128] = *g;
	for (j = 64; j > 0; j >>= 1)
		gf128mul_x_lle(&t->t[j], &t->t[j+j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_lle);

struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[j + j], &t->t[j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_bbe);

void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 15;

	*r = t->t[ap[15]];
	while (i--) {
		gf128mul_x8_lle(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_lle);

void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 0;

	*r = t->t[ap[0]];
	while (++i < 16) {
		gf128mul_x8_bbe(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_bbe);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
Commit	Line	Data
c494e070 RS	1	/* gf128mul.c - GF(2^128) multiplication functions
	2	*
	3	* Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
	4	* Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
	5	*
	6	* Based on Dr Brian Gladman's (GPL'd) work published at
8c882f64	7	* http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
c494e070 RS	8	* See the original copyright notice below.
	9	*
	10	* This program is free software; you can redistribute it and/or modify it
	11	* under the terms of the GNU General Public License as published by the Free
	12	* Software Foundation; either version 2 of the License, or (at your option)
	13	* any later version.
	14	*/
	15
	16	/*
	17	---------------------------------------------------------------------------
	18	Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
	19
	20	LICENSE TERMS
	21
	22	The free distribution and use of this software in both source and binary
	23	form is allowed (with or without changes) provided that:
	24
	25	1. distributions of this source code include the above copyright
	26	notice, this list of conditions and the following disclaimer;
	27
	28	2. distributions in binary form include the above copyright
	29	notice, this list of conditions and the following disclaimer
	30	in the documentation and/or other associated materials;
	31
	32	3. the copyright holder's name is not used to endorse products
	33	built using this software without specific written permission.
	34
	35	ALTERNATIVELY, provided that this notice is retained in full, this product
	36	may be distributed under the terms of the GNU General Public License (GPL),
	37	in which case the provisions of the GPL apply INSTEAD OF those given above.
	38
	39	DISCLAIMER
	40
	41	This software is provided 'as is' with no explicit or implied warranties
	42	in respect of its properties, including, but not limited to, correctness
	43	and/or fitness for purpose.
	44	---------------------------------------------------------------------------
	45	Issue 31/01/2006
	46
63be5b53	47	This file provides fast multiplication in GF(2^128) as required by several
c494e070 RS	48	cryptographic authentication modes
	49	*/
	50
	51	#include <crypto/gf128mul.h>
	52	#include <linux/kernel.h>
	53	#include <linux/module.h>
	54	#include <linux/slab.h>
	55
	56	#define gf128mul_dat(q) { \
	57	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	58	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	59	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	60	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	61	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	62	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	63	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	64	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	65	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	66	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	67	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	68	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	69	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	70	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	71	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	72	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	73	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	74	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	75	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	76	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	77	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	78	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	79	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	80	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	81	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	82	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	83	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	84	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	85	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	86	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	87	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	88	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
	89	}
	90
f33fd647 EB	91	/*
	92	* Given a value i in 0..255 as the byte overflow when a field element
	93	* in GF(2^128) is multiplied by x^8, the following macro returns the
	94	* 16-bit value that must be XOR-ed into the low-degree end of the
	95	* product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
	96	*
	97	* There are two versions of the macro, and hence two tables: one for
	98	* the "be" convention where the highest-order bit is the coefficient of
	99	* the highest-degree polynomial term, and one for the "le" convention
	100	* where the highest-order bit is the coefficient of the lowest-degree
	101	* polynomial term. In both cases the values are stored in CPU byte
	102	* endianness such that the coefficients are ordered consistently across
	103	* bytes, i.e. in the "be" table bits 15..0 of the stored value
	104	* correspond to the coefficients of x^15..x^0, and in the "le" table
	105	* bits 15..0 correspond to the coefficients of x^0..x^15.
	106	*
	107	* Therefore, provided that the appropriate byte endianness conversions
	108	* are done by the multiplication functions (and these must be in place
	109	* anyway to support both little endian and big endian CPUs), the "be"
	110	* table can be used for multiplications of both "bbe" and "ble"
	111	* elements, and the "le" table can be used for multiplications of both
	112	* "lle" and "lbe" elements.
	113	*/
c494e070	114
f33fd647	115	#define xda_be(i) ( \
2416e4fa EB	116	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	117	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	118	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	119	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
c494e070 RS	120	)
c494e070 RS	121
f33fd647	122	#define xda_le(i) ( \
2416e4fa EB	123	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	124	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	125	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	126	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
c494e070 RS	127	)
c494e070 RS	128
f33fd647 EB	129	static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
f33fd647 EB	130	static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
c494e070	131
63be5b53	132	/*
acb9b159	133	* The following functions multiply a field element by x^8 in
63be5b53 EB	134	* the polynomial field representation. They use 64-bit word operations
63be5b53 EB	135	* to gain speed but compensate for machine endianness and hence work
c494e070 RS	136	* correctly on both styles of machine.
	137	*/
	138
c494e070 RS	139	static void gf128mul_x8_lle(be128 *x)
	140	{
	141	u64 a = be64_to_cpu(x->a);
	142	u64 b = be64_to_cpu(x->b);
f33fd647	143	u64 _tt = gf128mul_table_le[b & 0xff];
c494e070 RS	144
	145	x->b = cpu_to_be64((b >> 8) \| (a << 56));
	146	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
	147	}
	148
	149	static void gf128mul_x8_bbe(be128 *x)
	150	{
	151	u64 a = be64_to_cpu(x->a);
	152	u64 b = be64_to_cpu(x->b);
f33fd647	153	u64 _tt = gf128mul_table_be[a >> 56];
c494e070 RS	154
	155	x->a = cpu_to_be64((a << 8) \| (b >> 56));
	156	x->b = cpu_to_be64((b << 8) ^ _tt);
	157	}
	158
	159	void gf128mul_lle(be128 r, const be128 b)
	160	{
	161	be128 p[8];
	162	int i;
	163
	164	p[0] = *r;
	165	for (i = 0; i < 7; ++i)
	166	gf128mul_x_lle(&p[i + 1], &p[i]);
	167
62542663	168	memset(r, 0, sizeof(*r));
c494e070 RS	169	for (i = 0;;) {
	170	u8 ch = ((u8 *)b)[15 - i];
	171
	172	if (ch & 0x80)
	173	be128_xor(r, r, &p[0]);
	174	if (ch & 0x40)
	175	be128_xor(r, r, &p[1]);
	176	if (ch & 0x20)
	177	be128_xor(r, r, &p[2]);
	178	if (ch & 0x10)
	179	be128_xor(r, r, &p[3]);
	180	if (ch & 0x08)
	181	be128_xor(r, r, &p[4]);
	182	if (ch & 0x04)
	183	be128_xor(r, r, &p[5]);
	184	if (ch & 0x02)
	185	be128_xor(r, r, &p[6]);
	186	if (ch & 0x01)
	187	be128_xor(r, r, &p[7]);
	188
	189	if (++i >= 16)
	190	break;
	191
	192	gf128mul_x8_lle(r);
	193	}
	194	}
	195	EXPORT_SYMBOL(gf128mul_lle);
	196
	197	void gf128mul_bbe(be128 r, const be128 b)
	198	{
	199	be128 p[8];
	200	int i;
	201
	202	p[0] = *r;
	203	for (i = 0; i < 7; ++i)
	204	gf128mul_x_bbe(&p[i + 1], &p[i]);
	205
62542663	206	memset(r, 0, sizeof(*r));
c494e070 RS	207	for (i = 0;;) {
	208	u8 ch = ((u8 *)b)[i];
	209
	210	if (ch & 0x80)
	211	be128_xor(r, r, &p[7]);
	212	if (ch & 0x40)
	213	be128_xor(r, r, &p[6]);
	214	if (ch & 0x20)
	215	be128_xor(r, r, &p[5]);
	216	if (ch & 0x10)
	217	be128_xor(r, r, &p[4]);
	218	if (ch & 0x08)
	219	be128_xor(r, r, &p[3]);
	220	if (ch & 0x04)
	221	be128_xor(r, r, &p[2]);
	222	if (ch & 0x02)
	223	be128_xor(r, r, &p[1]);
	224	if (ch & 0x01)
	225	be128_xor(r, r, &p[0]);
	226
	227	if (++i >= 16)
	228	break;
	229
	230	gf128mul_x8_bbe(r);
	231	}
	232	}
	233	EXPORT_SYMBOL(gf128mul_bbe);
	234
	235	/* This version uses 64k bytes of table space.
	236	A 16 byte buffer has to be multiplied by a 16 byte key
63be5b53	237	value in GF(2^128). If we consider a GF(2^128) value in
c494e070 RS	238	the buffer's lowest byte, we can construct a table of
	239	the 256 16 byte values that result from the 256 values
	240	of this byte. This requires 4096 bytes. But we also
	241	need tables for each of the 16 higher bytes in the
	242	buffer as well, which makes 64 kbytes in total.
	243	*/
	244	/* additional explanation
	245	* t[0][BYTE] contains g*BYTE
	246	* t[1][BYTE] contains gx^8BYTE
	247	* ..
	248	* t[15][BYTE] contains gx^120BYTE */
c494e070 RS	249	struct gf128mul_64k gf128mul_init_64k_bbe(const be128 g)
	250	{
	251	struct gf128mul_64k *t;
	252	int i, j, k;
	253
	254	t = kzalloc(sizeof(*t), GFP_KERNEL);
	255	if (!t)
	256	goto out;
	257
	258	for (i = 0; i < 16; i++) {
	259	t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
	260	if (!t->t[i]) {
	261	gf128mul_free_64k(t);
	262	t = NULL;
	263	goto out;
	264	}
	265	}
	266
	267	t->t[0]->t[1] = *g;
	268	for (j = 1; j <= 64; j <<= 1)
	269	gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);
	270
	271	for (i = 0;;) {
	272	for (j = 2; j < 256; j += j)
	273	for (k = 1; k < j; ++k)
	274	be128_xor(&t->t[i]->t[j + k],
	275	&t->t[i]->t[j], &t->t[i]->t[k]);
	276
	277	if (++i >= 16)
	278	break;
	279
	280	for (j = 128; j > 0; j >>= 1) {
	281	t->t[i]->t[j] = t->t[i - 1]->t[j];
	282	gf128mul_x8_bbe(&t->t[i]->t[j]);
	283	}
	284	}
	285
	286	out:
	287	return t;
	288	}
	289	EXPORT_SYMBOL(gf128mul_init_64k_bbe);
	290
	291	void gf128mul_free_64k(struct gf128mul_64k *t)
	292	{
	293	int i;
	294
	295	for (i = 0; i < 16; i++)
75aa0a7c AC	296	kzfree(t->t[i]);
75aa0a7c AC	297	kzfree(t);
c494e070 RS	298	}
	299	EXPORT_SYMBOL(gf128mul_free_64k);
	300
3ea996dd	301	void gf128mul_64k_bbe(be128 a, const struct gf128mul_64k t)
c494e070 RS	302	{
	303	u8 ap = (u8 )a;
	304	be128 r[1];
	305	int i;
	306
	307	*r = t->t[0]->t[ap[15]];
	308	for (i = 1; i < 16; ++i)
	309	be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	310	a = r;
	311	}
	312	EXPORT_SYMBOL(gf128mul_64k_bbe);
	313
	314	/* This version uses 4k bytes of table space.
	315	A 16 byte buffer has to be multiplied by a 16 byte key
63be5b53	316	value in GF(2^128). If we consider a GF(2^128) value in a
c494e070 RS	317	single byte, we can construct a table of the 256 16 byte
	318	values that result from the 256 values of this byte.
	319	This requires 4096 bytes. If we take the highest byte in
	320	the buffer and use this table to get the result, we then
	321	have to multiply by x^120 to get the final value. For the
	322	next highest byte the result has to be multiplied by x^112
	323	and so on. But we can do this by accumulating the result
	324	in an accumulator starting with the result for the top
	325	byte. We repeatedly multiply the accumulator value by
	326	x^8 and then add in (i.e. xor) the 16 bytes of the next
	327	lower byte in the buffer, stopping when we reach the
	328	lowest byte. This requires a 4096 byte table.
	329	*/
	330	struct gf128mul_4k gf128mul_init_4k_lle(const be128 g)
	331	{
	332	struct gf128mul_4k *t;
	333	int j, k;
	334
	335	t = kzalloc(sizeof(*t), GFP_KERNEL);
	336	if (!t)
	337	goto out;
	338
	339	t->t[128] = *g;
	340	for (j = 64; j > 0; j >>= 1)
	341	gf128mul_x_lle(&t->t[j], &t->t[j+j]);
	342
	343	for (j = 2; j < 256; j += j)
	344	for (k = 1; k < j; ++k)
	345	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	346
	347	out:
	348	return t;
	349	}
	350	EXPORT_SYMBOL(gf128mul_init_4k_lle);
	351
	352	struct gf128mul_4k gf128mul_init_4k_bbe(const be128 g)
	353	{
	354	struct gf128mul_4k *t;
	355	int j, k;
	356
	357	t = kzalloc(sizeof(*t), GFP_KERNEL);
	358	if (!t)
	359	goto out;
	360
	361	t->t[1] = *g;
	362	for (j = 1; j <= 64; j <<= 1)
	363	gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
	364
	365	for (j = 2; j < 256; j += j)
	366	for (k = 1; k < j; ++k)
	367	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	368
	369	out:
	370	return t;
	371	}
	372	EXPORT_SYMBOL(gf128mul_init_4k_bbe);
	373
3ea996dd	374	void gf128mul_4k_lle(be128 a, const struct gf128mul_4k t)
c494e070 RS	375	{
	376	u8 ap = (u8 )a;
	377	be128 r[1];
	378	int i = 15;
	379
	380	*r = t->t[ap[15]];
	381	while (i--) {
	382	gf128mul_x8_lle(r);
	383	be128_xor(r, r, &t->t[ap[i]]);
	384	}
	385	a = r;
	386	}
	387	EXPORT_SYMBOL(gf128mul_4k_lle);
	388
3ea996dd	389	void gf128mul_4k_bbe(be128 a, const struct gf128mul_4k t)
c494e070 RS	390	{
	391	u8 ap = (u8 )a;
	392	be128 r[1];
	393	int i = 0;
	394
	395	*r = t->t[ap[0]];
	396	while (++i < 16) {
	397	gf128mul_x8_bbe(r);
	398	be128_xor(r, r, &t->t[ap[i]]);
	399	}
	400	a = r;
	401	}
	402	EXPORT_SYMBOL(gf128mul_4k_bbe);
	403
	404	MODULE_LICENSE("GPL");
	405	MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");