]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
773e1c5f GS |
2 | #ifndef _ASM_HASH_H |
3 | #define _ASM_HASH_H | |
4 | ||
5 | /* | |
6 | * HP-PA only implements integer multiply in the FPU. However, for | |
7 | * integer multiplies by constant, it has a number of shift-and-add | |
8 | * (but no shift-and-subtract, sigh!) instructions that a compiler | |
9 | * can synthesize a code sequence with. | |
10 | * | |
11 | * Unfortunately, GCC isn't very efficient at using them. For example | |
12 | * it uses three instructions for "x *= 21" when only two are needed. | |
13 | * But we can find a sequence manually. | |
14 | */ | |
15 | ||
16 | #define HAVE_ARCH__HASH_32 1 | |
17 | ||
18 | /* | |
19 | * This is a multiply by GOLDEN_RATIO_32 = 0x61C88647 optimized for the | |
20 | * PA7100 pairing rules. This is an in-order 2-way superscalar processor. | |
21 | * Only one instruction in a pair may be a shift (by more than 3 bits), | |
22 | * but other than that, simple ALU ops (including shift-and-add by up | |
23 | * to 3 bits) may be paired arbitrarily. | |
24 | * | |
25 | * PA8xxx processors also dual-issue ALU instructions, although with | |
26 | * fewer constraints, so this schedule is good for them, too. | |
27 | * | |
28 | * This 6-step sequence was found by Yevgen Voronenko's implementation | |
29 | * of the Hcub algorithm at http://spiral.ece.cmu.edu/mcm/gen.html. | |
30 | */ | |
31 | static inline u32 __attribute_const__ __hash_32(u32 x) | |
32 | { | |
33 | u32 a, b, c; | |
34 | ||
35 | /* | |
36 | * Phase 1: Compute a = (x << 19) + x, | |
37 | * b = (x << 9) + a, c = (x << 23) + b. | |
38 | */ | |
39 | a = x << 19; /* Two shifts can't be paired */ | |
40 | b = x << 9; a += x; | |
41 | c = x << 23; b += a; | |
42 | c += b; | |
43 | /* Phase 2: Return (b<<11) + (c<<6) + (a<<3) - c */ | |
44 | b <<= 11; | |
45 | a += c << 3; b -= c; | |
46 | return (a << 3) + b; | |
47 | } | |
48 | ||
49 | #if BITS_PER_LONG == 64 | |
50 | ||
51 | #define HAVE_ARCH_HASH_64 1 | |
52 | ||
53 | /* | |
54 | * Finding a good shift-and-add chain for GOLDEN_RATIO_64 is tricky, | |
55 | * because available software for the purpose chokes on constants this | |
56 | * large. (It's mostly designed for compiling FIR filter coefficients | |
57 | * into FPGAs.) | |
58 | * | |
59 | * However, Jason Thong pointed out a work-around. The Hcub software | |
60 | * (http://spiral.ece.cmu.edu/mcm/gen.html) is designed for *multiple* | |
61 | * constant multiplication, and is good at finding shift-and-add chains | |
62 | * which share common terms. | |
63 | * | |
64 | * Looking at 0x0x61C8864680B583EB in binary: | |
65 | * 0110000111001000100001100100011010000000101101011000001111101011 | |
66 | * \______________/ \__________/ \_______/ \________/ | |
67 | * \____________________________/ \____________________/ | |
68 | * you can see the non-zero bits are divided into several well-separated | |
69 | * blocks. Hcub can find algorithms for those terms separately, which | |
70 | * can then be shifted and added together. | |
71 | * | |
72 | * Dividing the input into 2, 3 or 4 blocks, Hcub can find solutions | |
73 | * with 10, 9 or 8 adds, respectively, making a total of 11 for the | |
74 | * whole number. | |
75 | * | |
76 | * Using just two large blocks, 0xC3910C8D << 31 in the high bits, | |
77 | * and 0xB583EB in the low bits, produces as good an algorithm as any, | |
78 | * and with one more small shift than alternatives. | |
79 | * | |
80 | * The high bits are a larger number and more work to compute, as well | |
81 | * as needing one extra cycle to shift left 31 bits before the final | |
82 | * addition, so they are the critical path for scheduling. The low bits | |
83 | * can fit into the scheduling slots left over. | |
84 | */ | |
85 | ||
86 | ||
87 | /* | |
88 | * This _ASSIGN(dst, src) macro performs "dst = src", but prevents GCC | |
89 | * from inferring anything about the value assigned to "dest". | |
90 | * | |
91 | * This prevents it from mis-optimizing certain sequences. | |
92 | * In particular, gcc is annoyingly eager to combine consecutive shifts. | |
93 | * Given "x <<= 19; y += x; z += x << 1;", GCC will turn this into | |
94 | * "y += x << 19; z += x << 20;" even though the latter sequence needs | |
95 | * an additional instruction and temporary register. | |
96 | * | |
97 | * Because no actual assembly code is generated, this construct is | |
98 | * usefully portable across all GCC platforms, and so can be test-compiled | |
99 | * on non-PA systems. | |
100 | * | |
101 | * In two places, additional unused input dependencies are added. This | |
102 | * forces GCC's scheduling so it does not rearrange instructions too much. | |
103 | * Because the PA-8xxx is out of order, I'm not sure how much this matters, | |
104 | * but why make it more difficult for the processor than necessary? | |
105 | */ | |
106 | #define _ASSIGN(dst, src, ...) asm("" : "=r" (dst) : "0" (src), ##__VA_ARGS__) | |
107 | ||
108 | /* | |
109 | * Multiply by GOLDEN_RATIO_64 = 0x0x61C8864680B583EB using a heavily | |
110 | * optimized shift-and-add sequence. | |
111 | * | |
112 | * Without the final shift, the multiply proper is 19 instructions, | |
113 | * 10 cycles and uses only 4 temporaries. Whew! | |
114 | * | |
115 | * You are not expected to understand this. | |
116 | */ | |
117 | static __always_inline u32 __attribute_const__ | |
118 | hash_64(u64 a, unsigned int bits) | |
119 | { | |
120 | u64 b, c, d; | |
121 | ||
122 | /* | |
123 | * Encourage GCC to move a dynamic shift to %sar early, | |
124 | * thereby freeing up an additional temporary register. | |
125 | */ | |
126 | if (!__builtin_constant_p(bits)) | |
127 | asm("" : "=q" (bits) : "0" (64 - bits)); | |
128 | else | |
129 | bits = 64 - bits; | |
130 | ||
131 | _ASSIGN(b, a*5); c = a << 13; | |
132 | b = (b << 2) + a; _ASSIGN(d, a << 17); | |
133 | a = b + (a << 1); c += d; | |
134 | d = a << 10; _ASSIGN(a, a << 19); | |
135 | d = a - d; _ASSIGN(a, a << 4, "X" (d)); | |
136 | c += b; a += b; | |
137 | d -= c; c += a << 1; | |
138 | a += c << 3; _ASSIGN(b, b << (7+31), "X" (c), "X" (d)); | |
139 | a <<= 31; b += d; | |
140 | a += b; | |
141 | return a >> bits; | |
142 | } | |
143 | #undef _ASSIGN /* We're a widely-used header file, so don't litter! */ | |
144 | ||
145 | #endif /* BITS_PER_LONG == 64 */ | |
146 | ||
147 | #endif /* _ASM_HASH_H */ |