]> git.proxmox.com Git - rustc.git/blame - vendor/compiler_builtins/src/int/specialized_div_rem/binary_long.rs
New upstream version 1.51.0+dfsg1
[rustc.git] / vendor / compiler_builtins / src / int / specialized_div_rem / binary_long.rs
CommitLineData
5869c6ff 1/// Creates an unsigned division function that uses binary long division, designed for
29967ef6
XL
2/// computer architectures without division instructions. These functions have good performance for
3/// microarchitectures with large branch miss penalties and architectures without the ability to
4/// predicate instructions. For architectures with predicated instructions, one of the algorithms
5/// described in the documentation of these functions probably has higher performance, and a custom
6/// assembly routine should be used instead.
5869c6ff 7#[doc(hidden)]
29967ef6
XL
8#[macro_export]
9macro_rules! impl_binary_long {
10 (
5869c6ff 11 $fn:ident, // name of the unsigned division function
29967ef6
XL
12 $zero_div_fn:ident, // function called when division by zero is attempted
13 $normalization_shift:ident, // function for finding the normalization shift
14 $n:tt, // the number of bits in a $iX or $uX
5869c6ff
XL
15 $uX:ident, // unsigned integer type for the inputs and outputs of `$fn`
16 $iX:ident // signed integer type with same bitwidth as `$uX`
29967ef6
XL
17 ) => {
18 /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
19 /// tuple.
5869c6ff 20 pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) {
29967ef6
XL
21 let mut duo = duo;
22 // handle edge cases before calling `$normalization_shift`
23 if div == 0 {
24 $zero_div_fn()
25 }
26 if duo < div {
5869c6ff 27 return (0, duo);
29967ef6
XL
28 }
29
30 // There are many variations of binary division algorithm that could be used. This
31 // documentation gives a tour of different methods so that future readers wanting to
32 // optimize further do not have to painstakingly derive them. The SWAR variation is
33 // especially hard to understand without reading the less convoluted methods first.
34
35 // You may notice that a `duo < div_original` check is included in many these
36 // algorithms. A critical optimization that many algorithms miss is handling of
37 // quotients that will turn out to have many trailing zeros or many leading zeros. This
38 // happens in cases of exact or close-to-exact divisions, divisions by power of two, and
39 // in cases where the quotient is small. The `duo < div_original` check handles these
40 // cases of early returns and ends up replacing other kinds of mundane checks that
41 // normally terminate a binary division algorithm.
42 //
43 // Something you may see in other algorithms that is not special-cased here is checks
44 // for division by powers of two. The `duo < div_original` check handles this case and
45 // more, however it can be checked up front before the bisection using the
46 // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because
47 // compilers should handle most cases where divisions by power of two occur, and we do
48 // not want to add on a few cycles for every division operation just to save a few
49 // cycles rarely.
50
51 // The following example is the most straightforward translation from the way binary
52 // long division is typically visualized:
53 // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to
54 // the result from `$normalization_shift(duo, div, false)`.
55 //
56 // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set
57 // and `duo` is kept unchanged.
58 // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5
59 //
60 // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step.
61 // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4
62 //
63 // Step 2: Continue based on `sub`. The `quo` bits start accumulating.
64 // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3
65 // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2
66 // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1
67 // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0
68 // The `duo < div_original` check terminates the algorithm with the correct quotient of
69 // 29u8 and remainder of 4u8
70 /*
71 let div_original = div;
72 let mut shl = $normalization_shift(duo, div, false);
73 let mut quo = 0;
74 loop {
75 let div_shifted = div << shl;
76 let sub = duo.wrapping_sub(div_shifted);
77 // it is recommended to use `println!`s like this if functionality is unclear
78 /*
79 println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}",
80 duo,
81 div_shifted,
82 sub,
83 quo,
84 shl
85 );
86 */
87 if 0 <= (sub as $iX) {
88 duo = sub;
89 quo += 1 << shl;
90 if duo < div_original {
91 // this branch is optional
92 return (quo, duo)
93 }
94 }
95 if shl == 0 {
96 return (quo, duo)
97 }
98 shl -= 1;
99 }
100 */
101
102 // This restoring binary long division algorithm reduces the number of operations
103 // overall via:
104 // - `pow` can be shifted right instead of recalculating from `shl`
105 // - starting `div` shifted left and shifting it right for each step instead of
106 // recalculating from `shl`
107 // - The `duo < div_original` branch is used to terminate the algorithm instead of the
108 // `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and
109 // `div` from being shifted off the end. This check also only occurs on half of steps
110 // on average, since it is behind the `(sub as $iX) >= 0` branch.
111 // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are
112 // being updated between steps
113 //
114 // There are many variations of this algorithm, but this encompases the largest number
115 // of architectures and does not rely on carry flags, add-with-carry, or SWAR
116 // complications to be decently fast.
117 /*
118 let div_original = div;
119 let shl = $normalization_shift(duo, div, false);
120 let mut div: $uX = div << shl;
121 let mut pow: $uX = 1 << shl;
122 let mut quo: $uX = 0;
123 loop {
124 let sub = duo.wrapping_sub(div);
125 if 0 <= (sub as $iX) {
126 duo = sub;
127 quo |= pow;
128 if duo < div_original {
129 return (quo, duo)
130 }
131 }
132 div >>= 1;
133 pow >>= 1;
134 }
135 */
136
137 // If the architecture has flags and predicated arithmetic instructions, it is possible
138 // to do binary long division without branching and in only 3 or 4 instructions. This is
139 // a variation of a 3 instruction central loop from
140 // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt.
141 //
142 // What allows doing division in only 3 instructions is realizing that instead of
143 // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in
144 // place and `duo` can be shifted left. This means `div` does not have to be updated,
145 // but causes edge case problems and makes `duo < div_original` tests harder. Some
146 // architectures have an option to shift an argument in an arithmetic operation, which
147 // means `duo` can be shifted left and subtracted from in one instruction. The other two
148 // instructions are updating `quo` and undoing the subtraction if it turns out things
149 // were not normalized.
150
151 /*
152 // Perform one binary long division step on the already normalized arguments, because
153 // the main. Note that this does a full normalization since the central loop needs
154 // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original
155 // variation only did normalization to the nearest 4 steps, but this makes handling edge
156 // cases much harder. We do a full normalization and perform a binary long division
157 // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb
158 // of `duo`, then the edge case handler shifts `div` right and does another long
159 // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`.
160 let div_original = div;
161 let mut shl = $normalization_shift(duo, div, true);
162 let mut div: $uX = (div << shl);
163 let mut quo: $uX = 1;
164 duo = duo.wrapping_sub(div);
165 if duo < div_original {
166 return (1 << shl, duo);
167 }
168 let div_neg: $uX;
169 if (div as $iX) < 0 {
170 // A very ugly edge case where the most significant bit of `div` is set (after
171 // shifting to match `duo` when its most significant bit is at the sign bit), which
172 // leads to the sign bit of `div_neg` being cut off and carries not happening when
173 // they should. This branch performs a long division step that keeps `duo` in place
174 // and shifts `div` down.
175 div >>= 1;
176 div_neg = div.wrapping_neg();
177 let (sub, carry) = duo.overflowing_add(div_neg);
178 duo = sub;
179 quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
180 if !carry {
181 duo = duo.wrapping_add(div);
182 }
183 shl -= 1;
184 } else {
185 div_neg = div.wrapping_neg();
186 }
187 // The add-with-carry that updates `quo` needs to have the carry set when a normalized
188 // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the
189 // subtraction generates a carry when an unnormalized subtract happens, which is the
190 // opposite of what we want. Instead, we use
191 // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`.
192 let mut i = shl;
193 loop {
194 if i == 0 {
195 break;
196 }
197 i -= 1;
198 // `ADDS duo, div, duo, LSL #1`
199 // (add `div` to `duo << 1` and set flags)
200 let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg);
201 duo = sub;
202 // `ADC quo, quo, quo`
203 // (add with carry). Effectively shifts `quo` left by 1 and sets the least
204 // significant bit to the carry.
205 quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
206 // `ADDCC duo, duo, div`
207 // (add if carry clear). Undoes the subtraction if no carry was generated.
208 if !carry {
209 duo = duo.wrapping_add(div);
210 }
211 }
212 return (quo, duo >> shl);
213 */
214
215 // This is the SWAR (SIMD within in a register) restoring division algorithm.
216 // This combines several ideas of the above algorithms:
217 // - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction
218 // restoring division algorithm, some architectures can do the shifting and
219 // subtraction step in one instruction.
220 // - `quo` can be constructed by adding powers-of-two to it or shifting it left by one
221 // and adding one.
222 // - Every time `duo` is shifted left, there is another unused 0 bit shifted into the
223 // LSB, so what if we use those bits to store `quo`?
224 // Through a complex setup, it is possible to manage `duo` and `quo` in the same
225 // register, and perform one step with 2 or 3 instructions. The only major downsides are
226 // that there is significant setup (it is only saves instructions if `shl` is
227 // approximately more than 4), `duo < div_original` checks are impractical once SWAR is
228 // initiated, and the number of division steps taken has to be exact (we cannot do more
229 // division steps than `shl`, because it introduces edge cases where quotient bits in
230 // `duo` start to collide with the real part of `div`.
231 /*
232 // first step. The quotient bit is stored in `quo` for now
233 let div_original = div;
234 let mut shl = $normalization_shift(duo, div, true);
235 let mut div: $uX = (div << shl);
236 duo = duo.wrapping_sub(div);
237 let mut quo: $uX = 1 << shl;
238 if duo < div_original {
239 return (quo, duo);
240 }
241
242 let mask: $uX;
243 if (div as $iX) < 0 {
244 // deal with same edge case as the 3 instruction restoring division algorithm, but
245 // the quotient bit from this step also has to be stored in `quo`
246 div >>= 1;
247 shl -= 1;
248 let tmp = 1 << shl;
249 mask = tmp - 1;
250 let sub = duo.wrapping_sub(div);
251 if (sub as $iX) >= 0 {
252 // restore
253 duo = sub;
254 quo |= tmp;
255 }
256 if duo < div_original {
257 return (quo, duo);
258 }
259 } else {
260 mask = quo - 1;
261 }
262 // There is now room for quotient bits in `duo`.
263
264 // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from
265 // `div` and end up with the subset of `shl` bits being all being set. This subset acts
266 // just like a two's complement negative one. The subset of `div` containing the divisor
267 // had 1 subtracted from it, but a carry will always be generated from the `shl` subset
268 // as long as the quotient stays positive.
269 //
270 // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset
271 // adds a quotient bit to the least significant bit.
272 // For example, 89 (0b01011001) divided by 3 (0b11):
273 //
274 // shl:4, div:0b00110000
275 // first step:
276 // duo:0b01011001
277 // + div_neg:0b11010000
278 // ____________________
279 // 0b00101001
280 // quo is set to 0b00010000 and mask is set to 0b00001111 for later
281 //
282 // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the
283 // quotient part of `duo` with `^`s.
284 // chars.
285 // div:0b00110000
286 // ^^^^
287 // + 0b11111111
288 // ________________
289 // 0b00101111
290 // ^^^^
291 // div_neg:0b11010001
292 //
293 // first SWAR step:
294 // duo_shl1:0b01010010
295 // ^
296 // + div_neg:0b11010001
297 // ____________________
298 // 0b00100011
299 // ^
300 // second:
301 // duo_shl1:0b01000110
302 // ^^
303 // + div_neg:0b11010001
304 // ____________________
305 // 0b00010111
306 // ^^
307 // third:
308 // duo_shl1:0b00101110
309 // ^^^
310 // + div_neg:0b11010001
311 // ____________________
312 // 0b11111111
313 // ^^^
314 // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real
315 // part of `duo` is negative and the third step was an unnormalized step. The restore
316 // branch then restores `duo`. Note that the restore branch does not shift `duo` left.
317 //
318 // duo:0b11111111
319 // ^^^
320 // + div:0b00101111
321 // ^^^^
322 // ________________
323 // 0b00101110
324 // ^^^
325 // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an
326 // unset quotient bit.
327 //
328 // final step (`shl` was 4, so exactly 4 steps must be taken)
329 // duo_shl1:0b01011100
330 // ^^^^
331 // + div_neg:0b11010001
332 // ____________________
333 // 0b00101101
334 // ^^^^
335 // The quotient includes the `^` bits added with the `quo` bits from the beginning that
336 // contained the first step and potential edge case step,
337 // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`.
338 // The remainder is the bits remaining in `duo` that are not part of the quotient bits,
339 // `duo:0b00101101 >> shl == 0b0010 == 2u8`.
340 let div: $uX = div.wrapping_sub(1);
341 let mut i = shl;
342 loop {
343 if i == 0 {
344 break;
345 }
346 i -= 1;
347 duo = duo.wrapping_shl(1).wrapping_sub(div);
348 if (duo as $iX) < 0 {
349 // restore
350 duo = duo.wrapping_add(div);
351 }
352 }
353 // unpack the results of SWAR
354 return ((duo & mask) | quo, duo >> shl);
355 */
356
357 // The problem with the conditional restoring SWAR algorithm above is that, in practice,
358 // it requires assembly code to bring out its full unrolled potential (It seems that
359 // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit
360 // that my algorithm intends. On architectures without predicated instructions, the code
361 // gen is especially bad. We need a default software division algorithm that is
362 // guaranteed to get decent code gen for the central loop.
363
364 // For non-SWAR algorithms, there is a way to do binary long division without
365 // predication or even branching. This involves creating a mask from the sign bit and
366 // performing different kinds of steps using that.
367 /*
368 let shl = $normalization_shift(duo, div, true);
369 let mut div: $uX = div << shl;
370 let mut pow: $uX = 1 << shl;
371 let mut quo: $uX = 0;
372 loop {
373 let sub = duo.wrapping_sub(div);
374 let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX);
375 duo -= div & sign_mask;
376 quo |= pow & sign_mask;
377 div >>= 1;
378 pow >>= 1;
379 if pow == 0 {
380 break;
381 }
382 }
383 return (quo, duo);
384 */
385 // However, it requires about 4 extra operations (smearing the sign bit, negating the
386 // mask, and applying the mask twice) on top of the operations done by the actual
387 // algorithm. With SWAR however, just 2 extra operations are needed, making it
388 // practical and even the most optimal algorithm for some architectures.
389
390 // What we do is use custom assembly for predicated architectures that need software
391 // division, and for the default algorithm use a mask based restoring SWAR algorithm
392 // without conditionals or branches. On almost all architectures, this Rust code is
393 // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM
394 // will unroll it in a decent way.
395
396 // standard opening for SWAR algorithm with first step and edge case handling
397 let div_original = div;
398 let mut shl = $normalization_shift(duo, div, true);
399 let mut div: $uX = (div << shl);
400 duo = duo.wrapping_sub(div);
401 let mut quo: $uX = 1 << shl;
402 if duo < div_original {
403 return (quo, duo);
404 }
405 let mask: $uX;
406 if (div as $iX) < 0 {
407 div >>= 1;
408 shl -= 1;
409 let tmp = 1 << shl;
410 mask = tmp - 1;
411 let sub = duo.wrapping_sub(div);
412 if (sub as $iX) >= 0 {
413 duo = sub;
414 quo |= tmp;
415 }
416 if duo < div_original {
417 return (quo, duo);
418 }
419 } else {
420 mask = quo - 1;
421 }
422
423 // central loop
424 div = div.wrapping_sub(1);
425 let mut i = shl;
426 loop {
427 if i == 0 {
5869c6ff 428 break;
29967ef6
XL
429 }
430 i -= 1;
431 // shift left 1 and subtract
432 duo = duo.wrapping_shl(1).wrapping_sub(div);
433 // create mask
434 let mask = (duo as $iX).wrapping_shr($n - 1) as $uX;
435 // restore
436 duo = duo.wrapping_add(div & mask);
437 }
438 // unpack
439 return ((duo & mask) | quo, duo >> shl);
440
441 // miscellanious binary long division algorithms that might be better for specific
442 // architectures
443
444 // Another kind of long division uses an interesting fact that `div` and `pow` can be
445 // negated when `duo` is negative to perform a "negated" division step that works in
446 // place of any normalization mechanism. This is a non-restoring division algorithm that
447 // is very similar to the non-restoring division algorithms that can be found on the
448 // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can
449 // be viewed as shifting the least significant set bit right (e.x. if we enter a series
450 // of negated binary long division steps starting with `quo == 0b1011_0000` and
451 // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100,
452 // 0b1010_0010, 0b1010_0001).
453 /*
454 let div_original = div;
455 let shl = $normalization_shift(duo, div, true);
456 let mut div: $uX = (div << shl);
457 let mut pow: $uX = 1 << shl;
458 let mut quo: $uX = pow;
459 duo = duo.wrapping_sub(div);
460 if duo < div_original {
461 return (quo, duo);
462 }
463 div >>= 1;
464 pow >>= 1;
465 loop {
466 if (duo as $iX) < 0 {
467 // Negated binary long division step.
468 duo = duo.wrapping_add(div);
469 quo = quo.wrapping_sub(pow);
470 } else {
471 // Normal long division step.
472 if duo < div_original {
473 return (quo, duo)
474 }
475 duo = duo.wrapping_sub(div);
476 quo = quo.wrapping_add(pow);
477 }
478 pow >>= 1;
479 div >>= 1;
480 }
481 */
482
483 // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with
484 // SWAR techniques that makes the only difference between steps be negation of `div`.
485 // If there was an architecture with an instruction that negated inputs to an adder
486 // based on conditionals, and in place shifting (or a three input addition operation
487 // that can have `duo` as two of the inputs to effectively shift it left by 1), then a
488 // single instruction central loop is possible. Microarchitectures often have inputs to
489 // their ALU that can invert the arguments and carry in of adders, but the architectures
490 // unfortunately do not have an instruction to dynamically invert this input based on
491 // conditionals.
492 /*
493 // SWAR opening
494 let div_original = div;
495 let mut shl = $normalization_shift(duo, div, true);
496 let mut div: $uX = (div << shl);
497 duo = duo.wrapping_sub(div);
498 let mut quo: $uX = 1 << shl;
499 if duo < div_original {
500 return (quo, duo);
501 }
502 let mask: $uX;
503 if (div as $iX) < 0 {
504 div >>= 1;
505 shl -= 1;
506 let tmp = 1 << shl;
507 let sub = duo.wrapping_sub(div);
508 if (sub as $iX) >= 0 {
509 // restore
510 duo = sub;
511 quo |= tmp;
512 }
513 if duo < div_original {
514 return (quo, duo);
515 }
516 mask = tmp - 1;
517 } else {
518 mask = quo - 1;
519 }
520
521 // central loop
522 let div: $uX = div.wrapping_sub(1);
523 let mut i = shl;
524 loop {
525 if i == 0 {
526 break;
527 }
528 i -= 1;
529 // note: the `wrapping_shl(1)` can be factored out, but would require another
530 // restoring division step to prevent `(duo as $iX)` from overflowing
531 if (duo as $iX) < 0 {
532 // Negated binary long division step.
533 duo = duo.wrapping_shl(1).wrapping_add(div);
534 } else {
535 // Normal long division step.
536 duo = duo.wrapping_shl(1).wrapping_sub(div);
537 }
538 }
539 if (duo as $iX) < 0 {
540 // Restore. This was not needed in the original nonrestoring algorithm because of
541 // the `duo < div_original` checks.
542 duo = duo.wrapping_add(div);
543 }
544 // unpack
545 return ((duo & mask) | quo, duo >> shl);
546 */
547 }
5869c6ff 548 };
29967ef6 549}