]>
Commit | Line | Data |
---|---|---|
5869c6ff | 1 | /// Creates an unsigned division function that uses binary long division, designed for |
29967ef6 XL |
2 | /// computer architectures without division instructions. These functions have good performance for |
3 | /// microarchitectures with large branch miss penalties and architectures without the ability to | |
4 | /// predicate instructions. For architectures with predicated instructions, one of the algorithms | |
5 | /// described in the documentation of these functions probably has higher performance, and a custom | |
6 | /// assembly routine should be used instead. | |
5869c6ff | 7 | #[doc(hidden)] |
29967ef6 XL |
8 | #[macro_export] |
9 | macro_rules! impl_binary_long { | |
10 | ( | |
5869c6ff | 11 | $fn:ident, // name of the unsigned division function |
29967ef6 XL |
12 | $zero_div_fn:ident, // function called when division by zero is attempted |
13 | $normalization_shift:ident, // function for finding the normalization shift | |
14 | $n:tt, // the number of bits in a $iX or $uX | |
5869c6ff XL |
15 | $uX:ident, // unsigned integer type for the inputs and outputs of `$fn` |
16 | $iX:ident // signed integer type with same bitwidth as `$uX` | |
29967ef6 XL |
17 | ) => { |
18 | /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a | |
19 | /// tuple. | |
5869c6ff | 20 | pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) { |
29967ef6 XL |
21 | let mut duo = duo; |
22 | // handle edge cases before calling `$normalization_shift` | |
23 | if div == 0 { | |
24 | $zero_div_fn() | |
25 | } | |
26 | if duo < div { | |
5869c6ff | 27 | return (0, duo); |
29967ef6 XL |
28 | } |
29 | ||
30 | // There are many variations of binary division algorithm that could be used. This | |
31 | // documentation gives a tour of different methods so that future readers wanting to | |
32 | // optimize further do not have to painstakingly derive them. The SWAR variation is | |
33 | // especially hard to understand without reading the less convoluted methods first. | |
34 | ||
35 | // You may notice that a `duo < div_original` check is included in many these | |
36 | // algorithms. A critical optimization that many algorithms miss is handling of | |
37 | // quotients that will turn out to have many trailing zeros or many leading zeros. This | |
38 | // happens in cases of exact or close-to-exact divisions, divisions by power of two, and | |
39 | // in cases where the quotient is small. The `duo < div_original` check handles these | |
40 | // cases of early returns and ends up replacing other kinds of mundane checks that | |
41 | // normally terminate a binary division algorithm. | |
42 | // | |
43 | // Something you may see in other algorithms that is not special-cased here is checks | |
44 | // for division by powers of two. The `duo < div_original` check handles this case and | |
45 | // more, however it can be checked up front before the bisection using the | |
46 | // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because | |
47 | // compilers should handle most cases where divisions by power of two occur, and we do | |
48 | // not want to add on a few cycles for every division operation just to save a few | |
49 | // cycles rarely. | |
50 | ||
51 | // The following example is the most straightforward translation from the way binary | |
52 | // long division is typically visualized: | |
53 | // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to | |
54 | // the result from `$normalization_shift(duo, div, false)`. | |
55 | // | |
56 | // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set | |
57 | // and `duo` is kept unchanged. | |
58 | // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5 | |
59 | // | |
60 | // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step. | |
61 | // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4 | |
62 | // | |
63 | // Step 2: Continue based on `sub`. The `quo` bits start accumulating. | |
64 | // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3 | |
65 | // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2 | |
66 | // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1 | |
67 | // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0 | |
68 | // The `duo < div_original` check terminates the algorithm with the correct quotient of | |
69 | // 29u8 and remainder of 4u8 | |
70 | /* | |
71 | let div_original = div; | |
72 | let mut shl = $normalization_shift(duo, div, false); | |
73 | let mut quo = 0; | |
74 | loop { | |
75 | let div_shifted = div << shl; | |
76 | let sub = duo.wrapping_sub(div_shifted); | |
77 | // it is recommended to use `println!`s like this if functionality is unclear | |
78 | /* | |
79 | println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}", | |
80 | duo, | |
81 | div_shifted, | |
82 | sub, | |
83 | quo, | |
84 | shl | |
85 | ); | |
86 | */ | |
87 | if 0 <= (sub as $iX) { | |
88 | duo = sub; | |
89 | quo += 1 << shl; | |
90 | if duo < div_original { | |
91 | // this branch is optional | |
92 | return (quo, duo) | |
93 | } | |
94 | } | |
95 | if shl == 0 { | |
96 | return (quo, duo) | |
97 | } | |
98 | shl -= 1; | |
99 | } | |
100 | */ | |
101 | ||
102 | // This restoring binary long division algorithm reduces the number of operations | |
103 | // overall via: | |
104 | // - `pow` can be shifted right instead of recalculating from `shl` | |
105 | // - starting `div` shifted left and shifting it right for each step instead of | |
106 | // recalculating from `shl` | |
107 | // - The `duo < div_original` branch is used to terminate the algorithm instead of the | |
108 | // `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and | |
109 | // `div` from being shifted off the end. This check also only occurs on half of steps | |
110 | // on average, since it is behind the `(sub as $iX) >= 0` branch. | |
111 | // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are | |
112 | // being updated between steps | |
113 | // | |
114 | // There are many variations of this algorithm, but this encompases the largest number | |
115 | // of architectures and does not rely on carry flags, add-with-carry, or SWAR | |
116 | // complications to be decently fast. | |
117 | /* | |
118 | let div_original = div; | |
119 | let shl = $normalization_shift(duo, div, false); | |
120 | let mut div: $uX = div << shl; | |
121 | let mut pow: $uX = 1 << shl; | |
122 | let mut quo: $uX = 0; | |
123 | loop { | |
124 | let sub = duo.wrapping_sub(div); | |
125 | if 0 <= (sub as $iX) { | |
126 | duo = sub; | |
127 | quo |= pow; | |
128 | if duo < div_original { | |
129 | return (quo, duo) | |
130 | } | |
131 | } | |
132 | div >>= 1; | |
133 | pow >>= 1; | |
134 | } | |
135 | */ | |
136 | ||
137 | // If the architecture has flags and predicated arithmetic instructions, it is possible | |
138 | // to do binary long division without branching and in only 3 or 4 instructions. This is | |
139 | // a variation of a 3 instruction central loop from | |
140 | // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt. | |
141 | // | |
142 | // What allows doing division in only 3 instructions is realizing that instead of | |
143 | // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in | |
144 | // place and `duo` can be shifted left. This means `div` does not have to be updated, | |
145 | // but causes edge case problems and makes `duo < div_original` tests harder. Some | |
146 | // architectures have an option to shift an argument in an arithmetic operation, which | |
147 | // means `duo` can be shifted left and subtracted from in one instruction. The other two | |
148 | // instructions are updating `quo` and undoing the subtraction if it turns out things | |
149 | // were not normalized. | |
150 | ||
151 | /* | |
152 | // Perform one binary long division step on the already normalized arguments, because | |
153 | // the main. Note that this does a full normalization since the central loop needs | |
154 | // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original | |
155 | // variation only did normalization to the nearest 4 steps, but this makes handling edge | |
156 | // cases much harder. We do a full normalization and perform a binary long division | |
157 | // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb | |
158 | // of `duo`, then the edge case handler shifts `div` right and does another long | |
159 | // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`. | |
160 | let div_original = div; | |
161 | let mut shl = $normalization_shift(duo, div, true); | |
162 | let mut div: $uX = (div << shl); | |
163 | let mut quo: $uX = 1; | |
164 | duo = duo.wrapping_sub(div); | |
165 | if duo < div_original { | |
166 | return (1 << shl, duo); | |
167 | } | |
168 | let div_neg: $uX; | |
169 | if (div as $iX) < 0 { | |
170 | // A very ugly edge case where the most significant bit of `div` is set (after | |
171 | // shifting to match `duo` when its most significant bit is at the sign bit), which | |
172 | // leads to the sign bit of `div_neg` being cut off and carries not happening when | |
173 | // they should. This branch performs a long division step that keeps `duo` in place | |
174 | // and shifts `div` down. | |
175 | div >>= 1; | |
176 | div_neg = div.wrapping_neg(); | |
177 | let (sub, carry) = duo.overflowing_add(div_neg); | |
178 | duo = sub; | |
179 | quo = quo.wrapping_add(quo).wrapping_add(carry as $uX); | |
180 | if !carry { | |
181 | duo = duo.wrapping_add(div); | |
182 | } | |
183 | shl -= 1; | |
184 | } else { | |
185 | div_neg = div.wrapping_neg(); | |
186 | } | |
187 | // The add-with-carry that updates `quo` needs to have the carry set when a normalized | |
188 | // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the | |
189 | // subtraction generates a carry when an unnormalized subtract happens, which is the | |
190 | // opposite of what we want. Instead, we use | |
191 | // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`. | |
192 | let mut i = shl; | |
193 | loop { | |
194 | if i == 0 { | |
195 | break; | |
196 | } | |
197 | i -= 1; | |
198 | // `ADDS duo, div, duo, LSL #1` | |
199 | // (add `div` to `duo << 1` and set flags) | |
200 | let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg); | |
201 | duo = sub; | |
202 | // `ADC quo, quo, quo` | |
203 | // (add with carry). Effectively shifts `quo` left by 1 and sets the least | |
204 | // significant bit to the carry. | |
205 | quo = quo.wrapping_add(quo).wrapping_add(carry as $uX); | |
206 | // `ADDCC duo, duo, div` | |
207 | // (add if carry clear). Undoes the subtraction if no carry was generated. | |
208 | if !carry { | |
209 | duo = duo.wrapping_add(div); | |
210 | } | |
211 | } | |
212 | return (quo, duo >> shl); | |
213 | */ | |
214 | ||
215 | // This is the SWAR (SIMD within in a register) restoring division algorithm. | |
216 | // This combines several ideas of the above algorithms: | |
217 | // - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction | |
218 | // restoring division algorithm, some architectures can do the shifting and | |
219 | // subtraction step in one instruction. | |
220 | // - `quo` can be constructed by adding powers-of-two to it or shifting it left by one | |
221 | // and adding one. | |
222 | // - Every time `duo` is shifted left, there is another unused 0 bit shifted into the | |
223 | // LSB, so what if we use those bits to store `quo`? | |
224 | // Through a complex setup, it is possible to manage `duo` and `quo` in the same | |
225 | // register, and perform one step with 2 or 3 instructions. The only major downsides are | |
226 | // that there is significant setup (it is only saves instructions if `shl` is | |
227 | // approximately more than 4), `duo < div_original` checks are impractical once SWAR is | |
228 | // initiated, and the number of division steps taken has to be exact (we cannot do more | |
229 | // division steps than `shl`, because it introduces edge cases where quotient bits in | |
230 | // `duo` start to collide with the real part of `div`. | |
231 | /* | |
232 | // first step. The quotient bit is stored in `quo` for now | |
233 | let div_original = div; | |
234 | let mut shl = $normalization_shift(duo, div, true); | |
235 | let mut div: $uX = (div << shl); | |
236 | duo = duo.wrapping_sub(div); | |
237 | let mut quo: $uX = 1 << shl; | |
238 | if duo < div_original { | |
239 | return (quo, duo); | |
240 | } | |
241 | ||
242 | let mask: $uX; | |
243 | if (div as $iX) < 0 { | |
244 | // deal with same edge case as the 3 instruction restoring division algorithm, but | |
245 | // the quotient bit from this step also has to be stored in `quo` | |
246 | div >>= 1; | |
247 | shl -= 1; | |
248 | let tmp = 1 << shl; | |
249 | mask = tmp - 1; | |
250 | let sub = duo.wrapping_sub(div); | |
251 | if (sub as $iX) >= 0 { | |
252 | // restore | |
253 | duo = sub; | |
254 | quo |= tmp; | |
255 | } | |
256 | if duo < div_original { | |
257 | return (quo, duo); | |
258 | } | |
259 | } else { | |
260 | mask = quo - 1; | |
261 | } | |
262 | // There is now room for quotient bits in `duo`. | |
263 | ||
264 | // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from | |
265 | // `div` and end up with the subset of `shl` bits being all being set. This subset acts | |
266 | // just like a two's complement negative one. The subset of `div` containing the divisor | |
267 | // had 1 subtracted from it, but a carry will always be generated from the `shl` subset | |
268 | // as long as the quotient stays positive. | |
269 | // | |
270 | // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset | |
271 | // adds a quotient bit to the least significant bit. | |
272 | // For example, 89 (0b01011001) divided by 3 (0b11): | |
273 | // | |
274 | // shl:4, div:0b00110000 | |
275 | // first step: | |
276 | // duo:0b01011001 | |
277 | // + div_neg:0b11010000 | |
278 | // ____________________ | |
279 | // 0b00101001 | |
280 | // quo is set to 0b00010000 and mask is set to 0b00001111 for later | |
281 | // | |
282 | // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the | |
283 | // quotient part of `duo` with `^`s. | |
284 | // chars. | |
285 | // div:0b00110000 | |
286 | // ^^^^ | |
287 | // + 0b11111111 | |
288 | // ________________ | |
289 | // 0b00101111 | |
290 | // ^^^^ | |
291 | // div_neg:0b11010001 | |
292 | // | |
293 | // first SWAR step: | |
294 | // duo_shl1:0b01010010 | |
295 | // ^ | |
296 | // + div_neg:0b11010001 | |
297 | // ____________________ | |
298 | // 0b00100011 | |
299 | // ^ | |
300 | // second: | |
301 | // duo_shl1:0b01000110 | |
302 | // ^^ | |
303 | // + div_neg:0b11010001 | |
304 | // ____________________ | |
305 | // 0b00010111 | |
306 | // ^^ | |
307 | // third: | |
308 | // duo_shl1:0b00101110 | |
309 | // ^^^ | |
310 | // + div_neg:0b11010001 | |
311 | // ____________________ | |
312 | // 0b11111111 | |
313 | // ^^^ | |
314 | // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real | |
315 | // part of `duo` is negative and the third step was an unnormalized step. The restore | |
316 | // branch then restores `duo`. Note that the restore branch does not shift `duo` left. | |
317 | // | |
318 | // duo:0b11111111 | |
319 | // ^^^ | |
320 | // + div:0b00101111 | |
321 | // ^^^^ | |
322 | // ________________ | |
323 | // 0b00101110 | |
324 | // ^^^ | |
325 | // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an | |
326 | // unset quotient bit. | |
327 | // | |
328 | // final step (`shl` was 4, so exactly 4 steps must be taken) | |
329 | // duo_shl1:0b01011100 | |
330 | // ^^^^ | |
331 | // + div_neg:0b11010001 | |
332 | // ____________________ | |
333 | // 0b00101101 | |
334 | // ^^^^ | |
335 | // The quotient includes the `^` bits added with the `quo` bits from the beginning that | |
336 | // contained the first step and potential edge case step, | |
337 | // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`. | |
338 | // The remainder is the bits remaining in `duo` that are not part of the quotient bits, | |
339 | // `duo:0b00101101 >> shl == 0b0010 == 2u8`. | |
340 | let div: $uX = div.wrapping_sub(1); | |
341 | let mut i = shl; | |
342 | loop { | |
343 | if i == 0 { | |
344 | break; | |
345 | } | |
346 | i -= 1; | |
347 | duo = duo.wrapping_shl(1).wrapping_sub(div); | |
348 | if (duo as $iX) < 0 { | |
349 | // restore | |
350 | duo = duo.wrapping_add(div); | |
351 | } | |
352 | } | |
353 | // unpack the results of SWAR | |
354 | return ((duo & mask) | quo, duo >> shl); | |
355 | */ | |
356 | ||
357 | // The problem with the conditional restoring SWAR algorithm above is that, in practice, | |
358 | // it requires assembly code to bring out its full unrolled potential (It seems that | |
359 | // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit | |
360 | // that my algorithm intends. On architectures without predicated instructions, the code | |
361 | // gen is especially bad. We need a default software division algorithm that is | |
362 | // guaranteed to get decent code gen for the central loop. | |
363 | ||
364 | // For non-SWAR algorithms, there is a way to do binary long division without | |
365 | // predication or even branching. This involves creating a mask from the sign bit and | |
366 | // performing different kinds of steps using that. | |
367 | /* | |
368 | let shl = $normalization_shift(duo, div, true); | |
369 | let mut div: $uX = div << shl; | |
370 | let mut pow: $uX = 1 << shl; | |
371 | let mut quo: $uX = 0; | |
372 | loop { | |
373 | let sub = duo.wrapping_sub(div); | |
374 | let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX); | |
375 | duo -= div & sign_mask; | |
376 | quo |= pow & sign_mask; | |
377 | div >>= 1; | |
378 | pow >>= 1; | |
379 | if pow == 0 { | |
380 | break; | |
381 | } | |
382 | } | |
383 | return (quo, duo); | |
384 | */ | |
385 | // However, it requires about 4 extra operations (smearing the sign bit, negating the | |
386 | // mask, and applying the mask twice) on top of the operations done by the actual | |
387 | // algorithm. With SWAR however, just 2 extra operations are needed, making it | |
388 | // practical and even the most optimal algorithm for some architectures. | |
389 | ||
390 | // What we do is use custom assembly for predicated architectures that need software | |
391 | // division, and for the default algorithm use a mask based restoring SWAR algorithm | |
392 | // without conditionals or branches. On almost all architectures, this Rust code is | |
393 | // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM | |
394 | // will unroll it in a decent way. | |
395 | ||
396 | // standard opening for SWAR algorithm with first step and edge case handling | |
397 | let div_original = div; | |
398 | let mut shl = $normalization_shift(duo, div, true); | |
399 | let mut div: $uX = (div << shl); | |
400 | duo = duo.wrapping_sub(div); | |
401 | let mut quo: $uX = 1 << shl; | |
402 | if duo < div_original { | |
403 | return (quo, duo); | |
404 | } | |
405 | let mask: $uX; | |
406 | if (div as $iX) < 0 { | |
407 | div >>= 1; | |
408 | shl -= 1; | |
409 | let tmp = 1 << shl; | |
410 | mask = tmp - 1; | |
411 | let sub = duo.wrapping_sub(div); | |
412 | if (sub as $iX) >= 0 { | |
413 | duo = sub; | |
414 | quo |= tmp; | |
415 | } | |
416 | if duo < div_original { | |
417 | return (quo, duo); | |
418 | } | |
419 | } else { | |
420 | mask = quo - 1; | |
421 | } | |
422 | ||
423 | // central loop | |
424 | div = div.wrapping_sub(1); | |
425 | let mut i = shl; | |
426 | loop { | |
427 | if i == 0 { | |
5869c6ff | 428 | break; |
29967ef6 XL |
429 | } |
430 | i -= 1; | |
431 | // shift left 1 and subtract | |
432 | duo = duo.wrapping_shl(1).wrapping_sub(div); | |
433 | // create mask | |
434 | let mask = (duo as $iX).wrapping_shr($n - 1) as $uX; | |
435 | // restore | |
436 | duo = duo.wrapping_add(div & mask); | |
437 | } | |
438 | // unpack | |
439 | return ((duo & mask) | quo, duo >> shl); | |
440 | ||
441 | // miscellanious binary long division algorithms that might be better for specific | |
442 | // architectures | |
443 | ||
444 | // Another kind of long division uses an interesting fact that `div` and `pow` can be | |
445 | // negated when `duo` is negative to perform a "negated" division step that works in | |
446 | // place of any normalization mechanism. This is a non-restoring division algorithm that | |
447 | // is very similar to the non-restoring division algorithms that can be found on the | |
448 | // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can | |
449 | // be viewed as shifting the least significant set bit right (e.x. if we enter a series | |
450 | // of negated binary long division steps starting with `quo == 0b1011_0000` and | |
451 | // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100, | |
452 | // 0b1010_0010, 0b1010_0001). | |
453 | /* | |
454 | let div_original = div; | |
455 | let shl = $normalization_shift(duo, div, true); | |
456 | let mut div: $uX = (div << shl); | |
457 | let mut pow: $uX = 1 << shl; | |
458 | let mut quo: $uX = pow; | |
459 | duo = duo.wrapping_sub(div); | |
460 | if duo < div_original { | |
461 | return (quo, duo); | |
462 | } | |
463 | div >>= 1; | |
464 | pow >>= 1; | |
465 | loop { | |
466 | if (duo as $iX) < 0 { | |
467 | // Negated binary long division step. | |
468 | duo = duo.wrapping_add(div); | |
469 | quo = quo.wrapping_sub(pow); | |
470 | } else { | |
471 | // Normal long division step. | |
472 | if duo < div_original { | |
473 | return (quo, duo) | |
474 | } | |
475 | duo = duo.wrapping_sub(div); | |
476 | quo = quo.wrapping_add(pow); | |
477 | } | |
478 | pow >>= 1; | |
479 | div >>= 1; | |
480 | } | |
481 | */ | |
482 | ||
483 | // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with | |
484 | // SWAR techniques that makes the only difference between steps be negation of `div`. | |
485 | // If there was an architecture with an instruction that negated inputs to an adder | |
486 | // based on conditionals, and in place shifting (or a three input addition operation | |
487 | // that can have `duo` as two of the inputs to effectively shift it left by 1), then a | |
488 | // single instruction central loop is possible. Microarchitectures often have inputs to | |
489 | // their ALU that can invert the arguments and carry in of adders, but the architectures | |
490 | // unfortunately do not have an instruction to dynamically invert this input based on | |
491 | // conditionals. | |
492 | /* | |
493 | // SWAR opening | |
494 | let div_original = div; | |
495 | let mut shl = $normalization_shift(duo, div, true); | |
496 | let mut div: $uX = (div << shl); | |
497 | duo = duo.wrapping_sub(div); | |
498 | let mut quo: $uX = 1 << shl; | |
499 | if duo < div_original { | |
500 | return (quo, duo); | |
501 | } | |
502 | let mask: $uX; | |
503 | if (div as $iX) < 0 { | |
504 | div >>= 1; | |
505 | shl -= 1; | |
506 | let tmp = 1 << shl; | |
507 | let sub = duo.wrapping_sub(div); | |
508 | if (sub as $iX) >= 0 { | |
509 | // restore | |
510 | duo = sub; | |
511 | quo |= tmp; | |
512 | } | |
513 | if duo < div_original { | |
514 | return (quo, duo); | |
515 | } | |
516 | mask = tmp - 1; | |
517 | } else { | |
518 | mask = quo - 1; | |
519 | } | |
520 | ||
521 | // central loop | |
522 | let div: $uX = div.wrapping_sub(1); | |
523 | let mut i = shl; | |
524 | loop { | |
525 | if i == 0 { | |
526 | break; | |
527 | } | |
528 | i -= 1; | |
529 | // note: the `wrapping_shl(1)` can be factored out, but would require another | |
530 | // restoring division step to prevent `(duo as $iX)` from overflowing | |
531 | if (duo as $iX) < 0 { | |
532 | // Negated binary long division step. | |
533 | duo = duo.wrapping_shl(1).wrapping_add(div); | |
534 | } else { | |
535 | // Normal long division step. | |
536 | duo = duo.wrapping_shl(1).wrapping_sub(div); | |
537 | } | |
538 | } | |
539 | if (duo as $iX) < 0 { | |
540 | // Restore. This was not needed in the original nonrestoring algorithm because of | |
541 | // the `duo < div_original` checks. | |
542 | duo = duo.wrapping_add(div); | |
543 | } | |
544 | // unpack | |
545 | return ((duo & mask) | quo, duo >> shl); | |
546 | */ | |
547 | } | |
5869c6ff | 548 | }; |
29967ef6 | 549 | } |