]>
Commit | Line | Data |
---|---|---|
b2441318 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
180ae203 PM |
2 | .section .text..SHmedia32,"ax" |
3 | .align 2 | |
4 | .global __udivdi3 | |
5 | __udivdi3: | |
6 | shlri r3,1,r4 | |
7 | nsb r4,r22 | |
8 | shlld r3,r22,r6 | |
9 | shlri r6,49,r5 | |
10 | movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ | |
11 | sub r21,r5,r1 | |
12 | mmulfx.w r1,r1,r4 | |
13 | mshflo.w r1,r63,r1 | |
14 | sub r63,r22,r20 // r63 == 64 % 64 | |
15 | mmulfx.w r5,r4,r4 | |
16 | pta large_divisor,tr0 | |
17 | addi r20,32,r9 | |
18 | msub.w r1,r4,r1 | |
19 | madd.w r1,r1,r1 | |
20 | mmulfx.w r1,r1,r4 | |
21 | shlri r6,32,r7 | |
22 | bgt/u r9,r63,tr0 // large_divisor | |
23 | mmulfx.w r5,r4,r4 | |
24 | shlri r2,32+14,r19 | |
25 | addi r22,-31,r0 | |
26 | msub.w r1,r4,r1 | |
27 | ||
28 | mulu.l r1,r7,r4 | |
29 | addi r1,-3,r5 | |
30 | mulu.l r5,r19,r5 | |
31 | sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 | |
32 | shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as | |
33 | the case may be, %0000000000000000 000.11111111111, still */ | |
34 | muls.l r1,r4,r4 /* leaving at least one sign bit. */ | |
35 | mulu.l r5,r3,r8 | |
36 | mshalds.l r1,r21,r1 | |
37 | shari r4,26,r4 | |
38 | shlld r8,r0,r8 | |
39 | add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) | |
40 | sub r2,r8,r2 | |
41 | /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ | |
42 | ||
43 | shlri r2,22,r21 | |
44 | mulu.l r21,r1,r21 | |
45 | shlld r5,r0,r8 | |
46 | addi r20,30-22,r0 | |
47 | shlrd r21,r0,r21 | |
48 | mulu.l r21,r3,r5 | |
49 | add r8,r21,r8 | |
50 | mcmpgt.l r21,r63,r21 // See Note 1 | |
51 | addi r20,30,r0 | |
52 | mshfhi.l r63,r21,r21 | |
53 | sub r2,r5,r2 | |
54 | andc r2,r21,r2 | |
55 | ||
56 | /* small divisor: need a third divide step */ | |
57 | mulu.l r2,r1,r7 | |
58 | ptabs r18,tr0 | |
59 | addi r2,1,r2 | |
60 | shlrd r7,r0,r7 | |
61 | mulu.l r7,r3,r5 | |
62 | add r8,r7,r8 | |
63 | sub r2,r3,r2 | |
64 | cmpgt r2,r5,r5 | |
65 | add r8,r5,r2 | |
66 | /* could test r3 here to check for divide by zero. */ | |
67 | blink tr0,r63 | |
68 | ||
69 | large_divisor: | |
70 | mmulfx.w r5,r4,r4 | |
71 | shlrd r2,r9,r25 | |
72 | shlri r25,32,r8 | |
73 | msub.w r1,r4,r1 | |
74 | ||
75 | mulu.l r1,r7,r4 | |
76 | addi r1,-3,r5 | |
77 | mulu.l r5,r8,r5 | |
78 | sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 | |
79 | shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as | |
80 | the case may be, %0000000000000000 000.11111111111, still */ | |
81 | muls.l r1,r4,r4 /* leaving at least one sign bit. */ | |
82 | shlri r5,14-1,r8 | |
83 | mulu.l r8,r7,r5 | |
84 | mshalds.l r1,r21,r1 | |
85 | shari r4,26,r4 | |
86 | add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) | |
87 | sub r25,r5,r25 | |
88 | /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ | |
89 | ||
90 | shlri r25,22,r21 | |
91 | mulu.l r21,r1,r21 | |
92 | pta no_lo_adj,tr0 | |
93 | addi r22,32,r0 | |
94 | shlri r21,40,r21 | |
95 | mulu.l r21,r7,r5 | |
96 | add r8,r21,r8 | |
97 | shlld r2,r0,r2 | |
98 | sub r25,r5,r25 | |
99 | bgtu/u r7,r25,tr0 // no_lo_adj | |
100 | addi r8,1,r8 | |
101 | sub r25,r7,r25 | |
102 | no_lo_adj: | |
103 | mextr4 r2,r25,r2 | |
104 | ||
105 | /* large_divisor: only needs a few adjustments. */ | |
106 | mulu.l r8,r6,r5 | |
107 | ptabs r18,tr0 | |
108 | /* bubble */ | |
109 | cmpgtu r5,r2,r5 | |
110 | sub r8,r5,r2 | |
111 | blink tr0,r63 | |
112 | ||
113 | /* Note 1: To shift the result of the second divide stage so that the result | |
114 | always fits into 32 bits, yet we still reduce the rest sufficiently | |
115 | would require a lot of instructions to do the shifts just right. Using | |
116 | the full 64 bit shift result to multiply with the divisor would require | |
117 | four extra instructions for the upper 32 bits (shift / mulu / shift / sub). | |
118 | Fortunately, if the upper 32 bits of the shift result are nonzero, we | |
119 | know that the rest after taking this partial result into account will | |
120 | fit into 32 bits. So we just clear the upper 32 bits of the rest if the | |
121 | upper 32 bits of the partial result are nonzero. */ |