]>
Commit | Line | Data |
---|---|---|
223e47cc LB |
1 | //===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===// |
2 | ||
3 | Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector | |
4 | registers, to generate better spill code. | |
5 | ||
6 | //===----------------------------------------------------------------------===// | |
7 | ||
8 | The first should be a single lvx from the constant pool, the second should be | |
9 | a xor/stvx: | |
10 | ||
11 | void foo(void) { | |
12 | int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 }; | |
13 | bar (x); | |
14 | } | |
15 | ||
16 | #include <string.h> | |
17 | void foo(void) { | |
18 | int x[8] __attribute__((aligned(128))); | |
19 | memset (x, 0, sizeof (x)); | |
20 | bar (x); | |
21 | } | |
22 | ||
23 | //===----------------------------------------------------------------------===// | |
24 | ||
25 | Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0: | |
26 | http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763 | |
27 | ||
28 | When -ffast-math is on, we can use 0.0. | |
29 | ||
30 | //===----------------------------------------------------------------------===// | |
31 | ||
32 | Consider this: | |
33 | v4f32 Vector; | |
34 | v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X }; | |
35 | ||
36 | Since we know that "Vector" is 16-byte aligned and we know the element offset | |
37 | of ".X", we should change the load into a lve*x instruction, instead of doing | |
38 | a load/store/lve*x sequence. | |
39 | ||
40 | //===----------------------------------------------------------------------===// | |
41 | ||
42 | For functions that use altivec AND have calls, we are VRSAVE'ing all call | |
43 | clobbered regs. | |
44 | ||
45 | //===----------------------------------------------------------------------===// | |
46 | ||
47 | Implement passing vectors by value into calls and receiving them as arguments. | |
48 | ||
49 | //===----------------------------------------------------------------------===// | |
50 | ||
51 | GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load | |
52 | of C1/C2/C3, then a load and vperm of Variable. | |
53 | ||
54 | //===----------------------------------------------------------------------===// | |
55 | ||
56 | We need a way to teach tblgen that some operands of an intrinsic are required to | |
57 | be constants. The verifier should enforce this constraint. | |
58 | ||
59 | //===----------------------------------------------------------------------===// | |
60 | ||
61 | We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte | |
62 | aligned stack slot, followed by a load/vperm. We should probably just store it | |
63 | to a scalar stack slot, then use lvsl/vperm to load it. If the value is already | |
64 | in memory this is a big win. | |
65 | ||
66 | //===----------------------------------------------------------------------===// | |
67 | ||
68 | extract_vector_elt of an arbitrary constant vector can be done with the | |
69 | following instructions: | |
70 | ||
71 | vTemp = vec_splat(v0,2); // 2 is the element the src is in. | |
72 | vec_ste(&destloc,0,vTemp); | |
73 | ||
74 | We can do an arbitrary non-constant value by using lvsr/perm/ste. | |
75 | ||
76 | //===----------------------------------------------------------------------===// | |
77 | ||
78 | If we want to tie instruction selection into the scheduler, we can do some | |
79 | constant formation with different instructions. For example, we can generate | |
80 | "vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with | |
81 | "vsplti 0" or "vxor", each of which use different execution units, thus could | |
82 | help scheduling. | |
83 | ||
84 | This is probably only reasonable for a post-pass scheduler. | |
85 | ||
86 | //===----------------------------------------------------------------------===// | |
87 | ||
88 | For this function: | |
89 | ||
90 | void test(vector float *A, vector float *B) { | |
91 | vector float C = (vector float)vec_cmpeq(*A, *B); | |
92 | if (!vec_any_eq(*A, *B)) | |
93 | *B = (vector float){0,0,0,0}; | |
94 | *A = C; | |
95 | } | |
96 | ||
97 | we get the following basic block: | |
98 | ||
99 | ... | |
100 | lvx v2, 0, r4 | |
101 | lvx v3, 0, r3 | |
102 | vcmpeqfp v4, v3, v2 | |
103 | vcmpeqfp. v2, v3, v2 | |
104 | bne cr6, LBB1_2 ; cond_next | |
105 | ||
106 | The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the | |
107 | vcmpeqfp. result is used by a branch. This can be improved. | |
108 | ||
109 | //===----------------------------------------------------------------------===// | |
110 | ||
111 | The code generated for this is truly aweful: | |
112 | ||
113 | vector float test(float a, float b) { | |
114 | return (vector float){ 0.0, a, 0.0, 0.0}; | |
115 | } | |
116 | ||
117 | LCPI1_0: ; float | |
118 | .space 4 | |
119 | .text | |
120 | .globl _test | |
121 | .align 4 | |
122 | _test: | |
123 | mfspr r2, 256 | |
124 | oris r3, r2, 4096 | |
125 | mtspr 256, r3 | |
126 | lis r3, ha16(LCPI1_0) | |
127 | addi r4, r1, -32 | |
128 | stfs f1, -16(r1) | |
129 | addi r5, r1, -16 | |
130 | lfs f0, lo16(LCPI1_0)(r3) | |
131 | stfs f0, -32(r1) | |
132 | lvx v2, 0, r4 | |
133 | lvx v3, 0, r5 | |
134 | vmrghw v3, v3, v2 | |
135 | vspltw v2, v2, 0 | |
136 | vmrghw v2, v2, v3 | |
137 | mtspr 256, r2 | |
138 | blr | |
139 | ||
140 | //===----------------------------------------------------------------------===// | |
141 | ||
142 | int foo(vector float *x, vector float *y) { | |
143 | if (vec_all_eq(*x,*y)) return 3245; | |
144 | else return 12; | |
145 | } | |
146 | ||
147 | A predicate compare being used in a select_cc should have the same peephole | |
148 | applied to it as a predicate compare used by a br_cc. There should be no | |
149 | mfcr here: | |
150 | ||
151 | _foo: | |
152 | mfspr r2, 256 | |
153 | oris r5, r2, 12288 | |
154 | mtspr 256, r5 | |
155 | li r5, 12 | |
156 | li r6, 3245 | |
157 | lvx v2, 0, r4 | |
158 | lvx v3, 0, r3 | |
159 | vcmpeqfp. v2, v3, v2 | |
160 | mfcr r3, 2 | |
161 | rlwinm r3, r3, 25, 31, 31 | |
162 | cmpwi cr0, r3, 0 | |
163 | bne cr0, LBB1_2 ; entry | |
164 | LBB1_1: ; entry | |
165 | mr r6, r5 | |
166 | LBB1_2: ; entry | |
167 | mr r3, r6 | |
168 | mtspr 256, r2 | |
169 | blr | |
170 | ||
171 | //===----------------------------------------------------------------------===// | |
172 | ||
173 | CodeGen/PowerPC/vec_constants.ll has an and operation that should be | |
174 | codegen'd to andc. The issue is that the 'all ones' build vector is | |
175 | SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected | |
176 | which prevents the vnot pattern from matching. | |
177 | ||
178 | ||
179 | //===----------------------------------------------------------------------===// | |
180 | ||
181 | An alternative to the store/store/load approach for illegal insert element | |
182 | lowering would be: | |
183 | ||
184 | 1. store element to any ol' slot | |
185 | 2. lvx the slot | |
186 | 3. lvsl 0; splat index; vcmpeq to generate a select mask | |
187 | 4. lvsl slot + x; vperm to rotate result into correct slot | |
188 | 5. vsel result together. | |
189 | ||
190 | //===----------------------------------------------------------------------===// | |
191 | ||
192 | Should codegen branches on vec_any/vec_all to avoid mfcr. Two examples: | |
193 | ||
194 | #include <altivec.h> | |
195 | int f(vector float a, vector float b) | |
196 | { | |
197 | int aa = 0; | |
198 | if (vec_all_ge(a, b)) | |
199 | aa |= 0x1; | |
200 | if (vec_any_ge(a,b)) | |
201 | aa |= 0x2; | |
202 | return aa; | |
203 | } | |
204 | ||
205 | vector float f(vector float a, vector float b) { | |
206 | if (vec_any_eq(a, b)) | |
207 | return a; | |
208 | else | |
209 | return b; | |
210 | } | |
211 |