]> git.proxmox.com Git - rustc.git/blob - src/llvm/lib/Target/PowerPC/PPCScheduleP7.td
Imported Upstream version 1.0.0+dfsg1
[rustc.git] / src / llvm / lib / Target / PowerPC / PPCScheduleP7.td
1 //===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file defines the itinerary class data for the POWER7 processor.
11 //
12 //===----------------------------------------------------------------------===//
13
14 // Primary reference:
15 // IBM POWER7 multicore server processor
16 // B. Sinharoy, et al.
17 // IBM J. Res. & Dev. (55) 3. May/June 2011.
18
19 // Scheduling for the P7 involves tracking two types of resources:
20 // 1. The dispatch bundle slots
21 // 2. The functional unit resources
22
23 // Dispatch units:
24 def P7_DU1 : FuncUnit;
25 def P7_DU2 : FuncUnit;
26 def P7_DU3 : FuncUnit;
27 def P7_DU4 : FuncUnit;
28 def P7_DU5 : FuncUnit;
29 def P7_DU6 : FuncUnit;
30
31 def P7_LS1 : FuncUnit; // Load/Store pipeline 1
32 def P7_LS2 : FuncUnit; // Load/Store pipeline 2
33
34 def P7_FX1 : FuncUnit; // FX pipeline 1
35 def P7_FX2 : FuncUnit; // FX pipeline 2
36
37 // VS pipeline 1 (vector integer ops. always here)
38 def P7_VS1 : FuncUnit; // VS pipeline 1
39 // VS pipeline 2 (128-bit stores and perms. here)
40 def P7_VS2 : FuncUnit; // VS pipeline 2
41
42 def P7_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs)
43 def P7_BRU : FuncUnit; // BR unit
44
45 // Notes:
46 // Each LSU pipeline can also execute FX add and logical instructions.
47 // Each LSU pipeline can complete a load or store in one cycle.
48 //
49 // Each store is broken into two parts, AGEN goes to the LSU while a
50 // "data steering" op. goes to the FXU or VSU.
51 //
52 // FX loads have a two cycle load-to-use latency (so one "bubble" cycle).
53 // VSU loads have a three cycle load-to-use latency (so two "bubble" cycle).
54 //
55 // Frequent FX ops. take only one cycle and results can be used again in the
56 // next cycle (there is a self-bypass). Getting results from the other FX
57 // pipeline takes an additional cycle.
58 //
59 // The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles
60 // (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops.
61 // Dispatch of an instruction to VS1 that uses four single prec. inputs
62 // (either to a float or XC op). prevents dispatch in that cycle to VS2 of any
63 // floating point instruction.
64 //
65 // The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles
66 // (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline
67 // (unlike on the POWER6).
68 //
69 // FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP
70 // share the same write-back, and have a 5-cycle latency difference, so the
71 // IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP
72 // op. has been dispatched to VS1.
73 //
74 // Three cycles after an L1 cache hit, a dependent VSU instruction can issue.
75 //
76 // Instruction dispatch groups have (at most) four non-branch instructions, and
77 // two branches. Unlike on the POWER4/5, a branch does not automatically
78 // end the dispatch group, but a second branch must be the last in the group.
79
80 def P7Itineraries : ProcessorItineraries<
81 [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6,
82 P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [
83 InstrItinData<IIC_IntSimple , [InstrStage<1, [P7_DU1, P7_DU2,
84 P7_DU3, P7_DU4], 0>,
85 InstrStage<1, [P7_FX1, P7_FX2,
86 P7_LS1, P7_LS2]>],
87 [1, 1, 1]>,
88 InstrItinData<IIC_IntGeneral , [InstrStage<1, [P7_DU1, P7_DU2,
89 P7_DU3, P7_DU4], 0>,
90 InstrStage<1, [P7_FX1, P7_FX2]>],
91 [1, 1, 1]>,
92 InstrItinData<IIC_IntCompare , [InstrStage<1, [P7_DU1, P7_DU2,
93 P7_DU3, P7_DU4], 0>,
94 InstrStage<1, [P7_FX1, P7_FX2]>],
95 [1, 1, 1]>,
96 // FIXME: Add record-form itinerary data.
97 InstrItinData<IIC_IntDivW , [InstrStage<1, [P7_DU1], 0>,
98 InstrStage<1, [P7_DU2], 0>,
99 InstrStage<36, [P7_FX1, P7_FX2]>],
100 [36, 1, 1]>,
101 InstrItinData<IIC_IntDivD , [InstrStage<1, [P7_DU1], 0>,
102 InstrStage<1, [P7_DU2], 0>,
103 InstrStage<68, [P7_FX1, P7_FX2]>],
104 [68, 1, 1]>,
105 InstrItinData<IIC_IntMulHW , [InstrStage<1, [P7_DU1, P7_DU2,
106 P7_DU3, P7_DU4], 0>,
107 InstrStage<1, [P7_FX1, P7_FX2]>],
108 [4, 1, 1]>,
109 InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P7_DU1, P7_DU2,
110 P7_DU3, P7_DU4], 0>,
111 InstrStage<1, [P7_FX1, P7_FX2]>],
112 [4, 1, 1]>,
113 InstrItinData<IIC_IntMulLI , [InstrStage<1, [P7_DU1, P7_DU2,
114 P7_DU3, P7_DU4], 0>,
115 InstrStage<1, [P7_FX1, P7_FX2]>],
116 [4, 1, 1]>,
117 InstrItinData<IIC_IntRotate , [InstrStage<1, [P7_DU1, P7_DU2,
118 P7_DU3, P7_DU4], 0>,
119 InstrStage<1, [P7_FX1, P7_FX2]>],
120 [1, 1, 1]>,
121 InstrItinData<IIC_IntRotateD , [InstrStage<1, [P7_DU1, P7_DU2,
122 P7_DU3, P7_DU4], 0>,
123 InstrStage<1, [P7_FX1, P7_FX2]>],
124 [1, 1, 1]>,
125 InstrItinData<IIC_IntShift , [InstrStage<1, [P7_DU1, P7_DU2,
126 P7_DU3, P7_DU4], 0>,
127 InstrStage<1, [P7_FX1, P7_FX2]>],
128 [1, 1, 1]>,
129 InstrItinData<IIC_IntTrapW , [InstrStage<1, [P7_DU1, P7_DU2,
130 P7_DU3, P7_DU4], 0>,
131 InstrStage<1, [P7_FX1, P7_FX2]>],
132 [1, 1]>,
133 InstrItinData<IIC_IntTrapD , [InstrStage<1, [P7_DU1, P7_DU2,
134 P7_DU3, P7_DU4], 0>,
135 InstrStage<1, [P7_FX1, P7_FX2]>],
136 [1, 1]>,
137 InstrItinData<IIC_BrB , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
138 InstrStage<1, [P7_BRU]>],
139 [3, 1, 1]>,
140 InstrItinData<IIC_BrCR , [InstrStage<1, [P7_DU1], 0>,
141 InstrStage<1, [P7_CRU]>],
142 [3, 1, 1]>,
143 InstrItinData<IIC_BrMCR , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
144 InstrStage<1, [P7_BRU]>],
145 [3, 1, 1]>,
146 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
147 InstrStage<1, [P7_BRU]>],
148 [3, 1, 1]>,
149 InstrItinData<IIC_LdStLoad , [InstrStage<1, [P7_DU1, P7_DU2,
150 P7_DU3, P7_DU4], 0>,
151 InstrStage<1, [P7_LS1, P7_LS2]>],
152 [2, 1, 1]>,
153 InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>,
154 InstrStage<1, [P7_DU2], 0>,
155 InstrStage<1, [P7_LS1, P7_LS2], 0>,
156 InstrStage<1, [P7_FX1, P7_FX2]>],
157 [2, 2, 1, 1]>,
158 InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>,
159 InstrStage<1, [P7_DU2], 0>,
160 InstrStage<1, [P7_DU3], 0>,
161 InstrStage<1, [P7_DU4], 0>,
162 InstrStage<1, [P7_FX1, P7_FX2]>,
163 InstrStage<1, [P7_LS1, P7_LS2], 0>,
164 InstrStage<1, [P7_FX1, P7_FX2]>],
165 [3, 3, 1, 1]>,
166 InstrItinData<IIC_LdStLD , [InstrStage<1, [P7_DU1, P7_DU2,
167 P7_DU3, P7_DU4], 0>,
168 InstrStage<1, [P7_LS1, P7_LS2]>],
169 [2, 1, 1]>,
170 InstrItinData<IIC_LdStLDU , [InstrStage<1, [P7_DU1], 0>,
171 InstrStage<1, [P7_DU2], 0>,
172 InstrStage<1, [P7_LS1, P7_LS2], 0>,
173 InstrStage<1, [P7_FX1, P7_FX2]>],
174 [2, 2, 1, 1]>,
175 InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P7_DU1], 0>,
176 InstrStage<1, [P7_DU2], 0>,
177 InstrStage<1, [P7_DU3], 0>,
178 InstrStage<1, [P7_DU4], 0>,
179 InstrStage<1, [P7_FX1, P7_FX2]>,
180 InstrStage<1, [P7_LS1, P7_LS2], 0>,
181 InstrStage<1, [P7_FX1, P7_FX2]>],
182 [3, 3, 1, 1]>,
183 InstrItinData<IIC_LdStLFD , [InstrStage<1, [P7_DU1, P7_DU2,
184 P7_DU3, P7_DU4], 0>,
185 InstrStage<1, [P7_LS1, P7_LS2]>],
186 [3, 1, 1]>,
187 InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P7_DU1, P7_DU2,
188 P7_DU3, P7_DU4], 0>,
189 InstrStage<1, [P7_LS1, P7_LS2]>],
190 [3, 1, 1]>,
191 InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P7_DU1], 0>,
192 InstrStage<1, [P7_DU2], 0>,
193 InstrStage<1, [P7_LS1, P7_LS2], 0>,
194 InstrStage<1, [P7_FX1, P7_FX2]>],
195 [3, 3, 1, 1]>,
196 InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P7_DU1], 0>,
197 InstrStage<1, [P7_DU2], 0>,
198 InstrStage<1, [P7_LS1, P7_LS2], 0>,
199 InstrStage<1, [P7_FX1, P7_FX2]>],
200 [3, 3, 1, 1]>,
201 InstrItinData<IIC_LdStLHA , [InstrStage<1, [P7_DU1], 0>,
202 InstrStage<1, [P7_DU2], 0>,
203 InstrStage<1, [P7_LS1, P7_LS2]>,
204 InstrStage<1, [P7_FX1, P7_FX2]>],
205 [3, 1, 1]>,
206 InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P7_DU1], 0>,
207 InstrStage<1, [P7_DU2], 0>,
208 InstrStage<1, [P7_LS1, P7_LS2], 0>,
209 InstrStage<1, [P7_FX1, P7_FX2]>,
210 InstrStage<1, [P7_FX1, P7_FX2]>],
211 [4, 4, 1, 1]>,
212 InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P7_DU1], 0>,
213 InstrStage<1, [P7_DU2], 0>,
214 InstrStage<1, [P7_DU3], 0>,
215 InstrStage<1, [P7_DU4], 0>,
216 InstrStage<1, [P7_FX1, P7_FX2]>,
217 InstrStage<1, [P7_LS1, P7_LS2], 0>,
218 InstrStage<1, [P7_FX1, P7_FX2]>,
219 InstrStage<1, [P7_FX1, P7_FX2]>],
220 [4, 4, 1, 1]>,
221 InstrItinData<IIC_LdStLWA , [InstrStage<1, [P7_DU1], 0>,
222 InstrStage<1, [P7_DU2], 0>,
223 InstrStage<1, [P7_LS1, P7_LS2]>,
224 InstrStage<1, [P7_FX1, P7_FX2]>],
225 [3, 1, 1]>,
226 InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P7_DU1], 0>,
227 InstrStage<1, [P7_DU2], 0>,
228 InstrStage<1, [P7_DU3], 0>,
229 InstrStage<1, [P7_DU4], 0>,
230 InstrStage<1, [P7_LS1, P7_LS2]>],
231 [3, 1, 1]>,
232 InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P7_DU1], 0>,
233 InstrStage<1, [P7_DU2], 0>,
234 InstrStage<1, [P7_DU3], 0>,
235 InstrStage<1, [P7_DU4], 0>,
236 InstrStage<1, [P7_LS1, P7_LS2]>],
237 [3, 1, 1]>,
238 InstrItinData<IIC_LdStLMW , [InstrStage<1, [P7_DU1, P7_DU2,
239 P7_DU3, P7_DU4], 0>,
240 InstrStage<1, [P7_LS1, P7_LS2]>],
241 [2, 1, 1]>,
242 InstrItinData<IIC_LdStStore , [InstrStage<1, [P7_DU1, P7_DU2,
243 P7_DU3, P7_DU4], 0>,
244 InstrStage<1, [P7_LS1, P7_LS2], 0>,
245 InstrStage<1, [P7_FX1, P7_FX2]>],
246 [1, 1, 1]>,
247 InstrItinData<IIC_LdStSTD , [InstrStage<1, [P7_DU1, P7_DU2,
248 P7_DU3, P7_DU4], 0>,
249 InstrStage<1, [P7_LS1, P7_LS2], 0>,
250 InstrStage<1, [P7_FX1, P7_FX2]>],
251 [1, 1, 1]>,
252 InstrItinData<IIC_LdStSTDU , [InstrStage<1, [P7_DU1], 0>,
253 InstrStage<1, [P7_DU2], 0>,
254 InstrStage<1, [P7_LS1, P7_LS2], 0>,
255 InstrStage<1, [P7_FX1, P7_FX2]>,
256 InstrStage<1, [P7_FX1, P7_FX2]>],
257 [2, 1, 1, 1]>,
258 InstrItinData<IIC_LdStSTDUX , [InstrStage<1, [P7_DU1], 0>,
259 InstrStage<1, [P7_DU2], 0>,
260 InstrStage<1, [P7_DU3], 0>,
261 InstrStage<1, [P7_DU4], 0>,
262 InstrStage<1, [P7_LS1, P7_LS2], 0>,
263 InstrStage<1, [P7_FX1, P7_FX2]>,
264 InstrStage<1, [P7_FX1, P7_FX2]>],
265 [2, 1, 1, 1]>,
266 InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P7_DU1, P7_DU2,
267 P7_DU3, P7_DU4], 0>,
268 InstrStage<1, [P7_LS1, P7_LS2], 0>,
269 InstrStage<1, [P7_VS1, P7_VS2]>],
270 [1, 1, 1]>,
271 InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P7_DU1], 0>,
272 InstrStage<1, [P7_DU2], 0>,
273 InstrStage<1, [P7_LS1, P7_LS2], 0>,
274 InstrStage<1, [P7_FX1, P7_FX2], 0>,
275 InstrStage<1, [P7_VS1, P7_VS2]>],
276 [2, 1, 1, 1]>,
277 InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P7_DU1, P7_DU2,
278 P7_DU3, P7_DU4], 0>,
279 InstrStage<1, [P7_LS1, P7_LS2], 0>,
280 InstrStage<1, [P7_VS2]>],
281 [1, 1, 1]>,
282 InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P7_DU1], 0>,
283 InstrStage<1, [P7_DU2], 0>,
284 InstrStage<1, [P7_DU3], 0>,
285 InstrStage<1, [P7_DU4], 0>,
286 InstrStage<1, [P7_LS1, P7_LS2]>],
287 [1, 1, 1]>,
288 InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P7_DU1], 0>,
289 InstrStage<1, [P7_DU2], 0>,
290 InstrStage<1, [P7_DU3], 0>,
291 InstrStage<1, [P7_DU4], 0>,
292 InstrStage<1, [P7_LS1, P7_LS2]>],
293 [1, 1, 1]>,
294 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU1], 0>,
295 InstrStage<1, [P7_DU2], 0>,
296 InstrStage<1, [P7_DU3], 0>,
297 InstrStage<1, [P7_DU4], 0>,
298 InstrStage<1, [P7_CRU]>,
299 InstrStage<1, [P7_FX1, P7_FX2]>],
300 [3, 1]>, // mtcr
301 InstrItinData<IIC_SprMFCR , [InstrStage<1, [P7_DU1], 0>,
302 InstrStage<1, [P7_CRU]>],
303 [6, 1]>,
304 InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P7_DU1], 0>,
305 InstrStage<1, [P7_CRU]>],
306 [3, 1]>,
307 InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P7_DU1], 0>,
308 InstrStage<1, [P7_FX1]>],
309 [4, 1]>, // mtctr
310 InstrItinData<IIC_FPGeneral , [InstrStage<1, [P7_DU1, P7_DU2,
311 P7_DU3, P7_DU4], 0>,
312 InstrStage<1, [P7_VS1, P7_VS2]>],
313 [5, 1, 1]>,
314 InstrItinData<IIC_FPCompare , [InstrStage<1, [P7_DU1, P7_DU2,
315 P7_DU3, P7_DU4], 0>,
316 InstrStage<1, [P7_VS1, P7_VS2]>],
317 [8, 1, 1]>,
318 InstrItinData<IIC_FPDivD , [InstrStage<1, [P7_DU1, P7_DU2,
319 P7_DU3, P7_DU4], 0>,
320 InstrStage<1, [P7_VS1, P7_VS2]>],
321 [33, 1, 1]>,
322 InstrItinData<IIC_FPDivS , [InstrStage<1, [P7_DU1, P7_DU2,
323 P7_DU3, P7_DU4], 0>,
324 InstrStage<1, [P7_VS1, P7_VS2]>],
325 [27, 1, 1]>,
326 InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P7_DU1, P7_DU2,
327 P7_DU3, P7_DU4], 0>,
328 InstrStage<1, [P7_VS1, P7_VS2]>],
329 [44, 1, 1]>,
330 InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P7_DU1, P7_DU2,
331 P7_DU3, P7_DU4], 0>,
332 InstrStage<1, [P7_VS1, P7_VS2]>],
333 [32, 1, 1]>,
334 InstrItinData<IIC_FPFused , [InstrStage<1, [P7_DU1, P7_DU2,
335 P7_DU3, P7_DU4], 0>,
336 InstrStage<1, [P7_VS1, P7_VS2]>],
337 [5, 1, 1, 1]>,
338 InstrItinData<IIC_FPRes , [InstrStage<1, [P7_DU1, P7_DU2,
339 P7_DU3, P7_DU4], 0>,
340 InstrStage<1, [P7_VS1, P7_VS2]>],
341 [5, 1, 1]>,
342 InstrItinData<IIC_VecGeneral , [InstrStage<1, [P7_DU1], 0>,
343 InstrStage<1, [P7_VS1]>],
344 [2, 1, 1]>,
345 InstrItinData<IIC_VecVSL , [InstrStage<1, [P7_DU1], 0>,
346 InstrStage<1, [P7_VS1]>],
347 [2, 1, 1]>,
348 InstrItinData<IIC_VecVSR , [InstrStage<1, [P7_DU1], 0>,
349 InstrStage<1, [P7_VS1]>],
350 [2, 1, 1]>,
351 InstrItinData<IIC_VecFP , [InstrStage<1, [P7_DU1], 0>,
352 InstrStage<1, [P7_VS1, P7_VS2]>],
353 [6, 1, 1]>,
354 InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>,
355 InstrStage<1, [P7_VS1, P7_VS2]>],
356 [6, 1, 1]>,
357 InstrItinData<IIC_VecFPRound , [InstrStage<1, [P7_DU1], 0>,
358 InstrStage<1, [P7_VS1, P7_VS2]>],
359 [6, 1, 1]>,
360 InstrItinData<IIC_VecComplex , [InstrStage<1, [P7_DU1], 0>,
361 InstrStage<1, [P7_VS1]>],
362 [7, 1, 1]>,
363 InstrItinData<IIC_VecPerm , [InstrStage<1, [P7_DU1, P7_DU2], 0>,
364 InstrStage<1, [P7_VS2]>],
365 [3, 1, 1]>
366 ]>;
367
368 // ===---------------------------------------------------------------------===//
369 // P7 machine model for scheduling and other instruction cost heuristics.
370
371 def P7Model : SchedMachineModel {
372 let IssueWidth = 6; // 4 (non-branch) instructions are dispatched per cycle.
373 // Note that the dispatch bundle size is 6 (including
374 // branches), but the total internal issue bandwidth per
375 // cycle (from all queues) is 8.
376
377 let MinLatency = 0; // Out-of-order dispatch.
378 let LoadLatency = 3; // Optimistic load latency assuming bypass.
379 // This is overriden by OperandCycles if the
380 // Itineraries are queried instead.
381 let MispredictPenalty = 16;
382
383 // Try to make sure we have at least 10 dispatch groups in a loop.
384 let LoopMicroOpBufferSize = 40;
385
386 let Itineraries = P7Itineraries;
387 }
388