1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; the following defines control the operation of the macros below and
32 ; need to be defines in the including file
33 ; KEY_ROUNDS - number of key rounds needed based on key length: 128bit - 11, 192bit - 13 or 256bit - 15
34 ; EARLY_BLOCKS - number of data block to load before starting computations
35 ; PARALLEL_BLOCKS - number of blocks of data to process in parallel also the number of xmm regs to reserve for data
36 ; IV_CNT - number of xmm regs to use for IV data valid values of 0 or 1
37 ; TMP_CNT - number of tmp xmm register to reserve
38 ; XMM_USAGE - number of xmm registers to use. must be at least the same as PARALLEL_BLOCKS + 2
41 %include "reg_sizes.asm"
48 ; the following instructions set specific macros must be defined in the user file
49 ; to make use of the AES macros below
50 ; MOVDQ - move from memory to xmm reg
51 ; PXOR - XOR of two xmm registers pxor
52 ; AES_DEC - AES block decode for early key rounds
53 ; AES_DEC_LAST - AES block decode for last key round
55 ; AES_ENC - AES block encode for early key rounds
56 ; AES_ENC_LAST - AES block encode for last key round
58 ; Three usages of xmm regs: key round cache, blocks data and one temp
59 ; CKEY_CNT are (number of xmm regs) - PARALLEL_BLOCKS - IV holder - 2 TMP mmx reg
60 %assign FIRST_XDATA (0)
61 %assign IV_IDX (FIRST_XDATA + PARALLEL_BLOCKS)
65 %assign TMP (IV_IDX + IV_CNT)
67 %assign FIRST_CKEY (TMP + TMP_CNT)
68 %assign CKEY_CNT (XMM_USAGE - (PARALLEL_BLOCKS + IV_CNT + TMP_CNT))
70 ; Abstract xmm register usages that identify the expected contents of the register
71 %define reg(i) xmm %+ i
72 %define XDATA(i) xmm %+ i
73 %define KEY_REG(i) xmm %+ i
74 %define IV_REG(i) xmm %+ i
83 ; AES CBC ENCODE MACROS
89 ; Decrypts a number of blocks using AES_PARALLEL_ENC_BLOCKS macro
90 ; Finalized the decryption and saves results in the output
91 ; places last last buffers crypto text in IV for next buffer
92 ; updates the index and number of bytes left
94 %macro CBC_DECRYPT_BLOCKS 17
95 %define %%TOT_ROUNDS %1
96 %define %%num_blocks %2 ; can be 0..13
97 %define %%EARLY_LOADS %3 ; number of data blocks to laod before processing
101 %define %%AES_DEC_LAST %7
102 %define %%CACHED_KEYS %8 ; number of key data cached in xmm regs
104 %define %%TMP_CNT %10
105 %define %%FIRST_CKEY %11
106 %define %%KEY_DATA %12
107 %define %%FIRST_XDATA %13
108 %define %%IN %14 ; input data
109 %define %%OUT %15 ; output data
110 %define %%IDX %16 ; index into input and output data buffers
113 AES_PARALLEL_ENC_BLOCKS %%TOT_ROUNDS, %%num_blocks, %%EARLY_LOADS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST, %%CACHED_KEYS, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%KEY_DATA, %%FIRST_XDATA, %%IN, %%OUT, %%IDX
116 ; XOR the result of each block's decrypt with the previous block's cypher text (C)
120 %%PXOR XDATA(i), XDATA(IV_IDX) ; XOR result with previous block's C
121 %%MOVDQ [%%OUT + %%IDX + i*16], XDATA(i) ; save plain text to out
122 %%MOVDQ XDATA(IV_IDX), [%%IN + IDX + i*16] ; load IV with current block C
126 add %%IDX, %%num_blocks*16
127 sub %%LEN, %%num_blocks*16
133 ; XOR first data block with the IV data
134 %macro CBC_ENC_INIT 7
140 %define %%IN %6 ; input data
141 %define %%IDX %7 ; index into input and output data buffers
143 %%MOVDQ XDATA(%%P_FIRST), [%%IN + %%IDX + 0*16]
144 %%MOVDQ reg(%%IV_IDX), [%%IV]
145 %%PXOR XDATA(%%P_FIRST), reg(%%IV_IDX)
150 ; LEN is length of data remaining
151 ; IDX is offset into the data buffer
154 ; if data > 16 load next block into a next XDATA reg (XDATA(p_next))
155 ; load first uncached key into TMP0 (if any)
156 ; AES block encript XDATA(P_FIRST)
157 ; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(P_FIRST))
158 ; save current (XDATA(P_FIRST))
159 ; update indexes for P_FIRST
162 %macro CBC_ENC_SUBLOOP 17
163 %define %%TOT_ROUNDS %1
164 %define %%BLOCKS %2 ; can be 1...14
165 %define %%START_DATA %3
169 %define %%AES_DEC_LAST %7
172 %define %%FIRST_CKEY %10
173 %define %%CKEY_CNT %11
175 %define %%CACHED_KEYS %13
176 %define %%IN %14 ; input data
177 %define %%OUT %15 ; output data
178 %define %%IDX %16 ; index into input and output data buffers
183 %assign p_first %%START_DATA
184 %assign p_next (p_first+1)
185 ; for number of blocks to be processed in a loop
188 ; if data > 16 load next block into a next XDATA reg (XDATA(p_next))
192 %%MOVDQ XDATA(p_next), [%%IN + %%IDX + next_blk*16]
196 AES_ENC_BLOCKS %%TOT_ROUNDS, p_first, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%CKEY_CNT, %%KEYS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST
198 ; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(p_first))
201 je %$skip_next_blk_start
202 %%PXOR XDATA(p_next), XDATA(p_first)
203 %$skip_next_blk_start:
206 ; save current (XDATA(p_first))
207 %%MOVDQ [%%OUT + %%IDX + this_blk*16], XDATA(p_first)
208 ; update indexes for p_first
212 %if (blk < %%BLOCKS) ; only insert jz if NOT last block
214 jz %%END_CBC_ENC_SUBLOOP
215 %endif ; (p_next < %%BLOCKS)
217 %assign p_first (p_next)
219 %if (blk == %%BLOCKS) ; the last rep loop's read of the next block needs to be into START_DATA
220 %assign p_next (%%START_DATA)
221 %elif (1 == %%BLOCKS)
222 %%MOVDQ XDATA(%%START_DATA), XDATA(p_next)
224 %assign p_next (p_next+1)
228 %%END_CBC_ENC_SUBLOOP:
229 %endm ; CBC_ENC_SUBLOOP
234 ; AES BLOCK ENCODE MACROS
240 ; Load key data into the cache key xmm regs
241 %macro FILL_KEY_CACHE 4
242 %define %%CACHED_KEYS %1
243 %define %%CKEY_START %2
244 %define %%KEY_DATA %3
249 %if (rnd < %%CACHED_KEYS) ; find the round's key data
250 %assign c (rnd + %%CKEY_START)
251 %%MOVDQ KEY_REG(c), [%%KEY_DATA + rnd*16] ;load sub key into an available register
259 ; pre-loades message data into xmm regs
260 ; updates global 'blocks_loaded' that tracks which data blocks have been loaded
261 ; 'blocks_loaded' is an in/out global and must be declared in the using macro or function
262 %macro SCHEDULE_DATA_LOAD 5
263 %define %%PARALLEL_DATA %1
264 %define %%EARLY_LOADS %2
269 %if (blocks_loaded < %%PARALLEL_DATA)
271 %%MOVDQ XDATA(blocks_loaded), [%%IN + %%IDX + blocks_loaded*16]
272 %assign blocks_loaded (blocks_loaded+1)
273 %endif ; (blocks_loaded < %%PARALLEL_DATA)
274 %endmacro ; SCHEDULED_EARLY_DATA_LOADS
278 ; determine which xmm reg holds the key data needed or loades it into the temp register if not cached
279 ; 'current_tmp' is an in/out global and must be declared in the using macro or function
280 %macro INIT_SELECT_KEY 6
281 %define %%TOT_ROUNDS %1
282 %define %%CACHED_KEYS %2
283 %define %%KEY_DATA %3
284 %define %%FIRST_TMP %4
288 %assign current_tmp (%%FIRST_TMP)
289 %if (%%TOT_ROUNDS > %%CACHED_KEYS) ; load the first uncached key into temp reg
290 %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%CACHED_KEYS*16]
291 %endif ; (KEY_ROUNDS > CKEY_CNT)
292 %endmacro ; SELECT_KEY
296 ; determine which xmm reg holds the key data needed or loades it into the temp register if not cached
297 ; 'current_tmp' is an in/out global and must be declared in the using macro or function
300 %define %%TOT_ROUNDS %2
301 %define %%CACHED_KEYS %3
302 %define %%FIRST_KEY %4
303 %define %%KEY_DATA %5
304 %define %%FIRST_TMP %6
308 ; find the key data for this round
309 %if (%%ROUND < %%CACHED_KEYS) ; is it cached
310 %assign key (%%ROUND + %%FIRST_KEY)
312 ; Load non-cached key %%ROUND data ping-ponging between temp regs if more than one
313 %assign key (current_tmp) ; use the previous loaded key data
315 %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%ROUND*16] ; load the next rounds key data
317 %assign next_round (%%ROUND+1)
318 %if (next_round < %%TOT_ROUNDS) ; if more rounds to be done
319 %if (current_tmp == %%FIRST_TMP) ; calc the next temp reg to use
320 %assign current_tmp (current_tmp + 1)
322 %assign current_tmp (%%FIRST_TMP)
323 %endif ; (current_tmp == %%FIRST_TMP)
324 %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + next_round*16] ; load the next rounds key data
326 %endif ; (%%ROUND < KEY_ROUNDS)
327 %endif ; (1 < %%TMP_CNT)
328 %endif ; (%%ROUND < %%CACHED_KEYS)
329 %endmacro ; SELECT_KEY
333 ; AES_PARALLEL_ENC_BLOCKS
334 ; preloads some data blocks to be worked on
335 ; starts the aes block encoding while loading the other blocks to be done in parallel
336 ; aes block encodes each key round on each block
337 %macro AES_PARALLEL_ENC_BLOCKS 16
338 %define %%KEY_ROUNDS %1
339 %define %%PARALLEL_DATA %2
340 %define %%EARLY_LOADS %3
344 %define %%AES_DEC_LAST %7
345 %define %%CACHED_KEYS %8
347 %define %%TMP_CNT %10
348 %define %%FIRST_CKEY %11
349 %define %%KEY_DATA %12
350 %define %%FIRST_XDATA %13
351 %define %%IN %14 ; input data
352 %define %%OUT %15 ; output data
353 %define %%IDX %16 ; index into input and output data buffers
355 %assign blocks_loaded 0
358 SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX ; updates blocks_loaded
359 %endrep ; %%EARLY_LOADS
361 %assign current_tmp (TMP)
362 INIT_SELECT_KEY %%KEY_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
366 %rep KEY_ROUNDS ; for all key rounds
367 SELECT_KEY round, %%KEY_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
369 %assign i %%FIRST_XDATA
370 %rep %%PARALLEL_DATA ; for each block do the EAS block encode step
372 %%PXOR XDATA(i), KEY_REG(key) ; first round's step
373 SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX
375 %elif ( (%%KEY_ROUNDS-1) == round )
376 %%AES_DEC_LAST XDATA(i), KEY_REG(key) ; last round's step
379 %%AES_DEC XDATA(i), KEY_REG(key) ; middle round's (1..last-1) step
383 %endrep ;%%PARALLEL_DATA
384 %assign round (round+1)
386 %endmacro ; AES_PARALLEL_ENC_BLOCKS
392 ; load first uncached key into TMP0 (if any)
393 ; AES block encript XDATA(p_first)
394 ; before using uncached key in TMP0, load next key in TMP1
395 ; before using uncached key in TMP1, load next key in TMP0
396 %macro AES_ENC_BLOCKS 11
397 %define %%TOT_ROUNDS %1
398 %define %%ENC_BLOCK %2
401 %define %%FIRST_CKEY %5
402 %define %%CACHED_KEYS %6
403 %define %%KEY_DATA %7
406 %define %%AES_ENC %10
407 %define %%AES_ENC_LAST %11
409 %assign current_tmp (%%TMP)
410 INIT_SELECT_KEY %%TOT_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
413 %assign key (round + %%FIRST_CKEY)
414 %rep %%TOT_ROUNDS ; for all key rounds
415 ; find the key data for this round
416 SELECT_KEY round, %%TOT_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
420 %%PXOR XDATA(%%ENC_BLOCK), KEY_REG(key) ; round zero step
421 %elif ( (%%TOT_ROUNDS-1) == round )
422 %%AES_ENC_LAST XDATA(%%ENC_BLOCK), KEY_REG(key) ; last round's step
424 %%AES_ENC XDATA(%%ENC_BLOCK), KEY_REG(key) ; rounds 1..last-1 step
425 %endif ; (0 == round)
427 %assign round (round+1)